Clover icon

Coverage Report

  1. Project Clover database Mon Nov 11 2024 17:27:16 GMT
  2. Package jalview.analysis

File AlignmentUtils.java

 

Coverage histogram

../../img/srcFileCovDistChart8.png
20% of files have more coverage

Code metrics

440
888
54
2
3,087
1,882
332
0.37
16.44
27
6.15

Classes

Class Line # Actions
AlignmentUtils 82 882 326
0.804824680.5%
AlignmentUtils.DnaVariant 98 6 6
0.00%
 

Contributing tests

This file is covered by 138 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.analysis;
22   
23    import java.awt.Color;
24    import java.util.ArrayList;
25    import java.util.Arrays;
26    import java.util.Collection;
27    import java.util.Collections;
28    import java.util.HashMap;
29    import java.util.HashSet;
30    import java.util.Iterator;
31    import java.util.LinkedHashMap;
32    import java.util.List;
33    import java.util.Locale;
34    import java.util.Map;
35    import java.util.Map.Entry;
36    import java.util.NoSuchElementException;
37    import java.util.Set;
38    import java.util.SortedMap;
39    import java.util.TreeMap;
40    import java.util.Vector;
41   
42    import jalview.api.AlignCalcWorkerI;
43    import jalview.bin.Console;
44    import jalview.commands.RemoveGapColCommand;
45    import jalview.datamodel.AlignedCodon;
46    import jalview.datamodel.AlignedCodonFrame;
47    import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping;
48    import jalview.datamodel.Alignment;
49    import jalview.datamodel.AlignmentAnnotation;
50    import jalview.datamodel.AlignmentI;
51    import jalview.datamodel.Annotation;
52    import jalview.datamodel.ContactMatrixI;
53    import jalview.datamodel.DBRefEntry;
54    import jalview.datamodel.GeneLociI;
55    import jalview.datamodel.IncompleteCodonException;
56    import jalview.datamodel.Mapping;
57    import jalview.datamodel.PDBEntry;
58    import jalview.datamodel.SeqCigar;
59    import jalview.datamodel.Sequence;
60    import jalview.datamodel.SequenceFeature;
61    import jalview.datamodel.SequenceGroup;
62    import jalview.datamodel.SequenceI;
63    import jalview.datamodel.features.SequenceFeatures;
64    import jalview.gui.AlignmentPanel;
65    import jalview.io.gff.SequenceOntologyI;
66    import jalview.schemes.ResidueProperties;
67    import jalview.util.Comparison;
68    import jalview.util.Constants;
69    import jalview.util.DBRefUtils;
70    import jalview.util.IntRangeComparator;
71    import jalview.util.MapList;
72    import jalview.util.MappingUtils;
73    import jalview.workers.SecondaryStructureConsensusThread;
74   
75    /**
76    * grab bag of useful alignment manipulation operations Expect these to be
77    * refactored elsewhere at some point.
78    *
79    * @author jimp
80    *
81    */
 
82    public class AlignmentUtils
83    {
84    private static final int CODON_LENGTH = 3;
85   
86    private static final String SEQUENCE_VARIANT = "sequence_variant:";
87   
88    /*
89    * the 'id' attribute is provided for variant features fetched from
90    * Ensembl using its REST service with JSON format
91    */
92    public static final String VARIANT_ID = "id";
93   
94    /**
95    * A data model to hold the 'normal' base value at a position, and an optional
96    * sequence variant feature
97    */
 
98    static final class DnaVariant
99    {
100    final String base;
101   
102    SequenceFeature variant;
103   
 
104  0 toggle DnaVariant(String nuc)
105    {
106  0 base = nuc;
107  0 variant = null;
108    }
109   
 
110  0 toggle DnaVariant(String nuc, SequenceFeature var)
111    {
112  0 base = nuc;
113  0 variant = var;
114    }
115   
 
116  0 toggle public String getSource()
117    {
118  0 return variant == null ? null : variant.getFeatureGroup();
119    }
120   
121    /**
122    * toString for aid in the debugger only
123    */
 
124  0 toggle @Override
125    public String toString()
126    {
127  0 return base + ":" + (variant == null ? "" : variant.getDescription());
128    }
129    }
130   
131    /**
132    * given an existing alignment, create a new alignment including all, or up to
133    * flankSize additional symbols from each sequence's dataset sequence
134    *
135    * @param core
136    * @param flankSize
137    * @return AlignmentI
138    */
 
139  27 toggle public static AlignmentI expandContext(AlignmentI core, int flankSize)
140    {
141  27 List<SequenceI> sq = new ArrayList<>();
142  27 int maxoffset = 0;
143  27 for (SequenceI s : core.getSequences())
144    {
145  131 SequenceI newSeq = s.deriveSequence();
146  131 final int newSeqStart = newSeq.getStart() - 1;
147  131 if (newSeqStart > maxoffset
148    && newSeq.getDatasetSequence().getStart() < s.getStart())
149    {
150  131 maxoffset = newSeqStart;
151    }
152  131 sq.add(newSeq);
153    }
154  27 if (flankSize > -1)
155    {
156  25 maxoffset = Math.min(maxoffset, flankSize);
157    }
158   
159    /*
160    * now add offset left and right to create an expanded alignment
161    */
162  27 for (SequenceI s : sq)
163    {
164  131 SequenceI ds = s;
165  262 while (ds.getDatasetSequence() != null)
166    {
167  131 ds = ds.getDatasetSequence();
168    }
169  131 int s_end = s.findPosition(s.getStart() + s.getLength());
170    // find available flanking residues for sequence
171  131 int ustream_ds = s.getStart() - ds.getStart();
172  131 int dstream_ds = ds.getEnd() - s_end;
173   
174    // build new flanked sequence
175   
176    // compute gap padding to start of flanking sequence
177  131 int offset = maxoffset - ustream_ds;
178   
179    // padding is gapChar x ( maxoffset - min(ustream_ds, flank)
180  131 if (flankSize >= 0)
181    {
182  125 if (flankSize < ustream_ds)
183    {
184    // take up to flankSize residues
185  40 offset = maxoffset - flankSize;
186  40 ustream_ds = flankSize;
187    }
188  125 if (flankSize <= dstream_ds)
189    {
190  116 dstream_ds = flankSize - 1;
191    }
192    }
193    // TODO use Character.toLowerCase to avoid creating String objects?
194  131 char[] upstream = new String(ds
195    .getSequence(s.getStart() - 1 - ustream_ds, s.getStart() - 1))
196    .toLowerCase(Locale.ROOT).toCharArray();
197  131 char[] downstream = new String(
198    ds.getSequence(s_end - 1, s_end + dstream_ds))
199    .toLowerCase(Locale.ROOT).toCharArray();
200  131 char[] coreseq = s.getSequence();
201  131 char[] nseq = new char[offset + upstream.length + downstream.length
202    + coreseq.length];
203  131 char c = core.getGapCharacter();
204   
205  131 int p = 0;
206  461 for (; p < offset; p++)
207    {
208  330 nseq[p] = c;
209    }
210   
211  131 System.arraycopy(upstream, 0, nseq, p, upstream.length);
212  131 System.arraycopy(coreseq, 0, nseq, p + upstream.length,
213    coreseq.length);
214  131 System.arraycopy(downstream, 0, nseq,
215    p + coreseq.length + upstream.length, downstream.length);
216  131 s.setSequence(new String(nseq));
217  131 s.setStart(s.getStart() - ustream_ds);
218  131 s.setEnd(s_end + downstream.length);
219    }
220  27 AlignmentI newAl = new jalview.datamodel.Alignment(
221    sq.toArray(new SequenceI[0]));
222  27 for (SequenceI s : sq)
223    {
224  131 if (s.getAnnotation() != null)
225    {
226  1 for (AlignmentAnnotation aa : s.getAnnotation())
227    {
228  1 aa.adjustForAlignment(); // JAL-1712 fix
229  1 newAl.addAnnotation(aa);
230    }
231    }
232    }
233  27 newAl.setDataset(core.getDataset());
234  27 return newAl;
235    }
236   
237    /**
238    * Returns the index (zero-based position) of a sequence in an alignment, or
239    * -1 if not found.
240    *
241    * @param al
242    * @param seq
243    * @return
244    */
 
245  58491 toggle public static int getSequenceIndex(AlignmentI al, SequenceI seq)
246    {
247  58491 int result = -1;
248  58491 int pos = 0;
249  58491 for (SequenceI alSeq : al.getSequences())
250    {
251  126222796 if (alSeq == seq)
252    {
253  58452 result = pos;
254  58452 break;
255    }
256  126164423 pos++;
257    }
258  58491 return result;
259    }
260   
261    /**
262    * Returns a map of lists of sequences in the alignment, keyed by sequence
263    * name. For use in mapping between different alignment views of the same
264    * sequences.
265    *
266    * @see jalview.datamodel.AlignmentI#getSequencesByName()
267    */
 
268  1 toggle public static Map<String, List<SequenceI>> getSequencesByName(
269    AlignmentI al)
270    {
271  1 Map<String, List<SequenceI>> theMap = new LinkedHashMap<>();
272  1 for (SequenceI seq : al.getSequences())
273    {
274  3 String name = seq.getName();
275  3 if (name != null)
276    {
277  3 List<SequenceI> seqs = theMap.get(name);
278  3 if (seqs == null)
279    {
280  2 seqs = new ArrayList<>();
281  2 theMap.put(name, seqs);
282    }
283  3 seqs.add(seq);
284    }
285    }
286  1 return theMap;
287    }
288   
289    /**
290    * Build mapping of protein to cDNA alignment. Mappings are made between
291    * sequences where the cDNA translates to the protein sequence. Any new
292    * mappings are added to the protein alignment. Returns true if any mappings
293    * either already exist or were added, else false.
294    *
295    * @param proteinAlignment
296    * @param cdnaAlignment
297    * @return
298    */
 
299  4 toggle public static boolean mapProteinAlignmentToCdna(
300    final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment)
301    {
302  4 if (proteinAlignment == null || cdnaAlignment == null)
303    {
304  0 return false;
305    }
306   
307  4 Set<SequenceI> mappedDna = new HashSet<>();
308  4 Set<SequenceI> mappedProtein = new HashSet<>();
309   
310    /*
311    * First pass - map sequences where cross-references exist. This include
312    * 1-to-many mappings to support, for example, variant cDNA.
313    */
314  4 boolean mappingPerformed = mapProteinToCdna(proteinAlignment,
315    cdnaAlignment, mappedDna, mappedProtein, true);
316   
317    /*
318    * Second pass - map sequences where no cross-references exist. This only
319    * does 1-to-1 mappings and assumes corresponding sequences are in the same
320    * order in the alignments.
321    */
322  4 mappingPerformed |= mapProteinToCdna(proteinAlignment, cdnaAlignment,
323    mappedDna, mappedProtein, false);
324  4 return mappingPerformed;
325    }
326   
327    /**
328    * Make mappings between compatible sequences (where the cDNA translation
329    * matches the protein).
330    *
331    * @param proteinAlignment
332    * @param cdnaAlignment
333    * @param mappedDna
334    * a set of mapped DNA sequences (to add to)
335    * @param mappedProtein
336    * a set of mapped Protein sequences (to add to)
337    * @param xrefsOnly
338    * if true, only map sequences where xrefs exist
339    * @return
340    */
 
341  8 toggle protected static boolean mapProteinToCdna(
342    final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment,
343    Set<SequenceI> mappedDna, Set<SequenceI> mappedProtein,
344    boolean xrefsOnly)
345    {
346  8 boolean mappingExistsOrAdded = false;
347  8 List<SequenceI> thisSeqs = proteinAlignment.getSequences();
348  8 for (SequenceI aaSeq : thisSeqs)
349    {
350  22 boolean proteinMapped = false;
351  22 AlignedCodonFrame acf = new AlignedCodonFrame();
352   
353  22 for (SequenceI cdnaSeq : cdnaAlignment.getSequences())
354    {
355    /*
356    * Always try to map if sequences have xref to each other; this supports
357    * variant cDNA or alternative splicing for a protein sequence.
358    *
359    * If no xrefs, try to map progressively, assuming that alignments have
360    * mappable sequences in corresponding order. These are not
361    * many-to-many, as that would risk mixing species with similar cDNA
362    * sequences.
363    */
364  86 if (xrefsOnly && !AlignmentUtils.haveCrossRef(aaSeq, cdnaSeq))
365    {
366  39 continue;
367    }
368   
369    /*
370    * Don't map non-xrefd sequences more than once each. This heuristic
371    * allows us to pair up similar sequences in ordered alignments.
372    */
373  47 if (!xrefsOnly && (mappedProtein.contains(aaSeq)
374    || mappedDna.contains(cdnaSeq)))
375    {
376  29 continue;
377    }
378  18 if (mappingExists(proteinAlignment.getCodonFrames(),
379    aaSeq.getDatasetSequence(), cdnaSeq.getDatasetSequence()))
380    {
381  0 mappingExistsOrAdded = true;
382    }
383    else
384    {
385  18 MapList map = mapCdnaToProtein(aaSeq, cdnaSeq);
386  18 if (map != null)
387    {
388  12 acf.addMap(cdnaSeq, aaSeq, map);
389  12 mappingExistsOrAdded = true;
390  12 proteinMapped = true;
391  12 mappedDna.add(cdnaSeq);
392  12 mappedProtein.add(aaSeq);
393    }
394    }
395    }
396  22 if (proteinMapped)
397    {
398  11 proteinAlignment.addCodonFrame(acf);
399    }
400    }
401  8 return mappingExistsOrAdded;
402    }
403   
404    /**
405    * Answers true if the mappings include one between the given (dataset)
406    * sequences.
407    */
 
408  18 toggle protected static boolean mappingExists(List<AlignedCodonFrame> mappings,
409    SequenceI aaSeq, SequenceI cdnaSeq)
410    {
411  18 if (mappings != null)
412    {
413  18 for (AlignedCodonFrame acf : mappings)
414    {
415  14 if (cdnaSeq == acf.getDnaForAaSeq(aaSeq))
416    {
417  0 return true;
418    }
419    }
420    }
421  18 return false;
422    }
423   
424    /**
425    * Builds a mapping (if possible) of a cDNA to a protein sequence.
426    * <ul>
427    * <li>first checks if the cdna translates exactly to the protein
428    * sequence</li>
429    * <li>else checks for translation after removing a STOP codon</li>
430    * <li>else checks for translation after removing a START codon</li>
431    * <li>if that fails, inspect CDS features on the cDNA sequence</li>
432    * </ul>
433    * Returns null if no mapping is determined.
434    *
435    * @param proteinSeq
436    * the aligned protein sequence
437    * @param cdnaSeq
438    * the aligned cdna sequence
439    * @return
440    */
 
441  31 toggle public static MapList mapCdnaToProtein(SequenceI proteinSeq,
442    SequenceI cdnaSeq)
443    {
444    /*
445    * Here we handle either dataset sequence set (desktop) or absent (applet).
446    * Use only the char[] form of the sequence to avoid creating possibly large
447    * String objects.
448    */
449  31 final SequenceI proteinDataset = proteinSeq.getDatasetSequence();
450  31 char[] aaSeqChars = proteinDataset != null
451    ? proteinDataset.getSequence()
452    : proteinSeq.getSequence();
453  31 final SequenceI cdnaDataset = cdnaSeq.getDatasetSequence();
454  31 char[] cdnaSeqChars = cdnaDataset != null ? cdnaDataset.getSequence()
455    : cdnaSeq.getSequence();
456  31 if (aaSeqChars == null || cdnaSeqChars == null)
457    {
458  0 return null;
459    }
460   
461    /*
462    * cdnaStart/End, proteinStartEnd are base 1 (for dataset sequence mapping)
463    */
464  31 final int mappedLength = CODON_LENGTH * aaSeqChars.length;
465  31 int cdnaLength = cdnaSeqChars.length;
466  31 int cdnaStart = cdnaSeq.getStart();
467  31 int cdnaEnd = cdnaSeq.getEnd();
468  31 final int proteinStart = proteinSeq.getStart();
469  31 final int proteinEnd = proteinSeq.getEnd();
470   
471    /*
472    * If lengths don't match, try ignoring stop codon (if present)
473    */
474  31 if (cdnaLength != mappedLength && cdnaLength > 2)
475    {
476  16 String lastCodon = String.valueOf(cdnaSeqChars,
477    cdnaLength - CODON_LENGTH, CODON_LENGTH)
478    .toUpperCase(Locale.ROOT);
479  16 for (String stop : ResidueProperties.STOP_CODONS)
480    {
481  47 if (lastCodon.equals(stop))
482    {
483  2 cdnaEnd -= CODON_LENGTH;
484  2 cdnaLength -= CODON_LENGTH;
485  2 break;
486    }
487    }
488    }
489   
490    /*
491    * If lengths still don't match, try ignoring start codon.
492    */
493  31 int startOffset = 0;
494  31 if (cdnaLength != mappedLength && cdnaLength > 2
495    && String.valueOf(cdnaSeqChars, 0, CODON_LENGTH)
496    .toUpperCase(Locale.ROOT)
497    .equals(ResidueProperties.START))
498    {
499  5 startOffset += CODON_LENGTH;
500  5 cdnaStart += CODON_LENGTH;
501  5 cdnaLength -= CODON_LENGTH;
502    }
503   
504  31 if (translatesAs(cdnaSeqChars, startOffset, aaSeqChars))
505    {
506    /*
507    * protein is translation of dna (+/- start/stop codons)
508    */
509  15 MapList map = new MapList(new int[] { cdnaStart, cdnaEnd },
510    new int[]
511    { proteinStart, proteinEnd }, CODON_LENGTH, 1);
512  15 return map;
513    }
514   
515    /*
516    * translation failed - try mapping CDS annotated regions of dna
517    */
518  16 return mapCdsToProtein(cdnaSeq, proteinSeq);
519    }
520   
521    /**
522    * Test whether the given cdna sequence, starting at the given offset,
523    * translates to the given amino acid sequence, using the standard translation
524    * table. Designed to fail fast i.e. as soon as a mismatch position is found.
525    *
526    * @param cdnaSeqChars
527    * @param cdnaStart
528    * @param aaSeqChars
529    * @return
530    */
 
531  51 toggle protected static boolean translatesAs(char[] cdnaSeqChars, int cdnaStart,
532    char[] aaSeqChars)
533    {
534  51 if (cdnaSeqChars == null || aaSeqChars == null)
535    {
536  3 return false;
537    }
538   
539  48 int aaPos = 0;
540  48 int dnaPos = cdnaStart;
541  161 for (; dnaPos < cdnaSeqChars.length - 2
542    && aaPos < aaSeqChars.length; dnaPos += CODON_LENGTH, aaPos++)
543    {
544  130 String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
545  130 final String translated = ResidueProperties.codonTranslate(codon);
546   
547    /*
548    * allow * in protein to match untranslatable in dna
549    */
550  130 final char aaRes = aaSeqChars[aaPos];
551  130 if ((translated == null || ResidueProperties.STOP.equals(translated))
552    && aaRes == '*')
553    {
554  4 continue;
555    }
556  126 if (translated == null || !(aaRes == translated.charAt(0)))
557    {
558    // debug
559    // jalview.bin.Console.outPrintln(("Mismatch at " + i + "/" + aaResidue
560    // + ": "
561    // + codon + "(" + translated + ") != " + aaRes));
562  17 return false;
563    }
564    }
565   
566    /*
567    * check we matched all of the protein sequence
568    */
569  31 if (aaPos != aaSeqChars.length)
570    {
571  2 return false;
572    }
573   
574    /*
575    * check we matched all of the dna except
576    * for optional trailing STOP codon
577    */
578  29 if (dnaPos == cdnaSeqChars.length)
579    {
580  17 return true;
581    }
582  12 if (dnaPos == cdnaSeqChars.length - CODON_LENGTH)
583    {
584  11 String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
585  11 if (ResidueProperties.STOP
586    .equals(ResidueProperties.codonTranslate(codon)))
587    {
588  9 return true;
589    }
590    }
591  3 return false;
592    }
593   
594    /**
595    * Align sequence 'seq' to match the alignment of a mapped sequence. Note this
596    * currently assumes that we are aligning cDNA to match protein.
597    *
598    * @param seq
599    * the sequence to be realigned
600    * @param al
601    * the alignment whose sequence alignment is to be 'copied'
602    * @param gap
603    * character string represent a gap in the realigned sequence
604    * @param preserveUnmappedGaps
605    * @param preserveMappedGaps
606    * @return true if the sequence was realigned, false if it could not be
607    */
 
608  8 toggle public static boolean alignSequenceAs(SequenceI seq, AlignmentI al,
609    String gap, boolean preserveMappedGaps,
610    boolean preserveUnmappedGaps)
611    {
612    /*
613    * Get any mappings from the source alignment to the target (dataset)
614    * sequence.
615    */
616    // TODO there may be one AlignedCodonFrame per dataset sequence, or one with
617    // all mappings. Would it help to constrain this?
618  8 List<AlignedCodonFrame> mappings = al.getCodonFrame(seq);
619  8 if (mappings == null || mappings.isEmpty())
620    {
621  0 return false;
622    }
623   
624    /*
625    * Locate the aligned source sequence whose dataset sequence is mapped. We
626    * just take the first match here (as we can't align like more than one
627    * sequence).
628    */
629  8 SequenceI alignFrom = null;
630  8 AlignedCodonFrame mapping = null;
631  8 for (AlignedCodonFrame mp : mappings)
632    {
633  8 alignFrom = mp.findAlignedSequence(seq, al);
634  8 if (alignFrom != null)
635    {
636  4 mapping = mp;
637  4 break;
638    }
639    }
640   
641  8 if (alignFrom == null)
642    {
643  4 return false;
644    }
645  4 alignSequenceAs(seq, alignFrom, mapping, gap, al.getGapCharacter(),
646    preserveMappedGaps, preserveUnmappedGaps);
647  4 return true;
648    }
649   
650    /**
651    * Align sequence 'alignTo' the same way as 'alignFrom', using the mapping to
652    * match residues and codons. Flags control whether existing gaps in unmapped
653    * (intron) and mapped (exon) regions are preserved or not. Gaps between
654    * intron and exon are only retained if both flags are set.
655    *
656    * @param alignTo
657    * @param alignFrom
658    * @param mapping
659    * @param myGap
660    * @param sourceGap
661    * @param preserveUnmappedGaps
662    * @param preserveMappedGaps
663    */
 
664  19 toggle public static void alignSequenceAs(SequenceI alignTo, SequenceI alignFrom,
665    AlignedCodonFrame mapping, String myGap, char sourceGap,
666    boolean preserveMappedGaps, boolean preserveUnmappedGaps)
667    {
668    // TODO generalise to work for Protein-Protein, dna-dna, dna-protein
669   
670    // aligned and dataset sequence positions, all base zero
671  19 int thisSeqPos = 0;
672  19 int sourceDsPos = 0;
673   
674  19 int basesWritten = 0;
675  19 char myGapChar = myGap.charAt(0);
676  19 int ratio = myGap.length();
677   
678  19 int fromOffset = alignFrom.getStart() - 1;
679  19 int toOffset = alignTo.getStart() - 1;
680  19 int sourceGapMappedLength = 0;
681  19 boolean inExon = false;
682  19 final int toLength = alignTo.getLength();
683  19 final int fromLength = alignFrom.getLength();
684  19 StringBuilder thisAligned = new StringBuilder(2 * toLength);
685   
686    /*
687    * Traverse the 'model' aligned sequence
688    */
689  205 for (int i = 0; i < fromLength; i++)
690    {
691  186 char sourceChar = alignFrom.getCharAt(i);
692  186 if (sourceChar == sourceGap)
693    {
694  44 sourceGapMappedLength += ratio;
695  44 continue;
696    }
697   
698    /*
699    * Found a non-gap character. Locate its mapped region if any.
700    */
701  142 sourceDsPos++;
702    // Note mapping positions are base 1, our sequence positions base 0
703  142 int[] mappedPos = mapping.getMappedRegion(alignTo, alignFrom,
704    sourceDsPos + fromOffset);
705  142 if (mappedPos == null)
706    {
707    /*
708    * unmapped position; treat like a gap
709    */
710  94 sourceGapMappedLength += ratio;
711    // jalview.bin.Console.errPrintln("Can't align: no codon mapping to
712    // residue "
713    // + sourceDsPos + "(" + sourceChar + ")");
714    // return;
715  94 continue;
716    }
717   
718  48 int mappedCodonStart = mappedPos[0]; // position (1...) of codon start
719  48 int mappedCodonEnd = mappedPos[mappedPos.length - 1]; // codon end pos
720  48 StringBuilder trailingCopiedGap = new StringBuilder();
721   
722    /*
723    * Copy dna sequence up to and including this codon. Optionally, include
724    * gaps before the codon starts (in introns) and/or after the codon starts
725    * (in exons).
726    *
727    * Note this only works for 'linear' splicing, not reverse or interleaved.
728    * But then 'align dna as protein' doesn't make much sense otherwise.
729    */
730  48 int intronLength = 0;
731  294 while (basesWritten + toOffset < mappedCodonEnd
732    && thisSeqPos < toLength)
733    {
734  246 final char c = alignTo.getCharAt(thisSeqPos++);
735  246 if (c != myGapChar)
736    {
737  146 basesWritten++;
738  146 int sourcePosition = basesWritten + toOffset;
739  146 if (sourcePosition < mappedCodonStart)
740    {
741    /*
742    * Found an unmapped (intron) base. First add in any preceding gaps
743    * (if wanted).
744    */
745  48 if (preserveUnmappedGaps && trailingCopiedGap.length() > 0)
746    {
747  17 thisAligned.append(trailingCopiedGap.toString());
748  17 intronLength += trailingCopiedGap.length();
749  17 trailingCopiedGap = new StringBuilder();
750    }
751  48 intronLength++;
752  48 inExon = false;
753    }
754    else
755    {
756  98 final boolean startOfCodon = sourcePosition == mappedCodonStart;
757  98 int gapsToAdd = calculateGapsToInsert(preserveMappedGaps,
758    preserveUnmappedGaps, sourceGapMappedLength, inExon,
759    trailingCopiedGap.length(), intronLength, startOfCodon);
760  215 for (int k = 0; k < gapsToAdd; k++)
761    {
762  117 thisAligned.append(myGapChar);
763    }
764  98 sourceGapMappedLength = 0;
765  98 inExon = true;
766    }
767  146 thisAligned.append(c);
768  146 trailingCopiedGap = new StringBuilder();
769    }
770    else
771    {
772  100 if (inExon && preserveMappedGaps)
773    {
774  32 trailingCopiedGap.append(myGapChar);
775    }
776  68 else if (!inExon && preserveUnmappedGaps)
777    {
778  27 trailingCopiedGap.append(myGapChar);
779    }
780    }
781    }
782    }
783   
784    /*
785    * At end of model aligned sequence. Copy any remaining target sequence, optionally
786    * including (intron) gaps.
787    */
788  129 while (thisSeqPos < toLength)
789    {
790  110 final char c = alignTo.getCharAt(thisSeqPos++);
791  110 if (c != myGapChar || preserveUnmappedGaps)
792    {
793  102 thisAligned.append(c);
794    }
795  110 sourceGapMappedLength--;
796    }
797   
798    /*
799    * finally add gaps to pad for any trailing source gaps or
800    * unmapped characters
801    */
802  19 if (preserveUnmappedGaps)
803    {
804  24 while (sourceGapMappedLength > 0)
805    {
806  12 thisAligned.append(myGapChar);
807  12 sourceGapMappedLength--;
808    }
809    }
810   
811    /*
812    * All done aligning, set the aligned sequence.
813    */
814  19 alignTo.setSequence(new String(thisAligned));
815    }
816   
817    /**
818    * Helper method to work out how many gaps to insert when realigning.
819    *
820    * @param preserveMappedGaps
821    * @param preserveUnmappedGaps
822    * @param sourceGapMappedLength
823    * @param inExon
824    * @param trailingCopiedGap
825    * @param intronLength
826    * @param startOfCodon
827    * @return
828    */
 
829  98 toggle protected static int calculateGapsToInsert(boolean preserveMappedGaps,
830    boolean preserveUnmappedGaps, int sourceGapMappedLength,
831    boolean inExon, int trailingGapLength, int intronLength,
832    final boolean startOfCodon)
833    {
834  98 int gapsToAdd = 0;
835  98 if (startOfCodon)
836    {
837    /*
838    * Reached start of codon. Ignore trailing gaps in intron unless we are
839    * preserving gaps in both exon and intron. Ignore them anyway if the
840    * protein alignment introduces a gap at least as large as the intronic
841    * region.
842    */
843  40 if (inExon && !preserveMappedGaps)
844    {
845  4 trailingGapLength = 0;
846    }
847  40 if (!inExon && !(preserveMappedGaps && preserveUnmappedGaps))
848    {
849  19 trailingGapLength = 0;
850    }
851  40 if (inExon)
852    {
853  14 gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
854    }
855    else
856    {
857  26 if (intronLength + trailingGapLength <= sourceGapMappedLength)
858    {
859  20 gapsToAdd = sourceGapMappedLength - intronLength;
860    }
861    else
862    {
863  6 gapsToAdd = Math.min(
864    intronLength + trailingGapLength - sourceGapMappedLength,
865    trailingGapLength);
866    }
867    }
868    }
869    else
870    {
871    /*
872    * second or third base of codon; check for any gaps in dna
873    */
874  58 if (!preserveMappedGaps)
875    {
876  32 trailingGapLength = 0;
877    }
878  58 gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
879    }
880  98 return gapsToAdd;
881    }
882   
883    /**
884    * Realigns the given protein to match the alignment of the dna, using codon
885    * mappings to translate aligned codon positions to protein residues.
886    *
887    * @param protein
888    * the alignment whose sequences are realigned by this method
889    * @param dna
890    * the dna alignment whose alignment we are 'copying'
891    * @return the number of sequences that were realigned
892    */
 
893  5 toggle public static int alignProteinAsDna(AlignmentI protein, AlignmentI dna)
894    {
895  5 if (protein.isNucleotide() || !dna.isNucleotide())
896    {
897  0 jalview.bin.Console
898    .errPrintln("Wrong alignment type in alignProteinAsDna");
899  0 return 0;
900    }
901  5 List<SequenceI> unmappedProtein = new ArrayList<>();
902  5 Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = buildCodonColumnsMap(
903    protein, dna, unmappedProtein);
904  5 return alignProteinAs(protein, alignedCodons, unmappedProtein);
905    }
906   
907    /**
908    * Realigns the given dna to match the alignment of the protein, using codon
909    * mappings to translate aligned peptide positions to codons.
910    *
911    * Always produces a padded CDS alignment.
912    *
913    * @param dna
914    * the alignment whose sequences are realigned by this method
915    * @param protein
916    * the protein alignment whose alignment we are 'copying'
917    * @return the number of sequences that were realigned
918    */
 
919  4 toggle public static int alignCdsAsProtein(AlignmentI dna, AlignmentI protein)
920    {
921  4 if (protein.isNucleotide() || !dna.isNucleotide())
922    {
923  0 jalview.bin.Console
924    .errPrintln("Wrong alignment type in alignProteinAsDna");
925  0 return 0;
926    }
927    // todo: implement this
928  4 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
929  4 int alignedCount = 0;
930  4 int width = 0; // alignment width for padding CDS
931  4 for (SequenceI dnaSeq : dna.getSequences())
932    {
933  5 if (alignCdsSequenceAsProtein(dnaSeq, protein, mappings,
934    dna.getGapCharacter()))
935    {
936  5 alignedCount++;
937    }
938  5 width = Math.max(dnaSeq.getLength(), width);
939    }
940  4 int oldwidth;
941  4 int diff;
942  4 for (SequenceI dnaSeq : dna.getSequences())
943    {
944  5 oldwidth = dnaSeq.getLength();
945  5 diff = width - oldwidth;
946  5 if (diff > 0)
947    {
948  1 dnaSeq.insertCharAt(oldwidth, diff, dna.getGapCharacter());
949    }
950    }
951  4 return alignedCount;
952    }
953   
954    /**
955    * Helper method to align (if possible) the dna sequence to match the
956    * alignment of a mapped protein sequence. This is currently limited to
957    * handling coding sequence only.
958    *
959    * @param cdsSeq
960    * @param protein
961    * @param mappings
962    * @param gapChar
963    * @return
964    */
 
965  5 toggle static boolean alignCdsSequenceAsProtein(SequenceI cdsSeq,
966    AlignmentI protein, List<AlignedCodonFrame> mappings,
967    char gapChar)
968    {
969  5 SequenceI cdsDss = cdsSeq.getDatasetSequence();
970  5 if (cdsDss == null)
971    {
972  0 System.err
973    .println("alignCdsSequenceAsProtein needs aligned sequence!");
974  0 return false;
975    }
976   
977  5 List<AlignedCodonFrame> dnaMappings = MappingUtils
978    .findMappingsForSequence(cdsSeq, mappings);
979  5 for (AlignedCodonFrame mapping : dnaMappings)
980    {
981  5 SequenceI peptide = mapping.findAlignedSequence(cdsSeq, protein);
982  5 if (peptide != null)
983    {
984  5 final int peptideLength = peptide.getLength();
985  5 Mapping map = mapping.getMappingBetween(cdsSeq, peptide);
986  5 if (map != null)
987    {
988  5 MapList mapList = map.getMap();
989  5 if (map.getTo() == peptide.getDatasetSequence())
990    {
991  5 mapList = mapList.getInverse();
992    }
993  5 final int cdsLength = cdsDss.getLength();
994  5 int mappedFromLength = MappingUtils
995    .getLength(mapList.getFromRanges());
996  5 int mappedToLength = MappingUtils
997    .getLength(mapList.getToRanges());
998  5 boolean addStopCodon = (cdsLength == mappedFromLength
999    * CODON_LENGTH + CODON_LENGTH)
1000    || (peptide.getDatasetSequence()
1001    .getLength() == mappedFromLength - 1);
1002  5 if (cdsLength != mappedToLength && !addStopCodon)
1003    {
1004  0 jalview.bin.Console.errPrintln(String.format(
1005    "Can't align cds as protein (length mismatch %d/%d): %s",
1006    cdsLength, mappedToLength, cdsSeq.getName()));
1007    }
1008   
1009    /*
1010    * pre-fill the aligned cds sequence with gaps
1011    */
1012  5 char[] alignedCds = new char[peptideLength * CODON_LENGTH
1013  5 + (addStopCodon ? CODON_LENGTH : 0)];
1014  5 Arrays.fill(alignedCds, gapChar);
1015   
1016    /*
1017    * walk over the aligned peptide sequence and insert mapped
1018    * codons for residues in the aligned cds sequence
1019    */
1020  5 int copiedBases = 0;
1021  5 int cdsStart = cdsDss.getStart();
1022  5 int proteinPos = peptide.getStart() - 1;
1023  5 int cdsCol = 0;
1024   
1025  33 for (int col = 0; col < peptideLength; col++)
1026    {
1027  28 char residue = peptide.getCharAt(col);
1028   
1029  28 if (Comparison.isGap(residue))
1030    {
1031  13 cdsCol += CODON_LENGTH;
1032    }
1033    else
1034    {
1035  15 proteinPos++;
1036  15 int[] codon = mapList.locateInTo(proteinPos, proteinPos);
1037  15 if (codon == null)
1038    {
1039    // e.g. incomplete start codon, X in peptide
1040  0 cdsCol += CODON_LENGTH;
1041    }
1042    else
1043    {
1044  60 for (int j = codon[0]; j <= codon[1]; j++)
1045    {
1046  45 char mappedBase = cdsDss.getCharAt(j - cdsStart);
1047  45 alignedCds[cdsCol++] = mappedBase;
1048  45 copiedBases++;
1049    }
1050    }
1051    }
1052    }
1053   
1054    /*
1055    * append stop codon if not mapped from protein,
1056    * closing it up to the end of the mapped sequence
1057    */
1058  5 if (copiedBases == cdsLength - CODON_LENGTH)
1059    {
1060  0 for (int i = alignedCds.length - 1; i >= 0; i--)
1061    {
1062  0 if (!Comparison.isGap(alignedCds[i]))
1063    {
1064  0 cdsCol = i + 1; // gap just after end of sequence
1065  0 break;
1066    }
1067    }
1068  0 for (int i = cdsLength - CODON_LENGTH; i < cdsLength; i++)
1069    {
1070  0 alignedCds[cdsCol++] = cdsDss.getCharAt(i);
1071    }
1072    }
1073  5 cdsSeq.setSequence(new String(alignedCds));
1074  5 return true;
1075    }
1076    }
1077    }
1078  0 return false;
1079    }
1080   
1081    /**
1082    * Builds a map whose key is an aligned codon position (3 alignment column
1083    * numbers base 0), and whose value is a map from protein sequence to each
1084    * protein's peptide residue for that codon. The map generates an ordering of
1085    * the codons, and allows us to read off the peptides at each position in
1086    * order to assemble 'aligned' protein sequences.
1087    *
1088    * @param protein
1089    * the protein alignment
1090    * @param dna
1091    * the coding dna alignment
1092    * @param unmappedProtein
1093    * any unmapped proteins are added to this list
1094    * @return
1095    */
 
1096  5 toggle protected static Map<AlignedCodon, Map<SequenceI, AlignedCodon>> buildCodonColumnsMap(
1097    AlignmentI protein, AlignmentI dna,
1098    List<SequenceI> unmappedProtein)
1099    {
1100    /*
1101    * maintain a list of any proteins with no mappings - these will be
1102    * rendered 'as is' in the protein alignment as we can't align them
1103    */
1104  5 unmappedProtein.addAll(protein.getSequences());
1105   
1106  5 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
1107   
1108    /*
1109    * Map will hold, for each aligned codon position e.g. [3, 5, 6], a map of
1110    * {dnaSequence, {proteinSequence, codonProduct}} at that position. The
1111    * comparator keeps the codon positions ordered.
1112    */
1113  5 Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = new TreeMap<>(
1114    new CodonComparator());
1115   
1116  5 for (SequenceI dnaSeq : dna.getSequences())
1117    {
1118  30 for (AlignedCodonFrame mapping : mappings)
1119    {
1120  516 SequenceI prot = mapping.findAlignedSequence(dnaSeq, protein);
1121  516 if (prot != null)
1122    {
1123  32 Mapping seqMap = mapping.getMappingForSequence(dnaSeq);
1124  32 addCodonPositions(dnaSeq, prot, protein.getGapCharacter(), seqMap,
1125    alignedCodons);
1126  32 unmappedProtein.remove(prot);
1127    }
1128    }
1129    }
1130   
1131    /*
1132    * Finally add any unmapped peptide start residues (e.g. for incomplete
1133    * codons) as if at the codon position before the second residue
1134    */
1135    // TODO resolve JAL-2022 so this fudge can be removed
1136  5 int mappedSequenceCount = protein.getHeight() - unmappedProtein.size();
1137  5 addUnmappedPeptideStarts(alignedCodons, mappedSequenceCount);
1138   
1139  5 return alignedCodons;
1140    }
1141   
1142    /**
1143    * Scans for any protein mapped from position 2 (meaning unmapped start
1144    * position e.g. an incomplete codon), and synthesizes a 'codon' for it at the
1145    * preceding position in the alignment
1146    *
1147    * @param alignedCodons
1148    * the codon-to-peptide map
1149    * @param mappedSequenceCount
1150    * the number of distinct sequences in the map
1151    */
 
1152  5 toggle protected static void addUnmappedPeptideStarts(
1153    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1154    int mappedSequenceCount)
1155    {
1156    // TODO delete this ugly hack once JAL-2022 is resolved
1157    // i.e. we can model startPhase > 0 (incomplete start codon)
1158   
1159  5 List<SequenceI> sequencesChecked = new ArrayList<>();
1160  5 AlignedCodon lastCodon = null;
1161  5 Map<SequenceI, AlignedCodon> toAdd = new HashMap<>();
1162   
1163  5 for (Entry<AlignedCodon, Map<SequenceI, AlignedCodon>> entry : alignedCodons
1164    .entrySet())
1165    {
1166  1913 for (Entry<SequenceI, AlignedCodon> sequenceCodon : entry.getValue()
1167    .entrySet())
1168    {
1169  10671 SequenceI seq = sequenceCodon.getKey();
1170  10671 if (sequencesChecked.contains(seq))
1171    {
1172  10641 continue;
1173    }
1174  30 sequencesChecked.add(seq);
1175  30 AlignedCodon codon = sequenceCodon.getValue();
1176  30 if (codon.peptideCol > 1)
1177    {
1178  0 jalview.bin.Console.errPrintln(
1179    "Problem mapping protein with >1 unmapped start positions: "
1180    + seq.getName());
1181    }
1182  30 else if (codon.peptideCol == 1)
1183    {
1184    /*
1185    * first position (peptideCol == 0) was unmapped - add it
1186    */
1187  6 if (lastCodon != null)
1188    {
1189  5 AlignedCodon firstPeptide = new AlignedCodon(lastCodon.pos1,
1190    lastCodon.pos2, lastCodon.pos3,
1191    String.valueOf(seq.getCharAt(0)), 0);
1192  5 toAdd.put(seq, firstPeptide);
1193    }
1194    else
1195    {
1196    /*
1197    * unmapped residue at start of alignment (no prior column) -
1198    * 'insert' at nominal codon [0, 0, 0]
1199    */
1200  1 AlignedCodon firstPeptide = new AlignedCodon(0, 0, 0,
1201    String.valueOf(seq.getCharAt(0)), 0);
1202  1 toAdd.put(seq, firstPeptide);
1203    }
1204    }
1205  30 if (sequencesChecked.size() == mappedSequenceCount)
1206    {
1207    // no need to check past first mapped position in all sequences
1208  5 break;
1209    }
1210    }
1211  1913 lastCodon = entry.getKey();
1212    }
1213   
1214    /*
1215    * add any new codons safely after iterating over the map
1216    */
1217  5 for (Entry<SequenceI, AlignedCodon> startCodon : toAdd.entrySet())
1218    {
1219  6 addCodonToMap(alignedCodons, startCodon.getValue(),
1220    startCodon.getKey());
1221    }
1222    }
1223   
1224    /**
1225    * Update the aligned protein sequences to match the codon alignments given in
1226    * the map.
1227    *
1228    * @param protein
1229    * @param alignedCodons
1230    * an ordered map of codon positions (columns), with sequence/peptide
1231    * values present in each column
1232    * @param unmappedProtein
1233    * @return
1234    */
 
1235  5 toggle protected static int alignProteinAs(AlignmentI protein,
1236    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1237    List<SequenceI> unmappedProtein)
1238    {
1239    /*
1240    * prefill peptide sequences with gaps
1241    */
1242  5 int alignedWidth = alignedCodons.size();
1243  5 char[] gaps = new char[alignedWidth];
1244  5 Arrays.fill(gaps, protein.getGapCharacter());
1245  5 Map<SequenceI, char[]> peptides = new HashMap<>();
1246  5 for (SequenceI seq : protein.getSequences())
1247    {
1248  31 if (!unmappedProtein.contains(seq))
1249    {
1250  30 peptides.put(seq, Arrays.copyOf(gaps, gaps.length));
1251    }
1252    }
1253   
1254    /*
1255    * Traverse the codons left to right (as defined by CodonComparator)
1256    * and insert peptides in each column where the sequence is mapped.
1257    * This gives a peptide 'alignment' where residues are aligned if their
1258    * corresponding codons occupy the same columns in the cdna alignment.
1259    */
1260  5 int column = 0;
1261  5 for (AlignedCodon codon : alignedCodons.keySet())
1262    {
1263  1914 final Map<SequenceI, AlignedCodon> columnResidues = alignedCodons
1264    .get(codon);
1265  1914 for (Entry<SequenceI, AlignedCodon> entry : columnResidues.entrySet())
1266    {
1267  10682 char residue = entry.getValue().product.charAt(0);
1268  10682 peptides.get(entry.getKey())[column] = residue;
1269    }
1270  1914 column++;
1271    }
1272   
1273    /*
1274    * and finally set the constructed sequences
1275    */
1276  5 for (Entry<SequenceI, char[]> entry : peptides.entrySet())
1277    {
1278  30 entry.getKey().setSequence(new String(entry.getValue()));
1279    }
1280   
1281  5 return 0;
1282    }
1283   
1284    /**
1285    * Populate the map of aligned codons by traversing the given sequence
1286    * mapping, locating the aligned positions of mapped codons, and adding those
1287    * positions and their translation products to the map.
1288    *
1289    * @param dna
1290    * the aligned sequence we are mapping from
1291    * @param protein
1292    * the sequence to be aligned to the codons
1293    * @param gapChar
1294    * the gap character in the dna sequence
1295    * @param seqMap
1296    * a mapping to a sequence translation
1297    * @param alignedCodons
1298    * the map we are building up
1299    */
 
1300  32 toggle static void addCodonPositions(SequenceI dna, SequenceI protein,
1301    char gapChar, Mapping seqMap,
1302    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons)
1303    {
1304  32 Iterator<AlignedCodon> codons = seqMap.getCodonIterator(dna, gapChar);
1305   
1306    /*
1307    * add codon positions, and their peptide translations, to the alignment
1308    * map, while remembering the first codon mapped
1309    */
1310  10716 while (codons.hasNext())
1311    {
1312  10684 try
1313    {
1314  10684 AlignedCodon codon = codons.next();
1315  10684 addCodonToMap(alignedCodons, codon, protein);
1316    } catch (IncompleteCodonException e)
1317    {
1318    // possible incomplete trailing codon - ignore
1319    } catch (NoSuchElementException e)
1320    {
1321    // possibly peptide lacking STOP
1322    }
1323    }
1324    }
1325   
1326    /**
1327    * Helper method to add a codon-to-peptide entry to the aligned codons map
1328    *
1329    * @param alignedCodons
1330    * @param codon
1331    * @param protein
1332    */
 
1333  10690 toggle protected static void addCodonToMap(
1334    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1335    AlignedCodon codon, SequenceI protein)
1336    {
1337  10690 Map<SequenceI, AlignedCodon> seqProduct = alignedCodons.get(codon);
1338  10690 if (seqProduct == null)
1339    {
1340  1914 seqProduct = new HashMap<>();
1341  1914 alignedCodons.put(codon, seqProduct);
1342    }
1343  10690 seqProduct.put(protein, codon);
1344    }
1345   
1346    /**
1347    * Returns true if a cDNA/Protein mapping either exists, or could be made,
1348    * between at least one pair of sequences in the two alignments. Currently,
1349    * the logic is:
1350    * <ul>
1351    * <li>One alignment must be nucleotide, and the other protein</li>
1352    * <li>At least one pair of sequences must be already mapped, or mappable</li>
1353    * <li>Mappable means the nucleotide translation matches the protein
1354    * sequence</li>
1355    * <li>The translation may ignore start and stop codons if present in the
1356    * nucleotide</li>
1357    * </ul>
1358    *
1359    * @param al1
1360    * @param al2
1361    * @return
1362    */
 
1363  28 toggle public static boolean isMappable(AlignmentI al1, AlignmentI al2)
1364    {
1365  28 if (al1 == null || al2 == null)
1366    {
1367  3 return false;
1368    }
1369   
1370    /*
1371    * Require one nucleotide and one protein
1372    */
1373  25 if (al1.isNucleotide() == al2.isNucleotide())
1374    {
1375  14 return false;
1376    }
1377  11 AlignmentI dna = al1.isNucleotide() ? al1 : al2;
1378  11 AlignmentI protein = dna == al1 ? al2 : al1;
1379  11 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
1380  11 for (SequenceI dnaSeq : dna.getSequences())
1381    {
1382  12 for (SequenceI proteinSeq : protein.getSequences())
1383    {
1384  12 if (isMappable(dnaSeq, proteinSeq, mappings))
1385    {
1386  2 return true;
1387    }
1388    }
1389    }
1390  9 return false;
1391    }
1392   
1393    /**
1394    * Returns true if the dna sequence is mapped, or could be mapped, to the
1395    * protein sequence.
1396    *
1397    * @param dnaSeq
1398    * @param proteinSeq
1399    * @param mappings
1400    * @return
1401    */
 
1402  12 toggle protected static boolean isMappable(SequenceI dnaSeq,
1403    SequenceI proteinSeq, List<AlignedCodonFrame> mappings)
1404    {
1405  12 if (dnaSeq == null || proteinSeq == null)
1406    {
1407  0 return false;
1408    }
1409   
1410  12 SequenceI dnaDs = dnaSeq.getDatasetSequence() == null ? dnaSeq
1411    : dnaSeq.getDatasetSequence();
1412  12 SequenceI proteinDs = proteinSeq.getDatasetSequence() == null
1413    ? proteinSeq
1414    : proteinSeq.getDatasetSequence();
1415   
1416  12 for (AlignedCodonFrame mapping : mappings)
1417    {
1418  0 if (proteinDs == mapping.getAaForDnaSeq(dnaDs))
1419    {
1420    /*
1421    * already mapped
1422    */
1423  0 return true;
1424    }
1425    }
1426   
1427    /*
1428    * Just try to make a mapping (it is not yet stored), test whether
1429    * successful.
1430    */
1431  12 return mapCdnaToProtein(proteinDs, dnaDs) != null;
1432    }
1433   
1434    /**
1435    * Finds any reference annotations associated with the sequences in
1436    * sequenceScope, that are not already added to the alignment, and adds them
1437    * to the 'candidates' map. Also populates a lookup table of annotation
1438    * labels, keyed by calcId, for use in constructing tooltips or the like.
1439    *
1440    * @param sequenceScope
1441    * the sequences to scan for reference annotations
1442    * @param labelForCalcId
1443    * (optional) map to populate with label for calcId
1444    * @param candidates
1445    * map to populate with annotations for sequence
1446    * @param al
1447    * the alignment to check for presence of annotations
1448    */
 
1449  86 toggle public static void findAddableReferenceAnnotations(
1450    List<SequenceI> sequenceScope, Map<String, String> labelForCalcId,
1451    final Map<SequenceI, List<AlignmentAnnotation>> candidates,
1452    AlignmentI al)
1453    {
1454  86 if (sequenceScope == null)
1455    {
1456  1 return;
1457    }
1458   
1459    /*
1460    * For each sequence in scope, make a list of any annotations on the
1461    * underlying dataset sequence which are not already on the alignment.
1462    *
1463    * Add to a map of { alignmentSequence, <List of annotations to add> }
1464    */
1465  85 for (SequenceI seq : sequenceScope)
1466    {
1467  82 SequenceI dataset = seq.getDatasetSequence();
1468  82 if (dataset == null)
1469    {
1470  0 continue;
1471    }
1472  82 AlignmentAnnotation[] datasetAnnotations = dataset.getAnnotation();
1473  82 if (datasetAnnotations == null)
1474    {
1475  35 continue;
1476    }
1477  47 final List<AlignmentAnnotation> result = new ArrayList<>();
1478  47 for (AlignmentAnnotation dsann : datasetAnnotations)
1479    {
1480    /*
1481    * Find matching annotations on the alignment. If none is found, then
1482    * add this annotation to the list of 'addable' annotations for this
1483    * sequence.
1484    */
1485  155 final Iterable<AlignmentAnnotation> matchedAlignmentAnnotations = al
1486    .findAnnotations(seq, dsann.getCalcId(), dsann.label);
1487  155 boolean found = false;
1488  155 if (matchedAlignmentAnnotations != null)
1489    {
1490  152 for (AlignmentAnnotation matched : matchedAlignmentAnnotations)
1491    {
1492  135 if (dsann.description.equals(matched.description))
1493    {
1494  54 found = true;
1495  54 break;
1496    }
1497    }
1498    }
1499  155 if (!found)
1500    {
1501  101 result.add(dsann);
1502  101 if (labelForCalcId != null)
1503    {
1504  10 labelForCalcId.put(dsann.getCalcId(), dsann.label);
1505    }
1506    }
1507    }
1508    /*
1509    * Save any addable annotations for this sequence
1510    */
1511  47 if (!result.isEmpty())
1512    {
1513  43 candidates.put(seq, result);
1514    }
1515    }
1516    }
1517   
1518    /**
1519    * Adds annotations to the top of the alignment annotations, in the same order
1520    * as their related sequences. If you already have an annotation and want to
1521    * add it to a sequence in an alignment use {@code addReferenceAnnotationTo}
1522    *
1523    * @param annotations
1524    * the annotations to add
1525    * @param alignment
1526    * the alignment to add them to
1527    * @param selectionGroup
1528    * current selection group - may be null, if provided then any added
1529    * annotation will be trimmed to just those columns in the selection
1530    * group
1531    */
 
1532  40 toggle public static void addReferenceAnnotations(
1533    Map<SequenceI, List<AlignmentAnnotation>> annotations,
1534    final AlignmentI alignment, final SequenceGroup selectionGroup)
1535    {
1536  40 for (SequenceI seq : annotations.keySet())
1537    {
1538  39 for (AlignmentAnnotation ann : annotations.get(seq))
1539    {
1540  95 addReferenceAnnotationTo(alignment, seq, ann, selectionGroup);
1541    }
1542    }
1543    }
1544   
 
1545  0 toggle public static boolean isSSAnnotationPresent(
1546    Map<SequenceI, List<AlignmentAnnotation>> annotations)
1547    {
1548   
1549  0 for (SequenceI seq : annotations.keySet())
1550    {
1551  0 if (isSecondaryStructurePresent(
1552    annotations.get(seq).toArray(new AlignmentAnnotation[0])))
1553    {
1554  0 return true;
1555    }
1556    }
1557  0 return false;
1558    }
1559   
1560    /**
1561    * Make a copy of a reference annotation {@code ann} and add it to an
1562    * alignment sequence {@code seq} in {@code alignment}, optionally limited to
1563    * the extent of {@code selectionGroup}
1564    *
1565    * @param alignment
1566    * @param seq
1567    * @param ann
1568    * @param selectionGroup
1569    * current selection group - may be null, if provided then any added
1570    * annotation will be trimmed to just those columns in the selection
1571    * group
1572    * @return annotation added to {@code seq and {@code alignment}
1573    */
 
1574  99 toggle public static AlignmentAnnotation addReferenceAnnotationTo(
1575    final AlignmentI alignment, final SequenceI seq,
1576    final AlignmentAnnotation ann, final SequenceGroup selectionGroup)
1577    {
1578  99 AlignmentAnnotation copyAnn = new AlignmentAnnotation(ann);
1579  99 int startRes = 0;
1580  99 int endRes = ann.annotations.length;
1581  99 if (selectionGroup != null)
1582    {
1583  2 startRes = -1 + Math.min(seq.getEnd(), Math.max(seq.getStart(),
1584    seq.findPosition(selectionGroup.getStartRes())));
1585  2 endRes = -1 + Math.min(seq.getEnd(),
1586    seq.findPosition(selectionGroup.getEndRes()));
1587   
1588    }
1589  99 copyAnn.restrict(startRes, endRes + 0);
1590   
1591    /*
1592    * Add to the sequence (sets copyAnn.datasetSequence), unless the
1593    * original annotation is already on the sequence.
1594    */
1595  99 if (!seq.hasAnnotation(ann))
1596    {
1597  99 ContactMatrixI cm = seq.getDatasetSequence().getContactMatrixFor(ann);
1598  99 if (cm != null)
1599    {
1600  25 seq.addContactListFor(copyAnn, cm);
1601    }
1602  99 seq.addAlignmentAnnotation(copyAnn);
1603    }
1604    // adjust for gaps
1605  99 copyAnn.adjustForAlignment();
1606    // add to the alignment and set visible
1607  99 alignment.addAnnotation(copyAnn);
1608  99 copyAnn.visible = true;
1609   
1610  99 return copyAnn;
1611    }
1612   
1613    /**
1614    * Set visibility of alignment annotations of specified types (labels), for
1615    * specified sequences. This supports controls like "Show all secondary
1616    * structure", "Hide all Temp factor", etc.
1617    *
1618    * @al the alignment to scan for annotations
1619    * @param types
1620    * the types (labels) of annotations to be updated
1621    * @param forSequences
1622    * if not null, only annotations linked to one of these sequences are
1623    * in scope for update; if null, acts on all sequence annotations
1624    * @param anyType
1625    * if this flag is true, 'types' is ignored (label not checked)
1626    * @param doShow
1627    * if true, set visibility on, else set off
1628    */
 
1629  5 toggle public static void showOrHideSequenceAnnotations(AlignmentI al,
1630    Collection<String> types, List<SequenceI> forSequences,
1631    boolean anyType, boolean doShow)
1632    {
1633  5 AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
1634  5 if (anns != null)
1635    {
1636  5 for (AlignmentAnnotation aa : anns)
1637    {
1638  30 if (anyType || types.contains(aa.label))
1639    {
1640  21 if ((aa.sequenceRef != null) && (forSequences == null
1641    || forSequences.contains(aa.sequenceRef)))
1642    {
1643  11 aa.visible = doShow;
1644    }
1645    }
1646    }
1647    }
1648    }
1649   
 
1650  0 toggle public static AlignmentAnnotation getFirstSequenceAnnotationOfType(
1651    AlignmentI al, int graphType)
1652    {
1653  0 AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
1654  0 if (anns != null)
1655    {
1656  0 for (AlignmentAnnotation aa : anns)
1657    {
1658  0 if (aa.sequenceRef != null && aa.graph == graphType)
1659  0 return aa;
1660    }
1661    }
1662  0 return null;
1663    }
1664   
1665    /**
1666    * Returns true if either sequence has a cross-reference to the other
1667    *
1668    * @param seq1
1669    * @param seq2
1670    * @return
1671    */
 
1672  52 toggle public static boolean haveCrossRef(SequenceI seq1, SequenceI seq2)
1673    {
1674    // Note: moved here from class CrossRef as the latter class has dependencies
1675    // not availability to the applet's classpath
1676  52 return hasCrossRef(seq1, seq2) || hasCrossRef(seq2, seq1);
1677    }
1678   
1679    /**
1680    * Returns true if seq1 has a cross-reference to seq2. Currently this assumes
1681    * that sequence name is structured as Source|AccessionId.
1682    *
1683    * @param seq1
1684    * @param seq2
1685    * @return
1686    */
 
1687  108 toggle public static boolean hasCrossRef(SequenceI seq1, SequenceI seq2)
1688    {
1689  108 if (seq1 == null || seq2 == null)
1690    {
1691  8 return false;
1692    }
1693  100 String name = seq2.getName();
1694  100 final List<DBRefEntry> xrefs = seq1.getDBRefs();
1695  100 if (xrefs != null)
1696    {
1697  34 for (int ix = 0, nx = xrefs.size(); ix < nx; ix++)
1698    {
1699  24 DBRefEntry xref = xrefs.get(ix);
1700  24 String xrefName = xref.getSource() + "|" + xref.getAccessionId();
1701    // case-insensitive test, consistent with DBRefEntry.equalRef()
1702  24 if (xrefName.equalsIgnoreCase(name))
1703    {
1704  12 return true;
1705    }
1706    }
1707    }
1708  88 return false;
1709    }
1710   
1711    /**
1712    * Constructs an alignment consisting of the mapped (CDS) regions in the given
1713    * nucleotide sequences, and updates mappings to match. The CDS sequences are
1714    * added to the original alignment's dataset, which is shared by the new
1715    * alignment. Mappings from nucleotide to CDS, and from CDS to protein, are
1716    * added to the alignment dataset.
1717    *
1718    * @param dna
1719    * aligned nucleotide (dna or cds) sequences
1720    * @param dataset
1721    * the alignment dataset the sequences belong to
1722    * @param products
1723    * (optional) to restrict results to CDS that map to specified
1724    * protein products
1725    * @return an alignment whose sequences are the cds-only parts of the dna
1726    * sequences (or null if no mappings are found)
1727    */
 
1728  6 toggle public static AlignmentI makeCdsAlignment(SequenceI[] dna,
1729    AlignmentI dataset, SequenceI[] products)
1730    {
1731  6 if (dataset == null || dataset.getDataset() != null)
1732    {
1733  0 throw new IllegalArgumentException(
1734    "IMPLEMENTATION ERROR: dataset.getDataset() must be null!");
1735    }
1736  6 List<SequenceI> foundSeqs = new ArrayList<>();
1737  6 List<SequenceI> cdsSeqs = new ArrayList<>();
1738  6 List<AlignedCodonFrame> mappings = dataset.getCodonFrames();
1739  6 HashSet<SequenceI> productSeqs = null;
1740  6 if (products != null)
1741    {
1742  3 productSeqs = new HashSet<>();
1743  3 for (SequenceI seq : products)
1744    {
1745  24 productSeqs.add(seq.getDatasetSequence() == null ? seq
1746    : seq.getDatasetSequence());
1747    }
1748    }
1749   
1750    /*
1751    * Construct CDS sequences from mappings on the alignment dataset.
1752    * The logic is:
1753    * - find the protein product(s) mapped to from each dna sequence
1754    * - if the mapping covers the whole dna sequence (give or take start/stop
1755    * codon), take the dna as the CDS sequence
1756    * - else search dataset mappings for a suitable dna sequence, i.e. one
1757    * whose whole sequence is mapped to the protein
1758    * - if no sequence found, construct one from the dna sequence and mapping
1759    * (and add it to dataset so it is found if this is repeated)
1760    */
1761  6 for (SequenceI dnaSeq : dna)
1762    {
1763  52 SequenceI dnaDss = dnaSeq.getDatasetSequence() == null ? dnaSeq
1764    : dnaSeq.getDatasetSequence();
1765   
1766  52 List<AlignedCodonFrame> seqMappings = MappingUtils
1767    .findMappingsForSequence(dnaSeq, mappings);
1768  52 for (AlignedCodonFrame mapping : seqMappings)
1769    {
1770  42 List<Mapping> mappingsFromSequence = mapping
1771    .getMappingsFromSequence(dnaSeq);
1772   
1773  42 for (Mapping aMapping : mappingsFromSequence)
1774    {
1775  44 MapList mapList = aMapping.getMap();
1776  44 if (mapList.getFromRatio() == 1)
1777    {
1778    /*
1779    * not a dna-to-protein mapping (likely dna-to-cds)
1780    */
1781  11 continue;
1782    }
1783   
1784    /*
1785    * skip if mapping is not to one of the target set of proteins
1786    */
1787  33 SequenceI proteinProduct = aMapping.getTo();
1788  33 if (productSeqs != null && !productSeqs.contains(proteinProduct))
1789    {
1790  2 continue;
1791    }
1792   
1793    /*
1794    * try to locate the CDS from the dataset mappings;
1795    * guard against duplicate results (for the case that protein has
1796    * dbrefs to both dna and cds sequences)
1797    */
1798  31 SequenceI cdsSeq = findCdsForProtein(mappings, dnaSeq,
1799    seqMappings, aMapping);
1800  31 if (cdsSeq != null)
1801    {
1802  11 if (!foundSeqs.contains(cdsSeq))
1803    {
1804  11 foundSeqs.add(cdsSeq);
1805  11 SequenceI derivedSequence = cdsSeq.deriveSequence();
1806  11 cdsSeqs.add(derivedSequence);
1807  11 if (!dataset.getSequences().contains(cdsSeq))
1808    {
1809  0 dataset.addSequence(cdsSeq);
1810    }
1811    }
1812  11 continue;
1813    }
1814   
1815    /*
1816    * didn't find mapped CDS sequence - construct it and add
1817    * its dataset sequence to the dataset
1818    */
1819  20 cdsSeq = makeCdsSequence(dnaSeq.getDatasetSequence(), aMapping,
1820    dataset).deriveSequence();
1821    // cdsSeq has a name constructed as CDS|<dbref>
1822    // <dbref> will be either the accession for the coding sequence,
1823    // marked in the /via/ dbref to the protein product accession
1824    // or it will be the original nucleotide accession.
1825  20 SequenceI cdsSeqDss = cdsSeq.getDatasetSequence();
1826   
1827  20 cdsSeqs.add(cdsSeq);
1828   
1829    /*
1830    * build the mapping from CDS to protein
1831    */
1832  20 List<int[]> cdsRange = Collections
1833    .singletonList(new int[]
1834    { cdsSeq.getStart(),
1835    cdsSeq.getLength() + cdsSeq.getStart() - 1 });
1836  20 MapList cdsToProteinMap = new MapList(cdsRange,
1837    mapList.getToRanges(), mapList.getFromRatio(),
1838    mapList.getToRatio());
1839   
1840  20 if (!dataset.getSequences().contains(cdsSeqDss))
1841    {
1842    /*
1843    * if this sequence is a newly created one, add it to the dataset
1844    * and made a CDS to protein mapping (if sequence already exists,
1845    * CDS-to-protein mapping _is_ the transcript-to-protein mapping)
1846    */
1847  20 dataset.addSequence(cdsSeqDss);
1848  20 AlignedCodonFrame cdsToProteinMapping = new AlignedCodonFrame();
1849  20 cdsToProteinMapping.addMap(cdsSeqDss, proteinProduct,
1850    cdsToProteinMap);
1851   
1852    /*
1853    * guard against duplicating the mapping if repeating this action
1854    */
1855  20 if (!mappings.contains(cdsToProteinMapping))
1856    {
1857  20 mappings.add(cdsToProteinMapping);
1858    }
1859    }
1860   
1861  20 propagateDBRefsToCDS(cdsSeqDss, dnaSeq.getDatasetSequence(),
1862    proteinProduct, aMapping);
1863    /*
1864    * add another mapping from original 'from' range to CDS
1865    */
1866  20 AlignedCodonFrame dnaToCdsMapping = new AlignedCodonFrame();
1867  20 final MapList dnaToCdsMap = new MapList(mapList.getFromRanges(),
1868    cdsRange, 1, 1);
1869  20 dnaToCdsMapping.addMap(dnaSeq.getDatasetSequence(), cdsSeqDss,
1870    dnaToCdsMap);
1871  20 if (!mappings.contains(dnaToCdsMapping))
1872    {
1873  20 mappings.add(dnaToCdsMapping);
1874    }
1875   
1876    /*
1877    * transfer dna chromosomal loci (if known) to the CDS
1878    * sequence (via the mapping)
1879    */
1880  20 final MapList cdsToDnaMap = dnaToCdsMap.getInverse();
1881  20 transferGeneLoci(dnaSeq, cdsToDnaMap, cdsSeq);
1882   
1883    /*
1884    * add DBRef with mapping from protein to CDS
1885    * (this enables Get Cross-References from protein alignment)
1886    * This is tricky because we can't have two DBRefs with the
1887    * same source and accession, so need a different accession for
1888    * the CDS from the dna sequence
1889    */
1890   
1891    // specific use case:
1892    // Genomic contig ENSCHR:1, contains coding regions for ENSG01,
1893    // ENSG02, ENSG03, with transcripts and products similarly named.
1894    // cannot add distinct dbrefs mapping location on ENSCHR:1 to ENSG01
1895   
1896    // JBPNote: ?? can't actually create an example that demonstrates we
1897    // need to
1898    // synthesize an xref.
1899   
1900  20 List<DBRefEntry> primrefs = dnaDss.getPrimaryDBRefs();
1901  33 for (int ip = 0, np = primrefs.size(); ip < np; ip++)
1902    {
1903  13 DBRefEntry primRef = primrefs.get(ip);
1904    /*
1905    * create a cross-reference from CDS to the source sequence's
1906    * primary reference and vice versa
1907    */
1908  13 String source = primRef.getSource();
1909  13 String version = primRef.getVersion();
1910  13 DBRefEntry cdsCrossRef = new DBRefEntry(source,
1911    source + ":" + version, primRef.getAccessionId());
1912  13 cdsCrossRef
1913    .setMap(new Mapping(dnaDss, new MapList(cdsToDnaMap)));
1914  13 cdsSeqDss.addDBRef(cdsCrossRef);
1915   
1916  13 dnaSeq.addDBRef(new DBRefEntry(source, version,
1917    cdsSeq.getName(), new Mapping(cdsSeqDss, dnaToCdsMap)));
1918    // problem here is that the cross-reference is synthesized -
1919    // cdsSeq.getName() may be like 'CDS|dnaaccession' or
1920    // 'CDS|emblcdsacc'
1921    // assuming cds version same as dna ?!?
1922   
1923  13 DBRefEntry proteinToCdsRef = new DBRefEntry(source, version,
1924    cdsSeq.getName());
1925    //
1926  13 proteinToCdsRef.setMap(
1927    new Mapping(cdsSeqDss, cdsToProteinMap.getInverse()));
1928  13 proteinProduct.addDBRef(proteinToCdsRef);
1929    }
1930    /*
1931    * transfer any features on dna that overlap the CDS
1932    */
1933  20 transferFeatures(dnaSeq, cdsSeq, dnaToCdsMap, null,
1934    SequenceOntologyI.CDS);
1935    }
1936    }
1937    }
1938   
1939  6 AlignmentI cds = new Alignment(
1940    cdsSeqs.toArray(new SequenceI[cdsSeqs.size()]));
1941  6 cds.setDataset(dataset);
1942   
1943  6 return cds;
1944    }
1945   
1946    /**
1947    * Tries to transfer gene loci (dbref to chromosome positions) from fromSeq to
1948    * toSeq, mediated by the given mapping between the sequences
1949    *
1950    * @param fromSeq
1951    * @param targetToFrom
1952    * Map
1953    * @param targetSeq
1954    */
 
1955  23 toggle protected static void transferGeneLoci(SequenceI fromSeq,
1956    MapList targetToFrom, SequenceI targetSeq)
1957    {
1958  23 if (targetSeq.getGeneLoci() != null)
1959    {
1960    // already have - don't override
1961  1 return;
1962    }
1963  22 GeneLociI fromLoci = fromSeq.getGeneLoci();
1964  22 if (fromLoci == null)
1965    {
1966  10 return;
1967    }
1968   
1969  12 MapList newMap = targetToFrom.traverse(fromLoci.getMapping());
1970   
1971  12 if (newMap != null)
1972    {
1973  12 targetSeq.setGeneLoci(fromLoci.getSpeciesId(),
1974    fromLoci.getAssemblyId(), fromLoci.getChromosomeId(), newMap);
1975    }
1976    }
1977   
1978    /**
1979    * A helper method that finds a CDS sequence in the alignment dataset that is
1980    * mapped to the given protein sequence, and either is, or has a mapping from,
1981    * the given dna sequence.
1982    *
1983    * @param mappings
1984    * set of all mappings on the dataset
1985    * @param dnaSeq
1986    * a dna (or cds) sequence we are searching from
1987    * @param seqMappings
1988    * the set of mappings involving dnaSeq
1989    * @param aMapping
1990    * a transcript-to-peptide mapping
1991    * @return
1992    */
 
1993  38 toggle static SequenceI findCdsForProtein(List<AlignedCodonFrame> mappings,
1994    SequenceI dnaSeq, List<AlignedCodonFrame> seqMappings,
1995    Mapping aMapping)
1996    {
1997    /*
1998    * TODO a better dna-cds-protein mapping data representation to allow easy
1999    * navigation; until then this clunky looping around lists of mappings
2000    */
2001  38 SequenceI seqDss = dnaSeq.getDatasetSequence() == null ? dnaSeq
2002    : dnaSeq.getDatasetSequence();
2003  38 SequenceI proteinProduct = aMapping.getTo();
2004   
2005    /*
2006    * is this mapping from the whole dna sequence (i.e. CDS)?
2007    * allowing for possible stop codon on dna but not peptide
2008    */
2009  38 int mappedFromLength = MappingUtils
2010    .getLength(aMapping.getMap().getFromRanges());
2011  38 int dnaLength = seqDss.getLength();
2012  38 if (mappedFromLength == dnaLength
2013    || mappedFromLength == dnaLength - CODON_LENGTH)
2014    {
2015    /*
2016    * if sequence has CDS features, this is a transcript with no UTR
2017    * - do not take this as the CDS sequence! (JAL-2789)
2018    */
2019  4 if (seqDss.getFeatures().getFeaturesByOntology(SequenceOntologyI.CDS)
2020    .isEmpty())
2021    {
2022  1 return seqDss;
2023    }
2024    }
2025   
2026    /*
2027    * looks like we found the dna-to-protein mapping; search for the
2028    * corresponding cds-to-protein mapping
2029    */
2030  37 List<AlignedCodonFrame> mappingsToPeptide = MappingUtils
2031    .findMappingsForSequence(proteinProduct, mappings);
2032  37 for (AlignedCodonFrame acf : mappingsToPeptide)
2033    {
2034  52 for (SequenceToSequenceMapping map : acf.getMappings())
2035    {
2036  276 Mapping mapping = map.getMapping();
2037  276 if (mapping != aMapping
2038    && mapping.getMap().getFromRatio() == CODON_LENGTH
2039    && proteinProduct == mapping.getTo()
2040    && seqDss != map.getFromSeq())
2041    {
2042  15 mappedFromLength = MappingUtils
2043    .getLength(mapping.getMap().getFromRanges());
2044  15 if (mappedFromLength == map.getFromSeq().getLength())
2045    {
2046    /*
2047    * found a 3:1 mapping to the protein product which covers
2048    * the whole dna sequence i.e. is from CDS; finally check the CDS
2049    * is mapped from the given dna start sequence
2050    */
2051  15 SequenceI cdsSeq = map.getFromSeq();
2052    // todo this test is weak if seqMappings contains multiple mappings;
2053    // we get away with it if transcript:cds relationship is 1:1
2054  15 List<AlignedCodonFrame> dnaToCdsMaps = MappingUtils
2055    .findMappingsForSequence(cdsSeq, seqMappings);
2056  15 if (!dnaToCdsMaps.isEmpty())
2057    {
2058  13 return cdsSeq;
2059    }
2060    }
2061    }
2062    }
2063    }
2064  24 return null;
2065    }
2066   
2067    /**
2068    * Helper method that makes a CDS sequence as defined by the mappings from the
2069    * given sequence i.e. extracts the 'mapped from' ranges (which may be on
2070    * forward or reverse strand).
2071    *
2072    * @param seq
2073    * @param mapping
2074    * @param dataset
2075    * - existing dataset. We check for sequences that look like the CDS
2076    * we are about to construct, if one exists already, then we will
2077    * just return that one.
2078    * @return CDS sequence (as a dataset sequence)
2079    */
 
2080  20 toggle static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping,
2081    AlignmentI dataset)
2082    {
2083    /*
2084    * construct CDS sequence name as "CDS|" with 'from id' held in the mapping
2085    * if set (e.g. EMBL protein_id), else sequence name appended
2086    */
2087  20 String mapFromId = mapping.getMappedFromId();
2088  20 final String seqId = "CDS|"
2089  20 + (mapFromId != null ? mapFromId : seq.getName());
2090   
2091  20 SequenceI newSeq = null;
2092   
2093    /*
2094    * construct CDS sequence by splicing mapped from ranges
2095    */
2096  20 char[] seqChars = seq.getSequence();
2097  20 List<int[]> fromRanges = mapping.getMap().getFromRanges();
2098  20 int cdsWidth = MappingUtils.getLength(fromRanges);
2099  20 char[] newSeqChars = new char[cdsWidth];
2100   
2101  20 int newPos = 0;
2102  20 for (int[] range : fromRanges)
2103    {
2104  32 if (range[0] <= range[1])
2105    {
2106    // forward strand mapping - just copy the range
2107  32 int length = range[1] - range[0] + 1;
2108  32 System.arraycopy(seqChars, range[0] - 1, newSeqChars, newPos,
2109    length);
2110  32 newPos += length;
2111    }
2112    else
2113    {
2114    // reverse strand mapping - copy and complement one by one
2115  0 for (int i = range[0]; i >= range[1]; i--)
2116    {
2117  0 newSeqChars[newPos++] = Dna.getComplement(seqChars[i - 1]);
2118    }
2119    }
2120   
2121  32 newSeq = new Sequence(seqId, newSeqChars, 1, newPos);
2122    }
2123   
2124  20 if (dataset != null)
2125    {
2126  20 SequenceI[] matches = dataset.findSequenceMatch(newSeq.getName());
2127  20 if (matches != null)
2128    {
2129  20 boolean matched = false;
2130  20 for (SequenceI mtch : matches)
2131    {
2132  3 if (mtch.getStart() != newSeq.getStart())
2133    {
2134  0 continue;
2135    }
2136  3 if (mtch.getEnd() != newSeq.getEnd())
2137    {
2138  0 continue;
2139    }
2140  3 if (!Arrays.equals(mtch.getSequence(), newSeq.getSequence()))
2141    {
2142  3 continue;
2143    }
2144  0 if (!matched)
2145    {
2146  0 matched = true;
2147  0 newSeq = mtch;
2148    }
2149    else
2150    {
2151  0 Console.error(
2152    "JAL-2154 regression: warning - found (and ignored) a duplicate CDS sequence:"
2153    + mtch.toString());
2154    }
2155    }
2156    }
2157    }
2158    // newSeq.setDescription(mapFromId);
2159   
2160  20 return newSeq;
2161    }
2162   
2163    /**
2164    * Adds any DBRefEntrys to cdsSeq from contig that have a Mapping congruent to
2165    * the given mapping.
2166    *
2167    * @param cdsSeq
2168    * @param contig
2169    * @param proteinProduct
2170    * @param mapping
2171    * @return list of DBRefEntrys added
2172    */
 
2173  20 toggle protected static List<DBRefEntry> propagateDBRefsToCDS(SequenceI cdsSeq,
2174    SequenceI contig, SequenceI proteinProduct, Mapping mapping)
2175    {
2176   
2177    // gather direct refs from contig congruent with mapping
2178  20 List<DBRefEntry> direct = new ArrayList<>();
2179  20 HashSet<String> directSources = new HashSet<>();
2180   
2181  20 List<DBRefEntry> refs = contig.getDBRefs();
2182  20 if (refs != null)
2183    {
2184  292 for (int ib = 0, nb = refs.size(); ib < nb; ib++)
2185    {
2186  279 DBRefEntry dbr = refs.get(ib);
2187  279 MapList map;
2188  ? if (dbr.hasMap() && (map = dbr.getMap().getMap()).isTripletMap())
2189    {
2190    // check if map is the CDS mapping
2191  24 if (mapping.getMap().equals(map))
2192    {
2193  24 direct.add(dbr);
2194  24 directSources.add(dbr.getSource());
2195    }
2196    }
2197    }
2198    }
2199  20 List<DBRefEntry> onSource = DBRefUtils.selectRefs(
2200    proteinProduct.getDBRefs(),
2201    directSources.toArray(new String[0]));
2202  20 List<DBRefEntry> propagated = new ArrayList<>();
2203   
2204    // and generate appropriate mappings
2205  44 for (int ic = 0, nc = direct.size(); ic < nc; ic++)
2206    {
2207  24 DBRefEntry cdsref = direct.get(ic);
2208  24 Mapping m = cdsref.getMap();
2209    // clone maplist and mapping
2210  24 MapList cdsposmap = new MapList(
2211    Arrays.asList(new int[][]
2212    { new int[] { cdsSeq.getStart(), cdsSeq.getEnd() } }),
2213    m.getMap().getToRanges(), 3, 1);
2214  24 Mapping cdsmap = new Mapping(m.getTo(), m.getMap());
2215   
2216    // create dbref
2217  24 DBRefEntry newref = new DBRefEntry(cdsref.getSource(),
2218    cdsref.getVersion(), cdsref.getAccessionId(),
2219    new Mapping(cdsmap.getTo(), cdsposmap));
2220   
2221    // and see if we can map to the protein product for this mapping.
2222    // onSource is the filtered set of accessions on protein that we are
2223    // tranferring, so we assume accession is the same.
2224  24 if (cdsmap.getTo() == null && onSource != null)
2225    {
2226  2 List<DBRefEntry> sourceRefs = DBRefUtils.searchRefs(onSource,
2227    cdsref.getAccessionId());
2228  2 if (sourceRefs != null)
2229    {
2230  2 for (DBRefEntry srcref : sourceRefs)
2231    {
2232  2 if (srcref.getSource().equalsIgnoreCase(cdsref.getSource()))
2233    {
2234    // we have found a complementary dbref on the protein product, so
2235    // update mapping's getTo
2236  2 newref.getMap().setTo(proteinProduct);
2237    }
2238    }
2239    }
2240    }
2241  24 cdsSeq.addDBRef(newref);
2242  24 propagated.add(newref);
2243    }
2244  20 return propagated;
2245    }
2246   
2247    /**
2248    * Transfers co-located features on 'fromSeq' to 'toSeq', adjusting the
2249    * feature start/end ranges, optionally omitting specified feature types.
2250    * Returns the number of features copied.
2251    *
2252    * @param fromSeq
2253    * @param toSeq
2254    * @param mapping
2255    * the mapping from 'fromSeq' to 'toSeq'
2256    * @param select
2257    * if not null, only features of this type are copied (including
2258    * subtypes in the Sequence Ontology)
2259    * @param omitting
2260    */
 
2261  23 toggle protected static int transferFeatures(SequenceI fromSeq, SequenceI toSeq,
2262    MapList mapping, String select, String... omitting)
2263    {
2264  23 SequenceI copyTo = toSeq;
2265  43 while (copyTo.getDatasetSequence() != null)
2266    {
2267  20 copyTo = copyTo.getDatasetSequence();
2268    }
2269  23 if (fromSeq == copyTo || fromSeq.getDatasetSequence() == copyTo)
2270    {
2271  0 return 0; // shared dataset sequence
2272    }
2273   
2274    /*
2275    * get features, optionally restricted by an ontology term
2276    */
2277  23 List<SequenceFeature> sfs = select == null
2278    ? fromSeq.getFeatures().getPositionalFeatures()
2279    : fromSeq.getFeatures().getFeaturesByOntology(select);
2280   
2281  23 int count = 0;
2282  23 for (SequenceFeature sf : sfs)
2283    {
2284  9610 String type = sf.getType();
2285  9610 boolean omit = false;
2286  9610 for (String toOmit : omitting)
2287    {
2288  9603 if (type.equals(toOmit))
2289    {
2290  134 omit = true;
2291    }
2292    }
2293  9610 if (omit)
2294    {
2295  134 continue;
2296    }
2297   
2298    /*
2299    * locate the mapped range - null if either start or end is
2300    * not mapped (no partial overlaps are calculated)
2301    */
2302  9476 int start = sf.getBegin();
2303  9476 int end = sf.getEnd();
2304  9476 int[] mappedTo = mapping.locateInTo(start, end);
2305    /*
2306    * if whole exon range doesn't map, try interpreting it
2307    * as 5' or 3' exon overlapping the CDS range
2308    */
2309  9476 if (mappedTo == null)
2310    {
2311  4447 mappedTo = mapping.locateInTo(end, end);
2312  4447 if (mappedTo != null)
2313    {
2314    /*
2315    * end of exon is in CDS range - 5' overlap
2316    * to a range from the start of the peptide
2317    */
2318  0 mappedTo[0] = 1;
2319    }
2320    }
2321  9476 if (mappedTo == null)
2322    {
2323  4447 mappedTo = mapping.locateInTo(start, start);
2324  4447 if (mappedTo != null)
2325    {
2326    /*
2327    * start of exon is in CDS range - 3' overlap
2328    * to a range up to the end of the peptide
2329    */
2330  0 mappedTo[1] = toSeq.getLength();
2331    }
2332    }
2333  9476 if (mappedTo != null)
2334    {
2335  5029 int newBegin = Math.min(mappedTo[0], mappedTo[1]);
2336  5029 int newEnd = Math.max(mappedTo[0], mappedTo[1]);
2337  5029 SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
2338    sf.getFeatureGroup(), sf.getScore());
2339  5029 copyTo.addSequenceFeature(copy);
2340  5029 count++;
2341    }
2342    }
2343  23 return count;
2344    }
2345   
2346    /**
2347    * Returns a mapping from dna to protein by inspecting sequence features of
2348    * type "CDS" on the dna. A mapping is constructed if the total CDS feature
2349    * length is 3 times the peptide length (optionally after dropping a trailing
2350    * stop codon). This method does not check whether the CDS nucleotide sequence
2351    * translates to the peptide sequence.
2352    *
2353    * @param dnaSeq
2354    * @param proteinSeq
2355    * @return
2356    */
 
2357  22 toggle public static MapList mapCdsToProtein(SequenceI dnaSeq,
2358    SequenceI proteinSeq)
2359    {
2360  22 List<int[]> ranges = findCdsPositions(dnaSeq);
2361  22 int mappedDnaLength = MappingUtils.getLength(ranges);
2362   
2363    /*
2364    * if not a whole number of codons, truncate mapping
2365    */
2366  22 int codonRemainder = mappedDnaLength % CODON_LENGTH;
2367  22 if (codonRemainder > 0)
2368    {
2369  2 mappedDnaLength -= codonRemainder;
2370  2 MappingUtils.removeEndPositions(codonRemainder, ranges);
2371    }
2372   
2373  22 int proteinLength = proteinSeq.getLength();
2374  22 int proteinStart = proteinSeq.getStart();
2375  22 int proteinEnd = proteinSeq.getEnd();
2376   
2377    /*
2378    * incomplete start codon may mean X at start of peptide
2379    * we ignore both for mapping purposes
2380    */
2381  22 if (proteinSeq.getCharAt(0) == 'X')
2382    {
2383    // todo JAL-2022 support startPhase > 0
2384  1 proteinStart++;
2385  1 proteinLength--;
2386    }
2387  22 List<int[]> proteinRange = new ArrayList<>();
2388   
2389    /*
2390    * dna length should map to protein (or protein plus stop codon)
2391    */
2392  22 int codesForResidues = mappedDnaLength / CODON_LENGTH;
2393  22 if (codesForResidues == (proteinLength + 1))
2394    {
2395    // assuming extra codon is for STOP and not in peptide
2396    // todo: check trailing codon is indeed a STOP codon
2397  2 codesForResidues--;
2398  2 mappedDnaLength -= CODON_LENGTH;
2399  2 MappingUtils.removeEndPositions(CODON_LENGTH, ranges);
2400    }
2401   
2402  22 if (codesForResidues == proteinLength)
2403    {
2404  4 proteinRange.add(new int[] { proteinStart, proteinEnd });
2405  4 return new MapList(ranges, proteinRange, CODON_LENGTH, 1);
2406    }
2407  18 return null;
2408    }
2409   
2410    /**
2411    * Returns a list of CDS ranges found (as sequence positions base 1), i.e. of
2412    * [start, end] positions of sequence features of type "CDS" (or a sub-type of
2413    * CDS in the Sequence Ontology). The ranges are sorted into ascending start
2414    * position order, so this method is only valid for linear CDS in the same
2415    * sense as the protein product.
2416    *
2417    * @param dnaSeq
2418    * @return
2419    */
 
2420  24 toggle protected static List<int[]> findCdsPositions(SequenceI dnaSeq)
2421    {
2422  24 List<int[]> result = new ArrayList<>();
2423   
2424  24 List<SequenceFeature> sfs = dnaSeq.getFeatures()
2425    .getFeaturesByOntology(SequenceOntologyI.CDS);
2426  24 if (sfs.isEmpty())
2427    {
2428  16 return result;
2429    }
2430  8 SequenceFeatures.sortFeatures(sfs, true);
2431   
2432  8 for (SequenceFeature sf : sfs)
2433    {
2434  16 int phase = 0;
2435  16 try
2436    {
2437  16 String s = sf.getPhase();
2438  16 if (s != null)
2439    {
2440  2 phase = Integer.parseInt(s);
2441    }
2442    } catch (NumberFormatException e)
2443    {
2444    // leave as zero
2445    }
2446    /*
2447    * phase > 0 on first codon means 5' incomplete - skip to the start
2448    * of the next codon; example ENST00000496384
2449    */
2450  16 int begin = sf.getBegin();
2451  16 int end = sf.getEnd();
2452  16 if (result.isEmpty() && phase > 0)
2453    {
2454  2 begin += phase;
2455  2 if (begin > end)
2456    {
2457    // shouldn't happen!
2458  0 System.err
2459    .println("Error: start phase extends beyond start CDS in "
2460    + dnaSeq.getName());
2461    }
2462    }
2463  16 result.add(new int[] { begin, end });
2464    }
2465   
2466    /*
2467    * Finally sort ranges by start position. This avoids a dependency on
2468    * keeping features in order on the sequence (if they are in order anyway,
2469    * the sort will have almost no work to do). The implicit assumption is CDS
2470    * ranges are assembled in order. Other cases should not use this method,
2471    * but instead construct an explicit mapping for CDS (e.g. EMBL parsing).
2472    */
2473  8 Collections.sort(result, IntRangeComparator.ASCENDING);
2474  8 return result;
2475    }
2476   
2477    /**
2478    * Makes an alignment with a copy of the given sequences, adding in any
2479    * non-redundant sequences which are mapped to by the cross-referenced
2480    * sequences.
2481    *
2482    * @param seqs
2483    * @param xrefs
2484    * @param dataset
2485    * the alignment dataset shared by the new copy
2486    * @return
2487    */
 
2488  0 toggle public static AlignmentI makeCopyAlignment(SequenceI[] seqs,
2489    SequenceI[] xrefs, AlignmentI dataset)
2490    {
2491  0 AlignmentI copy = new Alignment(new Alignment(seqs));
2492  0 copy.setDataset(dataset);
2493  0 boolean isProtein = !copy.isNucleotide();
2494  0 SequenceIdMatcher matcher = new SequenceIdMatcher(seqs);
2495  0 if (xrefs != null)
2496    {
2497    // BH 2019.01.25 recoded to remove iterators
2498   
2499  0 for (int ix = 0, nx = xrefs.length; ix < nx; ix++)
2500    {
2501  0 SequenceI xref = xrefs[ix];
2502  0 List<DBRefEntry> dbrefs = xref.getDBRefs();
2503  0 if (dbrefs != null)
2504    {
2505  0 for (int ir = 0, nir = dbrefs.size(); ir < nir; ir++)
2506    {
2507  0 DBRefEntry dbref = dbrefs.get(ir);
2508  0 Mapping map = dbref.getMap();
2509  0 SequenceI mto;
2510  0 if (map == null || (mto = map.getTo()) == null
2511    || mto.isProtein() != isProtein)
2512    {
2513  0 continue;
2514    }
2515  0 SequenceI mappedTo = mto;
2516  0 SequenceI match = matcher.findIdMatch(mappedTo);
2517  0 if (match == null)
2518    {
2519  0 matcher.add(mappedTo);
2520  0 copy.addSequence(mappedTo);
2521    }
2522    }
2523    }
2524    }
2525    }
2526  0 return copy;
2527    }
2528   
2529    /**
2530    * Try to align sequences in 'unaligned' to match the alignment of their
2531    * mapped regions in 'aligned'. For example, could use this to align CDS
2532    * sequences which are mapped to their parent cDNA sequences.
2533    *
2534    * This method handles 1:1 mappings (dna-to-dna or protein-to-protein). For
2535    * dna-to-protein or protein-to-dna use alternative methods.
2536    *
2537    * @param unaligned
2538    * sequences to be aligned
2539    * @param aligned
2540    * holds aligned sequences and their mappings
2541    * @return
2542    */
 
2543  4 toggle public static int alignAs(AlignmentI unaligned, AlignmentI aligned)
2544    {
2545    /*
2546    * easy case - aligning a copy of aligned sequences
2547    */
2548  4 if (alignAsSameSequences(unaligned, aligned))
2549    {
2550  0 return unaligned.getHeight();
2551    }
2552   
2553    /*
2554    * fancy case - aligning via mappings between sequences
2555    */
2556  4 List<SequenceI> unmapped = new ArrayList<>();
2557  4 Map<Integer, Map<SequenceI, Character>> columnMap = buildMappedColumnsMap(
2558    unaligned, aligned, unmapped);
2559  4 int width = columnMap.size();
2560  4 char gap = unaligned.getGapCharacter();
2561  4 int realignedCount = 0;
2562    // TODO: verify this loop scales sensibly for very wide/high alignments
2563   
2564  4 for (SequenceI seq : unaligned.getSequences())
2565    {
2566  26 if (!unmapped.contains(seq))
2567    {
2568  26 char[] newSeq = new char[width];
2569  26 Arrays.fill(newSeq, gap); // JBPComment - doubt this is faster than the
2570    // Integer iteration below
2571  26 int newCol = 0;
2572  26 int lastCol = 0;
2573   
2574    /*
2575    * traverse the map to find columns populated
2576    * by our sequence
2577    */
2578  26 for (Integer column : columnMap.keySet())
2579    {
2580  58976 Character c = columnMap.get(column).get(seq);
2581  58976 if (c != null)
2582    {
2583    /*
2584    * sequence has a character at this position
2585    *
2586    */
2587  31986 newSeq[newCol] = c;
2588  31986 lastCol = newCol;
2589    }
2590  58976 newCol++;
2591    }
2592   
2593    /*
2594    * trim trailing gaps
2595    */
2596  26 if (lastCol < width)
2597    {
2598  26 char[] tmp = new char[lastCol + 1];
2599  26 System.arraycopy(newSeq, 0, tmp, 0, lastCol + 1);
2600  26 newSeq = tmp;
2601    }
2602    // TODO: optimise SequenceI to avoid char[]->String->char[]
2603  26 seq.setSequence(String.valueOf(newSeq));
2604  26 realignedCount++;
2605    }
2606    }
2607  4 return realignedCount;
2608    }
2609   
2610    /**
2611    * If unaligned and aligned sequences share the same dataset sequences, then
2612    * simply copies the aligned sequences to the unaligned sequences and returns
2613    * true; else returns false
2614    *
2615    * @param unaligned
2616    * - sequences to be aligned based on aligned
2617    * @param aligned
2618    * - 'guide' alignment containing sequences derived from same dataset
2619    * as unaligned
2620    * @return
2621    */
 
2622  8 toggle static boolean alignAsSameSequences(AlignmentI unaligned,
2623    AlignmentI aligned)
2624    {
2625  8 if (aligned.getDataset() == null || unaligned.getDataset() == null)
2626    {
2627  0 return false; // should only pass alignments with datasets here
2628    }
2629   
2630    // map from dataset sequence to alignment sequence(s)
2631  8 Map<SequenceI, List<SequenceI>> alignedDatasets = new HashMap<>();
2632  8 for (SequenceI seq : aligned.getSequences())
2633    {
2634  59 SequenceI ds = seq.getDatasetSequence();
2635  59 if (alignedDatasets.get(ds) == null)
2636    {
2637  58 alignedDatasets.put(ds, new ArrayList<SequenceI>());
2638    }
2639  59 alignedDatasets.get(ds).add(seq);
2640    }
2641   
2642    /*
2643    * first pass - check whether all sequences to be aligned share a
2644    * dataset sequence with an aligned sequence; also note the leftmost
2645    * ungapped column from which to copy
2646    */
2647  8 int leftmost = Integer.MAX_VALUE;
2648  8 for (SequenceI seq : unaligned.getSequences())
2649    {
2650  14 final SequenceI ds = seq.getDatasetSequence();
2651  14 if (!alignedDatasets.containsKey(ds))
2652    {
2653  5 return false;
2654    }
2655  9 SequenceI alignedSeq = alignedDatasets.get(ds).get(0);
2656  9 int startCol = alignedSeq.findIndex(seq.getStart()); // 1..
2657  9 leftmost = Math.min(leftmost, startCol);
2658    }
2659   
2660    /*
2661    * second pass - copy aligned sequences;
2662    * heuristic rule: pair off sequences in order for the case where
2663    * more than one shares the same dataset sequence
2664    */
2665  3 final char gapCharacter = aligned.getGapCharacter();
2666  3 for (SequenceI seq : unaligned.getSequences())
2667    {
2668  7 List<SequenceI> alignedSequences = alignedDatasets
2669    .get(seq.getDatasetSequence());
2670  7 if (alignedSequences.isEmpty())
2671    {
2672    /*
2673    * defensive check - shouldn't happen! (JAL-3536)
2674    */
2675  0 continue;
2676    }
2677  7 SequenceI alignedSeq = alignedSequences.get(0);
2678   
2679    /*
2680    * gap fill for leading (5') UTR if any
2681    */
2682    // TODO this copies intron columns - wrong!
2683  7 int startCol = alignedSeq.findIndex(seq.getStart()); // 1..
2684  7 int endCol = alignedSeq.findIndex(seq.getEnd());
2685  7 char[] seqchars = new char[endCol - leftmost + 1];
2686  7 Arrays.fill(seqchars, gapCharacter);
2687  7 char[] toCopy = alignedSeq.getSequence(startCol - 1, endCol);
2688  7 System.arraycopy(toCopy, 0, seqchars, startCol - leftmost,
2689    toCopy.length);
2690  7 seq.setSequence(String.valueOf(seqchars));
2691  7 if (alignedSequences.size() > 0)
2692    {
2693    // pop off aligned sequences (except the last one)
2694  7 alignedSequences.remove(0);
2695    }
2696    }
2697   
2698    /*
2699    * finally remove gapped columns (e.g. introns)
2700    */
2701  3 new RemoveGapColCommand("", unaligned.getSequencesArray(), 0,
2702    unaligned.getWidth() - 1, unaligned);
2703   
2704  3 return true;
2705    }
2706   
2707    /**
2708    * Returns a map whose key is alignment column number (base 1), and whose
2709    * values are a map of sequence characters in that column.
2710    *
2711    * @param unaligned
2712    * @param aligned
2713    * @param unmapped
2714    * @return
2715    */
 
2716  4 toggle static SortedMap<Integer, Map<SequenceI, Character>> buildMappedColumnsMap(
2717    AlignmentI unaligned, AlignmentI aligned,
2718    List<SequenceI> unmapped)
2719    {
2720    /*
2721    * Map will hold, for each aligned column position, a map of
2722    * {unalignedSequence, characterPerSequence} at that position.
2723    * TreeMap keeps the entries in ascending column order.
2724    */
2725  4 SortedMap<Integer, Map<SequenceI, Character>> map = new TreeMap<>();
2726   
2727    /*
2728    * record any sequences that have no mapping so can't be realigned
2729    */
2730  4 unmapped.addAll(unaligned.getSequences());
2731   
2732  4 List<AlignedCodonFrame> mappings = aligned.getCodonFrames();
2733   
2734  4 for (SequenceI seq : unaligned.getSequences())
2735    {
2736  26 for (AlignedCodonFrame mapping : mappings)
2737    {
2738  510 SequenceI fromSeq = mapping.findAlignedSequence(seq, aligned);
2739  510 if (fromSeq != null)
2740    {
2741  26 Mapping seqMap = mapping.getMappingBetween(fromSeq, seq);
2742  26 if (addMappedPositions(seq, fromSeq, seqMap, map))
2743    {
2744  26 unmapped.remove(seq);
2745    }
2746    }
2747    }
2748    }
2749  4 return map;
2750    }
2751   
2752    /**
2753    * Helper method that adds to a map the mapped column positions of a sequence.
2754    * <br>
2755    * For example if aaTT-Tg-gAAA is mapped to TTTAAA then the map should record
2756    * that columns 3,4,6,10,11,12 map to characters T,T,T,A,A,A of the mapped to
2757    * sequence.
2758    *
2759    * @param seq
2760    * the sequence whose column positions we are recording
2761    * @param fromSeq
2762    * a sequence that is mapped to the first sequence
2763    * @param seqMap
2764    * the mapping from 'fromSeq' to 'seq'
2765    * @param map
2766    * a map to add the column positions (in fromSeq) of the mapped
2767    * positions of seq
2768    * @return
2769    */
 
2770  28 toggle static boolean addMappedPositions(SequenceI seq, SequenceI fromSeq,
2771    Mapping seqMap, Map<Integer, Map<SequenceI, Character>> map)
2772    {
2773  28 if (seqMap == null)
2774    {
2775  0 return false;
2776    }
2777   
2778    /*
2779    * invert mapping if it is from unaligned to aligned sequence
2780    */
2781  28 if (seqMap.getTo() == fromSeq.getDatasetSequence())
2782    {
2783  0 seqMap = new Mapping(seq.getDatasetSequence(),
2784    seqMap.getMap().getInverse());
2785    }
2786   
2787  28 int toStart = seq.getStart();
2788   
2789    /*
2790    * traverse [start, end, start, end...] ranges in fromSeq
2791    */
2792  28 for (int[] fromRange : seqMap.getMap().getFromRanges())
2793    {
2794  62 for (int i = 0; i < fromRange.length - 1; i += 2)
2795    {
2796  31 boolean forward = fromRange[i + 1] >= fromRange[i];
2797   
2798    /*
2799    * find the range mapped to (sequence positions base 1)
2800    */
2801  31 int[] range = seqMap.locateMappedRange(fromRange[i],
2802    fromRange[i + 1]);
2803  31 if (range == null)
2804    {
2805  0 jalview.bin.Console.errPrintln("Error in mapping " + seqMap
2806    + " from " + fromSeq.getName());
2807  0 return false;
2808    }
2809  31 int fromCol = fromSeq.findIndex(fromRange[i]);
2810  31 int mappedCharPos = range[0];
2811   
2812    /*
2813    * walk over the 'from' aligned sequence in forward or reverse
2814    * direction; when a non-gap is found, record the column position
2815    * of the next character of the mapped-to sequence; stop when all
2816    * the characters of the range have been counted
2817    */
2818  2794274 while (mappedCharPos <= range[1] && fromCol <= fromSeq.getLength()
2819    && fromCol >= 0)
2820    {
2821  2794243 if (!Comparison.isGap(fromSeq.getCharAt(fromCol - 1)))
2822    {
2823    /*
2824    * mapped from sequence has a character in this column
2825    * record the column position for the mapped to character
2826    */
2827  31998 Map<SequenceI, Character> seqsMap = map.get(fromCol);
2828  31998 if (seqsMap == null)
2829    {
2830  5398 seqsMap = new HashMap<>();
2831  5398 map.put(fromCol, seqsMap);
2832    }
2833  31998 seqsMap.put(seq, seq.getCharAt(mappedCharPos - toStart));
2834  31998 mappedCharPos++;
2835    }
2836  2794243 fromCol += (forward ? 1 : -1);
2837    }
2838    }
2839    }
2840  28 return true;
2841    }
2842   
2843    // strictly temporary hack until proper criteria for aligning protein to cds
2844    // are in place; this is so Ensembl -> fetch xrefs Uniprot aligns the Uniprot
 
2845  4 toggle public static boolean looksLikeEnsembl(AlignmentI alignment)
2846    {
2847  4 for (SequenceI seq : alignment.getSequences())
2848    {
2849  88 String name = seq.getName();
2850  88 if (!name.startsWith("ENSG") && !name.startsWith("ENST"))
2851    {
2852  0 return false;
2853    }
2854    }
2855  4 return true;
2856    }
2857   
 
2858  5 toggle public static List<String> getSecondaryStructureSources(
2859    AlignmentAnnotation[] annotations)
2860    {
2861   
2862  5 List<String> ssSources = new ArrayList<>();
2863  5 Set<String> addedLabels = new HashSet<>(); // to keep track of added labels
2864   
2865  5 for (AlignmentAnnotation annotation : annotations)
2866    {
2867  12 String label = annotation.label;
2868  12 if (Constants.SECONDARY_STRUCTURE_LABELS.containsKey(label)
2869    && !addedLabels.contains(label))
2870    {
2871  4 ssSources.add(Constants.SECONDARY_STRUCTURE_LABELS.get(label));
2872  4 addedLabels.add(label); // Add the label to the set
2873    }
2874    }
2875   
2876  5 return ssSources;
2877    }
2878   
 
2879  5 toggle public static boolean isSecondaryStructurePresent(
2880    AlignmentAnnotation[] annotations)
2881    {
2882  5 boolean ssPresent = false;
2883   
2884  5 for (AlignmentAnnotation aa : annotations)
2885    {
2886  5 if (ssPresent)
2887    {
2888  0 break;
2889    }
2890   
2891  5 if (Constants.SECONDARY_STRUCTURE_LABELS.containsKey(aa.label))
2892    {
2893  3 ssPresent = true;
2894  3 break;
2895    }
2896    }
2897   
2898  5 return ssPresent;
2899   
2900    }
2901   
 
2902  0 toggle public static Color getSecondaryStructureAnnotationColour(char symbol)
2903    {
2904   
2905  0 if (symbol == Constants.COIL)
2906    {
2907  0 return Color.gray;
2908    }
2909  0 if (symbol == Constants.SHEET)
2910    {
2911  0 return Color.green;
2912    }
2913  0 if (symbol == Constants.HELIX)
2914    {
2915  0 return Color.red;
2916    }
2917   
2918  0 return Color.gray;
2919    }
2920   
 
2921  0 toggle public static char findSSAnnotationForGivenSeqposition(
2922    AlignmentAnnotation aa, int seqPosition)
2923    {
2924  0 char ss = '*';
2925   
2926  0 if (aa != null)
2927    {
2928  0 if (aa.getAnnotationForPosition(seqPosition) != null)
2929    {
2930  0 Annotation a = aa.getAnnotationForPosition(seqPosition);
2931  0 ss = a.secondaryStructure;
2932   
2933    // There is no representation for coil and it can be either ' ' or null.
2934  0 if (ss == ' ' || ss == '-')
2935    {
2936  0 ss = Constants.COIL;
2937    }
2938    }
2939    else
2940    {
2941  0 ss = Constants.COIL;
2942    }
2943    }
2944   
2945  0 return ss;
2946    }
2947   
 
2948  0 toggle public static List<String> extractSSSourceInAlignmentAnnotation(
2949    AlignmentAnnotation[] annotations)
2950    {
2951   
2952  0 List<String> ssSources = new ArrayList<>();
2953  0 Set<String> addedSources = new HashSet<>(); // to keep track of added
2954    // sources
2955   
2956  0 for (AlignmentAnnotation aa : annotations)
2957    {
2958   
2959  0 String ssSource = extractSSSourceFromAnnotationDescription(aa);
2960   
2961  0 if (ssSource != null && !addedSources.contains(ssSource))
2962    {
2963  0 ssSources.add(ssSource);
2964  0 addedSources.add(ssSource);
2965    }
2966   
2967    }
2968  0 Collections.sort(ssSources);
2969   
2970  0 return ssSources;
2971   
2972    }
2973   
 
2974  0 toggle public static String extractSSSourceFromAnnotationDescription(
2975    AlignmentAnnotation aa)
2976    {
2977   
2978  0 for (String label : Constants.SECONDARY_STRUCTURE_LABELS.keySet())
2979    {
2980   
2981  0 if (label.equals(aa.label))
2982    {
2983   
2984    // For JPred
2985  0 if (aa.label.equals(Constants.SS_ANNOTATION_FROM_JPRED_LABEL))
2986    {
2987   
2988  0 return (Constants.SECONDARY_STRUCTURE_LABELS.get(aa.label));
2989   
2990    }
2991   
2992    // For input with secondary structure
2993  0 if (aa.label.equals(Constants.SS_ANNOTATION_LABEL)
2994    && aa.description.equals(Constants.SS_ANNOTATION_LABEL))
2995    {
2996   
2997  0 return (Constants.SECONDARY_STRUCTURE_LABELS.get(aa.label));
2998   
2999    }
3000   
3001    // For other sources
3002  0 if (aa.sequenceRef == null)
3003    {
3004  0 return null;
3005    }
3006  0 else if (aa.sequenceRef.getDatasetSequence() == null)
3007    {
3008  0 return null;
3009    }
3010  0 Vector<PDBEntry> pdbEntries = aa.sequenceRef.getDatasetSequence()
3011    .getAllPDBEntries();
3012   
3013  0 for (PDBEntry entry : pdbEntries)
3014    {
3015   
3016  0 String entryProvider = entry.getProvider();
3017  0 if (entryProvider == null)
3018    {
3019  0 entryProvider = "PDB";
3020    }
3021   
3022    // Trim the string from first occurrence of colon
3023  0 String entryID = entry.getId();
3024  0 int index = entryID.indexOf(':');
3025   
3026    // Check if colon exists
3027  0 if (index != -1)
3028    {
3029   
3030    // Trim the string from first occurrence of colon
3031  0 entryID = entryID.substring(0, index);
3032   
3033    }
3034   
3035  0 if (entryProvider == "PDB" && aa.description.toLowerCase()
3036    .contains("secondary structure for "
3037    + entryID.toLowerCase()))
3038    {
3039   
3040  0 return entryProvider;
3041   
3042    }
3043   
3044  0 else if (entryProvider != "PDB" && aa.description.toLowerCase()
3045    .contains(entryID.toLowerCase()))
3046    {
3047   
3048  0 return entryProvider;
3049   
3050    }
3051   
3052    }
3053   
3054    }
3055    }
3056   
3057  0 return null;
3058   
3059    }
3060   
3061    // to do set priority for labels
 
3062  241392 toggle public static AlignmentAnnotation getDisplayedAlignmentAnnotation(
3063    SequenceI seq)
3064    {
3065   
3066  241392 for (String ssLabel : Constants.SECONDARY_STRUCTURE_LABELS.keySet())
3067    {
3068   
3069  482784 AlignmentAnnotation[] aa = seq.getAnnotation(ssLabel);
3070  482784 if (aa != null)
3071    {
3072   
3073  0 for (AlignmentAnnotation annot : aa)
3074    {
3075  0 if (annot.isForDisplay())
3076    {
3077  0 return annot;
3078    }
3079    }
3080    }
3081    }
3082   
3083  241392 return null;
3084   
3085    }
3086   
3087    }