Clover icon

Coverage Report

  1. Project Clover database Mon Dec 1 2025 13:17:41 GMT
  2. Package jalview.analysis

File AlignmentUtils.java

 

Coverage histogram

../../img/srcFileCovDistChart9.png
13% of files have more coverage

Code metrics

482
973
63
2
3,333
2,069
373
0.38
15.44
31.5
5.92

Classes

Class Line # Actions
AlignmentUtils 83 967 367
0.845079884.5%
AlignmentUtils.DnaVariant 99 6 6
0.00%
 

Contributing tests

This file is covered by 326 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.analysis;
22   
23    import java.awt.Color;
24    import java.util.ArrayList;
25    import java.util.Arrays;
26    import java.util.Collection;
27    import java.util.Collections;
28    import java.util.HashMap;
29    import java.util.HashSet;
30    import java.util.Iterator;
31    import java.util.LinkedHashMap;
32    import java.util.List;
33    import java.util.Locale;
34    import java.util.Map;
35    import java.util.Map.Entry;
36    import java.util.NoSuchElementException;
37    import java.util.Set;
38    import java.util.SortedMap;
39    import java.util.TreeMap;
40    import java.util.Vector;
41    import java.util.stream.Collectors;
42   
43    import jalview.bin.Console;
44    import jalview.commands.RemoveGapColCommand;
45    import jalview.datamodel.AlignedCodon;
46    import jalview.datamodel.AlignedCodonFrame;
47    import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping;
48    import jalview.datamodel.Alignment;
49    import jalview.datamodel.AlignmentAnnotation;
50    import jalview.datamodel.AlignmentI;
51    import jalview.datamodel.Annotation;
52    import jalview.datamodel.ContactMatrixI;
53    import jalview.datamodel.DBRefEntry;
54    import jalview.datamodel.DBRefSource;
55    import jalview.datamodel.GeneLociI;
56    import jalview.datamodel.IncompleteCodonException;
57    import jalview.datamodel.Mapping;
58    import jalview.datamodel.PDBEntry;
59    import jalview.datamodel.Sequence;
60    import jalview.datamodel.SequenceFeature;
61    import jalview.datamodel.SequenceGroup;
62    import jalview.datamodel.SequenceI;
63    import jalview.datamodel.features.SequenceFeatures;
64    import jalview.io.gff.SequenceOntologyI;
65    import jalview.schemes.ResidueProperties;
66    import jalview.util.ColorUtils;
67    import jalview.util.Comparison;
68    import jalview.util.Constants;
69    import jalview.util.DBRefUtils;
70    import jalview.util.IntRangeComparator;
71    import jalview.util.MapList;
72    import jalview.util.MappingUtils;
73    import jalview.util.ShiftList;
74    import uk.ac.vamsas.objects.core.descriptors.NewickDescriptor;
75   
76    /**
77    * grab bag of useful alignment manipulation operations Expect these to be
78    * refactored elsewhere at some point.
79    *
80    * @author jimp
81    *
82    */
 
83    public class AlignmentUtils
84    {
85    private static final int CODON_LENGTH = 3;
86   
87    private static final String SEQUENCE_VARIANT = "sequence_variant:";
88   
89    /*
90    * the 'id' attribute is provided for variant features fetched from
91    * Ensembl using its REST service with JSON format
92    */
93    public static final String VARIANT_ID = "id";
94   
95    /**
96    * A data model to hold the 'normal' base value at a position, and an optional
97    * sequence variant feature
98    */
 
99    static final class DnaVariant
100    {
101    final String base;
102   
103    SequenceFeature variant;
104   
 
105  0 toggle DnaVariant(String nuc)
106    {
107  0 base = nuc;
108  0 variant = null;
109    }
110   
 
111  0 toggle DnaVariant(String nuc, SequenceFeature var)
112    {
113  0 base = nuc;
114  0 variant = var;
115    }
116   
 
117  0 toggle public String getSource()
118    {
119  0 return variant == null ? null : variant.getFeatureGroup();
120    }
121   
122    /**
123    * toString for aid in the debugger only
124    */
 
125  0 toggle @Override
126    public String toString()
127    {
128  0 return base + ":" + (variant == null ? "" : variant.getDescription());
129    }
130    }
131   
132    /**
133    * given an existing alignment, create a new alignment including all, or up to
134    * flankSize additional symbols from each sequence's dataset sequence
135    *
136    * @param core
137    * @param flankSize
138    * @return AlignmentI
139    */
 
140  27 toggle public static AlignmentI expandContext(AlignmentI core, int flankSize)
141    {
142  27 List<SequenceI> sq = new ArrayList<>();
143  27 int maxoffset = 0;
144  27 for (SequenceI s : core.getSequences())
145    {
146  131 SequenceI newSeq = s.deriveSequence();
147  131 final int newSeqStart = newSeq.getStart() - 1;
148  131 if (newSeqStart > maxoffset
149    && newSeq.getDatasetSequence().getStart() < s.getStart())
150    {
151  131 maxoffset = newSeqStart;
152    }
153  131 sq.add(newSeq);
154    }
155  27 if (flankSize > -1)
156    {
157  25 maxoffset = Math.min(maxoffset, flankSize);
158    }
159   
160    /*
161    * now add offset left and right to create an expanded alignment
162    */
163  27 for (SequenceI s : sq)
164    {
165  131 SequenceI ds = s;
166  262 while (ds.getDatasetSequence() != null)
167    {
168  131 ds = ds.getDatasetSequence();
169    }
170  131 int s_end = s.findPosition(s.getStart() + s.getLength());
171    // find available flanking residues for sequence
172  131 int ustream_ds = s.getStart() - ds.getStart();
173  131 int dstream_ds = ds.getEnd() - s_end;
174   
175    // build new flanked sequence
176   
177    // compute gap padding to start of flanking sequence
178  131 int offset = maxoffset - ustream_ds;
179   
180    // padding is gapChar x ( maxoffset - min(ustream_ds, flank)
181  131 if (flankSize >= 0)
182    {
183  125 if (flankSize < ustream_ds)
184    {
185    // take up to flankSize residues
186  40 offset = maxoffset - flankSize;
187  40 ustream_ds = flankSize;
188    }
189  125 if (flankSize <= dstream_ds)
190    {
191  116 dstream_ds = flankSize - 1;
192    }
193    }
194    // TODO use Character.toLowerCase to avoid creating String objects?
195  131 char[] upstream = new String(ds
196    .getSequence(s.getStart() - 1 - ustream_ds, s.getStart() - 1))
197    .toLowerCase(Locale.ROOT).toCharArray();
198  131 char[] downstream = new String(
199    ds.getSequence(s_end - 1, s_end + dstream_ds))
200    .toLowerCase(Locale.ROOT).toCharArray();
201  131 char[] coreseq = s.getSequence();
202  131 char[] nseq = new char[offset + upstream.length + downstream.length
203    + coreseq.length];
204  131 char c = core.getGapCharacter();
205   
206  131 int p = 0;
207  461 for (; p < offset; p++)
208    {
209  330 nseq[p] = c;
210    }
211   
212  131 System.arraycopy(upstream, 0, nseq, p, upstream.length);
213  131 System.arraycopy(coreseq, 0, nseq, p + upstream.length,
214    coreseq.length);
215  131 System.arraycopy(downstream, 0, nseq,
216    p + coreseq.length + upstream.length, downstream.length);
217  131 s.setSequence(new String(nseq));
218  131 s.setStart(s.getStart() - ustream_ds);
219  131 s.setEnd(s_end + downstream.length);
220    }
221  27 AlignmentI newAl = new jalview.datamodel.Alignment(
222    sq.toArray(new SequenceI[0]));
223  27 for (SequenceI s : sq)
224    {
225  131 if (s.getAnnotation() != null)
226    {
227  1 for (AlignmentAnnotation aa : s.getAnnotation())
228    {
229  1 aa.adjustForAlignment(); // JAL-1712 fix
230  1 newAl.addAnnotation(aa);
231    }
232    }
233    }
234  27 newAl.setDataset(core.getDataset());
235  27 return newAl;
236    }
237   
238    /**
239    * Returns the index (zero-based position) of a sequence in an alignment, or
240    * -1 if not found.
241    *
242    * @param al
243    * @param seq
244    * @return
245    */
 
246  67170 toggle public static int getSequenceIndex(AlignmentI al, SequenceI seq)
247    {
248  67170 int result = -1;
249  67170 int pos = 0;
250  67170 for (SequenceI alSeq : al.getSequences())
251    {
252  126804637 if (alSeq == seq)
253    {
254  67099 result = pos;
255  67099 break;
256    }
257  126737541 pos++;
258    }
259  67169 return result;
260    }
261   
262    /**
263    * Returns a map of lists of sequences in the alignment, keyed by sequence
264    * name. For use in mapping between different alignment views of the same
265    * sequences.
266    *
267    * @see jalview.datamodel.AlignmentI#getSequencesByName()
268    */
 
269  1 toggle public static Map<String, List<SequenceI>> getSequencesByName(
270    AlignmentI al)
271    {
272  1 Map<String, List<SequenceI>> theMap = new LinkedHashMap<>();
273  1 for (SequenceI seq : al.getSequences())
274    {
275  3 String name = seq.getName();
276  3 if (name != null)
277    {
278  3 List<SequenceI> seqs = theMap.get(name);
279  3 if (seqs == null)
280    {
281  2 seqs = new ArrayList<>();
282  2 theMap.put(name, seqs);
283    }
284  3 seqs.add(seq);
285    }
286    }
287  1 return theMap;
288    }
289   
290    /**
291    * Build mapping of protein to cDNA alignment. Mappings are made between
292    * sequences where the cDNA translates to the protein sequence. Any new
293    * mappings are added to the protein alignment. Returns true if any mappings
294    * either already exist or were added, else false.
295    *
296    * @param proteinAlignment
297    * @param cdnaAlignment
298    * @return
299    */
 
300  4 toggle public static boolean mapProteinAlignmentToCdna(
301    final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment)
302    {
303  4 if (proteinAlignment == null || cdnaAlignment == null)
304    {
305  0 return false;
306    }
307   
308  4 Set<SequenceI> mappedDna = new HashSet<>();
309  4 Set<SequenceI> mappedProtein = new HashSet<>();
310   
311    /*
312    * First pass - map sequences where cross-references exist. This include
313    * 1-to-many mappings to support, for example, variant cDNA.
314    */
315  4 boolean mappingPerformed = mapProteinToCdna(proteinAlignment,
316    cdnaAlignment, mappedDna, mappedProtein, true);
317   
318    /*
319    * Second pass - map sequences where no cross-references exist. This only
320    * does 1-to-1 mappings and assumes corresponding sequences are in the same
321    * order in the alignments.
322    */
323  4 mappingPerformed |= mapProteinToCdna(proteinAlignment, cdnaAlignment,
324    mappedDna, mappedProtein, false);
325  4 return mappingPerformed;
326    }
327   
328    /**
329    * Make mappings between compatible sequences (where the cDNA translation
330    * matches the protein).
331    *
332    * @param proteinAlignment
333    * @param cdnaAlignment
334    * @param mappedDna
335    * a set of mapped DNA sequences (to add to)
336    * @param mappedProtein
337    * a set of mapped Protein sequences (to add to)
338    * @param xrefsOnly
339    * if true, only map sequences where xrefs exist
340    * @return
341    */
 
342  8 toggle protected static boolean mapProteinToCdna(
343    final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment,
344    Set<SequenceI> mappedDna, Set<SequenceI> mappedProtein,
345    boolean xrefsOnly)
346    {
347  8 boolean mappingExistsOrAdded = false;
348  8 List<SequenceI> thisSeqs = proteinAlignment.getSequences();
349  8 for (SequenceI aaSeq : thisSeqs)
350    {
351  22 boolean proteinMapped = false;
352  22 AlignedCodonFrame acf = new AlignedCodonFrame();
353   
354  22 for (SequenceI cdnaSeq : cdnaAlignment.getSequences())
355    {
356    /*
357    * Always try to map if sequences have xref to each other; this supports
358    * variant cDNA or alternative splicing for a protein sequence.
359    *
360    * If no xrefs, try to map progressively, assuming that alignments have
361    * mappable sequences in corresponding order. These are not
362    * many-to-many, as that would risk mixing species with similar cDNA
363    * sequences.
364    */
365  86 if (xrefsOnly && !AlignmentUtils.haveCrossRef(aaSeq, cdnaSeq))
366    {
367  39 continue;
368    }
369   
370    /*
371    * Don't map non-xrefd sequences more than once each. This heuristic
372    * allows us to pair up similar sequences in ordered alignments.
373    */
374  47 if (!xrefsOnly && (mappedProtein.contains(aaSeq)
375    || mappedDna.contains(cdnaSeq)))
376    {
377  29 continue;
378    }
379  18 if (mappingExists(proteinAlignment.getCodonFrames(),
380    aaSeq.getDatasetSequence(), cdnaSeq.getDatasetSequence()))
381    {
382  0 mappingExistsOrAdded = true;
383    }
384    else
385    {
386  18 MapList map = mapCdnaToProtein(aaSeq, cdnaSeq);
387  18 if (map != null)
388    {
389  12 acf.addMap(cdnaSeq, aaSeq, map);
390  12 mappingExistsOrAdded = true;
391  12 proteinMapped = true;
392  12 mappedDna.add(cdnaSeq);
393  12 mappedProtein.add(aaSeq);
394    }
395    }
396    }
397  22 if (proteinMapped)
398    {
399  11 proteinAlignment.addCodonFrame(acf);
400    }
401    }
402  8 return mappingExistsOrAdded;
403    }
404   
405    /**
406    * Answers true if the mappings include one between the given (dataset)
407    * sequences.
408    */
 
409  18 toggle protected static boolean mappingExists(List<AlignedCodonFrame> mappings,
410    SequenceI aaSeq, SequenceI cdnaSeq)
411    {
412  18 if (mappings != null)
413    {
414  18 for (AlignedCodonFrame acf : mappings)
415    {
416  14 if (cdnaSeq == acf.getDnaForAaSeq(aaSeq))
417    {
418  0 return true;
419    }
420    }
421    }
422  18 return false;
423    }
424   
425    /**
426    * Builds a mapping (if possible) of a cDNA to a protein sequence.
427    * <ul>
428    * <li>first checks if the cdna translates exactly to the protein
429    * sequence</li>
430    * <li>else checks for translation after removing a STOP codon</li>
431    * <li>else checks for translation after removing a START codon</li>
432    * <li>if that fails, inspect CDS features on the cDNA sequence</li>
433    * </ul>
434    * Returns null if no mapping is determined.
435    *
436    * @param proteinSeq
437    * the aligned protein sequence
438    * @param cdnaSeq
439    * the aligned cdna sequence
440    * @return
441    */
 
442  31 toggle public static MapList mapCdnaToProtein(SequenceI proteinSeq,
443    SequenceI cdnaSeq)
444    {
445    /*
446    * Here we handle either dataset sequence set (desktop) or absent (applet).
447    * Use only the char[] form of the sequence to avoid creating possibly large
448    * String objects.
449    */
450  31 final SequenceI proteinDataset = proteinSeq.getDatasetSequence();
451  31 char[] aaSeqChars = proteinDataset != null
452    ? proteinDataset.getSequence()
453    : proteinSeq.getSequence();
454  31 final SequenceI cdnaDataset = cdnaSeq.getDatasetSequence();
455  31 char[] cdnaSeqChars = cdnaDataset != null ? cdnaDataset.getSequence()
456    : cdnaSeq.getSequence();
457  31 if (aaSeqChars == null || cdnaSeqChars == null)
458    {
459  0 return null;
460    }
461   
462    /*
463    * cdnaStart/End, proteinStartEnd are base 1 (for dataset sequence mapping)
464    */
465  31 final int mappedLength = CODON_LENGTH * aaSeqChars.length;
466  31 int cdnaLength = cdnaSeqChars.length;
467  31 int cdnaStart = cdnaSeq.getStart();
468  31 int cdnaEnd = cdnaSeq.getEnd();
469  31 final int proteinStart = proteinSeq.getStart();
470  31 final int proteinEnd = proteinSeq.getEnd();
471   
472    /*
473    * If lengths don't match, try ignoring stop codon (if present)
474    */
475  31 if (cdnaLength != mappedLength && cdnaLength > 2)
476    {
477  16 String lastCodon = String.valueOf(cdnaSeqChars,
478    cdnaLength - CODON_LENGTH, CODON_LENGTH)
479    .toUpperCase(Locale.ROOT);
480  16 for (String stop : ResidueProperties.STOP_CODONS)
481    {
482  47 if (lastCodon.equals(stop))
483    {
484  2 cdnaEnd -= CODON_LENGTH;
485  2 cdnaLength -= CODON_LENGTH;
486  2 break;
487    }
488    }
489    }
490   
491    /*
492    * If lengths still don't match, try ignoring start codon.
493    */
494  31 int startOffset = 0;
495  31 if (cdnaLength != mappedLength && cdnaLength > 2
496    && String.valueOf(cdnaSeqChars, 0, CODON_LENGTH)
497    .toUpperCase(Locale.ROOT)
498    .equals(ResidueProperties.START))
499    {
500  5 startOffset += CODON_LENGTH;
501  5 cdnaStart += CODON_LENGTH;
502  5 cdnaLength -= CODON_LENGTH;
503    }
504   
505  31 if (translatesAs(cdnaSeqChars, startOffset, aaSeqChars))
506    {
507    /*
508    * protein is translation of dna (+/- start/stop codons)
509    */
510  15 MapList map = new MapList(new int[] { cdnaStart, cdnaEnd },
511    new int[]
512    { proteinStart, proteinEnd }, CODON_LENGTH, 1);
513  15 return map;
514    }
515   
516    /*
517    * translation failed - try mapping CDS annotated regions of dna
518    */
519  16 return mapCdsToProtein(cdnaSeq, proteinSeq);
520    }
521   
522    /**
523    * Test whether the given cdna sequence, starting at the given offset,
524    * translates to the given amino acid sequence, using the standard translation
525    * table. Designed to fail fast i.e. as soon as a mismatch position is found.
526    *
527    * @param cdnaSeqChars
528    * @param cdnaStart
529    * @param aaSeqChars
530    * @return
531    */
 
532  51 toggle protected static boolean translatesAs(char[] cdnaSeqChars, int cdnaStart,
533    char[] aaSeqChars)
534    {
535  51 if (cdnaSeqChars == null || aaSeqChars == null)
536    {
537  3 return false;
538    }
539   
540  48 int aaPos = 0;
541  48 int dnaPos = cdnaStart;
542  161 for (; dnaPos < cdnaSeqChars.length - 2
543    && aaPos < aaSeqChars.length; dnaPos += CODON_LENGTH, aaPos++)
544    {
545  130 String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
546  130 final String translated = ResidueProperties.codonTranslate(codon);
547   
548    /*
549    * allow * in protein to match untranslatable in dna
550    */
551  130 final char aaRes = aaSeqChars[aaPos];
552  130 if ((translated == null || ResidueProperties.STOP.equals(translated))
553    && aaRes == '*')
554    {
555  4 continue;
556    }
557  126 if (translated == null || !(aaRes == translated.charAt(0)))
558    {
559    // debug
560    // jalview.bin.Console.outPrintln(("Mismatch at " + i + "/" + aaResidue
561    // + ": "
562    // + codon + "(" + translated + ") != " + aaRes));
563  17 return false;
564    }
565    }
566   
567    /*
568    * check we matched all of the protein sequence
569    */
570  31 if (aaPos != aaSeqChars.length)
571    {
572  2 return false;
573    }
574   
575    /*
576    * check we matched all of the dna except
577    * for optional trailing STOP codon
578    */
579  29 if (dnaPos == cdnaSeqChars.length)
580    {
581  17 return true;
582    }
583  12 if (dnaPos == cdnaSeqChars.length - CODON_LENGTH)
584    {
585  11 String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
586  11 if (ResidueProperties.STOP
587    .equals(ResidueProperties.codonTranslate(codon)))
588    {
589  9 return true;
590    }
591    }
592  3 return false;
593    }
594   
595    /**
596    * Align sequence 'seq' to match the alignment of a mapped sequence. Note this
597    * currently assumes that we are aligning cDNA to match protein.
598    *
599    * @param seq
600    * the sequence to be realigned
601    * @param al
602    * the alignment whose sequence alignment is to be 'copied'
603    * @param gap
604    * character string represent a gap in the realigned sequence
605    * @param preserveUnmappedGaps
606    * @param preserveMappedGaps
607    * @return true if the sequence was realigned, false if it could not be
608    */
 
609  8 toggle public static boolean alignSequenceAs(SequenceI seq, AlignmentI al,
610    String gap, boolean preserveMappedGaps,
611    boolean preserveUnmappedGaps)
612    {
613    /*
614    * Get any mappings from the source alignment to the target (dataset)
615    * sequence.
616    */
617    // TODO there may be one AlignedCodonFrame per dataset sequence, or one with
618    // all mappings. Would it help to constrain this?
619  8 List<AlignedCodonFrame> mappings = al.getCodonFrame(seq);
620  8 if (mappings == null || mappings.isEmpty())
621    {
622  0 return false;
623    }
624   
625    /*
626    * Locate the aligned source sequence whose dataset sequence is mapped. We
627    * just take the first match here (as we can't align like more than one
628    * sequence).
629    */
630  8 SequenceI alignFrom = null;
631  8 AlignedCodonFrame mapping = null;
632  8 for (AlignedCodonFrame mp : mappings)
633    {
634  8 alignFrom = mp.findAlignedSequence(seq, al);
635  8 if (alignFrom != null)
636    {
637  4 mapping = mp;
638  4 break;
639    }
640    }
641   
642  8 if (alignFrom == null)
643    {
644  4 return false;
645    }
646  4 alignSequenceAs(seq, alignFrom, mapping, gap, al.getGapCharacter(),
647    preserveMappedGaps, preserveUnmappedGaps);
648  4 return true;
649    }
650   
651    /**
652    * Align sequence 'alignTo' the same way as 'alignFrom', using the mapping to
653    * match residues and codons. Flags control whether existing gaps in unmapped
654    * (intron) and mapped (exon) regions are preserved or not. Gaps between
655    * intron and exon are only retained if both flags are set.
656    *
657    * @param alignTo
658    * @param alignFrom
659    * @param mapping
660    * @param myGap
661    * @param sourceGap
662    * @param preserveUnmappedGaps
663    * @param preserveMappedGaps
664    */
 
665  19 toggle public static void alignSequenceAs(SequenceI alignTo, SequenceI alignFrom,
666    AlignedCodonFrame mapping, String myGap, char sourceGap,
667    boolean preserveMappedGaps, boolean preserveUnmappedGaps)
668    {
669    // TODO generalise to work for Protein-Protein, dna-dna, dna-protein
670   
671    // aligned and dataset sequence positions, all base zero
672  19 int thisSeqPos = 0;
673  19 int sourceDsPos = 0;
674   
675  19 int basesWritten = 0;
676  19 char myGapChar = myGap.charAt(0);
677  19 int ratio = myGap.length();
678   
679  19 int fromOffset = alignFrom.getStart() - 1;
680  19 int toOffset = alignTo.getStart() - 1;
681  19 int sourceGapMappedLength = 0;
682  19 boolean inExon = false;
683  19 final int toLength = alignTo.getLength();
684  19 final int fromLength = alignFrom.getLength();
685  19 StringBuilder thisAligned = new StringBuilder(2 * toLength);
686   
687    /*
688    * Traverse the 'model' aligned sequence
689    */
690  205 for (int i = 0; i < fromLength; i++)
691    {
692  186 char sourceChar = alignFrom.getCharAt(i);
693  186 if (sourceChar == sourceGap)
694    {
695  44 sourceGapMappedLength += ratio;
696  44 continue;
697    }
698   
699    /*
700    * Found a non-gap character. Locate its mapped region if any.
701    */
702  142 sourceDsPos++;
703    // Note mapping positions are base 1, our sequence positions base 0
704  142 int[] mappedPos = mapping.getMappedRegion(alignTo, alignFrom,
705    sourceDsPos + fromOffset);
706  142 if (mappedPos == null)
707    {
708    /*
709    * unmapped position; treat like a gap
710    */
711  94 sourceGapMappedLength += ratio;
712    // jalview.bin.Console.errPrintln("Can't align: no codon mapping to
713    // residue "
714    // + sourceDsPos + "(" + sourceChar + ")");
715    // return;
716  94 continue;
717    }
718   
719  48 int mappedCodonStart = mappedPos[0]; // position (1...) of codon start
720  48 int mappedCodonEnd = mappedPos[mappedPos.length - 1]; // codon end pos
721  48 StringBuilder trailingCopiedGap = new StringBuilder();
722   
723    /*
724    * Copy dna sequence up to and including this codon. Optionally, include
725    * gaps before the codon starts (in introns) and/or after the codon starts
726    * (in exons).
727    *
728    * Note this only works for 'linear' splicing, not reverse or interleaved.
729    * But then 'align dna as protein' doesn't make much sense otherwise.
730    */
731  48 int intronLength = 0;
732  294 while (basesWritten + toOffset < mappedCodonEnd
733    && thisSeqPos < toLength)
734    {
735  246 final char c = alignTo.getCharAt(thisSeqPos++);
736  246 if (c != myGapChar)
737    {
738  146 basesWritten++;
739  146 int sourcePosition = basesWritten + toOffset;
740  146 if (sourcePosition < mappedCodonStart)
741    {
742    /*
743    * Found an unmapped (intron) base. First add in any preceding gaps
744    * (if wanted).
745    */
746  48 if (preserveUnmappedGaps && trailingCopiedGap.length() > 0)
747    {
748  17 thisAligned.append(trailingCopiedGap.toString());
749  17 intronLength += trailingCopiedGap.length();
750  17 trailingCopiedGap = new StringBuilder();
751    }
752  48 intronLength++;
753  48 inExon = false;
754    }
755    else
756    {
757  98 final boolean startOfCodon = sourcePosition == mappedCodonStart;
758  98 int gapsToAdd = calculateGapsToInsert(preserveMappedGaps,
759    preserveUnmappedGaps, sourceGapMappedLength, inExon,
760    trailingCopiedGap.length(), intronLength, startOfCodon);
761  215 for (int k = 0; k < gapsToAdd; k++)
762    {
763  117 thisAligned.append(myGapChar);
764    }
765  98 sourceGapMappedLength = 0;
766  98 inExon = true;
767    }
768  146 thisAligned.append(c);
769  146 trailingCopiedGap = new StringBuilder();
770    }
771    else
772    {
773  100 if (inExon && preserveMappedGaps)
774    {
775  32 trailingCopiedGap.append(myGapChar);
776    }
777  68 else if (!inExon && preserveUnmappedGaps)
778    {
779  27 trailingCopiedGap.append(myGapChar);
780    }
781    }
782    }
783    }
784   
785    /*
786    * At end of model aligned sequence. Copy any remaining target sequence, optionally
787    * including (intron) gaps.
788    */
789  129 while (thisSeqPos < toLength)
790    {
791  110 final char c = alignTo.getCharAt(thisSeqPos++);
792  110 if (c != myGapChar || preserveUnmappedGaps)
793    {
794  102 thisAligned.append(c);
795    }
796  110 sourceGapMappedLength--;
797    }
798   
799    /*
800    * finally add gaps to pad for any trailing source gaps or
801    * unmapped characters
802    */
803  19 if (preserveUnmappedGaps)
804    {
805  24 while (sourceGapMappedLength > 0)
806    {
807  12 thisAligned.append(myGapChar);
808  12 sourceGapMappedLength--;
809    }
810    }
811   
812    /*
813    * All done aligning, set the aligned sequence.
814    */
815  19 alignTo.setSequence(new String(thisAligned));
816    }
817   
818    /**
819    * Helper method to work out how many gaps to insert when realigning.
820    *
821    * @param preserveMappedGaps
822    * @param preserveUnmappedGaps
823    * @param sourceGapMappedLength
824    * @param inExon
825    * @param trailingCopiedGap
826    * @param intronLength
827    * @param startOfCodon
828    * @return
829    */
 
830  98 toggle protected static int calculateGapsToInsert(boolean preserveMappedGaps,
831    boolean preserveUnmappedGaps, int sourceGapMappedLength,
832    boolean inExon, int trailingGapLength, int intronLength,
833    final boolean startOfCodon)
834    {
835  98 int gapsToAdd = 0;
836  98 if (startOfCodon)
837    {
838    /*
839    * Reached start of codon. Ignore trailing gaps in intron unless we are
840    * preserving gaps in both exon and intron. Ignore them anyway if the
841    * protein alignment introduces a gap at least as large as the intronic
842    * region.
843    */
844  40 if (inExon && !preserveMappedGaps)
845    {
846  4 trailingGapLength = 0;
847    }
848  40 if (!inExon && !(preserveMappedGaps && preserveUnmappedGaps))
849    {
850  19 trailingGapLength = 0;
851    }
852  40 if (inExon)
853    {
854  14 gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
855    }
856    else
857    {
858  26 if (intronLength + trailingGapLength <= sourceGapMappedLength)
859    {
860  20 gapsToAdd = sourceGapMappedLength - intronLength;
861    }
862    else
863    {
864  6 gapsToAdd = Math.min(
865    intronLength + trailingGapLength - sourceGapMappedLength,
866    trailingGapLength);
867    }
868    }
869    }
870    else
871    {
872    /*
873    * second or third base of codon; check for any gaps in dna
874    */
875  58 if (!preserveMappedGaps)
876    {
877  32 trailingGapLength = 0;
878    }
879  58 gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
880    }
881  98 return gapsToAdd;
882    }
883   
884    /**
885    * Realigns the given protein to match the alignment of the dna, using codon
886    * mappings to translate aligned codon positions to protein residues.
887    *
888    * @param protein
889    * the alignment whose sequences are realigned by this method
890    * @param dna
891    * the dna alignment whose alignment we are 'copying'
892    * @return the number of sequences that were realigned
893    */
 
894  5 toggle public static int alignProteinAsDna(AlignmentI protein, AlignmentI dna)
895    {
896  5 if (protein.isNucleotide() || !dna.isNucleotide())
897    {
898  0 jalview.bin.Console
899    .errPrintln("Wrong alignment type in alignProteinAsDna");
900  0 return 0;
901    }
902  5 List<SequenceI> unmappedProtein = new ArrayList<>();
903  5 Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = buildCodonColumnsMap(
904    protein, dna, unmappedProtein);
905  5 return alignProteinAs(protein, alignedCodons, unmappedProtein);
906    }
907   
908    /**
909    * Realigns the given dna to match the alignment of the protein, using codon
910    * mappings to translate aligned peptide positions to codons.
911    *
912    * Always produces a padded CDS alignment.
913    *
914    * @param dna
915    * the alignment whose sequences are realigned by this method
916    * @param protein
917    * the protein alignment whose alignment we are 'copying'
918    * @return the number of sequences that were realigned
919    */
 
920  4 toggle public static int alignCdsAsProtein(AlignmentI dna, AlignmentI protein)
921    {
922  4 if (protein.isNucleotide() || !dna.isNucleotide())
923    {
924  0 jalview.bin.Console
925    .errPrintln("Wrong alignment type in alignProteinAsDna");
926  0 return 0;
927    }
928    // todo: implement this
929  4 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
930  4 int alignedCount = 0;
931  4 int width = 0; // alignment width for padding CDS
932  4 for (SequenceI dnaSeq : dna.getSequences())
933    {
934  5 if (alignCdsSequenceAsProtein(dnaSeq, protein, mappings,
935    dna.getGapCharacter()))
936    {
937  5 alignedCount++;
938    }
939  5 width = Math.max(dnaSeq.getLength(), width);
940    }
941  4 int oldwidth;
942  4 int diff;
943  4 for (SequenceI dnaSeq : dna.getSequences())
944    {
945  5 oldwidth = dnaSeq.getLength();
946  5 diff = width - oldwidth;
947  5 if (diff > 0)
948    {
949  1 dnaSeq.insertCharAt(oldwidth, diff, dna.getGapCharacter());
950    }
951    }
952  4 return alignedCount;
953    }
954   
955    /**
956    * Helper method to align (if possible) the dna sequence to match the
957    * alignment of a mapped protein sequence. This is currently limited to
958    * handling coding sequence only.
959    *
960    * @param cdsSeq
961    * @param protein
962    * @param mappings
963    * @param gapChar
964    * @return
965    */
 
966  5 toggle static boolean alignCdsSequenceAsProtein(SequenceI cdsSeq,
967    AlignmentI protein, List<AlignedCodonFrame> mappings,
968    char gapChar)
969    {
970  5 SequenceI cdsDss = cdsSeq.getDatasetSequence();
971  5 if (cdsDss == null)
972    {
973  0 System.err
974    .println("alignCdsSequenceAsProtein needs aligned sequence!");
975  0 return false;
976    }
977   
978  5 List<AlignedCodonFrame> dnaMappings = MappingUtils
979    .findMappingsForSequence(cdsSeq, mappings);
980  5 for (AlignedCodonFrame mapping : dnaMappings)
981    {
982  5 SequenceI peptide = mapping.findAlignedSequence(cdsSeq, protein);
983  5 if (peptide != null)
984    {
985  5 final int peptideLength = peptide.getLength();
986  5 Mapping map = mapping.getMappingBetween(cdsSeq, peptide);
987  5 if (map != null)
988    {
989  5 MapList mapList = map.getMap();
990  5 if (map.getTo() == peptide.getDatasetSequence())
991    {
992  5 mapList = mapList.getInverse();
993    }
994  5 final int cdsLength = cdsDss.getLength();
995  5 int mappedFromLength = MappingUtils
996    .getLength(mapList.getFromRanges());
997  5 int mappedToLength = MappingUtils
998    .getLength(mapList.getToRanges());
999  5 boolean addStopCodon = (cdsLength == mappedFromLength
1000    * CODON_LENGTH + CODON_LENGTH)
1001    || (peptide.getDatasetSequence()
1002    .getLength() == mappedFromLength - 1);
1003  5 if (cdsLength != mappedToLength && !addStopCodon)
1004    {
1005  0 jalview.bin.Console.errPrintln(String.format(
1006    "Can't align cds as protein (length mismatch %d/%d): %s",
1007    cdsLength, mappedToLength, cdsSeq.getName()));
1008    }
1009   
1010    /*
1011    * pre-fill the aligned cds sequence with gaps
1012    */
1013  5 char[] alignedCds = new char[peptideLength * CODON_LENGTH
1014  5 + (addStopCodon ? CODON_LENGTH : 0)];
1015  5 Arrays.fill(alignedCds, gapChar);
1016   
1017    /*
1018    * walk over the aligned peptide sequence and insert mapped
1019    * codons for residues in the aligned cds sequence
1020    */
1021  5 int copiedBases = 0;
1022  5 int cdsStart = cdsDss.getStart();
1023  5 int proteinPos = peptide.getStart() - 1;
1024  5 int cdsCol = 0;
1025   
1026  33 for (int col = 0; col < peptideLength; col++)
1027    {
1028  28 char residue = peptide.getCharAt(col);
1029   
1030  28 if (Comparison.isGap(residue))
1031    {
1032  13 cdsCol += CODON_LENGTH;
1033    }
1034    else
1035    {
1036  15 proteinPos++;
1037  15 int[] codon = mapList.locateInTo(proteinPos, proteinPos);
1038  15 if (codon == null)
1039    {
1040    // e.g. incomplete start codon, X in peptide
1041  0 cdsCol += CODON_LENGTH;
1042    }
1043    else
1044    {
1045  60 for (int j = codon[0]; j <= codon[1]; j++)
1046    {
1047  45 char mappedBase = cdsDss.getCharAt(j - cdsStart);
1048  45 alignedCds[cdsCol++] = mappedBase;
1049  45 copiedBases++;
1050    }
1051    }
1052    }
1053    }
1054   
1055    /*
1056    * append stop codon if not mapped from protein,
1057    * closing it up to the end of the mapped sequence
1058    */
1059  5 if (copiedBases == cdsLength - CODON_LENGTH)
1060    {
1061  0 for (int i = alignedCds.length - 1; i >= 0; i--)
1062    {
1063  0 if (!Comparison.isGap(alignedCds[i]))
1064    {
1065  0 cdsCol = i + 1; // gap just after end of sequence
1066  0 break;
1067    }
1068    }
1069  0 for (int i = cdsLength - CODON_LENGTH; i < cdsLength; i++)
1070    {
1071  0 alignedCds[cdsCol++] = cdsDss.getCharAt(i);
1072    }
1073    }
1074  5 cdsSeq.setSequence(new String(alignedCds));
1075  5 return true;
1076    }
1077    }
1078    }
1079  0 return false;
1080    }
1081   
1082    /**
1083    * Builds a map whose key is an aligned codon position (3 alignment column
1084    * numbers base 0), and whose value is a map from protein sequence to each
1085    * protein's peptide residue for that codon. The map generates an ordering of
1086    * the codons, and allows us to read off the peptides at each position in
1087    * order to assemble 'aligned' protein sequences.
1088    *
1089    * @param protein
1090    * the protein alignment
1091    * @param dna
1092    * the coding dna alignment
1093    * @param unmappedProtein
1094    * any unmapped proteins are added to this list
1095    * @return
1096    */
 
1097  5 toggle protected static Map<AlignedCodon, Map<SequenceI, AlignedCodon>> buildCodonColumnsMap(
1098    AlignmentI protein, AlignmentI dna,
1099    List<SequenceI> unmappedProtein)
1100    {
1101    /*
1102    * maintain a list of any proteins with no mappings - these will be
1103    * rendered 'as is' in the protein alignment as we can't align them
1104    */
1105  5 unmappedProtein.addAll(protein.getSequences());
1106   
1107  5 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
1108   
1109    /*
1110    * Map will hold, for each aligned codon position e.g. [3, 5, 6], a map of
1111    * {dnaSequence, {proteinSequence, codonProduct}} at that position. The
1112    * comparator keeps the codon positions ordered.
1113    */
1114  5 Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = new TreeMap<>(
1115    new CodonComparator());
1116   
1117  5 for (SequenceI dnaSeq : dna.getSequences())
1118    {
1119  30 for (AlignedCodonFrame mapping : mappings)
1120    {
1121  516 SequenceI prot = mapping.findAlignedSequence(dnaSeq, protein);
1122  516 if (prot != null)
1123    {
1124  32 Mapping seqMap = mapping.getMappingForSequence(dnaSeq);
1125  32 addCodonPositions(dnaSeq, prot, protein.getGapCharacter(), seqMap,
1126    alignedCodons);
1127  32 unmappedProtein.remove(prot);
1128    }
1129    }
1130    }
1131   
1132    /*
1133    * Finally add any unmapped peptide start residues (e.g. for incomplete
1134    * codons) as if at the codon position before the second residue
1135    */
1136    // TODO resolve JAL-2022 so this fudge can be removed
1137  5 int mappedSequenceCount = protein.getHeight() - unmappedProtein.size();
1138  5 addUnmappedPeptideStarts(alignedCodons, mappedSequenceCount);
1139   
1140  5 return alignedCodons;
1141    }
1142   
1143    /**
1144    * Scans for any protein mapped from position 2 (meaning unmapped start
1145    * position e.g. an incomplete codon), and synthesizes a 'codon' for it at the
1146    * preceding position in the alignment
1147    *
1148    * @param alignedCodons
1149    * the codon-to-peptide map
1150    * @param mappedSequenceCount
1151    * the number of distinct sequences in the map
1152    */
 
1153  5 toggle protected static void addUnmappedPeptideStarts(
1154    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1155    int mappedSequenceCount)
1156    {
1157    // TODO delete this ugly hack once JAL-2022 is resolved
1158    // i.e. we can model startPhase > 0 (incomplete start codon)
1159   
1160  5 List<SequenceI> sequencesChecked = new ArrayList<>();
1161  5 AlignedCodon lastCodon = null;
1162  5 Map<SequenceI, AlignedCodon> toAdd = new HashMap<>();
1163   
1164  5 for (Entry<AlignedCodon, Map<SequenceI, AlignedCodon>> entry : alignedCodons
1165    .entrySet())
1166    {
1167  1913 for (Entry<SequenceI, AlignedCodon> sequenceCodon : entry.getValue()
1168    .entrySet())
1169    {
1170  10666 SequenceI seq = sequenceCodon.getKey();
1171  10666 if (sequencesChecked.contains(seq))
1172    {
1173  10636 continue;
1174    }
1175  30 sequencesChecked.add(seq);
1176  30 AlignedCodon codon = sequenceCodon.getValue();
1177  30 if (codon.peptideCol > 1)
1178    {
1179  0 jalview.bin.Console.errPrintln(
1180    "Problem mapping protein with >1 unmapped start positions: "
1181    + seq.getName());
1182    }
1183  30 else if (codon.peptideCol == 1)
1184    {
1185    /*
1186    * first position (peptideCol == 0) was unmapped - add it
1187    */
1188  6 if (lastCodon != null)
1189    {
1190  5 AlignedCodon firstPeptide = new AlignedCodon(lastCodon.pos1,
1191    lastCodon.pos2, lastCodon.pos3,
1192    String.valueOf(seq.getCharAt(0)), 0);
1193  5 toAdd.put(seq, firstPeptide);
1194    }
1195    else
1196    {
1197    /*
1198    * unmapped residue at start of alignment (no prior column) -
1199    * 'insert' at nominal codon [0, 0, 0]
1200    */
1201  1 AlignedCodon firstPeptide = new AlignedCodon(0, 0, 0,
1202    String.valueOf(seq.getCharAt(0)), 0);
1203  1 toAdd.put(seq, firstPeptide);
1204    }
1205    }
1206  30 if (sequencesChecked.size() == mappedSequenceCount)
1207    {
1208    // no need to check past first mapped position in all sequences
1209  5 break;
1210    }
1211    }
1212  1913 lastCodon = entry.getKey();
1213    }
1214   
1215    /*
1216    * add any new codons safely after iterating over the map
1217    */
1218  5 for (Entry<SequenceI, AlignedCodon> startCodon : toAdd.entrySet())
1219    {
1220  6 addCodonToMap(alignedCodons, startCodon.getValue(),
1221    startCodon.getKey());
1222    }
1223    }
1224   
1225    /**
1226    * Update the aligned protein sequences to match the codon alignments given in
1227    * the map.
1228    *
1229    * @param protein
1230    * @param alignedCodons
1231    * an ordered map of codon positions (columns), with sequence/peptide
1232    * values present in each column
1233    * @param unmappedProtein
1234    * @return
1235    */
 
1236  5 toggle protected static int alignProteinAs(AlignmentI protein,
1237    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1238    List<SequenceI> unmappedProtein)
1239    {
1240    /*
1241    * prefill peptide sequences with gaps
1242    */
1243  5 int alignedWidth = alignedCodons.size();
1244  5 char[] gaps = new char[alignedWidth];
1245  5 Arrays.fill(gaps, protein.getGapCharacter());
1246  5 Map<SequenceI, char[]> peptides = new HashMap<>();
1247  5 for (SequenceI seq : protein.getSequences())
1248    {
1249  31 if (!unmappedProtein.contains(seq))
1250    {
1251  30 peptides.put(seq, Arrays.copyOf(gaps, gaps.length));
1252    }
1253    }
1254   
1255    /*
1256    * Traverse the codons left to right (as defined by CodonComparator)
1257    * and insert peptides in each column where the sequence is mapped.
1258    * This gives a peptide 'alignment' where residues are aligned if their
1259    * corresponding codons occupy the same columns in the cdna alignment.
1260    */
1261  5 int column = 0;
1262  5 for (AlignedCodon codon : alignedCodons.keySet())
1263    {
1264  1914 final Map<SequenceI, AlignedCodon> columnResidues = alignedCodons
1265    .get(codon);
1266  1914 for (Entry<SequenceI, AlignedCodon> entry : columnResidues.entrySet())
1267    {
1268  10682 char residue = entry.getValue().product.charAt(0);
1269  10682 peptides.get(entry.getKey())[column] = residue;
1270    }
1271  1914 column++;
1272    }
1273   
1274    /*
1275    * and finally set the constructed sequences
1276    */
1277  5 for (Entry<SequenceI, char[]> entry : peptides.entrySet())
1278    {
1279  30 entry.getKey().setSequence(new String(entry.getValue()));
1280    }
1281   
1282  5 return 0;
1283    }
1284   
1285    /**
1286    * Populate the map of aligned codons by traversing the given sequence
1287    * mapping, locating the aligned positions of mapped codons, and adding those
1288    * positions and their translation products to the map.
1289    *
1290    * @param dna
1291    * the aligned sequence we are mapping from
1292    * @param protein
1293    * the sequence to be aligned to the codons
1294    * @param gapChar
1295    * the gap character in the dna sequence
1296    * @param seqMap
1297    * a mapping to a sequence translation
1298    * @param alignedCodons
1299    * the map we are building up
1300    */
 
1301  32 toggle static void addCodonPositions(SequenceI dna, SequenceI protein,
1302    char gapChar, Mapping seqMap,
1303    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons)
1304    {
1305  32 Iterator<AlignedCodon> codons = seqMap.getCodonIterator(dna, gapChar);
1306   
1307    /*
1308    * add codon positions, and their peptide translations, to the alignment
1309    * map, while remembering the first codon mapped
1310    */
1311  10716 while (codons.hasNext())
1312    {
1313  10684 try
1314    {
1315  10684 AlignedCodon codon = codons.next();
1316  10684 addCodonToMap(alignedCodons, codon, protein);
1317    } catch (IncompleteCodonException e)
1318    {
1319    // possible incomplete trailing codon - ignore
1320    } catch (NoSuchElementException e)
1321    {
1322    // possibly peptide lacking STOP
1323    }
1324    }
1325    }
1326   
1327    /**
1328    * Helper method to add a codon-to-peptide entry to the aligned codons map
1329    *
1330    * @param alignedCodons
1331    * @param codon
1332    * @param protein
1333    */
 
1334  10690 toggle protected static void addCodonToMap(
1335    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1336    AlignedCodon codon, SequenceI protein)
1337    {
1338  10690 Map<SequenceI, AlignedCodon> seqProduct = alignedCodons.get(codon);
1339  10690 if (seqProduct == null)
1340    {
1341  1914 seqProduct = new HashMap<>();
1342  1914 alignedCodons.put(codon, seqProduct);
1343    }
1344  10690 seqProduct.put(protein, codon);
1345    }
1346   
1347    /**
1348    * Returns true if a cDNA/Protein mapping either exists, or could be made,
1349    * between at least one pair of sequences in the two alignments. Currently,
1350    * the logic is:
1351    * <ul>
1352    * <li>One alignment must be nucleotide, and the other protein</li>
1353    * <li>At least one pair of sequences must be already mapped, or mappable</li>
1354    * <li>Mappable means the nucleotide translation matches the protein
1355    * sequence</li>
1356    * <li>The translation may ignore start and stop codons if present in the
1357    * nucleotide</li>
1358    * </ul>
1359    *
1360    * @param al1
1361    * @param al2
1362    * @return
1363    */
 
1364  28 toggle public static boolean isMappable(AlignmentI al1, AlignmentI al2)
1365    {
1366  28 if (al1 == null || al2 == null)
1367    {
1368  3 return false;
1369    }
1370   
1371    /*
1372    * Require one nucleotide and one protein
1373    */
1374  25 if (al1.isNucleotide() == al2.isNucleotide())
1375    {
1376  14 return false;
1377    }
1378  11 AlignmentI dna = al1.isNucleotide() ? al1 : al2;
1379  11 AlignmentI protein = dna == al1 ? al2 : al1;
1380  11 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
1381  11 for (SequenceI dnaSeq : dna.getSequences())
1382    {
1383  12 for (SequenceI proteinSeq : protein.getSequences())
1384    {
1385  12 if (isMappable(dnaSeq, proteinSeq, mappings))
1386    {
1387  2 return true;
1388    }
1389    }
1390    }
1391  9 return false;
1392    }
1393   
1394    /**
1395    * Returns true if the dna sequence is mapped, or could be mapped, to the
1396    * protein sequence.
1397    *
1398    * @param dnaSeq
1399    * @param proteinSeq
1400    * @param mappings
1401    * @return
1402    */
 
1403  12 toggle protected static boolean isMappable(SequenceI dnaSeq,
1404    SequenceI proteinSeq, List<AlignedCodonFrame> mappings)
1405    {
1406  12 if (dnaSeq == null || proteinSeq == null)
1407    {
1408  0 return false;
1409    }
1410   
1411  12 SequenceI dnaDs = dnaSeq.getDatasetSequence() == null ? dnaSeq
1412    : dnaSeq.getDatasetSequence();
1413  12 SequenceI proteinDs = proteinSeq.getDatasetSequence() == null
1414    ? proteinSeq
1415    : proteinSeq.getDatasetSequence();
1416   
1417  12 for (AlignedCodonFrame mapping : mappings)
1418    {
1419  0 if (proteinDs == mapping.getAaForDnaSeq(dnaDs))
1420    {
1421    /*
1422    * already mapped
1423    */
1424  0 return true;
1425    }
1426    }
1427   
1428    /*
1429    * Just try to make a mapping (it is not yet stored), test whether
1430    * successful.
1431    */
1432  12 return mapCdnaToProtein(proteinDs, dnaDs) != null;
1433    }
1434   
1435    /**
1436    * Finds any reference annotations associated with the sequences in
1437    * sequenceScope, that are not already added to the alignment, and adds them
1438    * to the 'candidates' map. Also populates a lookup table of annotation
1439    * labels, keyed by calcId, for use in constructing tooltips or the like.
1440    *
1441    * @param sequenceScope
1442    * the sequences to scan for reference annotations
1443    * @param labelForCalcId
1444    * (optional) map to populate with label for calcId
1445    * @param candidates
1446    * map to populate with annotations for sequence
1447    * @param al
1448    * the alignment to check for presence of annotations
1449    */
 
1450  98 toggle public static void findAddableReferenceAnnotations(
1451    List<SequenceI> sequenceScope, Map<String, String> labelForCalcId,
1452    final Map<SequenceI, List<AlignmentAnnotation>> candidates,
1453    AlignmentI al)
1454    {
1455  98 if (sequenceScope == null)
1456    {
1457  1 return;
1458    }
1459   
1460    /*
1461    * For each sequence in scope, make a list of any annotations on the
1462    * underlying dataset sequence which are not already on the alignment.
1463    *
1464    * Add to a map of { alignmentSequence, <List of annotations to add> }
1465    */
1466  97 for (SequenceI seq : sequenceScope)
1467    {
1468  94 SequenceI dataset = seq.getDatasetSequence();
1469  94 if (dataset == null)
1470    {
1471  0 continue;
1472    }
1473  94 AlignmentAnnotation[] datasetAnnotations = dataset.getAnnotation();
1474  94 if (datasetAnnotations == null)
1475    {
1476  35 continue;
1477    }
1478  59 final List<AlignmentAnnotation> result = new ArrayList<>();
1479  59 for (AlignmentAnnotation dsann : datasetAnnotations)
1480    {
1481    /*
1482    * Find matching annotations on the alignment. If none is found, then
1483    * add this annotation to the list of 'addable' annotations for this
1484    * sequence.
1485    */
1486  192 final Iterable<AlignmentAnnotation> matchedAlignmentAnnotations = al
1487    .findAnnotations(seq, dsann.getCalcId(), dsann.label);
1488  192 boolean found = false;
1489  192 if (matchedAlignmentAnnotations != null)
1490    {
1491  189 for (AlignmentAnnotation matched : matchedAlignmentAnnotations)
1492    {
1493  139 if (dsann.description.equals(matched.description))
1494    {
1495  57 found = true;
1496  57 break;
1497    }
1498    }
1499    }
1500  192 if (!found)
1501    {
1502  135 result.add(dsann);
1503  135 if (labelForCalcId != null)
1504    {
1505  10 labelForCalcId.put(dsann.getCalcId(), dsann.label);
1506    }
1507    }
1508    }
1509    /*
1510    * Save any addable annotations for this sequence
1511    */
1512  59 if (!result.isEmpty())
1513    {
1514  55 candidates.put(seq, result);
1515    }
1516    }
1517    }
1518   
1519    /**
1520    * Adds annotations to the top of the alignment annotations, in the same order
1521    * as their related sequences. If you already have an annotation and want to
1522    * add it to a sequence in an alignment use {@code addReferenceAnnotationTo}
1523    *
1524    * @param annotations
1525    * the annotations to add
1526    * @param alignment
1527    * the alignment to add them to
1528    * @param selectionGroup
1529    * current selection group - may be null, if provided then any added
1530    * annotation will be trimmed to just those columns in the selection
1531    * group
1532    */
 
1533  52 toggle public static void addReferenceAnnotations(
1534    Map<SequenceI, List<AlignmentAnnotation>> annotations,
1535    final AlignmentI alignment, final SequenceGroup selectionGroup)
1536    {
1537  52 for (SequenceI seq : annotations.keySet())
1538    {
1539  51 for (AlignmentAnnotation ann : annotations.get(seq))
1540    {
1541  129 addReferenceAnnotationTo(alignment, seq, ann, selectionGroup);
1542    }
1543    }
1544    }
1545   
 
1546  4 toggle public static boolean isSSAnnotationPresent(
1547    Map<SequenceI, List<AlignmentAnnotation>> annotations)
1548    {
1549   
1550  4 for (SequenceI seq : annotations.keySet())
1551    {
1552  3 if (isSecondaryStructurePresent(
1553    annotations.get(seq).toArray(new AlignmentAnnotation[0])))
1554    {
1555  2 return true;
1556    }
1557    }
1558  2 return false;
1559    }
1560   
1561    /**
1562    * Make a copy of a reference annotation {@code ann} and add it to an
1563    * alignment sequence {@code seq} in {@code alignment}, optionally limited to
1564    * the extent of {@code selectionGroup}
1565    *
1566    * @param alignment
1567    * @param seq
1568    * @param ann
1569    * @param selectionGroup
1570    * current selection group - may be null, if provided then any added
1571    * annotation will be trimmed to just those columns in the selection
1572    * group
1573    * @return annotation added to {@code seq and {@code alignment}
1574    */
 
1575  133 toggle public static AlignmentAnnotation addReferenceAnnotationTo(
1576    final AlignmentI alignment, final SequenceI seq,
1577    final AlignmentAnnotation ann, final SequenceGroup selectionGroup)
1578    {
1579  133 AlignmentAnnotation copyAnn = new AlignmentAnnotation(ann);
1580  133 int startRes = 0;
1581  133 int endRes = ann.annotations.length;
1582  133 if (selectionGroup != null)
1583    {
1584  2 startRes = -1 + Math.min(seq.getEnd(), Math.max(seq.getStart(),
1585    seq.findPosition(selectionGroup.getStartRes())));
1586  2 endRes = -1 + Math.min(seq.getEnd(),
1587    seq.findPosition(selectionGroup.getEndRes()));
1588   
1589    }
1590  133 copyAnn.restrict(startRes, endRes + 0);
1591   
1592    /*
1593    * Add to the sequence (sets copyAnn.datasetSequence), unless the
1594    * original annotation is already on the sequence.
1595    */
1596  133 if (!seq.hasAnnotation(ann))
1597    {
1598  133 ContactMatrixI cm = seq.getDatasetSequence().getContactMatrixFor(ann);
1599  133 if (cm != null)
1600    {
1601  33 seq.addContactListFor(copyAnn, cm);
1602    }
1603  133 seq.addAlignmentAnnotation(copyAnn);
1604    }
1605    // adjust for gaps
1606  133 copyAnn.adjustForAlignment();
1607    // add to the alignment and set visible
1608  133 alignment.addAnnotation(copyAnn);
1609  133 copyAnn.visible = true;
1610   
1611  133 return copyAnn;
1612    }
1613   
1614    /**
1615    * Set visibility of alignment annotations of specified types (labels), for
1616    * specified sequences. This supports controls like "Show all secondary
1617    * structure", "Hide all Temp factor", etc.
1618    *
1619    * @al the alignment to scan for annotations
1620    * @param types
1621    * the types (labels) of annotations to be updated
1622    * @param forSequences
1623    * if not null, only annotations linked to one of these sequences are
1624    * in scope for update; if null, acts on all sequence annotations
1625    * @param anyType
1626    * if this flag is true, 'types' is ignored (label not checked)
1627    * @param doShow
1628    * if true, set visibility on, else set off
1629    */
 
1630  5 toggle public static void showOrHideSequenceAnnotations(AlignmentI al,
1631    Collection<String> types, List<SequenceI> forSequences,
1632    boolean anyType, boolean doShow)
1633    {
1634  5 AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
1635  5 if (anns != null)
1636    {
1637  5 for (AlignmentAnnotation aa : anns)
1638    {
1639  30 if (anyType || types.contains(aa.label))
1640    {
1641  21 if ((aa.sequenceRef != null) && (forSequences == null
1642    || forSequences.contains(aa.sequenceRef)))
1643    {
1644  11 aa.visible = doShow;
1645    }
1646    }
1647    }
1648    }
1649    }
1650   
1651    /**
1652    * Shows or hides auto calculated annotations for a sequence group.
1653    *
1654    * @param al
1655    * The alignment object with the annotations.
1656    * @param type
1657    * The type of annotation to show or hide.
1658    * @param selectedGroup
1659    * The sequence group for which the annotations should be shown or
1660    * hidden.
1661    * @param anyType
1662    * If true, all types of annotations will be shown/hidden.
1663    * @param doShow
1664    * If true, the annotations will be shown; if false, annotations will
1665    * be hidden.
1666    */
 
1667  0 toggle public static void showOrHideAutoCalculatedAnnotationsForGroup(
1668    AlignmentI al, String type, SequenceGroup selectedGroup,
1669    boolean anyType, boolean doShow)
1670    {
1671    // Get all alignment annotations
1672  0 AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
1673   
1674  0 if (anns != null)
1675    {
1676  0 for (AlignmentAnnotation aa : anns)
1677    {
1678    // Check if anyType is true or if the annotation's label contains the
1679    // specified type (currently for secondary structure consensus)
1680  0 if ((anyType && aa.label
1681    .startsWith(Constants.SECONDARY_STRUCTURE_CONSENSUS_LABEL))
1682    || aa.label.startsWith(type))
1683    {
1684    // If the annotation's group reference is not null and matches the
1685    // selected group, update its visibility.
1686  0 if (aa.groupRef != null && selectedGroup == aa.groupRef)
1687    {
1688  0 aa.visible = doShow;
1689    }
1690    }
1691    }
1692    }
1693    }
1694   
 
1695  0 toggle public static AlignmentAnnotation getFirstSequenceAnnotationOfType(
1696    AlignmentI al, int graphType)
1697    {
1698  0 AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
1699  0 if (anns != null)
1700    {
1701  0 for (AlignmentAnnotation aa : anns)
1702    {
1703  0 if (aa.sequenceRef != null && aa.graph == graphType)
1704  0 return aa;
1705    }
1706    }
1707  0 return null;
1708    }
1709   
1710    /**
1711    * Returns true if either sequence has a cross-reference to the other
1712    *
1713    * @param seq1
1714    * @param seq2
1715    * @return
1716    */
 
1717  52 toggle public static boolean haveCrossRef(SequenceI seq1, SequenceI seq2)
1718    {
1719    // Note: moved here from class CrossRef as the latter class has dependencies
1720    // not availability to the applet's classpath
1721  52 return hasCrossRef(seq1, seq2) || hasCrossRef(seq2, seq1);
1722    }
1723   
1724    /**
1725    * Returns true if seq1 has a cross-reference to seq2. Currently this assumes
1726    * that sequence name is structured as Source|AccessionId.
1727    *
1728    * @param seq1
1729    * @param seq2
1730    * @return
1731    */
 
1732  108 toggle public static boolean hasCrossRef(SequenceI seq1, SequenceI seq2)
1733    {
1734  108 if (seq1 == null || seq2 == null)
1735    {
1736  8 return false;
1737    }
1738  100 String name = seq2.getName();
1739  100 final List<DBRefEntry> xrefs = seq1.getDBRefs();
1740  100 if (xrefs != null)
1741    {
1742  34 for (int ix = 0, nx = xrefs.size(); ix < nx; ix++)
1743    {
1744  24 DBRefEntry xref = xrefs.get(ix);
1745  24 String xrefName = xref.getSource() + "|" + xref.getAccessionId();
1746    // case-insensitive test, consistent with DBRefEntry.equalRef()
1747  24 if (xrefName.equalsIgnoreCase(name))
1748    {
1749  12 return true;
1750    }
1751    }
1752    }
1753  88 return false;
1754    }
1755   
1756    /**
1757    * Constructs an alignment consisting of the mapped (CDS) regions in the given
1758    * nucleotide sequences, and updates mappings to match. The CDS sequences are
1759    * added to the original alignment's dataset, which is shared by the new
1760    * alignment. Mappings from nucleotide to CDS, and from CDS to protein, are
1761    * added to the alignment dataset.
1762    *
1763    * @param dna
1764    * aligned nucleotide (dna or cds) sequences
1765    * @param dataset
1766    * the alignment dataset the sequences belong to
1767    * @param products
1768    * (optional) to restrict results to CDS that map to specified
1769    * protein products
1770    * @return an alignment whose sequences are the cds-only parts of the dna
1771    * sequences (or null if no mappings are found)
1772    */
 
1773  6 toggle public static AlignmentI makeCdsAlignment(SequenceI[] dna,
1774    AlignmentI dataset, SequenceI[] products)
1775    {
1776  6 if (dataset == null || dataset.getDataset() != null)
1777    {
1778  0 throw new IllegalArgumentException(
1779    "IMPLEMENTATION ERROR: dataset.getDataset() must be null!");
1780    }
1781  6 List<SequenceI> foundSeqs = new ArrayList<>();
1782  6 List<SequenceI> cdsSeqs = new ArrayList<>();
1783  6 List<AlignedCodonFrame> mappings = dataset.getCodonFrames();
1784  6 HashSet<SequenceI> productSeqs = null;
1785  6 if (products != null)
1786    {
1787  3 productSeqs = new HashSet<>();
1788  3 for (SequenceI seq : products)
1789    {
1790  24 productSeqs.add(seq.getDatasetSequence() == null ? seq
1791    : seq.getDatasetSequence());
1792    }
1793    }
1794   
1795    /*
1796    * Construct CDS sequences from mappings on the alignment dataset.
1797    * The logic is:
1798    * - find the protein product(s) mapped to from each dna sequence
1799    * - if the mapping covers the whole dna sequence (give or take start/stop
1800    * codon), take the dna as the CDS sequence
1801    * - else search dataset mappings for a suitable dna sequence, i.e. one
1802    * whose whole sequence is mapped to the protein
1803    * - if no sequence found, construct one from the dna sequence and mapping
1804    * (and add it to dataset so it is found if this is repeated)
1805    */
1806  6 for (SequenceI dnaSeq : dna)
1807    {
1808  52 SequenceI dnaDss = dnaSeq.getDatasetSequence() == null ? dnaSeq
1809    : dnaSeq.getDatasetSequence();
1810   
1811  52 List<AlignedCodonFrame> seqMappings = MappingUtils
1812    .findMappingsForSequence(dnaSeq, mappings);
1813  52 for (AlignedCodonFrame mapping : seqMappings)
1814    {
1815  42 List<Mapping> mappingsFromSequence = mapping
1816    .getMappingsFromSequence(dnaSeq);
1817   
1818  42 for (Mapping aMapping : mappingsFromSequence)
1819    {
1820  44 MapList mapList = aMapping.getMap();
1821  44 if (mapList.getFromRatio() == 1)
1822    {
1823    /*
1824    * not a dna-to-protein mapping (likely dna-to-cds)
1825    */
1826  11 continue;
1827    }
1828   
1829    /*
1830    * skip if mapping is not to one of the target set of proteins
1831    */
1832  33 SequenceI proteinProduct = aMapping.getTo();
1833  33 if (productSeqs != null && !productSeqs.contains(proteinProduct))
1834    {
1835  2 continue;
1836    }
1837   
1838    /*
1839    * try to locate the CDS from the dataset mappings;
1840    * guard against duplicate results (for the case that protein has
1841    * dbrefs to both dna and cds sequences)
1842    */
1843  31 SequenceI cdsSeq = findCdsForProtein(mappings, dnaSeq,
1844    seqMappings, aMapping);
1845  31 if (cdsSeq != null)
1846    {
1847  11 if (!foundSeqs.contains(cdsSeq))
1848    {
1849  11 foundSeqs.add(cdsSeq);
1850  11 SequenceI derivedSequence = cdsSeq.deriveSequence();
1851  11 cdsSeqs.add(derivedSequence);
1852  11 if (!dataset.getSequences().contains(cdsSeq))
1853    {
1854  0 dataset.addSequence(cdsSeq);
1855    }
1856    }
1857  11 continue;
1858    }
1859   
1860    /*
1861    * didn't find mapped CDS sequence - construct it and add
1862    * its dataset sequence to the dataset
1863    */
1864  20 cdsSeq = makeCdsSequence(dnaSeq.getDatasetSequence(), aMapping,
1865    dataset).deriveSequence();
1866    // cdsSeq has a name constructed as CDS|<dbref>
1867    // <dbref> will be either the accession for the coding sequence,
1868    // marked in the /via/ dbref to the protein product accession
1869    // or it will be the original nucleotide accession.
1870  20 SequenceI cdsSeqDss = cdsSeq.getDatasetSequence();
1871   
1872  20 cdsSeqs.add(cdsSeq);
1873   
1874    /*
1875    * build the mapping from CDS to protein
1876    */
1877  20 List<int[]> cdsRange = Collections
1878    .singletonList(new int[]
1879    { cdsSeq.getStart(),
1880    cdsSeq.getLength() + cdsSeq.getStart() - 1 });
1881  20 MapList cdsToProteinMap = new MapList(cdsRange,
1882    mapList.getToRanges(), mapList.getFromRatio(),
1883    mapList.getToRatio());
1884   
1885  20 if (!dataset.getSequences().contains(cdsSeqDss))
1886    {
1887    /*
1888    * if this sequence is a newly created one, add it to the dataset
1889    * and made a CDS to protein mapping (if sequence already exists,
1890    * CDS-to-protein mapping _is_ the transcript-to-protein mapping)
1891    */
1892  20 dataset.addSequence(cdsSeqDss);
1893  20 AlignedCodonFrame cdsToProteinMapping = new AlignedCodonFrame();
1894  20 cdsToProteinMapping.addMap(cdsSeqDss, proteinProduct,
1895    cdsToProteinMap);
1896   
1897    /*
1898    * guard against duplicating the mapping if repeating this action
1899    */
1900  20 if (!mappings.contains(cdsToProteinMapping))
1901    {
1902  20 mappings.add(cdsToProteinMapping);
1903    }
1904    }
1905   
1906  20 propagateDBRefsToCDS(cdsSeqDss, dnaSeq.getDatasetSequence(),
1907    proteinProduct, aMapping);
1908    /*
1909    * add another mapping from original 'from' range to CDS
1910    */
1911  20 AlignedCodonFrame dnaToCdsMapping = new AlignedCodonFrame();
1912  20 final MapList dnaToCdsMap = new MapList(mapList.getFromRanges(),
1913    cdsRange, 1, 1);
1914  20 dnaToCdsMapping.addMap(dnaSeq.getDatasetSequence(), cdsSeqDss,
1915    dnaToCdsMap);
1916  20 if (!mappings.contains(dnaToCdsMapping))
1917    {
1918  20 mappings.add(dnaToCdsMapping);
1919    }
1920   
1921    /*
1922    * transfer dna chromosomal loci (if known) to the CDS
1923    * sequence (via the mapping)
1924    */
1925  20 final MapList cdsToDnaMap = dnaToCdsMap.getInverse();
1926  20 transferGeneLoci(dnaSeq, cdsToDnaMap, cdsSeq);
1927   
1928    /*
1929    * add DBRef with mapping from protein to CDS
1930    * (this enables Get Cross-References from protein alignment)
1931    * This is tricky because we can't have two DBRefs with the
1932    * same source and accession, so need a different accession for
1933    * the CDS from the dna sequence
1934    */
1935   
1936    // specific use case:
1937    // Genomic contig ENSCHR:1, contains coding regions for ENSG01,
1938    // ENSG02, ENSG03, with transcripts and products similarly named.
1939    // cannot add distinct dbrefs mapping location on ENSCHR:1 to ENSG01
1940   
1941    // JBPNote: ?? can't actually create an example that demonstrates we
1942    // need to
1943    // synthesize an xref.
1944   
1945  20 List<DBRefEntry> primrefs = dnaDss.getPrimaryDBRefs();
1946  33 for (int ip = 0, np = primrefs.size(); ip < np; ip++)
1947    {
1948  13 DBRefEntry primRef = primrefs.get(ip);
1949    /*
1950    * create a cross-reference from CDS to the source sequence's
1951    * primary reference and vice versa
1952    */
1953  13 String source = primRef.getSource();
1954  13 String version = primRef.getVersion();
1955  13 DBRefEntry cdsCrossRef = new DBRefEntry(source,
1956    source + ":" + version, primRef.getAccessionId());
1957  13 cdsCrossRef
1958    .setMap(new Mapping(dnaDss, new MapList(cdsToDnaMap)));
1959  13 cdsSeqDss.addDBRef(cdsCrossRef);
1960   
1961  13 dnaSeq.addDBRef(new DBRefEntry(source, version,
1962    cdsSeq.getName(), new Mapping(cdsSeqDss, dnaToCdsMap)));
1963    // problem here is that the cross-reference is synthesized -
1964    // cdsSeq.getName() may be like 'CDS|dnaaccession' or
1965    // 'CDS|emblcdsacc'
1966    // assuming cds version same as dna ?!?
1967   
1968  13 DBRefEntry proteinToCdsRef = new DBRefEntry(source, version,
1969    cdsSeq.getName());
1970    //
1971  13 proteinToCdsRef.setMap(
1972    new Mapping(cdsSeqDss, cdsToProteinMap.getInverse()));
1973  13 proteinProduct.addDBRef(proteinToCdsRef);
1974    }
1975    /*
1976    * transfer any features on dna that overlap the CDS
1977    */
1978  20 transferFeatures(dnaSeq, cdsSeq, dnaToCdsMap, null,
1979    SequenceOntologyI.CDS);
1980    }
1981    }
1982    }
1983   
1984  6 AlignmentI cds = new Alignment(
1985    cdsSeqs.toArray(new SequenceI[cdsSeqs.size()]));
1986  6 cds.setDataset(dataset);
1987   
1988  6 return cds;
1989    }
1990   
1991    /**
1992    * Tries to transfer gene loci (dbref to chromosome positions) from fromSeq to
1993    * toSeq, mediated by the given mapping between the sequences
1994    *
1995    * @param fromSeq
1996    * @param targetToFrom
1997    * Map
1998    * @param targetSeq
1999    */
 
2000  23 toggle protected static void transferGeneLoci(SequenceI fromSeq,
2001    MapList targetToFrom, SequenceI targetSeq)
2002    {
2003  23 if (targetSeq.getGeneLoci() != null)
2004    {
2005    // already have - don't override
2006  1 return;
2007    }
2008  22 GeneLociI fromLoci = fromSeq.getGeneLoci();
2009  22 if (fromLoci == null)
2010    {
2011  10 return;
2012    }
2013   
2014  12 MapList newMap = targetToFrom.traverse(fromLoci.getMapping());
2015   
2016  12 if (newMap != null)
2017    {
2018  12 targetSeq.setGeneLoci(fromLoci.getSpeciesId(),
2019    fromLoci.getAssemblyId(), fromLoci.getChromosomeId(), newMap);
2020    }
2021    }
2022   
2023    /**
2024    * A helper method that finds a CDS sequence in the alignment dataset that is
2025    * mapped to the given protein sequence, and either is, or has a mapping from,
2026    * the given dna sequence.
2027    *
2028    * @param mappings
2029    * set of all mappings on the dataset
2030    * @param dnaSeq
2031    * a dna (or cds) sequence we are searching from
2032    * @param seqMappings
2033    * the set of mappings involving dnaSeq
2034    * @param aMapping
2035    * a transcript-to-peptide mapping
2036    * @return
2037    */
 
2038  38 toggle static SequenceI findCdsForProtein(List<AlignedCodonFrame> mappings,
2039    SequenceI dnaSeq, List<AlignedCodonFrame> seqMappings,
2040    Mapping aMapping)
2041    {
2042    /*
2043    * TODO a better dna-cds-protein mapping data representation to allow easy
2044    * navigation; until then this clunky looping around lists of mappings
2045    */
2046  38 SequenceI seqDss = dnaSeq.getDatasetSequence() == null ? dnaSeq
2047    : dnaSeq.getDatasetSequence();
2048  38 SequenceI proteinProduct = aMapping.getTo();
2049   
2050    /*
2051    * is this mapping from the whole dna sequence (i.e. CDS)?
2052    * allowing for possible stop codon on dna but not peptide
2053    */
2054  38 int mappedFromLength = MappingUtils
2055    .getLength(aMapping.getMap().getFromRanges());
2056  38 int dnaLength = seqDss.getLength();
2057  38 if (mappedFromLength == dnaLength
2058    || mappedFromLength == dnaLength - CODON_LENGTH)
2059    {
2060    /*
2061    * if sequence has CDS features, this is a transcript with no UTR
2062    * - do not take this as the CDS sequence! (JAL-2789)
2063    */
2064  4 if (seqDss.getFeatures().getFeaturesByOntology(SequenceOntologyI.CDS)
2065    .isEmpty())
2066    {
2067  1 return seqDss;
2068    }
2069    }
2070   
2071    /*
2072    * looks like we found the dna-to-protein mapping; search for the
2073    * corresponding cds-to-protein mapping
2074    */
2075  37 List<AlignedCodonFrame> mappingsToPeptide = MappingUtils
2076    .findMappingsForSequence(proteinProduct, mappings);
2077  37 for (AlignedCodonFrame acf : mappingsToPeptide)
2078    {
2079  52 for (SequenceToSequenceMapping map : acf.getMappings())
2080    {
2081  276 Mapping mapping = map.getMapping();
2082  276 if (mapping != aMapping
2083    && mapping.getMap().getFromRatio() == CODON_LENGTH
2084    && proteinProduct == mapping.getTo()
2085    && seqDss != map.getFromSeq())
2086    {
2087  15 mappedFromLength = MappingUtils
2088    .getLength(mapping.getMap().getFromRanges());
2089  15 if (mappedFromLength == map.getFromSeq().getLength())
2090    {
2091    /*
2092    * found a 3:1 mapping to the protein product which covers
2093    * the whole dna sequence i.e. is from CDS; finally check the CDS
2094    * is mapped from the given dna start sequence
2095    */
2096  15 SequenceI cdsSeq = map.getFromSeq();
2097    // todo this test is weak if seqMappings contains multiple mappings;
2098    // we get away with it if transcript:cds relationship is 1:1
2099  15 List<AlignedCodonFrame> dnaToCdsMaps = MappingUtils
2100    .findMappingsForSequence(cdsSeq, seqMappings);
2101  15 if (!dnaToCdsMaps.isEmpty())
2102    {
2103  13 return cdsSeq;
2104    }
2105    }
2106    }
2107    }
2108    }
2109  24 return null;
2110    }
2111   
2112    /**
2113    * Helper method that makes a CDS sequence as defined by the mappings from the
2114    * given sequence i.e. extracts the 'mapped from' ranges (which may be on
2115    * forward or reverse strand).
2116    *
2117    * @param seq
2118    * @param mapping
2119    * @param dataset
2120    * - existing dataset. We check for sequences that look like the CDS
2121    * we are about to construct, if one exists already, then we will
2122    * just return that one.
2123    * @return CDS sequence (as a dataset sequence)
2124    */
 
2125  20 toggle static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping,
2126    AlignmentI dataset)
2127    {
2128    /*
2129    * construct CDS sequence name as "CDS|" with 'from id' held in the mapping
2130    * if set (e.g. EMBL protein_id), else sequence name appended
2131    */
2132  20 String mapFromId = mapping.getMappedFromId();
2133  20 final String seqId = "CDS|"
2134  20 + (mapFromId != null ? mapFromId : seq.getName());
2135   
2136  20 SequenceI newSeq = null;
2137   
2138    /*
2139    * construct CDS sequence by splicing mapped from ranges
2140    */
2141  20 char[] seqChars = seq.getSequence();
2142  20 List<int[]> fromRanges = mapping.getMap().getFromRanges();
2143  20 int cdsWidth = MappingUtils.getLength(fromRanges);
2144  20 char[] newSeqChars = new char[cdsWidth];
2145   
2146  20 int newPos = 0;
2147  20 for (int[] range : fromRanges)
2148    {
2149  32 if (range[0] <= range[1])
2150    {
2151    // forward strand mapping - just copy the range
2152  32 int length = range[1] - range[0] + 1;
2153  32 System.arraycopy(seqChars, range[0] - 1, newSeqChars, newPos,
2154    length);
2155  32 newPos += length;
2156    }
2157    else
2158    {
2159    // reverse strand mapping - copy and complement one by one
2160  0 for (int i = range[0]; i >= range[1]; i--)
2161    {
2162  0 newSeqChars[newPos++] = Dna.getComplement(seqChars[i - 1]);
2163    }
2164    }
2165   
2166  32 newSeq = new Sequence(seqId, newSeqChars, 1, newPos);
2167    }
2168   
2169  20 if (dataset != null)
2170    {
2171  20 SequenceI[] matches = dataset.findSequenceMatch(newSeq.getName());
2172  20 if (matches != null)
2173    {
2174  20 boolean matched = false;
2175  20 for (SequenceI mtch : matches)
2176    {
2177  3 if (mtch.getStart() != newSeq.getStart())
2178    {
2179  0 continue;
2180    }
2181  3 if (mtch.getEnd() != newSeq.getEnd())
2182    {
2183  0 continue;
2184    }
2185  3 if (!Arrays.equals(mtch.getSequence(), newSeq.getSequence()))
2186    {
2187  3 continue;
2188    }
2189  0 if (!matched)
2190    {
2191  0 matched = true;
2192  0 newSeq = mtch;
2193    }
2194    else
2195    {
2196  0 Console.error(
2197    "JAL-2154 regression: warning - found (and ignored) a duplicate CDS sequence:"
2198    + mtch.toString());
2199    }
2200    }
2201    }
2202    }
2203    // newSeq.setDescription(mapFromId);
2204   
2205  20 return newSeq;
2206    }
2207   
2208    /**
2209    * Adds any DBRefEntrys to cdsSeq from contig that have a Mapping congruent to
2210    * the given mapping.
2211    *
2212    * @param cdsSeq
2213    * @param contig
2214    * @param proteinProduct
2215    * @param mapping
2216    * @return list of DBRefEntrys added
2217    */
 
2218  20 toggle protected static List<DBRefEntry> propagateDBRefsToCDS(SequenceI cdsSeq,
2219    SequenceI contig, SequenceI proteinProduct, Mapping mapping)
2220    {
2221   
2222    // gather direct refs from contig congruent with mapping
2223  20 List<DBRefEntry> direct = new ArrayList<>();
2224  20 HashSet<String> directSources = new HashSet<>();
2225   
2226  20 List<DBRefEntry> refs = contig.getDBRefs();
2227  20 if (refs != null)
2228    {
2229  292 for (int ib = 0, nb = refs.size(); ib < nb; ib++)
2230    {
2231  279 DBRefEntry dbr = refs.get(ib);
2232  279 MapList map;
2233  ? if (dbr.hasMap() && (map = dbr.getMap().getMap()).isTripletMap())
2234    {
2235    // check if map is the CDS mapping
2236  24 if (mapping.getMap().equals(map))
2237    {
2238  24 direct.add(dbr);
2239  24 directSources.add(dbr.getSource());
2240    }
2241    }
2242    }
2243    }
2244  20 List<DBRefEntry> onSource = DBRefUtils.selectRefs(
2245    proteinProduct.getDBRefs(),
2246    directSources.toArray(new String[0]));
2247  20 List<DBRefEntry> propagated = new ArrayList<>();
2248   
2249    // and generate appropriate mappings
2250  44 for (int ic = 0, nc = direct.size(); ic < nc; ic++)
2251    {
2252  24 DBRefEntry cdsref = direct.get(ic);
2253  24 Mapping m = cdsref.getMap();
2254    // clone maplist and mapping
2255  24 MapList cdsposmap = new MapList(
2256    Arrays.asList(new int[][]
2257    { new int[] { cdsSeq.getStart(), cdsSeq.getEnd() } }),
2258    m.getMap().getToRanges(), 3, 1);
2259  24 Mapping cdsmap = new Mapping(m.getTo(), m.getMap());
2260   
2261    // create dbref
2262  24 DBRefEntry newref = new DBRefEntry(cdsref.getSource(),
2263    cdsref.getVersion(), cdsref.getAccessionId(),
2264    new Mapping(cdsmap.getTo(), cdsposmap));
2265   
2266    // and see if we can map to the protein product for this mapping.
2267    // onSource is the filtered set of accessions on protein that we are
2268    // tranferring, so we assume accession is the same.
2269  24 if (cdsmap.getTo() == null && onSource != null)
2270    {
2271  2 List<DBRefEntry> sourceRefs = DBRefUtils.searchRefs(onSource,
2272    cdsref.getAccessionId());
2273  2 if (sourceRefs != null)
2274    {
2275  2 for (DBRefEntry srcref : sourceRefs)
2276    {
2277  2 if (srcref.getSource().equalsIgnoreCase(cdsref.getSource()))
2278    {
2279    // we have found a complementary dbref on the protein product, so
2280    // update mapping's getTo
2281  2 newref.getMap().setTo(proteinProduct);
2282    }
2283    }
2284    }
2285    }
2286  24 cdsSeq.addDBRef(newref);
2287  24 propagated.add(newref);
2288    }
2289  20 return propagated;
2290    }
2291   
2292    /**
2293    * Transfers co-located features on 'fromSeq' to 'toSeq', adjusting the
2294    * feature start/end ranges, optionally omitting specified feature types.
2295    * Returns the number of features copied.
2296    *
2297    * @param fromSeq
2298    * @param toSeq
2299    * @param mapping
2300    * the mapping from 'fromSeq' to 'toSeq'
2301    * @param select
2302    * if not null, only features of this type are copied (including
2303    * subtypes in the Sequence Ontology)
2304    * @param omitting
2305    */
 
2306  23 toggle protected static int transferFeatures(SequenceI fromSeq, SequenceI toSeq,
2307    MapList mapping, String select, String... omitting)
2308    {
2309  23 SequenceI copyTo = toSeq;
2310  43 while (copyTo.getDatasetSequence() != null)
2311    {
2312  20 copyTo = copyTo.getDatasetSequence();
2313    }
2314  23 if (fromSeq == copyTo || fromSeq.getDatasetSequence() == copyTo)
2315    {
2316  0 return 0; // shared dataset sequence
2317    }
2318   
2319    /*
2320    * get features, optionally restricted by an ontology term
2321    */
2322  23 List<SequenceFeature> sfs = select == null
2323    ? fromSeq.getFeatures().getPositionalFeatures()
2324    : fromSeq.getFeatures().getFeaturesByOntology(select);
2325   
2326  23 int count = 0;
2327  23 for (SequenceFeature sf : sfs)
2328    {
2329  9610 String type = sf.getType();
2330  9610 boolean omit = false;
2331  9610 for (String toOmit : omitting)
2332    {
2333  9603 if (type.equals(toOmit))
2334    {
2335  134 omit = true;
2336    }
2337    }
2338  9610 if (omit)
2339    {
2340  134 continue;
2341    }
2342   
2343    /*
2344    * locate the mapped range - null if either start or end is
2345    * not mapped (no partial overlaps are calculated)
2346    */
2347  9476 int start = sf.getBegin();
2348  9476 int end = sf.getEnd();
2349  9476 int[] mappedTo = mapping.locateInTo(start, end);
2350    /*
2351    * if whole exon range doesn't map, try interpreting it
2352    * as 5' or 3' exon overlapping the CDS range
2353    */
2354  9476 if (mappedTo == null)
2355    {
2356  4447 mappedTo = mapping.locateInTo(end, end);
2357  4447 if (mappedTo != null)
2358    {
2359    /*
2360    * end of exon is in CDS range - 5' overlap
2361    * to a range from the start of the peptide
2362    */
2363  0 mappedTo[0] = 1;
2364    }
2365    }
2366  9476 if (mappedTo == null)
2367    {
2368  4447 mappedTo = mapping.locateInTo(start, start);
2369  4447 if (mappedTo != null)
2370    {
2371    /*
2372    * start of exon is in CDS range - 3' overlap
2373    * to a range up to the end of the peptide
2374    */
2375  0 mappedTo[1] = toSeq.getLength();
2376    }
2377    }
2378  9476 if (mappedTo != null)
2379    {
2380  5029 int newBegin = Math.min(mappedTo[0], mappedTo[1]);
2381  5029 int newEnd = Math.max(mappedTo[0], mappedTo[1]);
2382  5029 SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
2383    sf.getFeatureGroup(), sf.getScore());
2384  5029 copyTo.addSequenceFeature(copy);
2385  5029 count++;
2386    }
2387    }
2388  23 return count;
2389    }
2390   
2391    /**
2392    * Returns a mapping from dna to protein by inspecting sequence features of
2393    * type "CDS" on the dna. A mapping is constructed if the total CDS feature
2394    * length is 3 times the peptide length (optionally after dropping a trailing
2395    * stop codon). This method does not check whether the CDS nucleotide sequence
2396    * translates to the peptide sequence.
2397    *
2398    * @param dnaSeq
2399    * @param proteinSeq
2400    * @return
2401    */
 
2402  22 toggle public static MapList mapCdsToProtein(SequenceI dnaSeq,
2403    SequenceI proteinSeq)
2404    {
2405  22 List<int[]> ranges = findCdsPositions(dnaSeq);
2406  22 int mappedDnaLength = MappingUtils.getLength(ranges);
2407   
2408    /*
2409    * if not a whole number of codons, truncate mapping
2410    */
2411  22 int codonRemainder = mappedDnaLength % CODON_LENGTH;
2412  22 if (codonRemainder > 0)
2413    {
2414  2 mappedDnaLength -= codonRemainder;
2415  2 MappingUtils.removeEndPositions(codonRemainder, ranges);
2416    }
2417   
2418  22 int proteinLength = proteinSeq.getLength();
2419  22 int proteinStart = proteinSeq.getStart();
2420  22 int proteinEnd = proteinSeq.getEnd();
2421   
2422    /*
2423    * incomplete start codon may mean X at start of peptide
2424    * we ignore both for mapping purposes
2425    */
2426  22 if (proteinSeq.getCharAt(0) == 'X')
2427    {
2428    // todo JAL-2022 support startPhase > 0
2429  1 proteinStart++;
2430  1 proteinLength--;
2431    }
2432  22 List<int[]> proteinRange = new ArrayList<>();
2433   
2434    /*
2435    * dna length should map to protein (or protein plus stop codon)
2436    */
2437  22 int codesForResidues = mappedDnaLength / CODON_LENGTH;
2438  22 if (codesForResidues == (proteinLength + 1))
2439    {
2440    // assuming extra codon is for STOP and not in peptide
2441    // todo: check trailing codon is indeed a STOP codon
2442  2 codesForResidues--;
2443  2 mappedDnaLength -= CODON_LENGTH;
2444  2 MappingUtils.removeEndPositions(CODON_LENGTH, ranges);
2445    }
2446   
2447  22 if (codesForResidues == proteinLength)
2448    {
2449  4 proteinRange.add(new int[] { proteinStart, proteinEnd });
2450  4 return new MapList(ranges, proteinRange, CODON_LENGTH, 1);
2451    }
2452  18 return null;
2453    }
2454   
2455    /**
2456    * Returns a list of CDS ranges found (as sequence positions base 1), i.e. of
2457    * [start, end] positions of sequence features of type "CDS" (or a sub-type of
2458    * CDS in the Sequence Ontology). The ranges are sorted into ascending start
2459    * position order, so this method is only valid for linear CDS in the same
2460    * sense as the protein product.
2461    *
2462    * @param dnaSeq
2463    * @return
2464    */
 
2465  24 toggle protected static List<int[]> findCdsPositions(SequenceI dnaSeq)
2466    {
2467  24 List<int[]> result = new ArrayList<>();
2468   
2469  24 List<SequenceFeature> sfs = dnaSeq.getFeatures()
2470    .getFeaturesByOntology(SequenceOntologyI.CDS);
2471  24 if (sfs.isEmpty())
2472    {
2473  16 return result;
2474    }
2475  8 SequenceFeatures.sortFeatures(sfs, true);
2476   
2477  8 for (SequenceFeature sf : sfs)
2478    {
2479  16 int phase = 0;
2480  16 try
2481    {
2482  16 String s = sf.getPhase();
2483  16 if (s != null)
2484    {
2485  2 phase = Integer.parseInt(s);
2486    }
2487    } catch (NumberFormatException e)
2488    {
2489    // leave as zero
2490    }
2491    /*
2492    * phase > 0 on first codon means 5' incomplete - skip to the start
2493    * of the next codon; example ENST00000496384
2494    */
2495  16 int begin = sf.getBegin();
2496  16 int end = sf.getEnd();
2497  16 if (result.isEmpty() && phase > 0)
2498    {
2499  2 begin += phase;
2500  2 if (begin > end)
2501    {
2502    // shouldn't happen!
2503  0 System.err
2504    .println("Error: start phase extends beyond start CDS in "
2505    + dnaSeq.getName());
2506    }
2507    }
2508  16 result.add(new int[] { begin, end });
2509    }
2510   
2511    /*
2512    * Finally sort ranges by start position. This avoids a dependency on
2513    * keeping features in order on the sequence (if they are in order anyway,
2514    * the sort will have almost no work to do). The implicit assumption is CDS
2515    * ranges are assembled in order. Other cases should not use this method,
2516    * but instead construct an explicit mapping for CDS (e.g. EMBL parsing).
2517    */
2518  8 Collections.sort(result, IntRangeComparator.ASCENDING);
2519  8 return result;
2520    }
2521   
2522    /**
2523    * Makes an alignment with a copy of the given sequences, adding in any
2524    * non-redundant sequences which are mapped to by the cross-referenced
2525    * sequences.
2526    *
2527    * @param seqs
2528    * @param xrefs
2529    * @param dataset
2530    * the alignment dataset shared by the new copy
2531    * @return
2532    */
 
2533  0 toggle public static AlignmentI makeCopyAlignment(SequenceI[] seqs,
2534    SequenceI[] xrefs, AlignmentI dataset)
2535    {
2536  0 AlignmentI copy = new Alignment(new Alignment(seqs));
2537  0 copy.setDataset(dataset);
2538  0 boolean isProtein = !copy.isNucleotide();
2539  0 SequenceIdMatcher matcher = new SequenceIdMatcher(seqs);
2540  0 if (xrefs != null)
2541    {
2542    // BH 2019.01.25 recoded to remove iterators
2543   
2544  0 for (int ix = 0, nx = xrefs.length; ix < nx; ix++)
2545    {
2546  0 SequenceI xref = xrefs[ix];
2547  0 List<DBRefEntry> dbrefs = xref.getDBRefs();
2548  0 if (dbrefs != null)
2549    {
2550  0 for (int ir = 0, nir = dbrefs.size(); ir < nir; ir++)
2551    {
2552  0 DBRefEntry dbref = dbrefs.get(ir);
2553  0 Mapping map = dbref.getMap();
2554  0 SequenceI mto;
2555  0 if (map == null || (mto = map.getTo()) == null
2556    || mto.isProtein() != isProtein)
2557    {
2558  0 continue;
2559    }
2560  0 SequenceI mappedTo = mto;
2561  0 SequenceI match = matcher.findIdMatch(mappedTo);
2562  0 if (match == null)
2563    {
2564  0 matcher.add(mappedTo);
2565  0 copy.addSequence(mappedTo);
2566    }
2567    }
2568    }
2569    }
2570    }
2571  0 return copy;
2572    }
2573   
2574    /**
2575    * Try to align sequences in 'unaligned' to match the alignment of their
2576    * mapped regions in 'aligned'. For example, could use this to align CDS
2577    * sequences which are mapped to their parent cDNA sequences.
2578    *
2579    * This method handles 1:1 mappings (dna-to-dna or protein-to-protein). For
2580    * dna-to-protein or protein-to-dna use alternative methods.
2581    *
2582    * @param unaligned
2583    * sequences to be aligned
2584    * @param aligned
2585    * holds aligned sequences and their mappings
2586    * @return
2587    */
 
2588  4 toggle public static int alignAs(AlignmentI unaligned, AlignmentI aligned)
2589    {
2590    /*
2591    * easy case - aligning a copy of aligned sequences
2592    */
2593  4 if (alignAsSameSequences(unaligned, aligned))
2594    {
2595  0 return unaligned.getHeight();
2596    }
2597   
2598    /*
2599    * fancy case - aligning via mappings between sequences
2600    */
2601  4 List<SequenceI> unmapped = new ArrayList<>();
2602  4 Map<Integer, Map<SequenceI, Character>> columnMap = buildMappedColumnsMap(
2603    unaligned, aligned, unmapped);
2604  4 int width = columnMap.size();
2605  4 char gap = unaligned.getGapCharacter();
2606  4 int realignedCount = 0;
2607    // TODO: verify this loop scales sensibly for very wide/high alignments
2608   
2609  4 for (SequenceI seq : unaligned.getSequences())
2610    {
2611  26 if (!unmapped.contains(seq))
2612    {
2613  26 char[] newSeq = new char[width];
2614  26 Arrays.fill(newSeq, gap); // JBPComment - doubt this is faster than the
2615    // Integer iteration below
2616  26 int newCol = 0;
2617  26 int lastCol = 0;
2618   
2619    /*
2620    * traverse the map to find columns populated
2621    * by our sequence
2622    */
2623  26 for (Integer column : columnMap.keySet())
2624    {
2625  58976 Character c = columnMap.get(column).get(seq);
2626  58976 if (c != null)
2627    {
2628    /*
2629    * sequence has a character at this position
2630    *
2631    */
2632  31986 newSeq[newCol] = c;
2633  31986 lastCol = newCol;
2634    }
2635  58976 newCol++;
2636    }
2637   
2638    /*
2639    * trim trailing gaps
2640    */
2641  26 if (lastCol < width)
2642    {
2643  26 char[] tmp = new char[lastCol + 1];
2644  26 System.arraycopy(newSeq, 0, tmp, 0, lastCol + 1);
2645  26 newSeq = tmp;
2646    }
2647    // TODO: optimise SequenceI to avoid char[]->String->char[]
2648  26 seq.setSequence(String.valueOf(newSeq));
2649  26 realignedCount++;
2650    }
2651    }
2652  4 return realignedCount;
2653    }
2654   
2655    /**
2656    * If unaligned and aligned sequences share the same dataset sequences, then
2657    * simply copies the aligned sequences to the unaligned sequences and returns
2658    * true; else returns false
2659    *
2660    * @param unaligned
2661    * - sequences to be aligned based on aligned
2662    * @param aligned
2663    * - 'guide' alignment containing sequences derived from same dataset
2664    * as unaligned
2665    * @return
2666    */
 
2667  8 toggle static boolean alignAsSameSequences(AlignmentI unaligned,
2668    AlignmentI aligned)
2669    {
2670  8 if (aligned.getDataset() == null || unaligned.getDataset() == null)
2671    {
2672  0 return false; // should only pass alignments with datasets here
2673    }
2674   
2675    // map from dataset sequence to alignment sequence(s)
2676  8 Map<SequenceI, List<SequenceI>> alignedDatasets = new HashMap<>();
2677  8 for (SequenceI seq : aligned.getSequences())
2678    {
2679  59 SequenceI ds = seq.getDatasetSequence();
2680  59 if (alignedDatasets.get(ds) == null)
2681    {
2682  58 alignedDatasets.put(ds, new ArrayList<SequenceI>());
2683    }
2684  59 alignedDatasets.get(ds).add(seq);
2685    }
2686   
2687    /*
2688    * first pass - check whether all sequences to be aligned share a
2689    * dataset sequence with an aligned sequence; also note the leftmost
2690    * ungapped column from which to copy
2691    */
2692  8 int leftmost = Integer.MAX_VALUE;
2693  8 for (SequenceI seq : unaligned.getSequences())
2694    {
2695  14 final SequenceI ds = seq.getDatasetSequence();
2696  14 if (!alignedDatasets.containsKey(ds))
2697    {
2698  5 return false;
2699    }
2700  9 SequenceI alignedSeq = alignedDatasets.get(ds).get(0);
2701  9 int startCol = alignedSeq.findIndex(seq.getStart()); // 1..
2702  9 leftmost = Math.min(leftmost, startCol);
2703    }
2704   
2705    /*
2706    * second pass - copy aligned sequences;
2707    * heuristic rule: pair off sequences in order for the case where
2708    * more than one shares the same dataset sequence
2709    */
2710  3 final char gapCharacter = aligned.getGapCharacter();
2711  3 for (SequenceI seq : unaligned.getSequences())
2712    {
2713  7 List<SequenceI> alignedSequences = alignedDatasets
2714    .get(seq.getDatasetSequence());
2715  7 if (alignedSequences.isEmpty())
2716    {
2717    /*
2718    * defensive check - shouldn't happen! (JAL-3536)
2719    */
2720  0 continue;
2721    }
2722  7 SequenceI alignedSeq = alignedSequences.get(0);
2723   
2724    /*
2725    * gap fill for leading (5') UTR if any
2726    */
2727    // TODO this copies intron columns - wrong!
2728  7 int startCol = alignedSeq.findIndex(seq.getStart()); // 1..
2729  7 int endCol = alignedSeq.findIndex(seq.getEnd());
2730  7 char[] seqchars = new char[endCol - leftmost + 1];
2731  7 Arrays.fill(seqchars, gapCharacter);
2732  7 char[] toCopy = alignedSeq.getSequence(startCol - 1, endCol);
2733  7 System.arraycopy(toCopy, 0, seqchars, startCol - leftmost,
2734    toCopy.length);
2735  7 seq.setSequence(String.valueOf(seqchars));
2736  7 if (alignedSequences.size() > 0)
2737    {
2738    // pop off aligned sequences (except the last one)
2739  7 alignedSequences.remove(0);
2740    }
2741    }
2742   
2743    /*
2744    * finally remove gapped columns (e.g. introns)
2745    */
2746  3 new RemoveGapColCommand("", unaligned.getSequencesArray(), 0,
2747    unaligned.getWidth() - 1, unaligned);
2748   
2749  3 return true;
2750    }
2751   
2752    /**
2753    * Returns a map whose key is alignment column number (base 1), and whose
2754    * values are a map of sequence characters in that column.
2755    *
2756    * @param unaligned
2757    * @param aligned
2758    * @param unmapped
2759    * @return
2760    */
 
2761  4 toggle static SortedMap<Integer, Map<SequenceI, Character>> buildMappedColumnsMap(
2762    AlignmentI unaligned, AlignmentI aligned,
2763    List<SequenceI> unmapped)
2764    {
2765    /*
2766    * Map will hold, for each aligned column position, a map of
2767    * {unalignedSequence, characterPerSequence} at that position.
2768    * TreeMap keeps the entries in ascending column order.
2769    */
2770  4 SortedMap<Integer, Map<SequenceI, Character>> map = new TreeMap<>();
2771   
2772    /*
2773    * record any sequences that have no mapping so can't be realigned
2774    */
2775  4 unmapped.addAll(unaligned.getSequences());
2776   
2777  4 List<AlignedCodonFrame> mappings = aligned.getCodonFrames();
2778   
2779  4 for (SequenceI seq : unaligned.getSequences())
2780    {
2781  26 for (AlignedCodonFrame mapping : mappings)
2782    {
2783  510 SequenceI fromSeq = mapping.findAlignedSequence(seq, aligned);
2784  510 if (fromSeq != null)
2785    {
2786  26 Mapping seqMap = mapping.getMappingBetween(fromSeq, seq);
2787  26 if (addMappedPositions(seq, fromSeq, seqMap, map))
2788    {
2789  26 unmapped.remove(seq);
2790    }
2791    }
2792    }
2793    }
2794  4 return map;
2795    }
2796   
2797    /**
2798    * Helper method that adds to a map the mapped column positions of a sequence.
2799    * <br>
2800    * For example if aaTT-Tg-gAAA is mapped to TTTAAA then the map should record
2801    * that columns 3,4,6,10,11,12 map to characters T,T,T,A,A,A of the mapped to
2802    * sequence.
2803    *
2804    * @param seq
2805    * the sequence whose column positions we are recording
2806    * @param fromSeq
2807    * a sequence that is mapped to the first sequence
2808    * @param seqMap
2809    * the mapping from 'fromSeq' to 'seq'
2810    * @param map
2811    * a map to add the column positions (in fromSeq) of the mapped
2812    * positions of seq
2813    * @return
2814    */
 
2815  28 toggle static boolean addMappedPositions(SequenceI seq, SequenceI fromSeq,
2816    Mapping seqMap, Map<Integer, Map<SequenceI, Character>> map)
2817    {
2818  28 if (seqMap == null)
2819    {
2820  0 return false;
2821    }
2822   
2823    /*
2824    * invert mapping if it is from unaligned to aligned sequence
2825    */
2826  28 if (seqMap.getTo() == fromSeq.getDatasetSequence())
2827    {
2828  0 seqMap = new Mapping(seq.getDatasetSequence(),
2829    seqMap.getMap().getInverse());
2830    }
2831   
2832  28 int toStart = seq.getStart();
2833   
2834    /*
2835    * traverse [start, end, start, end...] ranges in fromSeq
2836    */
2837  28 for (int[] fromRange : seqMap.getMap().getFromRanges())
2838    {
2839  62 for (int i = 0; i < fromRange.length - 1; i += 2)
2840    {
2841  31 boolean forward = fromRange[i + 1] >= fromRange[i];
2842   
2843    /*
2844    * find the range mapped to (sequence positions base 1)
2845    */
2846  31 int[] range = seqMap.locateMappedRange(fromRange[i],
2847    fromRange[i + 1]);
2848  31 if (range == null)
2849    {
2850  0 jalview.bin.Console.errPrintln("Error in mapping " + seqMap
2851    + " from " + fromSeq.getName());
2852  0 return false;
2853    }
2854  31 int fromCol = fromSeq.findIndex(fromRange[i]);
2855  31 int mappedCharPos = range[0];
2856   
2857    /*
2858    * walk over the 'from' aligned sequence in forward or reverse
2859    * direction; when a non-gap is found, record the column position
2860    * of the next character of the mapped-to sequence; stop when all
2861    * the characters of the range have been counted
2862    */
2863  2794274 while (mappedCharPos <= range[1] && fromCol <= fromSeq.getLength()
2864    && fromCol >= 0)
2865    {
2866  2794243 if (!Comparison.isGap(fromSeq.getCharAt(fromCol - 1)))
2867    {
2868    /*
2869    * mapped from sequence has a character in this column
2870    * record the column position for the mapped to character
2871    */
2872  31998 Map<SequenceI, Character> seqsMap = map.get(fromCol);
2873  31998 if (seqsMap == null)
2874    {
2875  5398 seqsMap = new HashMap<>();
2876  5398 map.put(fromCol, seqsMap);
2877    }
2878  31998 seqsMap.put(seq, seq.getCharAt(mappedCharPos - toStart));
2879  31998 mappedCharPos++;
2880    }
2881  2794243 fromCol += (forward ? 1 : -1);
2882    }
2883    }
2884    }
2885  28 return true;
2886    }
2887   
2888    // strictly temporary hack until proper criteria for aligning protein to cds
2889    // are in place; this is so Ensembl -> fetch xrefs Uniprot aligns the Uniprot
 
2890  4 toggle public static boolean looksLikeEnsembl(AlignmentI alignment)
2891    {
2892  4 for (SequenceI seq : alignment.getSequences())
2893    {
2894  88 String name = seq.getName();
2895  88 if (!name.startsWith("ENSG") && !name.startsWith("ENST"))
2896    {
2897  0 return false;
2898    }
2899    }
2900  4 return true;
2901    }
2902   
2903    /**
2904    * This method filters and returns the secondary structure annotations
2905    * in an array of annotations
2906    *
2907    * @param annotations
2908    * @return array of secondary structure annotations
2909    */
 
2910  0 toggle public static List<AlignmentAnnotation> getSecondaryStructureAnnots(
2911    AlignmentAnnotation[] annotations)
2912    {
2913  0 List<AlignmentAnnotation> ssAnnotations = new ArrayList<>();
2914  0 if (annotations == null || annotations.length == 0)
2915    {
2916  0 return null;
2917    }
2918   
2919  0 synchronized (annotations)
2920    {
2921  0 for (AlignmentAnnotation aa : annotations)
2922    {
2923  0 if (aa == null)
2924    {
2925  0 continue;
2926    }
2927   
2928  0 if (aa.label != null && Constants.SECONDARY_STRUCTURE_LABELS
2929    .containsKey(aa.label))
2930    {
2931  0 ssAnnotations.add(aa);
2932    }
2933    }
2934    }
2935  0 return ssAnnotations;
2936   
2937    }
2938   
 
2939  19 toggle public static boolean isSecondaryStructurePresent(
2940    AlignmentAnnotation[] annotations)
2941    {
2942  19 boolean ssPresent = false;
2943  19 if (annotations == null || annotations.length == 0)
2944    {
2945  8 return false;
2946    }
2947   
2948  11 synchronized (annotations)
2949    {
2950  11 for (AlignmentAnnotation aa : annotations)
2951    {
2952  12 if (aa == null)
2953    {
2954  0 continue;
2955    }
2956    // TODO: @RENIA decide what exactly we should be testing for - not
2957    // strings/labels
2958  12 if (// aa.isValidStruc() || aa.hasIcons ||
2959  12 (aa.label != null && Constants.SECONDARY_STRUCTURE_LABELS
2960    .containsKey(aa.label)))
2961    {
2962  9 ssPresent = true;
2963  9 break;
2964    }
2965    }
2966    }
2967  11 return ssPresent;
2968   
2969    }
2970   
 
2971  4 toggle public static Color getSecondaryStructureAnnotationColour(char symbol)
2972    {
2973   
2974  4 if (symbol == Constants.COIL)
2975    {
2976  1 return Color.gray;
2977    }
2978  3 if (symbol == Constants.SHEET)
2979    {
2980  1 return Color.green;
2981    }
2982  2 if (symbol == Constants.HELIX)
2983    {
2984  1 return Color.red;
2985    }
2986   
2987  1 return Color.white;
2988    }
2989   
 
2990  245770 toggle public static char findSSAnnotationForGivenSeqposition(
2991    AlignmentAnnotation aa, int seqPosition)
2992    {
2993  245776 char ss = '*';
2994   
2995  245783 if (aa != null)
2996    {
2997  245788 if (aa.getAnnotationForPosition(seqPosition) != null)
2998    {
2999  112419 Annotation a = aa.getAnnotationForPosition(seqPosition);
3000  112412 ss = a.secondaryStructure;
3001   
3002    // There is no representation for coil and it can be either ' ' or null.
3003  112416 if (ss == ' ' || ss == '-')
3004    {
3005  17163 ss = Constants.COIL;
3006    }
3007    }
3008    else
3009    {
3010  133374 ss = Constants.COIL;
3011    }
3012    }
3013   
3014  245738 return ss;
3015    }
3016   
 
3017  5139 toggle public static List<String> extractSSSourceInAlignmentAnnotation(
3018    AlignmentAnnotation[] annotations)
3019    {
3020   
3021  5139 List<String> ssSources = new ArrayList<>();
3022  5139 Set<String> addedSources = new HashSet<>(); // to keep track of added
3023    // sources
3024   
3025  5139 if (annotations == null)
3026    {
3027  12 return ssSources;
3028    }
3029   
3030  5127 for (AlignmentAnnotation aa : annotations)
3031    {
3032   
3033  27000 String ssSource = AlignmentAnnotationUtils.extractSSSourceFromAnnotationDescription(aa);
3034   
3035  27001 if (ssSource != null && !addedSources.contains(ssSource))
3036    {
3037  521 ssSources.add(ssSource);
3038  521 addedSources.add(ssSource);
3039    }
3040   
3041    }
3042  5126 Collections.sort(ssSources);
3043   
3044  5127 return ssSources;
3045   
3046    }
3047   
3048   
3049    /**
3050    * Returns secondary structure annotations from the given sequence for a specified source.
3051    * @param seq the sequence
3052    * @param ssSource the source label ("PDB", "JPred")
3053    * @return list of matching annotations
3054    */
 
3055  13613404 toggle public static List<AlignmentAnnotation> getAlignmentAnnotationForSource(
3056    SequenceI seq, String ssSource)
3057    {
3058   
3059  13641179 List<AlignmentAnnotation> ssAnnots = new ArrayList<AlignmentAnnotation>();
3060   
3061    // Iterate through all known secondary structure labels
3062  13667975 for (String ssLabel : Constants.SECONDARY_STRUCTURE_LABELS.keySet())
3063    {
3064   
3065  26576488 AlignmentAnnotation[] aa = seq.getAnnotation(ssLabel);
3066  26210534 if (aa != null)
3067    {
3068    // If source is "ALL", include all annotations for this label
3069  394921 if (Constants.SS_ALL_PROVIDERS.equals(ssSource))
3070    {
3071  161698 ssAnnots.addAll(Arrays.asList(aa));
3072  161700 continue;
3073    }
3074   
3075  233223 for (AlignmentAnnotation annot : aa)
3076    {
3077  280482 String ssSourceForAnnot = AlignmentAnnotationUtils.extractSSSourceFromAnnotationDescription(
3078    annot);
3079    // Add annotations from the matching source
3080  280458 if (ssSourceForAnnot != null && ssSource.equals(ssSourceForAnnot)
3081    && annot.isForDisplay())
3082    {
3083  157290 ssAnnots.add(annot);
3084    }
3085    }
3086    }
3087    }
3088  13574478 if (ssAnnots.size() > 0)
3089    {
3090  278955 return ssAnnots;
3091    }
3092   
3093  13301949 return null;
3094   
3095    }
3096   
 
3097  48 toggle public static Map<SequenceI, ArrayList<AlignmentAnnotation>> getSequenceAssociatedAlignmentAnnotations(
3098    AlignmentAnnotation[] alignAnnotList, String selectedSSSource)
3099    {
3100   
3101  48 Map<SequenceI, ArrayList<AlignmentAnnotation>> ssAlignmentAnnotationForSequences = new HashMap<SequenceI, ArrayList<AlignmentAnnotation>>();
3102  48 if (alignAnnotList == null || alignAnnotList.length == 0)
3103    {
3104  0 return ssAlignmentAnnotationForSequences;
3105    }
3106   
3107  48 for (AlignmentAnnotation aa : alignAnnotList)
3108    {
3109  251 if (aa.sequenceRef == null)
3110    {
3111  32 continue;
3112    }
3113   
3114  219 if (isSecondaryStructureFrom(selectedSSSource, aa))
3115    {
3116  78 ssAlignmentAnnotationForSequences
3117    .computeIfAbsent(aa.sequenceRef.getDatasetSequence(),
3118    k -> new ArrayList<>())
3119    .add(aa);
3120    }
3121    }
3122   
3123  48 return ssAlignmentAnnotationForSequences;
3124   
3125    }
3126   
3127    /**
3128    * param alignAnnotation
3129    * @param selectedSSSource
3130    * @return any alignment annotation that matches the given source
3131    */
 
3132  0 toggle public static List<AlignmentAnnotation> getSecondaryStructureAnnotionFor(List<AlignmentAnnotation> alignAnnotation, String selectedSSSource)
3133    {
3134  0 ArrayList<AlignmentAnnotation> annForSource = new ArrayList();
3135  0 for (AlignmentAnnotation alan:alignAnnotation)
3136    {
3137  0 if (isSecondaryStructureFrom(selectedSSSource, alan)) {
3138  0 annForSource.add(alan);
3139    }
3140    }
3141  0 return annForSource;
3142    }
3143   
3144    /**
3145    *
3146    * @param selectedSSSource
3147    * @param aa
3148    * @return true if aa is from a provider or all providers as specified by
3149    * selectedSSSource
3150    */
 
3151  219 toggle public static boolean isSecondaryStructureFrom(String selectedSSSource,
3152    AlignmentAnnotation aa)
3153    {
3154   
3155  219 for (String label : Constants.SECONDARY_STRUCTURE_LABELS.keySet())
3156    {
3157   
3158  376 if (label.equals(aa.label))
3159    {
3160   
3161  170 if (selectedSSSource.equals(Constants.SS_ALL_PROVIDERS))
3162    {
3163  38 return true;
3164    }
3165  132 String ssSource = AlignmentAnnotationUtils
3166    .extractSSSourceFromAnnotationDescription(aa);
3167  132 if (ssSource != null && ssSource.equals(selectedSSSource))
3168    {
3169  40 return true;
3170    }
3171    }
3172    }
3173  141 return false;
3174    }
3175   
3176    // Method to get the key for a given provider value
 
3177  0 toggle public static String getSecondaryStructureProviderKey(
3178    String providerValue)
3179    {
3180  0 for (Map.Entry<String, String> entry : Constants.STRUCTURE_PROVIDERS
3181    .entrySet())
3182    {
3183  0 if (entry.getValue().equals(providerValue))
3184    {
3185  0 return entry.getKey(); // Return the key (abbreviation) for the matching
3186    // provider value
3187    }
3188    }
3189  0 return null; // Return null if no match is found
3190    }
3191   
 
3192  0 toggle public static String reduceLabelLength(String label)
3193    {
3194    // Split the input by " | "
3195  0 StringBuilder reducedLabel = new StringBuilder();
3196  0 for (String part:label.split(" \\| ")) {
3197  0 String nextLabel=part;
3198  0 for (Entry<String,String> provider:Constants.STRUCTURE_PROVIDERS.entrySet()) {
3199  0 if (part.equals(provider.getValue())) {
3200  0 nextLabel = provider.getKey();
3201    }
3202    }
3203  0 if (reducedLabel.length()>0)
3204    {
3205  0 reducedLabel.append(" | ");
3206    }
3207  0 reducedLabel.append(nextLabel);
3208    }
3209  0 return reducedLabel.toString();
3210    }
3211   
 
3212  532 toggle public static HashMap<String, Color> assignColorsForSecondaryStructureProviders(
3213    List<String> labels)
3214    {
3215    // Loop through each unique label and add it to the map with a colour.
3216    // for (String label : uniqueLabels)
3217   
3218  532 HashMap<String, Color> secondaryStructureProviderColorMap = new HashMap<String, Color>();
3219  532 for (String label : labels)
3220    {
3221    // Generate or retrieve a colour for the label.
3222  491 String name = label.toUpperCase(Locale.ROOT).trim();
3223  491 secondaryStructureProviderColorMap.put(name,
3224    ColorUtils.getColourFromNameAndScheme(name, "NONE"));
3225    }
3226   
3227  532 return secondaryStructureProviderColorMap;
3228    }
3229   
 
3230  6 toggle public static int computeMaxShifts(SequenceI[] seqs, ShiftList inserts)
3231    {
3232   
3233  6 int oldwidth = 0;
3234  6 int p = 0, lastP = 0, lastIns = 0;
3235  6 for (SequenceI seq : seqs)
3236    {
3237  140 char[] sqs = seq.getSequence();
3238    // System.out.println(seq.getSequenceAsString());
3239  140 if (oldwidth < sqs.length)
3240    {
3241  13 oldwidth = sqs.length;
3242    }
3243  140 p = 0;
3244  140 lastP = 0;
3245  140 lastIns = 0;
3246  140 int modelOffset = 0; // shift between index and insertions encountered
3247  140 do
3248    {
3249  53877 if (sqs[p] >= 'a' && sqs[p] <= 'z')
3250    {
3251  2015 if (lastIns == 0)
3252    {
3253  589 lastP = (modelOffset + p) * 2;
3254    }
3255  2015 lastIns++;
3256    }
3257    else
3258    {
3259  51862 if (lastIns > 0)
3260    {
3261  586 inserts.extendShift(lastP, lastIns);
3262  586 modelOffset -= lastIns;
3263    }
3264  51862 lastIns = 0;
3265    }
3266  53877 } while (++p < sqs.length);
3267  140 if (lastIns > 0)
3268    {
3269  3 inserts.extendShift(lastP, lastIns);
3270    }
3271    }
3272  6 return oldwidth;
3273    }
3274   
3275    /**
3276    * quick and dirty lowercase - to - insert expansion
3277    *
3278    * @param seqs
3279    * - these will be direcly modified
3280    * @return message indicating how many gaps were inserted.
3281    */
 
3282  4 toggle public static String a3mToMSA(SequenceI[] seqs)
3283    {
3284  4 String response = "";
3285  4 ShiftList inserts = new ShiftList();
3286  4 int oldwidth = computeMaxShifts(seqs, inserts);
3287   
3288  4 int newwidth = insertShifts(seqs,inserts);
3289  4 if (oldwidth != newwidth)
3290    {
3291  4 response = "Added " + (newwidth - oldwidth) + " inserts.";
3292    }
3293  4 return response;
3294    }
3295   
 
3296  5 toggle public static int insertShifts(SequenceI[] seqs, ShiftList inserts)
3297    {
3298  5 int newwidth=0;
3299  5 for (SequenceI seq : seqs)
3300    {
3301  80 boolean changed=false;
3302  80 int posShift = 0, totalShift = 0;
3303  80 for (int[] shift : inserts.getShifts())
3304    {
3305    // locate this shift in the totalShift'ed sequence
3306  5912 posShift = (shift[0] / 2);
3307  5912 int len = shift[1];
3308   
3309    // account for any insertions in this sequence
3310  5912 int p = posShift + totalShift;
3311  7021 while (len > 0 && p < seq.getLength()
3312    && (seq.getCharAt(p) >= 'a' && seq.getCharAt(p) <= 'z'))
3313    {
3314  1109 p++;
3315  1109 len--;
3316    }
3317    // and mark any remaining after the insert
3318  5912 if (len > 0)
3319    {
3320  5670 changed=true;
3321  5670 seq.doInsert(p, len, '-');
3322    }
3323  5912 totalShift += shift[1]; // update shifted positions in this sequence
3324    }
3325  80 if (changed)
3326    {
3327  78 seq.sequenceChanged();
3328    }
3329  80 newwidth = (seq.getLength() > newwidth) ? seq.getLength() : newwidth;
3330    }
3331  5 return newwidth;
3332    }
3333    }