Clover icon

Coverage Report

  1. Project Clover database Wed Nov 5 2025 13:15:40 GMT
  2. Package jalview.analysis

File AlignmentUtils.java

 

Coverage histogram

../../img/srcFileCovDistChart9.png
13% of files have more coverage

Code metrics

456
918
60
2
3,229
1,973
353
0.38
15.3
30
5.88

Classes

Class Line # Actions
AlignmentUtils 81 912 347
0.844366284.4%
AlignmentUtils.DnaVariant 97 6 6
0.00%
 

Contributing tests

This file is covered by 323 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.analysis;
22   
23    import java.awt.Color;
24    import java.util.ArrayList;
25    import java.util.Arrays;
26    import java.util.Collection;
27    import java.util.Collections;
28    import java.util.HashMap;
29    import java.util.HashSet;
30    import java.util.Iterator;
31    import java.util.LinkedHashMap;
32    import java.util.List;
33    import java.util.Locale;
34    import java.util.Map;
35    import java.util.Map.Entry;
36    import java.util.NoSuchElementException;
37    import java.util.Set;
38    import java.util.SortedMap;
39    import java.util.TreeMap;
40    import java.util.Vector;
41    import java.util.stream.Collectors;
42   
43    import jalview.bin.Console;
44    import jalview.commands.RemoveGapColCommand;
45    import jalview.datamodel.AlignedCodon;
46    import jalview.datamodel.AlignedCodonFrame;
47    import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping;
48    import jalview.datamodel.Alignment;
49    import jalview.datamodel.AlignmentAnnotation;
50    import jalview.datamodel.AlignmentI;
51    import jalview.datamodel.Annotation;
52    import jalview.datamodel.ContactMatrixI;
53    import jalview.datamodel.DBRefEntry;
54    import jalview.datamodel.DBRefSource;
55    import jalview.datamodel.GeneLociI;
56    import jalview.datamodel.IncompleteCodonException;
57    import jalview.datamodel.Mapping;
58    import jalview.datamodel.PDBEntry;
59    import jalview.datamodel.Sequence;
60    import jalview.datamodel.SequenceFeature;
61    import jalview.datamodel.SequenceGroup;
62    import jalview.datamodel.SequenceI;
63    import jalview.datamodel.features.SequenceFeatures;
64    import jalview.io.gff.SequenceOntologyI;
65    import jalview.schemes.ResidueProperties;
66    import jalview.util.ColorUtils;
67    import jalview.util.Comparison;
68    import jalview.util.Constants;
69    import jalview.util.DBRefUtils;
70    import jalview.util.IntRangeComparator;
71    import jalview.util.MapList;
72    import jalview.util.MappingUtils;
73   
74    /**
75    * grab bag of useful alignment manipulation operations Expect these to be
76    * refactored elsewhere at some point.
77    *
78    * @author jimp
79    *
80    */
 
81    public class AlignmentUtils
82    {
83    private static final int CODON_LENGTH = 3;
84   
85    private static final String SEQUENCE_VARIANT = "sequence_variant:";
86   
87    /*
88    * the 'id' attribute is provided for variant features fetched from
89    * Ensembl using its REST service with JSON format
90    */
91    public static final String VARIANT_ID = "id";
92   
93    /**
94    * A data model to hold the 'normal' base value at a position, and an optional
95    * sequence variant feature
96    */
 
97    static final class DnaVariant
98    {
99    final String base;
100   
101    SequenceFeature variant;
102   
 
103  0 toggle DnaVariant(String nuc)
104    {
105  0 base = nuc;
106  0 variant = null;
107    }
108   
 
109  0 toggle DnaVariant(String nuc, SequenceFeature var)
110    {
111  0 base = nuc;
112  0 variant = var;
113    }
114   
 
115  0 toggle public String getSource()
116    {
117  0 return variant == null ? null : variant.getFeatureGroup();
118    }
119   
120    /**
121    * toString for aid in the debugger only
122    */
 
123  0 toggle @Override
124    public String toString()
125    {
126  0 return base + ":" + (variant == null ? "" : variant.getDescription());
127    }
128    }
129   
130    /**
131    * given an existing alignment, create a new alignment including all, or up to
132    * flankSize additional symbols from each sequence's dataset sequence
133    *
134    * @param core
135    * @param flankSize
136    * @return AlignmentI
137    */
 
138  27 toggle public static AlignmentI expandContext(AlignmentI core, int flankSize)
139    {
140  27 List<SequenceI> sq = new ArrayList<>();
141  27 int maxoffset = 0;
142  27 for (SequenceI s : core.getSequences())
143    {
144  131 SequenceI newSeq = s.deriveSequence();
145  131 final int newSeqStart = newSeq.getStart() - 1;
146  131 if (newSeqStart > maxoffset
147    && newSeq.getDatasetSequence().getStart() < s.getStart())
148    {
149  131 maxoffset = newSeqStart;
150    }
151  131 sq.add(newSeq);
152    }
153  27 if (flankSize > -1)
154    {
155  25 maxoffset = Math.min(maxoffset, flankSize);
156    }
157   
158    /*
159    * now add offset left and right to create an expanded alignment
160    */
161  27 for (SequenceI s : sq)
162    {
163  131 SequenceI ds = s;
164  262 while (ds.getDatasetSequence() != null)
165    {
166  131 ds = ds.getDatasetSequence();
167    }
168  131 int s_end = s.findPosition(s.getStart() + s.getLength());
169    // find available flanking residues for sequence
170  131 int ustream_ds = s.getStart() - ds.getStart();
171  131 int dstream_ds = ds.getEnd() - s_end;
172   
173    // build new flanked sequence
174   
175    // compute gap padding to start of flanking sequence
176  131 int offset = maxoffset - ustream_ds;
177   
178    // padding is gapChar x ( maxoffset - min(ustream_ds, flank)
179  131 if (flankSize >= 0)
180    {
181  125 if (flankSize < ustream_ds)
182    {
183    // take up to flankSize residues
184  40 offset = maxoffset - flankSize;
185  40 ustream_ds = flankSize;
186    }
187  125 if (flankSize <= dstream_ds)
188    {
189  116 dstream_ds = flankSize - 1;
190    }
191    }
192    // TODO use Character.toLowerCase to avoid creating String objects?
193  131 char[] upstream = new String(ds
194    .getSequence(s.getStart() - 1 - ustream_ds, s.getStart() - 1))
195    .toLowerCase(Locale.ROOT).toCharArray();
196  131 char[] downstream = new String(
197    ds.getSequence(s_end - 1, s_end + dstream_ds))
198    .toLowerCase(Locale.ROOT).toCharArray();
199  131 char[] coreseq = s.getSequence();
200  131 char[] nseq = new char[offset + upstream.length + downstream.length
201    + coreseq.length];
202  131 char c = core.getGapCharacter();
203   
204  131 int p = 0;
205  461 for (; p < offset; p++)
206    {
207  330 nseq[p] = c;
208    }
209   
210  131 System.arraycopy(upstream, 0, nseq, p, upstream.length);
211  131 System.arraycopy(coreseq, 0, nseq, p + upstream.length,
212    coreseq.length);
213  131 System.arraycopy(downstream, 0, nseq,
214    p + coreseq.length + upstream.length, downstream.length);
215  131 s.setSequence(new String(nseq));
216  131 s.setStart(s.getStart() - ustream_ds);
217  131 s.setEnd(s_end + downstream.length);
218    }
219  27 AlignmentI newAl = new jalview.datamodel.Alignment(
220    sq.toArray(new SequenceI[0]));
221  27 for (SequenceI s : sq)
222    {
223  131 if (s.getAnnotation() != null)
224    {
225  1 for (AlignmentAnnotation aa : s.getAnnotation())
226    {
227  1 aa.adjustForAlignment(); // JAL-1712 fix
228  1 newAl.addAnnotation(aa);
229    }
230    }
231    }
232  27 newAl.setDataset(core.getDataset());
233  27 return newAl;
234    }
235   
236    /**
237    * Returns the index (zero-based position) of a sequence in an alignment, or
238    * -1 if not found.
239    *
240    * @param al
241    * @param seq
242    * @return
243    */
 
244  64581 toggle public static int getSequenceIndex(AlignmentI al, SequenceI seq)
245    {
246  64581 int result = -1;
247  64580 int pos = 0;
248  64580 for (SequenceI alSeq : al.getSequences())
249    {
250  126839984 if (alSeq == seq)
251    {
252  64520 result = pos;
253  64520 break;
254    }
255  126775464 pos++;
256    }
257  64581 return result;
258    }
259   
260    /**
261    * Returns a map of lists of sequences in the alignment, keyed by sequence
262    * name. For use in mapping between different alignment views of the same
263    * sequences.
264    *
265    * @see jalview.datamodel.AlignmentI#getSequencesByName()
266    */
 
267  1 toggle public static Map<String, List<SequenceI>> getSequencesByName(
268    AlignmentI al)
269    {
270  1 Map<String, List<SequenceI>> theMap = new LinkedHashMap<>();
271  1 for (SequenceI seq : al.getSequences())
272    {
273  3 String name = seq.getName();
274  3 if (name != null)
275    {
276  3 List<SequenceI> seqs = theMap.get(name);
277  3 if (seqs == null)
278    {
279  2 seqs = new ArrayList<>();
280  2 theMap.put(name, seqs);
281    }
282  3 seqs.add(seq);
283    }
284    }
285  1 return theMap;
286    }
287   
288    /**
289    * Build mapping of protein to cDNA alignment. Mappings are made between
290    * sequences where the cDNA translates to the protein sequence. Any new
291    * mappings are added to the protein alignment. Returns true if any mappings
292    * either already exist or were added, else false.
293    *
294    * @param proteinAlignment
295    * @param cdnaAlignment
296    * @return
297    */
 
298  4 toggle public static boolean mapProteinAlignmentToCdna(
299    final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment)
300    {
301  4 if (proteinAlignment == null || cdnaAlignment == null)
302    {
303  0 return false;
304    }
305   
306  4 Set<SequenceI> mappedDna = new HashSet<>();
307  4 Set<SequenceI> mappedProtein = new HashSet<>();
308   
309    /*
310    * First pass - map sequences where cross-references exist. This include
311    * 1-to-many mappings to support, for example, variant cDNA.
312    */
313  4 boolean mappingPerformed = mapProteinToCdna(proteinAlignment,
314    cdnaAlignment, mappedDna, mappedProtein, true);
315   
316    /*
317    * Second pass - map sequences where no cross-references exist. This only
318    * does 1-to-1 mappings and assumes corresponding sequences are in the same
319    * order in the alignments.
320    */
321  4 mappingPerformed |= mapProteinToCdna(proteinAlignment, cdnaAlignment,
322    mappedDna, mappedProtein, false);
323  4 return mappingPerformed;
324    }
325   
326    /**
327    * Make mappings between compatible sequences (where the cDNA translation
328    * matches the protein).
329    *
330    * @param proteinAlignment
331    * @param cdnaAlignment
332    * @param mappedDna
333    * a set of mapped DNA sequences (to add to)
334    * @param mappedProtein
335    * a set of mapped Protein sequences (to add to)
336    * @param xrefsOnly
337    * if true, only map sequences where xrefs exist
338    * @return
339    */
 
340  8 toggle protected static boolean mapProteinToCdna(
341    final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment,
342    Set<SequenceI> mappedDna, Set<SequenceI> mappedProtein,
343    boolean xrefsOnly)
344    {
345  8 boolean mappingExistsOrAdded = false;
346  8 List<SequenceI> thisSeqs = proteinAlignment.getSequences();
347  8 for (SequenceI aaSeq : thisSeqs)
348    {
349  22 boolean proteinMapped = false;
350  22 AlignedCodonFrame acf = new AlignedCodonFrame();
351   
352  22 for (SequenceI cdnaSeq : cdnaAlignment.getSequences())
353    {
354    /*
355    * Always try to map if sequences have xref to each other; this supports
356    * variant cDNA or alternative splicing for a protein sequence.
357    *
358    * If no xrefs, try to map progressively, assuming that alignments have
359    * mappable sequences in corresponding order. These are not
360    * many-to-many, as that would risk mixing species with similar cDNA
361    * sequences.
362    */
363  86 if (xrefsOnly && !AlignmentUtils.haveCrossRef(aaSeq, cdnaSeq))
364    {
365  39 continue;
366    }
367   
368    /*
369    * Don't map non-xrefd sequences more than once each. This heuristic
370    * allows us to pair up similar sequences in ordered alignments.
371    */
372  47 if (!xrefsOnly && (mappedProtein.contains(aaSeq)
373    || mappedDna.contains(cdnaSeq)))
374    {
375  29 continue;
376    }
377  18 if (mappingExists(proteinAlignment.getCodonFrames(),
378    aaSeq.getDatasetSequence(), cdnaSeq.getDatasetSequence()))
379    {
380  0 mappingExistsOrAdded = true;
381    }
382    else
383    {
384  18 MapList map = mapCdnaToProtein(aaSeq, cdnaSeq);
385  18 if (map != null)
386    {
387  12 acf.addMap(cdnaSeq, aaSeq, map);
388  12 mappingExistsOrAdded = true;
389  12 proteinMapped = true;
390  12 mappedDna.add(cdnaSeq);
391  12 mappedProtein.add(aaSeq);
392    }
393    }
394    }
395  22 if (proteinMapped)
396    {
397  11 proteinAlignment.addCodonFrame(acf);
398    }
399    }
400  8 return mappingExistsOrAdded;
401    }
402   
403    /**
404    * Answers true if the mappings include one between the given (dataset)
405    * sequences.
406    */
 
407  18 toggle protected static boolean mappingExists(List<AlignedCodonFrame> mappings,
408    SequenceI aaSeq, SequenceI cdnaSeq)
409    {
410  18 if (mappings != null)
411    {
412  18 for (AlignedCodonFrame acf : mappings)
413    {
414  14 if (cdnaSeq == acf.getDnaForAaSeq(aaSeq))
415    {
416  0 return true;
417    }
418    }
419    }
420  18 return false;
421    }
422   
423    /**
424    * Builds a mapping (if possible) of a cDNA to a protein sequence.
425    * <ul>
426    * <li>first checks if the cdna translates exactly to the protein
427    * sequence</li>
428    * <li>else checks for translation after removing a STOP codon</li>
429    * <li>else checks for translation after removing a START codon</li>
430    * <li>if that fails, inspect CDS features on the cDNA sequence</li>
431    * </ul>
432    * Returns null if no mapping is determined.
433    *
434    * @param proteinSeq
435    * the aligned protein sequence
436    * @param cdnaSeq
437    * the aligned cdna sequence
438    * @return
439    */
 
440  31 toggle public static MapList mapCdnaToProtein(SequenceI proteinSeq,
441    SequenceI cdnaSeq)
442    {
443    /*
444    * Here we handle either dataset sequence set (desktop) or absent (applet).
445    * Use only the char[] form of the sequence to avoid creating possibly large
446    * String objects.
447    */
448  31 final SequenceI proteinDataset = proteinSeq.getDatasetSequence();
449  31 char[] aaSeqChars = proteinDataset != null
450    ? proteinDataset.getSequence()
451    : proteinSeq.getSequence();
452  31 final SequenceI cdnaDataset = cdnaSeq.getDatasetSequence();
453  31 char[] cdnaSeqChars = cdnaDataset != null ? cdnaDataset.getSequence()
454    : cdnaSeq.getSequence();
455  31 if (aaSeqChars == null || cdnaSeqChars == null)
456    {
457  0 return null;
458    }
459   
460    /*
461    * cdnaStart/End, proteinStartEnd are base 1 (for dataset sequence mapping)
462    */
463  31 final int mappedLength = CODON_LENGTH * aaSeqChars.length;
464  31 int cdnaLength = cdnaSeqChars.length;
465  31 int cdnaStart = cdnaSeq.getStart();
466  31 int cdnaEnd = cdnaSeq.getEnd();
467  31 final int proteinStart = proteinSeq.getStart();
468  31 final int proteinEnd = proteinSeq.getEnd();
469   
470    /*
471    * If lengths don't match, try ignoring stop codon (if present)
472    */
473  31 if (cdnaLength != mappedLength && cdnaLength > 2)
474    {
475  16 String lastCodon = String.valueOf(cdnaSeqChars,
476    cdnaLength - CODON_LENGTH, CODON_LENGTH)
477    .toUpperCase(Locale.ROOT);
478  16 for (String stop : ResidueProperties.STOP_CODONS)
479    {
480  47 if (lastCodon.equals(stop))
481    {
482  2 cdnaEnd -= CODON_LENGTH;
483  2 cdnaLength -= CODON_LENGTH;
484  2 break;
485    }
486    }
487    }
488   
489    /*
490    * If lengths still don't match, try ignoring start codon.
491    */
492  31 int startOffset = 0;
493  31 if (cdnaLength != mappedLength && cdnaLength > 2
494    && String.valueOf(cdnaSeqChars, 0, CODON_LENGTH)
495    .toUpperCase(Locale.ROOT)
496    .equals(ResidueProperties.START))
497    {
498  5 startOffset += CODON_LENGTH;
499  5 cdnaStart += CODON_LENGTH;
500  5 cdnaLength -= CODON_LENGTH;
501    }
502   
503  31 if (translatesAs(cdnaSeqChars, startOffset, aaSeqChars))
504    {
505    /*
506    * protein is translation of dna (+/- start/stop codons)
507    */
508  15 MapList map = new MapList(new int[] { cdnaStart, cdnaEnd },
509    new int[]
510    { proteinStart, proteinEnd }, CODON_LENGTH, 1);
511  15 return map;
512    }
513   
514    /*
515    * translation failed - try mapping CDS annotated regions of dna
516    */
517  16 return mapCdsToProtein(cdnaSeq, proteinSeq);
518    }
519   
520    /**
521    * Test whether the given cdna sequence, starting at the given offset,
522    * translates to the given amino acid sequence, using the standard translation
523    * table. Designed to fail fast i.e. as soon as a mismatch position is found.
524    *
525    * @param cdnaSeqChars
526    * @param cdnaStart
527    * @param aaSeqChars
528    * @return
529    */
 
530  51 toggle protected static boolean translatesAs(char[] cdnaSeqChars, int cdnaStart,
531    char[] aaSeqChars)
532    {
533  51 if (cdnaSeqChars == null || aaSeqChars == null)
534    {
535  3 return false;
536    }
537   
538  48 int aaPos = 0;
539  48 int dnaPos = cdnaStart;
540  161 for (; dnaPos < cdnaSeqChars.length - 2
541    && aaPos < aaSeqChars.length; dnaPos += CODON_LENGTH, aaPos++)
542    {
543  130 String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
544  130 final String translated = ResidueProperties.codonTranslate(codon);
545   
546    /*
547    * allow * in protein to match untranslatable in dna
548    */
549  130 final char aaRes = aaSeqChars[aaPos];
550  130 if ((translated == null || ResidueProperties.STOP.equals(translated))
551    && aaRes == '*')
552    {
553  4 continue;
554    }
555  126 if (translated == null || !(aaRes == translated.charAt(0)))
556    {
557    // debug
558    // jalview.bin.Console.outPrintln(("Mismatch at " + i + "/" + aaResidue
559    // + ": "
560    // + codon + "(" + translated + ") != " + aaRes));
561  17 return false;
562    }
563    }
564   
565    /*
566    * check we matched all of the protein sequence
567    */
568  31 if (aaPos != aaSeqChars.length)
569    {
570  2 return false;
571    }
572   
573    /*
574    * check we matched all of the dna except
575    * for optional trailing STOP codon
576    */
577  29 if (dnaPos == cdnaSeqChars.length)
578    {
579  17 return true;
580    }
581  12 if (dnaPos == cdnaSeqChars.length - CODON_LENGTH)
582    {
583  11 String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
584  11 if (ResidueProperties.STOP
585    .equals(ResidueProperties.codonTranslate(codon)))
586    {
587  9 return true;
588    }
589    }
590  3 return false;
591    }
592   
593    /**
594    * Align sequence 'seq' to match the alignment of a mapped sequence. Note this
595    * currently assumes that we are aligning cDNA to match protein.
596    *
597    * @param seq
598    * the sequence to be realigned
599    * @param al
600    * the alignment whose sequence alignment is to be 'copied'
601    * @param gap
602    * character string represent a gap in the realigned sequence
603    * @param preserveUnmappedGaps
604    * @param preserveMappedGaps
605    * @return true if the sequence was realigned, false if it could not be
606    */
 
607  8 toggle public static boolean alignSequenceAs(SequenceI seq, AlignmentI al,
608    String gap, boolean preserveMappedGaps,
609    boolean preserveUnmappedGaps)
610    {
611    /*
612    * Get any mappings from the source alignment to the target (dataset)
613    * sequence.
614    */
615    // TODO there may be one AlignedCodonFrame per dataset sequence, or one with
616    // all mappings. Would it help to constrain this?
617  8 List<AlignedCodonFrame> mappings = al.getCodonFrame(seq);
618  8 if (mappings == null || mappings.isEmpty())
619    {
620  0 return false;
621    }
622   
623    /*
624    * Locate the aligned source sequence whose dataset sequence is mapped. We
625    * just take the first match here (as we can't align like more than one
626    * sequence).
627    */
628  8 SequenceI alignFrom = null;
629  8 AlignedCodonFrame mapping = null;
630  8 for (AlignedCodonFrame mp : mappings)
631    {
632  8 alignFrom = mp.findAlignedSequence(seq, al);
633  8 if (alignFrom != null)
634    {
635  4 mapping = mp;
636  4 break;
637    }
638    }
639   
640  8 if (alignFrom == null)
641    {
642  4 return false;
643    }
644  4 alignSequenceAs(seq, alignFrom, mapping, gap, al.getGapCharacter(),
645    preserveMappedGaps, preserveUnmappedGaps);
646  4 return true;
647    }
648   
649    /**
650    * Align sequence 'alignTo' the same way as 'alignFrom', using the mapping to
651    * match residues and codons. Flags control whether existing gaps in unmapped
652    * (intron) and mapped (exon) regions are preserved or not. Gaps between
653    * intron and exon are only retained if both flags are set.
654    *
655    * @param alignTo
656    * @param alignFrom
657    * @param mapping
658    * @param myGap
659    * @param sourceGap
660    * @param preserveUnmappedGaps
661    * @param preserveMappedGaps
662    */
 
663  19 toggle public static void alignSequenceAs(SequenceI alignTo, SequenceI alignFrom,
664    AlignedCodonFrame mapping, String myGap, char sourceGap,
665    boolean preserveMappedGaps, boolean preserveUnmappedGaps)
666    {
667    // TODO generalise to work for Protein-Protein, dna-dna, dna-protein
668   
669    // aligned and dataset sequence positions, all base zero
670  19 int thisSeqPos = 0;
671  19 int sourceDsPos = 0;
672   
673  19 int basesWritten = 0;
674  19 char myGapChar = myGap.charAt(0);
675  19 int ratio = myGap.length();
676   
677  19 int fromOffset = alignFrom.getStart() - 1;
678  19 int toOffset = alignTo.getStart() - 1;
679  19 int sourceGapMappedLength = 0;
680  19 boolean inExon = false;
681  19 final int toLength = alignTo.getLength();
682  19 final int fromLength = alignFrom.getLength();
683  19 StringBuilder thisAligned = new StringBuilder(2 * toLength);
684   
685    /*
686    * Traverse the 'model' aligned sequence
687    */
688  205 for (int i = 0; i < fromLength; i++)
689    {
690  186 char sourceChar = alignFrom.getCharAt(i);
691  186 if (sourceChar == sourceGap)
692    {
693  44 sourceGapMappedLength += ratio;
694  44 continue;
695    }
696   
697    /*
698    * Found a non-gap character. Locate its mapped region if any.
699    */
700  142 sourceDsPos++;
701    // Note mapping positions are base 1, our sequence positions base 0
702  142 int[] mappedPos = mapping.getMappedRegion(alignTo, alignFrom,
703    sourceDsPos + fromOffset);
704  142 if (mappedPos == null)
705    {
706    /*
707    * unmapped position; treat like a gap
708    */
709  94 sourceGapMappedLength += ratio;
710    // jalview.bin.Console.errPrintln("Can't align: no codon mapping to
711    // residue "
712    // + sourceDsPos + "(" + sourceChar + ")");
713    // return;
714  94 continue;
715    }
716   
717  48 int mappedCodonStart = mappedPos[0]; // position (1...) of codon start
718  48 int mappedCodonEnd = mappedPos[mappedPos.length - 1]; // codon end pos
719  48 StringBuilder trailingCopiedGap = new StringBuilder();
720   
721    /*
722    * Copy dna sequence up to and including this codon. Optionally, include
723    * gaps before the codon starts (in introns) and/or after the codon starts
724    * (in exons).
725    *
726    * Note this only works for 'linear' splicing, not reverse or interleaved.
727    * But then 'align dna as protein' doesn't make much sense otherwise.
728    */
729  48 int intronLength = 0;
730  294 while (basesWritten + toOffset < mappedCodonEnd
731    && thisSeqPos < toLength)
732    {
733  246 final char c = alignTo.getCharAt(thisSeqPos++);
734  246 if (c != myGapChar)
735    {
736  146 basesWritten++;
737  146 int sourcePosition = basesWritten + toOffset;
738  146 if (sourcePosition < mappedCodonStart)
739    {
740    /*
741    * Found an unmapped (intron) base. First add in any preceding gaps
742    * (if wanted).
743    */
744  48 if (preserveUnmappedGaps && trailingCopiedGap.length() > 0)
745    {
746  17 thisAligned.append(trailingCopiedGap.toString());
747  17 intronLength += trailingCopiedGap.length();
748  17 trailingCopiedGap = new StringBuilder();
749    }
750  48 intronLength++;
751  48 inExon = false;
752    }
753    else
754    {
755  98 final boolean startOfCodon = sourcePosition == mappedCodonStart;
756  98 int gapsToAdd = calculateGapsToInsert(preserveMappedGaps,
757    preserveUnmappedGaps, sourceGapMappedLength, inExon,
758    trailingCopiedGap.length(), intronLength, startOfCodon);
759  215 for (int k = 0; k < gapsToAdd; k++)
760    {
761  117 thisAligned.append(myGapChar);
762    }
763  98 sourceGapMappedLength = 0;
764  98 inExon = true;
765    }
766  146 thisAligned.append(c);
767  146 trailingCopiedGap = new StringBuilder();
768    }
769    else
770    {
771  100 if (inExon && preserveMappedGaps)
772    {
773  32 trailingCopiedGap.append(myGapChar);
774    }
775  68 else if (!inExon && preserveUnmappedGaps)
776    {
777  27 trailingCopiedGap.append(myGapChar);
778    }
779    }
780    }
781    }
782   
783    /*
784    * At end of model aligned sequence. Copy any remaining target sequence, optionally
785    * including (intron) gaps.
786    */
787  129 while (thisSeqPos < toLength)
788    {
789  110 final char c = alignTo.getCharAt(thisSeqPos++);
790  110 if (c != myGapChar || preserveUnmappedGaps)
791    {
792  102 thisAligned.append(c);
793    }
794  110 sourceGapMappedLength--;
795    }
796   
797    /*
798    * finally add gaps to pad for any trailing source gaps or
799    * unmapped characters
800    */
801  19 if (preserveUnmappedGaps)
802    {
803  24 while (sourceGapMappedLength > 0)
804    {
805  12 thisAligned.append(myGapChar);
806  12 sourceGapMappedLength--;
807    }
808    }
809   
810    /*
811    * All done aligning, set the aligned sequence.
812    */
813  19 alignTo.setSequence(new String(thisAligned));
814    }
815   
816    /**
817    * Helper method to work out how many gaps to insert when realigning.
818    *
819    * @param preserveMappedGaps
820    * @param preserveUnmappedGaps
821    * @param sourceGapMappedLength
822    * @param inExon
823    * @param trailingCopiedGap
824    * @param intronLength
825    * @param startOfCodon
826    * @return
827    */
 
828  98 toggle protected static int calculateGapsToInsert(boolean preserveMappedGaps,
829    boolean preserveUnmappedGaps, int sourceGapMappedLength,
830    boolean inExon, int trailingGapLength, int intronLength,
831    final boolean startOfCodon)
832    {
833  98 int gapsToAdd = 0;
834  98 if (startOfCodon)
835    {
836    /*
837    * Reached start of codon. Ignore trailing gaps in intron unless we are
838    * preserving gaps in both exon and intron. Ignore them anyway if the
839    * protein alignment introduces a gap at least as large as the intronic
840    * region.
841    */
842  40 if (inExon && !preserveMappedGaps)
843    {
844  4 trailingGapLength = 0;
845    }
846  40 if (!inExon && !(preserveMappedGaps && preserveUnmappedGaps))
847    {
848  19 trailingGapLength = 0;
849    }
850  40 if (inExon)
851    {
852  14 gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
853    }
854    else
855    {
856  26 if (intronLength + trailingGapLength <= sourceGapMappedLength)
857    {
858  20 gapsToAdd = sourceGapMappedLength - intronLength;
859    }
860    else
861    {
862  6 gapsToAdd = Math.min(
863    intronLength + trailingGapLength - sourceGapMappedLength,
864    trailingGapLength);
865    }
866    }
867    }
868    else
869    {
870    /*
871    * second or third base of codon; check for any gaps in dna
872    */
873  58 if (!preserveMappedGaps)
874    {
875  32 trailingGapLength = 0;
876    }
877  58 gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
878    }
879  98 return gapsToAdd;
880    }
881   
882    /**
883    * Realigns the given protein to match the alignment of the dna, using codon
884    * mappings to translate aligned codon positions to protein residues.
885    *
886    * @param protein
887    * the alignment whose sequences are realigned by this method
888    * @param dna
889    * the dna alignment whose alignment we are 'copying'
890    * @return the number of sequences that were realigned
891    */
 
892  5 toggle public static int alignProteinAsDna(AlignmentI protein, AlignmentI dna)
893    {
894  5 if (protein.isNucleotide() || !dna.isNucleotide())
895    {
896  0 jalview.bin.Console
897    .errPrintln("Wrong alignment type in alignProteinAsDna");
898  0 return 0;
899    }
900  5 List<SequenceI> unmappedProtein = new ArrayList<>();
901  5 Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = buildCodonColumnsMap(
902    protein, dna, unmappedProtein);
903  5 return alignProteinAs(protein, alignedCodons, unmappedProtein);
904    }
905   
906    /**
907    * Realigns the given dna to match the alignment of the protein, using codon
908    * mappings to translate aligned peptide positions to codons.
909    *
910    * Always produces a padded CDS alignment.
911    *
912    * @param dna
913    * the alignment whose sequences are realigned by this method
914    * @param protein
915    * the protein alignment whose alignment we are 'copying'
916    * @return the number of sequences that were realigned
917    */
 
918  4 toggle public static int alignCdsAsProtein(AlignmentI dna, AlignmentI protein)
919    {
920  4 if (protein.isNucleotide() || !dna.isNucleotide())
921    {
922  0 jalview.bin.Console
923    .errPrintln("Wrong alignment type in alignProteinAsDna");
924  0 return 0;
925    }
926    // todo: implement this
927  4 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
928  4 int alignedCount = 0;
929  4 int width = 0; // alignment width for padding CDS
930  4 for (SequenceI dnaSeq : dna.getSequences())
931    {
932  5 if (alignCdsSequenceAsProtein(dnaSeq, protein, mappings,
933    dna.getGapCharacter()))
934    {
935  5 alignedCount++;
936    }
937  5 width = Math.max(dnaSeq.getLength(), width);
938    }
939  4 int oldwidth;
940  4 int diff;
941  4 for (SequenceI dnaSeq : dna.getSequences())
942    {
943  5 oldwidth = dnaSeq.getLength();
944  5 diff = width - oldwidth;
945  5 if (diff > 0)
946    {
947  1 dnaSeq.insertCharAt(oldwidth, diff, dna.getGapCharacter());
948    }
949    }
950  4 return alignedCount;
951    }
952   
953    /**
954    * Helper method to align (if possible) the dna sequence to match the
955    * alignment of a mapped protein sequence. This is currently limited to
956    * handling coding sequence only.
957    *
958    * @param cdsSeq
959    * @param protein
960    * @param mappings
961    * @param gapChar
962    * @return
963    */
 
964  5 toggle static boolean alignCdsSequenceAsProtein(SequenceI cdsSeq,
965    AlignmentI protein, List<AlignedCodonFrame> mappings,
966    char gapChar)
967    {
968  5 SequenceI cdsDss = cdsSeq.getDatasetSequence();
969  5 if (cdsDss == null)
970    {
971  0 System.err
972    .println("alignCdsSequenceAsProtein needs aligned sequence!");
973  0 return false;
974    }
975   
976  5 List<AlignedCodonFrame> dnaMappings = MappingUtils
977    .findMappingsForSequence(cdsSeq, mappings);
978  5 for (AlignedCodonFrame mapping : dnaMappings)
979    {
980  5 SequenceI peptide = mapping.findAlignedSequence(cdsSeq, protein);
981  5 if (peptide != null)
982    {
983  5 final int peptideLength = peptide.getLength();
984  5 Mapping map = mapping.getMappingBetween(cdsSeq, peptide);
985  5 if (map != null)
986    {
987  5 MapList mapList = map.getMap();
988  5 if (map.getTo() == peptide.getDatasetSequence())
989    {
990  5 mapList = mapList.getInverse();
991    }
992  5 final int cdsLength = cdsDss.getLength();
993  5 int mappedFromLength = MappingUtils
994    .getLength(mapList.getFromRanges());
995  5 int mappedToLength = MappingUtils
996    .getLength(mapList.getToRanges());
997  5 boolean addStopCodon = (cdsLength == mappedFromLength
998    * CODON_LENGTH + CODON_LENGTH)
999    || (peptide.getDatasetSequence()
1000    .getLength() == mappedFromLength - 1);
1001  5 if (cdsLength != mappedToLength && !addStopCodon)
1002    {
1003  0 jalview.bin.Console.errPrintln(String.format(
1004    "Can't align cds as protein (length mismatch %d/%d): %s",
1005    cdsLength, mappedToLength, cdsSeq.getName()));
1006    }
1007   
1008    /*
1009    * pre-fill the aligned cds sequence with gaps
1010    */
1011  5 char[] alignedCds = new char[peptideLength * CODON_LENGTH
1012  5 + (addStopCodon ? CODON_LENGTH : 0)];
1013  5 Arrays.fill(alignedCds, gapChar);
1014   
1015    /*
1016    * walk over the aligned peptide sequence and insert mapped
1017    * codons for residues in the aligned cds sequence
1018    */
1019  5 int copiedBases = 0;
1020  5 int cdsStart = cdsDss.getStart();
1021  5 int proteinPos = peptide.getStart() - 1;
1022  5 int cdsCol = 0;
1023   
1024  33 for (int col = 0; col < peptideLength; col++)
1025    {
1026  28 char residue = peptide.getCharAt(col);
1027   
1028  28 if (Comparison.isGap(residue))
1029    {
1030  13 cdsCol += CODON_LENGTH;
1031    }
1032    else
1033    {
1034  15 proteinPos++;
1035  15 int[] codon = mapList.locateInTo(proteinPos, proteinPos);
1036  15 if (codon == null)
1037    {
1038    // e.g. incomplete start codon, X in peptide
1039  0 cdsCol += CODON_LENGTH;
1040    }
1041    else
1042    {
1043  60 for (int j = codon[0]; j <= codon[1]; j++)
1044    {
1045  45 char mappedBase = cdsDss.getCharAt(j - cdsStart);
1046  45 alignedCds[cdsCol++] = mappedBase;
1047  45 copiedBases++;
1048    }
1049    }
1050    }
1051    }
1052   
1053    /*
1054    * append stop codon if not mapped from protein,
1055    * closing it up to the end of the mapped sequence
1056    */
1057  5 if (copiedBases == cdsLength - CODON_LENGTH)
1058    {
1059  0 for (int i = alignedCds.length - 1; i >= 0; i--)
1060    {
1061  0 if (!Comparison.isGap(alignedCds[i]))
1062    {
1063  0 cdsCol = i + 1; // gap just after end of sequence
1064  0 break;
1065    }
1066    }
1067  0 for (int i = cdsLength - CODON_LENGTH; i < cdsLength; i++)
1068    {
1069  0 alignedCds[cdsCol++] = cdsDss.getCharAt(i);
1070    }
1071    }
1072  5 cdsSeq.setSequence(new String(alignedCds));
1073  5 return true;
1074    }
1075    }
1076    }
1077  0 return false;
1078    }
1079   
1080    /**
1081    * Builds a map whose key is an aligned codon position (3 alignment column
1082    * numbers base 0), and whose value is a map from protein sequence to each
1083    * protein's peptide residue for that codon. The map generates an ordering of
1084    * the codons, and allows us to read off the peptides at each position in
1085    * order to assemble 'aligned' protein sequences.
1086    *
1087    * @param protein
1088    * the protein alignment
1089    * @param dna
1090    * the coding dna alignment
1091    * @param unmappedProtein
1092    * any unmapped proteins are added to this list
1093    * @return
1094    */
 
1095  5 toggle protected static Map<AlignedCodon, Map<SequenceI, AlignedCodon>> buildCodonColumnsMap(
1096    AlignmentI protein, AlignmentI dna,
1097    List<SequenceI> unmappedProtein)
1098    {
1099    /*
1100    * maintain a list of any proteins with no mappings - these will be
1101    * rendered 'as is' in the protein alignment as we can't align them
1102    */
1103  5 unmappedProtein.addAll(protein.getSequences());
1104   
1105  5 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
1106   
1107    /*
1108    * Map will hold, for each aligned codon position e.g. [3, 5, 6], a map of
1109    * {dnaSequence, {proteinSequence, codonProduct}} at that position. The
1110    * comparator keeps the codon positions ordered.
1111    */
1112  5 Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = new TreeMap<>(
1113    new CodonComparator());
1114   
1115  5 for (SequenceI dnaSeq : dna.getSequences())
1116    {
1117  30 for (AlignedCodonFrame mapping : mappings)
1118    {
1119  516 SequenceI prot = mapping.findAlignedSequence(dnaSeq, protein);
1120  516 if (prot != null)
1121    {
1122  32 Mapping seqMap = mapping.getMappingForSequence(dnaSeq);
1123  32 addCodonPositions(dnaSeq, prot, protein.getGapCharacter(), seqMap,
1124    alignedCodons);
1125  32 unmappedProtein.remove(prot);
1126    }
1127    }
1128    }
1129   
1130    /*
1131    * Finally add any unmapped peptide start residues (e.g. for incomplete
1132    * codons) as if at the codon position before the second residue
1133    */
1134    // TODO resolve JAL-2022 so this fudge can be removed
1135  5 int mappedSequenceCount = protein.getHeight() - unmappedProtein.size();
1136  5 addUnmappedPeptideStarts(alignedCodons, mappedSequenceCount);
1137   
1138  5 return alignedCodons;
1139    }
1140   
1141    /**
1142    * Scans for any protein mapped from position 2 (meaning unmapped start
1143    * position e.g. an incomplete codon), and synthesizes a 'codon' for it at the
1144    * preceding position in the alignment
1145    *
1146    * @param alignedCodons
1147    * the codon-to-peptide map
1148    * @param mappedSequenceCount
1149    * the number of distinct sequences in the map
1150    */
 
1151  5 toggle protected static void addUnmappedPeptideStarts(
1152    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1153    int mappedSequenceCount)
1154    {
1155    // TODO delete this ugly hack once JAL-2022 is resolved
1156    // i.e. we can model startPhase > 0 (incomplete start codon)
1157   
1158  5 List<SequenceI> sequencesChecked = new ArrayList<>();
1159  5 AlignedCodon lastCodon = null;
1160  5 Map<SequenceI, AlignedCodon> toAdd = new HashMap<>();
1161   
1162  5 for (Entry<AlignedCodon, Map<SequenceI, AlignedCodon>> entry : alignedCodons
1163    .entrySet())
1164    {
1165  1913 for (Entry<SequenceI, AlignedCodon> sequenceCodon : entry.getValue()
1166    .entrySet())
1167    {
1168  10667 SequenceI seq = sequenceCodon.getKey();
1169  10667 if (sequencesChecked.contains(seq))
1170    {
1171  10637 continue;
1172    }
1173  30 sequencesChecked.add(seq);
1174  30 AlignedCodon codon = sequenceCodon.getValue();
1175  30 if (codon.peptideCol > 1)
1176    {
1177  0 jalview.bin.Console.errPrintln(
1178    "Problem mapping protein with >1 unmapped start positions: "
1179    + seq.getName());
1180    }
1181  30 else if (codon.peptideCol == 1)
1182    {
1183    /*
1184    * first position (peptideCol == 0) was unmapped - add it
1185    */
1186  6 if (lastCodon != null)
1187    {
1188  5 AlignedCodon firstPeptide = new AlignedCodon(lastCodon.pos1,
1189    lastCodon.pos2, lastCodon.pos3,
1190    String.valueOf(seq.getCharAt(0)), 0);
1191  5 toAdd.put(seq, firstPeptide);
1192    }
1193    else
1194    {
1195    /*
1196    * unmapped residue at start of alignment (no prior column) -
1197    * 'insert' at nominal codon [0, 0, 0]
1198    */
1199  1 AlignedCodon firstPeptide = new AlignedCodon(0, 0, 0,
1200    String.valueOf(seq.getCharAt(0)), 0);
1201  1 toAdd.put(seq, firstPeptide);
1202    }
1203    }
1204  30 if (sequencesChecked.size() == mappedSequenceCount)
1205    {
1206    // no need to check past first mapped position in all sequences
1207  5 break;
1208    }
1209    }
1210  1913 lastCodon = entry.getKey();
1211    }
1212   
1213    /*
1214    * add any new codons safely after iterating over the map
1215    */
1216  5 for (Entry<SequenceI, AlignedCodon> startCodon : toAdd.entrySet())
1217    {
1218  6 addCodonToMap(alignedCodons, startCodon.getValue(),
1219    startCodon.getKey());
1220    }
1221    }
1222   
1223    /**
1224    * Update the aligned protein sequences to match the codon alignments given in
1225    * the map.
1226    *
1227    * @param protein
1228    * @param alignedCodons
1229    * an ordered map of codon positions (columns), with sequence/peptide
1230    * values present in each column
1231    * @param unmappedProtein
1232    * @return
1233    */
 
1234  5 toggle protected static int alignProteinAs(AlignmentI protein,
1235    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1236    List<SequenceI> unmappedProtein)
1237    {
1238    /*
1239    * prefill peptide sequences with gaps
1240    */
1241  5 int alignedWidth = alignedCodons.size();
1242  5 char[] gaps = new char[alignedWidth];
1243  5 Arrays.fill(gaps, protein.getGapCharacter());
1244  5 Map<SequenceI, char[]> peptides = new HashMap<>();
1245  5 for (SequenceI seq : protein.getSequences())
1246    {
1247  31 if (!unmappedProtein.contains(seq))
1248    {
1249  30 peptides.put(seq, Arrays.copyOf(gaps, gaps.length));
1250    }
1251    }
1252   
1253    /*
1254    * Traverse the codons left to right (as defined by CodonComparator)
1255    * and insert peptides in each column where the sequence is mapped.
1256    * This gives a peptide 'alignment' where residues are aligned if their
1257    * corresponding codons occupy the same columns in the cdna alignment.
1258    */
1259  5 int column = 0;
1260  5 for (AlignedCodon codon : alignedCodons.keySet())
1261    {
1262  1914 final Map<SequenceI, AlignedCodon> columnResidues = alignedCodons
1263    .get(codon);
1264  1914 for (Entry<SequenceI, AlignedCodon> entry : columnResidues.entrySet())
1265    {
1266  10682 char residue = entry.getValue().product.charAt(0);
1267  10682 peptides.get(entry.getKey())[column] = residue;
1268    }
1269  1914 column++;
1270    }
1271   
1272    /*
1273    * and finally set the constructed sequences
1274    */
1275  5 for (Entry<SequenceI, char[]> entry : peptides.entrySet())
1276    {
1277  30 entry.getKey().setSequence(new String(entry.getValue()));
1278    }
1279   
1280  5 return 0;
1281    }
1282   
1283    /**
1284    * Populate the map of aligned codons by traversing the given sequence
1285    * mapping, locating the aligned positions of mapped codons, and adding those
1286    * positions and their translation products to the map.
1287    *
1288    * @param dna
1289    * the aligned sequence we are mapping from
1290    * @param protein
1291    * the sequence to be aligned to the codons
1292    * @param gapChar
1293    * the gap character in the dna sequence
1294    * @param seqMap
1295    * a mapping to a sequence translation
1296    * @param alignedCodons
1297    * the map we are building up
1298    */
 
1299  32 toggle static void addCodonPositions(SequenceI dna, SequenceI protein,
1300    char gapChar, Mapping seqMap,
1301    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons)
1302    {
1303  32 Iterator<AlignedCodon> codons = seqMap.getCodonIterator(dna, gapChar);
1304   
1305    /*
1306    * add codon positions, and their peptide translations, to the alignment
1307    * map, while remembering the first codon mapped
1308    */
1309  10716 while (codons.hasNext())
1310    {
1311  10684 try
1312    {
1313  10684 AlignedCodon codon = codons.next();
1314  10684 addCodonToMap(alignedCodons, codon, protein);
1315    } catch (IncompleteCodonException e)
1316    {
1317    // possible incomplete trailing codon - ignore
1318    } catch (NoSuchElementException e)
1319    {
1320    // possibly peptide lacking STOP
1321    }
1322    }
1323    }
1324   
1325    /**
1326    * Helper method to add a codon-to-peptide entry to the aligned codons map
1327    *
1328    * @param alignedCodons
1329    * @param codon
1330    * @param protein
1331    */
 
1332  10690 toggle protected static void addCodonToMap(
1333    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1334    AlignedCodon codon, SequenceI protein)
1335    {
1336  10690 Map<SequenceI, AlignedCodon> seqProduct = alignedCodons.get(codon);
1337  10690 if (seqProduct == null)
1338    {
1339  1914 seqProduct = new HashMap<>();
1340  1914 alignedCodons.put(codon, seqProduct);
1341    }
1342  10690 seqProduct.put(protein, codon);
1343    }
1344   
1345    /**
1346    * Returns true if a cDNA/Protein mapping either exists, or could be made,
1347    * between at least one pair of sequences in the two alignments. Currently,
1348    * the logic is:
1349    * <ul>
1350    * <li>One alignment must be nucleotide, and the other protein</li>
1351    * <li>At least one pair of sequences must be already mapped, or mappable</li>
1352    * <li>Mappable means the nucleotide translation matches the protein
1353    * sequence</li>
1354    * <li>The translation may ignore start and stop codons if present in the
1355    * nucleotide</li>
1356    * </ul>
1357    *
1358    * @param al1
1359    * @param al2
1360    * @return
1361    */
 
1362  28 toggle public static boolean isMappable(AlignmentI al1, AlignmentI al2)
1363    {
1364  28 if (al1 == null || al2 == null)
1365    {
1366  3 return false;
1367    }
1368   
1369    /*
1370    * Require one nucleotide and one protein
1371    */
1372  25 if (al1.isNucleotide() == al2.isNucleotide())
1373    {
1374  14 return false;
1375    }
1376  11 AlignmentI dna = al1.isNucleotide() ? al1 : al2;
1377  11 AlignmentI protein = dna == al1 ? al2 : al1;
1378  11 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
1379  11 for (SequenceI dnaSeq : dna.getSequences())
1380    {
1381  12 for (SequenceI proteinSeq : protein.getSequences())
1382    {
1383  12 if (isMappable(dnaSeq, proteinSeq, mappings))
1384    {
1385  2 return true;
1386    }
1387    }
1388    }
1389  9 return false;
1390    }
1391   
1392    /**
1393    * Returns true if the dna sequence is mapped, or could be mapped, to the
1394    * protein sequence.
1395    *
1396    * @param dnaSeq
1397    * @param proteinSeq
1398    * @param mappings
1399    * @return
1400    */
 
1401  12 toggle protected static boolean isMappable(SequenceI dnaSeq,
1402    SequenceI proteinSeq, List<AlignedCodonFrame> mappings)
1403    {
1404  12 if (dnaSeq == null || proteinSeq == null)
1405    {
1406  0 return false;
1407    }
1408   
1409  12 SequenceI dnaDs = dnaSeq.getDatasetSequence() == null ? dnaSeq
1410    : dnaSeq.getDatasetSequence();
1411  12 SequenceI proteinDs = proteinSeq.getDatasetSequence() == null
1412    ? proteinSeq
1413    : proteinSeq.getDatasetSequence();
1414   
1415  12 for (AlignedCodonFrame mapping : mappings)
1416    {
1417  0 if (proteinDs == mapping.getAaForDnaSeq(dnaDs))
1418    {
1419    /*
1420    * already mapped
1421    */
1422  0 return true;
1423    }
1424    }
1425   
1426    /*
1427    * Just try to make a mapping (it is not yet stored), test whether
1428    * successful.
1429    */
1430  12 return mapCdnaToProtein(proteinDs, dnaDs) != null;
1431    }
1432   
1433    /**
1434    * Finds any reference annotations associated with the sequences in
1435    * sequenceScope, that are not already added to the alignment, and adds them
1436    * to the 'candidates' map. Also populates a lookup table of annotation
1437    * labels, keyed by calcId, for use in constructing tooltips or the like.
1438    *
1439    * @param sequenceScope
1440    * the sequences to scan for reference annotations
1441    * @param labelForCalcId
1442    * (optional) map to populate with label for calcId
1443    * @param candidates
1444    * map to populate with annotations for sequence
1445    * @param al
1446    * the alignment to check for presence of annotations
1447    */
 
1448  98 toggle public static void findAddableReferenceAnnotations(
1449    List<SequenceI> sequenceScope, Map<String, String> labelForCalcId,
1450    final Map<SequenceI, List<AlignmentAnnotation>> candidates,
1451    AlignmentI al)
1452    {
1453  98 if (sequenceScope == null)
1454    {
1455  1 return;
1456    }
1457   
1458    /*
1459    * For each sequence in scope, make a list of any annotations on the
1460    * underlying dataset sequence which are not already on the alignment.
1461    *
1462    * Add to a map of { alignmentSequence, <List of annotations to add> }
1463    */
1464  97 for (SequenceI seq : sequenceScope)
1465    {
1466  94 SequenceI dataset = seq.getDatasetSequence();
1467  94 if (dataset == null)
1468    {
1469  0 continue;
1470    }
1471  94 AlignmentAnnotation[] datasetAnnotations = dataset.getAnnotation();
1472  94 if (datasetAnnotations == null)
1473    {
1474  35 continue;
1475    }
1476  59 final List<AlignmentAnnotation> result = new ArrayList<>();
1477  59 for (AlignmentAnnotation dsann : datasetAnnotations)
1478    {
1479    /*
1480    * Find matching annotations on the alignment. If none is found, then
1481    * add this annotation to the list of 'addable' annotations for this
1482    * sequence.
1483    */
1484  186 final Iterable<AlignmentAnnotation> matchedAlignmentAnnotations = al
1485    .findAnnotations(seq, dsann.getCalcId(), dsann.label);
1486  186 boolean found = false;
1487  186 if (matchedAlignmentAnnotations != null)
1488    {
1489  183 for (AlignmentAnnotation matched : matchedAlignmentAnnotations)
1490    {
1491  138 if (dsann.description.equals(matched.description))
1492    {
1493  56 found = true;
1494  56 break;
1495    }
1496    }
1497    }
1498  186 if (!found)
1499    {
1500  130 result.add(dsann);
1501  130 if (labelForCalcId != null)
1502    {
1503  10 labelForCalcId.put(dsann.getCalcId(), dsann.label);
1504    }
1505    }
1506    }
1507    /*
1508    * Save any addable annotations for this sequence
1509    */
1510  59 if (!result.isEmpty())
1511    {
1512  55 candidates.put(seq, result);
1513    }
1514    }
1515    }
1516   
1517    /**
1518    * Adds annotations to the top of the alignment annotations, in the same order
1519    * as their related sequences. If you already have an annotation and want to
1520    * add it to a sequence in an alignment use {@code addReferenceAnnotationTo}
1521    *
1522    * @param annotations
1523    * the annotations to add
1524    * @param alignment
1525    * the alignment to add them to
1526    * @param selectionGroup
1527    * current selection group - may be null, if provided then any added
1528    * annotation will be trimmed to just those columns in the selection
1529    * group
1530    */
 
1531  52 toggle public static void addReferenceAnnotations(
1532    Map<SequenceI, List<AlignmentAnnotation>> annotations,
1533    final AlignmentI alignment, final SequenceGroup selectionGroup)
1534    {
1535  52 for (SequenceI seq : annotations.keySet())
1536    {
1537  51 for (AlignmentAnnotation ann : annotations.get(seq))
1538    {
1539  124 addReferenceAnnotationTo(alignment, seq, ann, selectionGroup);
1540    }
1541    }
1542    }
1543   
 
1544  4 toggle public static boolean isSSAnnotationPresent(
1545    Map<SequenceI, List<AlignmentAnnotation>> annotations)
1546    {
1547   
1548  4 for (SequenceI seq : annotations.keySet())
1549    {
1550  3 if (isSecondaryStructurePresent(
1551    annotations.get(seq).toArray(new AlignmentAnnotation[0])))
1552    {
1553  2 return true;
1554    }
1555    }
1556  2 return false;
1557    }
1558   
1559    /**
1560    * Make a copy of a reference annotation {@code ann} and add it to an
1561    * alignment sequence {@code seq} in {@code alignment}, optionally limited to
1562    * the extent of {@code selectionGroup}
1563    *
1564    * @param alignment
1565    * @param seq
1566    * @param ann
1567    * @param selectionGroup
1568    * current selection group - may be null, if provided then any added
1569    * annotation will be trimmed to just those columns in the selection
1570    * group
1571    * @return annotation added to {@code seq and {@code alignment}
1572    */
 
1573  128 toggle public static AlignmentAnnotation addReferenceAnnotationTo(
1574    final AlignmentI alignment, final SequenceI seq,
1575    final AlignmentAnnotation ann, final SequenceGroup selectionGroup)
1576    {
1577  128 AlignmentAnnotation copyAnn = new AlignmentAnnotation(ann);
1578  128 int startRes = 0;
1579  128 int endRes = ann.annotations.length;
1580  128 if (selectionGroup != null)
1581    {
1582  2 startRes = -1 + Math.min(seq.getEnd(), Math.max(seq.getStart(),
1583    seq.findPosition(selectionGroup.getStartRes())));
1584  2 endRes = -1 + Math.min(seq.getEnd(),
1585    seq.findPosition(selectionGroup.getEndRes()));
1586   
1587    }
1588  128 copyAnn.restrict(startRes, endRes + 0);
1589   
1590    /*
1591    * Add to the sequence (sets copyAnn.datasetSequence), unless the
1592    * original annotation is already on the sequence.
1593    */
1594  128 if (!seq.hasAnnotation(ann))
1595    {
1596  128 ContactMatrixI cm = seq.getDatasetSequence().getContactMatrixFor(ann);
1597  128 if (cm != null)
1598    {
1599  28 seq.addContactListFor(copyAnn, cm);
1600    }
1601  128 seq.addAlignmentAnnotation(copyAnn);
1602    }
1603    // adjust for gaps
1604  128 copyAnn.adjustForAlignment();
1605    // add to the alignment and set visible
1606  128 alignment.addAnnotation(copyAnn);
1607  128 copyAnn.visible = true;
1608   
1609  128 return copyAnn;
1610    }
1611   
1612    /**
1613    * Set visibility of alignment annotations of specified types (labels), for
1614    * specified sequences. This supports controls like "Show all secondary
1615    * structure", "Hide all Temp factor", etc.
1616    *
1617    * @al the alignment to scan for annotations
1618    * @param types
1619    * the types (labels) of annotations to be updated
1620    * @param forSequences
1621    * if not null, only annotations linked to one of these sequences are
1622    * in scope for update; if null, acts on all sequence annotations
1623    * @param anyType
1624    * if this flag is true, 'types' is ignored (label not checked)
1625    * @param doShow
1626    * if true, set visibility on, else set off
1627    */
 
1628  5 toggle public static void showOrHideSequenceAnnotations(AlignmentI al,
1629    Collection<String> types, List<SequenceI> forSequences,
1630    boolean anyType, boolean doShow)
1631    {
1632  5 AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
1633  5 if (anns != null)
1634    {
1635  5 for (AlignmentAnnotation aa : anns)
1636    {
1637  30 if (anyType || types.contains(aa.label))
1638    {
1639  21 if ((aa.sequenceRef != null) && (forSequences == null
1640    || forSequences.contains(aa.sequenceRef)))
1641    {
1642  11 aa.visible = doShow;
1643    }
1644    }
1645    }
1646    }
1647    }
1648   
1649    /**
1650    * Shows or hides auto calculated annotations for a sequence group.
1651    *
1652    * @param al
1653    * The alignment object with the annotations.
1654    * @param type
1655    * The type of annotation to show or hide.
1656    * @param selectedGroup
1657    * The sequence group for which the annotations should be shown or
1658    * hidden.
1659    * @param anyType
1660    * If true, all types of annotations will be shown/hidden.
1661    * @param doShow
1662    * If true, the annotations will be shown; if false, annotations will
1663    * be hidden.
1664    */
 
1665  0 toggle public static void showOrHideAutoCalculatedAnnotationsForGroup(
1666    AlignmentI al, String type, SequenceGroup selectedGroup,
1667    boolean anyType, boolean doShow)
1668    {
1669    // Get all alignment annotations
1670  0 AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
1671   
1672  0 if (anns != null)
1673    {
1674  0 for (AlignmentAnnotation aa : anns)
1675    {
1676    // Check if anyType is true or if the annotation's label contains the
1677    // specified type (currently for secondary structure consensus)
1678  0 if ((anyType && aa.label
1679    .startsWith(Constants.SECONDARY_STRUCTURE_CONSENSUS_LABEL))
1680    || aa.label.startsWith(type))
1681    {
1682    // If the annotation's group reference is not null and matches the
1683    // selected group, update its visibility.
1684  0 if (aa.groupRef != null && selectedGroup == aa.groupRef)
1685    {
1686  0 aa.visible = doShow;
1687    }
1688    }
1689    }
1690    }
1691    }
1692   
 
1693  0 toggle public static AlignmentAnnotation getFirstSequenceAnnotationOfType(
1694    AlignmentI al, int graphType)
1695    {
1696  0 AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
1697  0 if (anns != null)
1698    {
1699  0 for (AlignmentAnnotation aa : anns)
1700    {
1701  0 if (aa.sequenceRef != null && aa.graph == graphType)
1702  0 return aa;
1703    }
1704    }
1705  0 return null;
1706    }
1707   
1708    /**
1709    * Returns true if either sequence has a cross-reference to the other
1710    *
1711    * @param seq1
1712    * @param seq2
1713    * @return
1714    */
 
1715  52 toggle public static boolean haveCrossRef(SequenceI seq1, SequenceI seq2)
1716    {
1717    // Note: moved here from class CrossRef as the latter class has dependencies
1718    // not availability to the applet's classpath
1719  52 return hasCrossRef(seq1, seq2) || hasCrossRef(seq2, seq1);
1720    }
1721   
1722    /**
1723    * Returns true if seq1 has a cross-reference to seq2. Currently this assumes
1724    * that sequence name is structured as Source|AccessionId.
1725    *
1726    * @param seq1
1727    * @param seq2
1728    * @return
1729    */
 
1730  108 toggle public static boolean hasCrossRef(SequenceI seq1, SequenceI seq2)
1731    {
1732  108 if (seq1 == null || seq2 == null)
1733    {
1734  8 return false;
1735    }
1736  100 String name = seq2.getName();
1737  100 final List<DBRefEntry> xrefs = seq1.getDBRefs();
1738  100 if (xrefs != null)
1739    {
1740  34 for (int ix = 0, nx = xrefs.size(); ix < nx; ix++)
1741    {
1742  24 DBRefEntry xref = xrefs.get(ix);
1743  24 String xrefName = xref.getSource() + "|" + xref.getAccessionId();
1744    // case-insensitive test, consistent with DBRefEntry.equalRef()
1745  24 if (xrefName.equalsIgnoreCase(name))
1746    {
1747  12 return true;
1748    }
1749    }
1750    }
1751  88 return false;
1752    }
1753   
1754    /**
1755    * Constructs an alignment consisting of the mapped (CDS) regions in the given
1756    * nucleotide sequences, and updates mappings to match. The CDS sequences are
1757    * added to the original alignment's dataset, which is shared by the new
1758    * alignment. Mappings from nucleotide to CDS, and from CDS to protein, are
1759    * added to the alignment dataset.
1760    *
1761    * @param dna
1762    * aligned nucleotide (dna or cds) sequences
1763    * @param dataset
1764    * the alignment dataset the sequences belong to
1765    * @param products
1766    * (optional) to restrict results to CDS that map to specified
1767    * protein products
1768    * @return an alignment whose sequences are the cds-only parts of the dna
1769    * sequences (or null if no mappings are found)
1770    */
 
1771  6 toggle public static AlignmentI makeCdsAlignment(SequenceI[] dna,
1772    AlignmentI dataset, SequenceI[] products)
1773    {
1774  6 if (dataset == null || dataset.getDataset() != null)
1775    {
1776  0 throw new IllegalArgumentException(
1777    "IMPLEMENTATION ERROR: dataset.getDataset() must be null!");
1778    }
1779  6 List<SequenceI> foundSeqs = new ArrayList<>();
1780  6 List<SequenceI> cdsSeqs = new ArrayList<>();
1781  6 List<AlignedCodonFrame> mappings = dataset.getCodonFrames();
1782  6 HashSet<SequenceI> productSeqs = null;
1783  6 if (products != null)
1784    {
1785  3 productSeqs = new HashSet<>();
1786  3 for (SequenceI seq : products)
1787    {
1788  24 productSeqs.add(seq.getDatasetSequence() == null ? seq
1789    : seq.getDatasetSequence());
1790    }
1791    }
1792   
1793    /*
1794    * Construct CDS sequences from mappings on the alignment dataset.
1795    * The logic is:
1796    * - find the protein product(s) mapped to from each dna sequence
1797    * - if the mapping covers the whole dna sequence (give or take start/stop
1798    * codon), take the dna as the CDS sequence
1799    * - else search dataset mappings for a suitable dna sequence, i.e. one
1800    * whose whole sequence is mapped to the protein
1801    * - if no sequence found, construct one from the dna sequence and mapping
1802    * (and add it to dataset so it is found if this is repeated)
1803    */
1804  6 for (SequenceI dnaSeq : dna)
1805    {
1806  52 SequenceI dnaDss = dnaSeq.getDatasetSequence() == null ? dnaSeq
1807    : dnaSeq.getDatasetSequence();
1808   
1809  52 List<AlignedCodonFrame> seqMappings = MappingUtils
1810    .findMappingsForSequence(dnaSeq, mappings);
1811  52 for (AlignedCodonFrame mapping : seqMappings)
1812    {
1813  42 List<Mapping> mappingsFromSequence = mapping
1814    .getMappingsFromSequence(dnaSeq);
1815   
1816  42 for (Mapping aMapping : mappingsFromSequence)
1817    {
1818  44 MapList mapList = aMapping.getMap();
1819  44 if (mapList.getFromRatio() == 1)
1820    {
1821    /*
1822    * not a dna-to-protein mapping (likely dna-to-cds)
1823    */
1824  11 continue;
1825    }
1826   
1827    /*
1828    * skip if mapping is not to one of the target set of proteins
1829    */
1830  33 SequenceI proteinProduct = aMapping.getTo();
1831  33 if (productSeqs != null && !productSeqs.contains(proteinProduct))
1832    {
1833  2 continue;
1834    }
1835   
1836    /*
1837    * try to locate the CDS from the dataset mappings;
1838    * guard against duplicate results (for the case that protein has
1839    * dbrefs to both dna and cds sequences)
1840    */
1841  31 SequenceI cdsSeq = findCdsForProtein(mappings, dnaSeq,
1842    seqMappings, aMapping);
1843  31 if (cdsSeq != null)
1844    {
1845  11 if (!foundSeqs.contains(cdsSeq))
1846    {
1847  11 foundSeqs.add(cdsSeq);
1848  11 SequenceI derivedSequence = cdsSeq.deriveSequence();
1849  11 cdsSeqs.add(derivedSequence);
1850  11 if (!dataset.getSequences().contains(cdsSeq))
1851    {
1852  0 dataset.addSequence(cdsSeq);
1853    }
1854    }
1855  11 continue;
1856    }
1857   
1858    /*
1859    * didn't find mapped CDS sequence - construct it and add
1860    * its dataset sequence to the dataset
1861    */
1862  20 cdsSeq = makeCdsSequence(dnaSeq.getDatasetSequence(), aMapping,
1863    dataset).deriveSequence();
1864    // cdsSeq has a name constructed as CDS|<dbref>
1865    // <dbref> will be either the accession for the coding sequence,
1866    // marked in the /via/ dbref to the protein product accession
1867    // or it will be the original nucleotide accession.
1868  20 SequenceI cdsSeqDss = cdsSeq.getDatasetSequence();
1869   
1870  20 cdsSeqs.add(cdsSeq);
1871   
1872    /*
1873    * build the mapping from CDS to protein
1874    */
1875  20 List<int[]> cdsRange = Collections
1876    .singletonList(new int[]
1877    { cdsSeq.getStart(),
1878    cdsSeq.getLength() + cdsSeq.getStart() - 1 });
1879  20 MapList cdsToProteinMap = new MapList(cdsRange,
1880    mapList.getToRanges(), mapList.getFromRatio(),
1881    mapList.getToRatio());
1882   
1883  20 if (!dataset.getSequences().contains(cdsSeqDss))
1884    {
1885    /*
1886    * if this sequence is a newly created one, add it to the dataset
1887    * and made a CDS to protein mapping (if sequence already exists,
1888    * CDS-to-protein mapping _is_ the transcript-to-protein mapping)
1889    */
1890  20 dataset.addSequence(cdsSeqDss);
1891  20 AlignedCodonFrame cdsToProteinMapping = new AlignedCodonFrame();
1892  20 cdsToProteinMapping.addMap(cdsSeqDss, proteinProduct,
1893    cdsToProteinMap);
1894   
1895    /*
1896    * guard against duplicating the mapping if repeating this action
1897    */
1898  20 if (!mappings.contains(cdsToProteinMapping))
1899    {
1900  20 mappings.add(cdsToProteinMapping);
1901    }
1902    }
1903   
1904  20 propagateDBRefsToCDS(cdsSeqDss, dnaSeq.getDatasetSequence(),
1905    proteinProduct, aMapping);
1906    /*
1907    * add another mapping from original 'from' range to CDS
1908    */
1909  20 AlignedCodonFrame dnaToCdsMapping = new AlignedCodonFrame();
1910  20 final MapList dnaToCdsMap = new MapList(mapList.getFromRanges(),
1911    cdsRange, 1, 1);
1912  20 dnaToCdsMapping.addMap(dnaSeq.getDatasetSequence(), cdsSeqDss,
1913    dnaToCdsMap);
1914  20 if (!mappings.contains(dnaToCdsMapping))
1915    {
1916  20 mappings.add(dnaToCdsMapping);
1917    }
1918   
1919    /*
1920    * transfer dna chromosomal loci (if known) to the CDS
1921    * sequence (via the mapping)
1922    */
1923  20 final MapList cdsToDnaMap = dnaToCdsMap.getInverse();
1924  20 transferGeneLoci(dnaSeq, cdsToDnaMap, cdsSeq);
1925   
1926    /*
1927    * add DBRef with mapping from protein to CDS
1928    * (this enables Get Cross-References from protein alignment)
1929    * This is tricky because we can't have two DBRefs with the
1930    * same source and accession, so need a different accession for
1931    * the CDS from the dna sequence
1932    */
1933   
1934    // specific use case:
1935    // Genomic contig ENSCHR:1, contains coding regions for ENSG01,
1936    // ENSG02, ENSG03, with transcripts and products similarly named.
1937    // cannot add distinct dbrefs mapping location on ENSCHR:1 to ENSG01
1938   
1939    // JBPNote: ?? can't actually create an example that demonstrates we
1940    // need to
1941    // synthesize an xref.
1942   
1943  20 List<DBRefEntry> primrefs = dnaDss.getPrimaryDBRefs();
1944  33 for (int ip = 0, np = primrefs.size(); ip < np; ip++)
1945    {
1946  13 DBRefEntry primRef = primrefs.get(ip);
1947    /*
1948    * create a cross-reference from CDS to the source sequence's
1949    * primary reference and vice versa
1950    */
1951  13 String source = primRef.getSource();
1952  13 String version = primRef.getVersion();
1953  13 DBRefEntry cdsCrossRef = new DBRefEntry(source,
1954    source + ":" + version, primRef.getAccessionId());
1955  13 cdsCrossRef
1956    .setMap(new Mapping(dnaDss, new MapList(cdsToDnaMap)));
1957  13 cdsSeqDss.addDBRef(cdsCrossRef);
1958   
1959  13 dnaSeq.addDBRef(new DBRefEntry(source, version,
1960    cdsSeq.getName(), new Mapping(cdsSeqDss, dnaToCdsMap)));
1961    // problem here is that the cross-reference is synthesized -
1962    // cdsSeq.getName() may be like 'CDS|dnaaccession' or
1963    // 'CDS|emblcdsacc'
1964    // assuming cds version same as dna ?!?
1965   
1966  13 DBRefEntry proteinToCdsRef = new DBRefEntry(source, version,
1967    cdsSeq.getName());
1968    //
1969  13 proteinToCdsRef.setMap(
1970    new Mapping(cdsSeqDss, cdsToProteinMap.getInverse()));
1971  13 proteinProduct.addDBRef(proteinToCdsRef);
1972    }
1973    /*
1974    * transfer any features on dna that overlap the CDS
1975    */
1976  20 transferFeatures(dnaSeq, cdsSeq, dnaToCdsMap, null,
1977    SequenceOntologyI.CDS);
1978    }
1979    }
1980    }
1981   
1982  6 AlignmentI cds = new Alignment(
1983    cdsSeqs.toArray(new SequenceI[cdsSeqs.size()]));
1984  6 cds.setDataset(dataset);
1985   
1986  6 return cds;
1987    }
1988   
1989    /**
1990    * Tries to transfer gene loci (dbref to chromosome positions) from fromSeq to
1991    * toSeq, mediated by the given mapping between the sequences
1992    *
1993    * @param fromSeq
1994    * @param targetToFrom
1995    * Map
1996    * @param targetSeq
1997    */
 
1998  23 toggle protected static void transferGeneLoci(SequenceI fromSeq,
1999    MapList targetToFrom, SequenceI targetSeq)
2000    {
2001  23 if (targetSeq.getGeneLoci() != null)
2002    {
2003    // already have - don't override
2004  1 return;
2005    }
2006  22 GeneLociI fromLoci = fromSeq.getGeneLoci();
2007  22 if (fromLoci == null)
2008    {
2009  10 return;
2010    }
2011   
2012  12 MapList newMap = targetToFrom.traverse(fromLoci.getMapping());
2013   
2014  12 if (newMap != null)
2015    {
2016  12 targetSeq.setGeneLoci(fromLoci.getSpeciesId(),
2017    fromLoci.getAssemblyId(), fromLoci.getChromosomeId(), newMap);
2018    }
2019    }
2020   
2021    /**
2022    * A helper method that finds a CDS sequence in the alignment dataset that is
2023    * mapped to the given protein sequence, and either is, or has a mapping from,
2024    * the given dna sequence.
2025    *
2026    * @param mappings
2027    * set of all mappings on the dataset
2028    * @param dnaSeq
2029    * a dna (or cds) sequence we are searching from
2030    * @param seqMappings
2031    * the set of mappings involving dnaSeq
2032    * @param aMapping
2033    * a transcript-to-peptide mapping
2034    * @return
2035    */
 
2036  38 toggle static SequenceI findCdsForProtein(List<AlignedCodonFrame> mappings,
2037    SequenceI dnaSeq, List<AlignedCodonFrame> seqMappings,
2038    Mapping aMapping)
2039    {
2040    /*
2041    * TODO a better dna-cds-protein mapping data representation to allow easy
2042    * navigation; until then this clunky looping around lists of mappings
2043    */
2044  38 SequenceI seqDss = dnaSeq.getDatasetSequence() == null ? dnaSeq
2045    : dnaSeq.getDatasetSequence();
2046  38 SequenceI proteinProduct = aMapping.getTo();
2047   
2048    /*
2049    * is this mapping from the whole dna sequence (i.e. CDS)?
2050    * allowing for possible stop codon on dna but not peptide
2051    */
2052  38 int mappedFromLength = MappingUtils
2053    .getLength(aMapping.getMap().getFromRanges());
2054  38 int dnaLength = seqDss.getLength();
2055  38 if (mappedFromLength == dnaLength
2056    || mappedFromLength == dnaLength - CODON_LENGTH)
2057    {
2058    /*
2059    * if sequence has CDS features, this is a transcript with no UTR
2060    * - do not take this as the CDS sequence! (JAL-2789)
2061    */
2062  4 if (seqDss.getFeatures().getFeaturesByOntology(SequenceOntologyI.CDS)
2063    .isEmpty())
2064    {
2065  1 return seqDss;
2066    }
2067    }
2068   
2069    /*
2070    * looks like we found the dna-to-protein mapping; search for the
2071    * corresponding cds-to-protein mapping
2072    */
2073  37 List<AlignedCodonFrame> mappingsToPeptide = MappingUtils
2074    .findMappingsForSequence(proteinProduct, mappings);
2075  37 for (AlignedCodonFrame acf : mappingsToPeptide)
2076    {
2077  52 for (SequenceToSequenceMapping map : acf.getMappings())
2078    {
2079  276 Mapping mapping = map.getMapping();
2080  276 if (mapping != aMapping
2081    && mapping.getMap().getFromRatio() == CODON_LENGTH
2082    && proteinProduct == mapping.getTo()
2083    && seqDss != map.getFromSeq())
2084    {
2085  15 mappedFromLength = MappingUtils
2086    .getLength(mapping.getMap().getFromRanges());
2087  15 if (mappedFromLength == map.getFromSeq().getLength())
2088    {
2089    /*
2090    * found a 3:1 mapping to the protein product which covers
2091    * the whole dna sequence i.e. is from CDS; finally check the CDS
2092    * is mapped from the given dna start sequence
2093    */
2094  15 SequenceI cdsSeq = map.getFromSeq();
2095    // todo this test is weak if seqMappings contains multiple mappings;
2096    // we get away with it if transcript:cds relationship is 1:1
2097  15 List<AlignedCodonFrame> dnaToCdsMaps = MappingUtils
2098    .findMappingsForSequence(cdsSeq, seqMappings);
2099  15 if (!dnaToCdsMaps.isEmpty())
2100    {
2101  13 return cdsSeq;
2102    }
2103    }
2104    }
2105    }
2106    }
2107  24 return null;
2108    }
2109   
2110    /**
2111    * Helper method that makes a CDS sequence as defined by the mappings from the
2112    * given sequence i.e. extracts the 'mapped from' ranges (which may be on
2113    * forward or reverse strand).
2114    *
2115    * @param seq
2116    * @param mapping
2117    * @param dataset
2118    * - existing dataset. We check for sequences that look like the CDS
2119    * we are about to construct, if one exists already, then we will
2120    * just return that one.
2121    * @return CDS sequence (as a dataset sequence)
2122    */
 
2123  20 toggle static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping,
2124    AlignmentI dataset)
2125    {
2126    /*
2127    * construct CDS sequence name as "CDS|" with 'from id' held in the mapping
2128    * if set (e.g. EMBL protein_id), else sequence name appended
2129    */
2130  20 String mapFromId = mapping.getMappedFromId();
2131  20 final String seqId = "CDS|"
2132  20 + (mapFromId != null ? mapFromId : seq.getName());
2133   
2134  20 SequenceI newSeq = null;
2135   
2136    /*
2137    * construct CDS sequence by splicing mapped from ranges
2138    */
2139  20 char[] seqChars = seq.getSequence();
2140  20 List<int[]> fromRanges = mapping.getMap().getFromRanges();
2141  20 int cdsWidth = MappingUtils.getLength(fromRanges);
2142  20 char[] newSeqChars = new char[cdsWidth];
2143   
2144  20 int newPos = 0;
2145  20 for (int[] range : fromRanges)
2146    {
2147  32 if (range[0] <= range[1])
2148    {
2149    // forward strand mapping - just copy the range
2150  32 int length = range[1] - range[0] + 1;
2151  32 System.arraycopy(seqChars, range[0] - 1, newSeqChars, newPos,
2152    length);
2153  32 newPos += length;
2154    }
2155    else
2156    {
2157    // reverse strand mapping - copy and complement one by one
2158  0 for (int i = range[0]; i >= range[1]; i--)
2159    {
2160  0 newSeqChars[newPos++] = Dna.getComplement(seqChars[i - 1]);
2161    }
2162    }
2163   
2164  32 newSeq = new Sequence(seqId, newSeqChars, 1, newPos);
2165    }
2166   
2167  20 if (dataset != null)
2168    {
2169  20 SequenceI[] matches = dataset.findSequenceMatch(newSeq.getName());
2170  20 if (matches != null)
2171    {
2172  20 boolean matched = false;
2173  20 for (SequenceI mtch : matches)
2174    {
2175  3 if (mtch.getStart() != newSeq.getStart())
2176    {
2177  0 continue;
2178    }
2179  3 if (mtch.getEnd() != newSeq.getEnd())
2180    {
2181  0 continue;
2182    }
2183  3 if (!Arrays.equals(mtch.getSequence(), newSeq.getSequence()))
2184    {
2185  3 continue;
2186    }
2187  0 if (!matched)
2188    {
2189  0 matched = true;
2190  0 newSeq = mtch;
2191    }
2192    else
2193    {
2194  0 Console.error(
2195    "JAL-2154 regression: warning - found (and ignored) a duplicate CDS sequence:"
2196    + mtch.toString());
2197    }
2198    }
2199    }
2200    }
2201    // newSeq.setDescription(mapFromId);
2202   
2203  20 return newSeq;
2204    }
2205   
2206    /**
2207    * Adds any DBRefEntrys to cdsSeq from contig that have a Mapping congruent to
2208    * the given mapping.
2209    *
2210    * @param cdsSeq
2211    * @param contig
2212    * @param proteinProduct
2213    * @param mapping
2214    * @return list of DBRefEntrys added
2215    */
 
2216  20 toggle protected static List<DBRefEntry> propagateDBRefsToCDS(SequenceI cdsSeq,
2217    SequenceI contig, SequenceI proteinProduct, Mapping mapping)
2218    {
2219   
2220    // gather direct refs from contig congruent with mapping
2221  20 List<DBRefEntry> direct = new ArrayList<>();
2222  20 HashSet<String> directSources = new HashSet<>();
2223   
2224  20 List<DBRefEntry> refs = contig.getDBRefs();
2225  20 if (refs != null)
2226    {
2227  292 for (int ib = 0, nb = refs.size(); ib < nb; ib++)
2228    {
2229  279 DBRefEntry dbr = refs.get(ib);
2230  279 MapList map;
2231  ? if (dbr.hasMap() && (map = dbr.getMap().getMap()).isTripletMap())
2232    {
2233    // check if map is the CDS mapping
2234  24 if (mapping.getMap().equals(map))
2235    {
2236  24 direct.add(dbr);
2237  24 directSources.add(dbr.getSource());
2238    }
2239    }
2240    }
2241    }
2242  20 List<DBRefEntry> onSource = DBRefUtils.selectRefs(
2243    proteinProduct.getDBRefs(),
2244    directSources.toArray(new String[0]));
2245  20 List<DBRefEntry> propagated = new ArrayList<>();
2246   
2247    // and generate appropriate mappings
2248  44 for (int ic = 0, nc = direct.size(); ic < nc; ic++)
2249    {
2250  24 DBRefEntry cdsref = direct.get(ic);
2251  24 Mapping m = cdsref.getMap();
2252    // clone maplist and mapping
2253  24 MapList cdsposmap = new MapList(
2254    Arrays.asList(new int[][]
2255    { new int[] { cdsSeq.getStart(), cdsSeq.getEnd() } }),
2256    m.getMap().getToRanges(), 3, 1);
2257  24 Mapping cdsmap = new Mapping(m.getTo(), m.getMap());
2258   
2259    // create dbref
2260  24 DBRefEntry newref = new DBRefEntry(cdsref.getSource(),
2261    cdsref.getVersion(), cdsref.getAccessionId(),
2262    new Mapping(cdsmap.getTo(), cdsposmap));
2263   
2264    // and see if we can map to the protein product for this mapping.
2265    // onSource is the filtered set of accessions on protein that we are
2266    // tranferring, so we assume accession is the same.
2267  24 if (cdsmap.getTo() == null && onSource != null)
2268    {
2269  2 List<DBRefEntry> sourceRefs = DBRefUtils.searchRefs(onSource,
2270    cdsref.getAccessionId());
2271  2 if (sourceRefs != null)
2272    {
2273  2 for (DBRefEntry srcref : sourceRefs)
2274    {
2275  2 if (srcref.getSource().equalsIgnoreCase(cdsref.getSource()))
2276    {
2277    // we have found a complementary dbref on the protein product, so
2278    // update mapping's getTo
2279  2 newref.getMap().setTo(proteinProduct);
2280    }
2281    }
2282    }
2283    }
2284  24 cdsSeq.addDBRef(newref);
2285  24 propagated.add(newref);
2286    }
2287  20 return propagated;
2288    }
2289   
2290    /**
2291    * Transfers co-located features on 'fromSeq' to 'toSeq', adjusting the
2292    * feature start/end ranges, optionally omitting specified feature types.
2293    * Returns the number of features copied.
2294    *
2295    * @param fromSeq
2296    * @param toSeq
2297    * @param mapping
2298    * the mapping from 'fromSeq' to 'toSeq'
2299    * @param select
2300    * if not null, only features of this type are copied (including
2301    * subtypes in the Sequence Ontology)
2302    * @param omitting
2303    */
 
2304  23 toggle protected static int transferFeatures(SequenceI fromSeq, SequenceI toSeq,
2305    MapList mapping, String select, String... omitting)
2306    {
2307  23 SequenceI copyTo = toSeq;
2308  43 while (copyTo.getDatasetSequence() != null)
2309    {
2310  20 copyTo = copyTo.getDatasetSequence();
2311    }
2312  23 if (fromSeq == copyTo || fromSeq.getDatasetSequence() == copyTo)
2313    {
2314  0 return 0; // shared dataset sequence
2315    }
2316   
2317    /*
2318    * get features, optionally restricted by an ontology term
2319    */
2320  23 List<SequenceFeature> sfs = select == null
2321    ? fromSeq.getFeatures().getPositionalFeatures()
2322    : fromSeq.getFeatures().getFeaturesByOntology(select);
2323   
2324  23 int count = 0;
2325  23 for (SequenceFeature sf : sfs)
2326    {
2327  9610 String type = sf.getType();
2328  9610 boolean omit = false;
2329  9610 for (String toOmit : omitting)
2330    {
2331  9603 if (type.equals(toOmit))
2332    {
2333  134 omit = true;
2334    }
2335    }
2336  9610 if (omit)
2337    {
2338  134 continue;
2339    }
2340   
2341    /*
2342    * locate the mapped range - null if either start or end is
2343    * not mapped (no partial overlaps are calculated)
2344    */
2345  9476 int start = sf.getBegin();
2346  9476 int end = sf.getEnd();
2347  9476 int[] mappedTo = mapping.locateInTo(start, end);
2348    /*
2349    * if whole exon range doesn't map, try interpreting it
2350    * as 5' or 3' exon overlapping the CDS range
2351    */
2352  9476 if (mappedTo == null)
2353    {
2354  4447 mappedTo = mapping.locateInTo(end, end);
2355  4447 if (mappedTo != null)
2356    {
2357    /*
2358    * end of exon is in CDS range - 5' overlap
2359    * to a range from the start of the peptide
2360    */
2361  0 mappedTo[0] = 1;
2362    }
2363    }
2364  9476 if (mappedTo == null)
2365    {
2366  4447 mappedTo = mapping.locateInTo(start, start);
2367  4447 if (mappedTo != null)
2368    {
2369    /*
2370    * start of exon is in CDS range - 3' overlap
2371    * to a range up to the end of the peptide
2372    */
2373  0 mappedTo[1] = toSeq.getLength();
2374    }
2375    }
2376  9476 if (mappedTo != null)
2377    {
2378  5029 int newBegin = Math.min(mappedTo[0], mappedTo[1]);
2379  5029 int newEnd = Math.max(mappedTo[0], mappedTo[1]);
2380  5029 SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
2381    sf.getFeatureGroup(), sf.getScore());
2382  5029 copyTo.addSequenceFeature(copy);
2383  5029 count++;
2384    }
2385    }
2386  23 return count;
2387    }
2388   
2389    /**
2390    * Returns a mapping from dna to protein by inspecting sequence features of
2391    * type "CDS" on the dna. A mapping is constructed if the total CDS feature
2392    * length is 3 times the peptide length (optionally after dropping a trailing
2393    * stop codon). This method does not check whether the CDS nucleotide sequence
2394    * translates to the peptide sequence.
2395    *
2396    * @param dnaSeq
2397    * @param proteinSeq
2398    * @return
2399    */
 
2400  22 toggle public static MapList mapCdsToProtein(SequenceI dnaSeq,
2401    SequenceI proteinSeq)
2402    {
2403  22 List<int[]> ranges = findCdsPositions(dnaSeq);
2404  22 int mappedDnaLength = MappingUtils.getLength(ranges);
2405   
2406    /*
2407    * if not a whole number of codons, truncate mapping
2408    */
2409  22 int codonRemainder = mappedDnaLength % CODON_LENGTH;
2410  22 if (codonRemainder > 0)
2411    {
2412  2 mappedDnaLength -= codonRemainder;
2413  2 MappingUtils.removeEndPositions(codonRemainder, ranges);
2414    }
2415   
2416  22 int proteinLength = proteinSeq.getLength();
2417  22 int proteinStart = proteinSeq.getStart();
2418  22 int proteinEnd = proteinSeq.getEnd();
2419   
2420    /*
2421    * incomplete start codon may mean X at start of peptide
2422    * we ignore both for mapping purposes
2423    */
2424  22 if (proteinSeq.getCharAt(0) == 'X')
2425    {
2426    // todo JAL-2022 support startPhase > 0
2427  1 proteinStart++;
2428  1 proteinLength--;
2429    }
2430  22 List<int[]> proteinRange = new ArrayList<>();
2431   
2432    /*
2433    * dna length should map to protein (or protein plus stop codon)
2434    */
2435  22 int codesForResidues = mappedDnaLength / CODON_LENGTH;
2436  22 if (codesForResidues == (proteinLength + 1))
2437    {
2438    // assuming extra codon is for STOP and not in peptide
2439    // todo: check trailing codon is indeed a STOP codon
2440  2 codesForResidues--;
2441  2 mappedDnaLength -= CODON_LENGTH;
2442  2 MappingUtils.removeEndPositions(CODON_LENGTH, ranges);
2443    }
2444   
2445  22 if (codesForResidues == proteinLength)
2446    {
2447  4 proteinRange.add(new int[] { proteinStart, proteinEnd });
2448  4 return new MapList(ranges, proteinRange, CODON_LENGTH, 1);
2449    }
2450  18 return null;
2451    }
2452   
2453    /**
2454    * Returns a list of CDS ranges found (as sequence positions base 1), i.e. of
2455    * [start, end] positions of sequence features of type "CDS" (or a sub-type of
2456    * CDS in the Sequence Ontology). The ranges are sorted into ascending start
2457    * position order, so this method is only valid for linear CDS in the same
2458    * sense as the protein product.
2459    *
2460    * @param dnaSeq
2461    * @return
2462    */
 
2463  24 toggle protected static List<int[]> findCdsPositions(SequenceI dnaSeq)
2464    {
2465  24 List<int[]> result = new ArrayList<>();
2466   
2467  24 List<SequenceFeature> sfs = dnaSeq.getFeatures()
2468    .getFeaturesByOntology(SequenceOntologyI.CDS);
2469  24 if (sfs.isEmpty())
2470    {
2471  16 return result;
2472    }
2473  8 SequenceFeatures.sortFeatures(sfs, true);
2474   
2475  8 for (SequenceFeature sf : sfs)
2476    {
2477  16 int phase = 0;
2478  16 try
2479    {
2480  16 String s = sf.getPhase();
2481  16 if (s != null)
2482    {
2483  2 phase = Integer.parseInt(s);
2484    }
2485    } catch (NumberFormatException e)
2486    {
2487    // leave as zero
2488    }
2489    /*
2490    * phase > 0 on first codon means 5' incomplete - skip to the start
2491    * of the next codon; example ENST00000496384
2492    */
2493  16 int begin = sf.getBegin();
2494  16 int end = sf.getEnd();
2495  16 if (result.isEmpty() && phase > 0)
2496    {
2497  2 begin += phase;
2498  2 if (begin > end)
2499    {
2500    // shouldn't happen!
2501  0 System.err
2502    .println("Error: start phase extends beyond start CDS in "
2503    + dnaSeq.getName());
2504    }
2505    }
2506  16 result.add(new int[] { begin, end });
2507    }
2508   
2509    /*
2510    * Finally sort ranges by start position. This avoids a dependency on
2511    * keeping features in order on the sequence (if they are in order anyway,
2512    * the sort will have almost no work to do). The implicit assumption is CDS
2513    * ranges are assembled in order. Other cases should not use this method,
2514    * but instead construct an explicit mapping for CDS (e.g. EMBL parsing).
2515    */
2516  8 Collections.sort(result, IntRangeComparator.ASCENDING);
2517  8 return result;
2518    }
2519   
2520    /**
2521    * Makes an alignment with a copy of the given sequences, adding in any
2522    * non-redundant sequences which are mapped to by the cross-referenced
2523    * sequences.
2524    *
2525    * @param seqs
2526    * @param xrefs
2527    * @param dataset
2528    * the alignment dataset shared by the new copy
2529    * @return
2530    */
 
2531  0 toggle public static AlignmentI makeCopyAlignment(SequenceI[] seqs,
2532    SequenceI[] xrefs, AlignmentI dataset)
2533    {
2534  0 AlignmentI copy = new Alignment(new Alignment(seqs));
2535  0 copy.setDataset(dataset);
2536  0 boolean isProtein = !copy.isNucleotide();
2537  0 SequenceIdMatcher matcher = new SequenceIdMatcher(seqs);
2538  0 if (xrefs != null)
2539    {
2540    // BH 2019.01.25 recoded to remove iterators
2541   
2542  0 for (int ix = 0, nx = xrefs.length; ix < nx; ix++)
2543    {
2544  0 SequenceI xref = xrefs[ix];
2545  0 List<DBRefEntry> dbrefs = xref.getDBRefs();
2546  0 if (dbrefs != null)
2547    {
2548  0 for (int ir = 0, nir = dbrefs.size(); ir < nir; ir++)
2549    {
2550  0 DBRefEntry dbref = dbrefs.get(ir);
2551  0 Mapping map = dbref.getMap();
2552  0 SequenceI mto;
2553  0 if (map == null || (mto = map.getTo()) == null
2554    || mto.isProtein() != isProtein)
2555    {
2556  0 continue;
2557    }
2558  0 SequenceI mappedTo = mto;
2559  0 SequenceI match = matcher.findIdMatch(mappedTo);
2560  0 if (match == null)
2561    {
2562  0 matcher.add(mappedTo);
2563  0 copy.addSequence(mappedTo);
2564    }
2565    }
2566    }
2567    }
2568    }
2569  0 return copy;
2570    }
2571   
2572    /**
2573    * Try to align sequences in 'unaligned' to match the alignment of their
2574    * mapped regions in 'aligned'. For example, could use this to align CDS
2575    * sequences which are mapped to their parent cDNA sequences.
2576    *
2577    * This method handles 1:1 mappings (dna-to-dna or protein-to-protein). For
2578    * dna-to-protein or protein-to-dna use alternative methods.
2579    *
2580    * @param unaligned
2581    * sequences to be aligned
2582    * @param aligned
2583    * holds aligned sequences and their mappings
2584    * @return
2585    */
 
2586  4 toggle public static int alignAs(AlignmentI unaligned, AlignmentI aligned)
2587    {
2588    /*
2589    * easy case - aligning a copy of aligned sequences
2590    */
2591  4 if (alignAsSameSequences(unaligned, aligned))
2592    {
2593  0 return unaligned.getHeight();
2594    }
2595   
2596    /*
2597    * fancy case - aligning via mappings between sequences
2598    */
2599  4 List<SequenceI> unmapped = new ArrayList<>();
2600  4 Map<Integer, Map<SequenceI, Character>> columnMap = buildMappedColumnsMap(
2601    unaligned, aligned, unmapped);
2602  4 int width = columnMap.size();
2603  4 char gap = unaligned.getGapCharacter();
2604  4 int realignedCount = 0;
2605    // TODO: verify this loop scales sensibly for very wide/high alignments
2606   
2607  4 for (SequenceI seq : unaligned.getSequences())
2608    {
2609  26 if (!unmapped.contains(seq))
2610    {
2611  26 char[] newSeq = new char[width];
2612  26 Arrays.fill(newSeq, gap); // JBPComment - doubt this is faster than the
2613    // Integer iteration below
2614  26 int newCol = 0;
2615  26 int lastCol = 0;
2616   
2617    /*
2618    * traverse the map to find columns populated
2619    * by our sequence
2620    */
2621  26 for (Integer column : columnMap.keySet())
2622    {
2623  58976 Character c = columnMap.get(column).get(seq);
2624  58976 if (c != null)
2625    {
2626    /*
2627    * sequence has a character at this position
2628    *
2629    */
2630  31986 newSeq[newCol] = c;
2631  31986 lastCol = newCol;
2632    }
2633  58976 newCol++;
2634    }
2635   
2636    /*
2637    * trim trailing gaps
2638    */
2639  26 if (lastCol < width)
2640    {
2641  26 char[] tmp = new char[lastCol + 1];
2642  26 System.arraycopy(newSeq, 0, tmp, 0, lastCol + 1);
2643  26 newSeq = tmp;
2644    }
2645    // TODO: optimise SequenceI to avoid char[]->String->char[]
2646  26 seq.setSequence(String.valueOf(newSeq));
2647  26 realignedCount++;
2648    }
2649    }
2650  4 return realignedCount;
2651    }
2652   
2653    /**
2654    * If unaligned and aligned sequences share the same dataset sequences, then
2655    * simply copies the aligned sequences to the unaligned sequences and returns
2656    * true; else returns false
2657    *
2658    * @param unaligned
2659    * - sequences to be aligned based on aligned
2660    * @param aligned
2661    * - 'guide' alignment containing sequences derived from same dataset
2662    * as unaligned
2663    * @return
2664    */
 
2665  8 toggle static boolean alignAsSameSequences(AlignmentI unaligned,
2666    AlignmentI aligned)
2667    {
2668  8 if (aligned.getDataset() == null || unaligned.getDataset() == null)
2669    {
2670  0 return false; // should only pass alignments with datasets here
2671    }
2672   
2673    // map from dataset sequence to alignment sequence(s)
2674  8 Map<SequenceI, List<SequenceI>> alignedDatasets = new HashMap<>();
2675  8 for (SequenceI seq : aligned.getSequences())
2676    {
2677  59 SequenceI ds = seq.getDatasetSequence();
2678  59 if (alignedDatasets.get(ds) == null)
2679    {
2680  58 alignedDatasets.put(ds, new ArrayList<SequenceI>());
2681    }
2682  59 alignedDatasets.get(ds).add(seq);
2683    }
2684   
2685    /*
2686    * first pass - check whether all sequences to be aligned share a
2687    * dataset sequence with an aligned sequence; also note the leftmost
2688    * ungapped column from which to copy
2689    */
2690  8 int leftmost = Integer.MAX_VALUE;
2691  8 for (SequenceI seq : unaligned.getSequences())
2692    {
2693  14 final SequenceI ds = seq.getDatasetSequence();
2694  14 if (!alignedDatasets.containsKey(ds))
2695    {
2696  5 return false;
2697    }
2698  9 SequenceI alignedSeq = alignedDatasets.get(ds).get(0);
2699  9 int startCol = alignedSeq.findIndex(seq.getStart()); // 1..
2700  9 leftmost = Math.min(leftmost, startCol);
2701    }
2702   
2703    /*
2704    * second pass - copy aligned sequences;
2705    * heuristic rule: pair off sequences in order for the case where
2706    * more than one shares the same dataset sequence
2707    */
2708  3 final char gapCharacter = aligned.getGapCharacter();
2709  3 for (SequenceI seq : unaligned.getSequences())
2710    {
2711  7 List<SequenceI> alignedSequences = alignedDatasets
2712    .get(seq.getDatasetSequence());
2713  7 if (alignedSequences.isEmpty())
2714    {
2715    /*
2716    * defensive check - shouldn't happen! (JAL-3536)
2717    */
2718  0 continue;
2719    }
2720  7 SequenceI alignedSeq = alignedSequences.get(0);
2721   
2722    /*
2723    * gap fill for leading (5') UTR if any
2724    */
2725    // TODO this copies intron columns - wrong!
2726  7 int startCol = alignedSeq.findIndex(seq.getStart()); // 1..
2727  7 int endCol = alignedSeq.findIndex(seq.getEnd());
2728  7 char[] seqchars = new char[endCol - leftmost + 1];
2729  7 Arrays.fill(seqchars, gapCharacter);
2730  7 char[] toCopy = alignedSeq.getSequence(startCol - 1, endCol);
2731  7 System.arraycopy(toCopy, 0, seqchars, startCol - leftmost,
2732    toCopy.length);
2733  7 seq.setSequence(String.valueOf(seqchars));
2734  7 if (alignedSequences.size() > 0)
2735    {
2736    // pop off aligned sequences (except the last one)
2737  7 alignedSequences.remove(0);
2738    }
2739    }
2740   
2741    /*
2742    * finally remove gapped columns (e.g. introns)
2743    */
2744  3 new RemoveGapColCommand("", unaligned.getSequencesArray(), 0,
2745    unaligned.getWidth() - 1, unaligned);
2746   
2747  3 return true;
2748    }
2749   
2750    /**
2751    * Returns a map whose key is alignment column number (base 1), and whose
2752    * values are a map of sequence characters in that column.
2753    *
2754    * @param unaligned
2755    * @param aligned
2756    * @param unmapped
2757    * @return
2758    */
 
2759  4 toggle static SortedMap<Integer, Map<SequenceI, Character>> buildMappedColumnsMap(
2760    AlignmentI unaligned, AlignmentI aligned,
2761    List<SequenceI> unmapped)
2762    {
2763    /*
2764    * Map will hold, for each aligned column position, a map of
2765    * {unalignedSequence, characterPerSequence} at that position.
2766    * TreeMap keeps the entries in ascending column order.
2767    */
2768  4 SortedMap<Integer, Map<SequenceI, Character>> map = new TreeMap<>();
2769   
2770    /*
2771    * record any sequences that have no mapping so can't be realigned
2772    */
2773  4 unmapped.addAll(unaligned.getSequences());
2774   
2775  4 List<AlignedCodonFrame> mappings = aligned.getCodonFrames();
2776   
2777  4 for (SequenceI seq : unaligned.getSequences())
2778    {
2779  26 for (AlignedCodonFrame mapping : mappings)
2780    {
2781  510 SequenceI fromSeq = mapping.findAlignedSequence(seq, aligned);
2782  510 if (fromSeq != null)
2783    {
2784  26 Mapping seqMap = mapping.getMappingBetween(fromSeq, seq);
2785  26 if (addMappedPositions(seq, fromSeq, seqMap, map))
2786    {
2787  26 unmapped.remove(seq);
2788    }
2789    }
2790    }
2791    }
2792  4 return map;
2793    }
2794   
2795    /**
2796    * Helper method that adds to a map the mapped column positions of a sequence.
2797    * <br>
2798    * For example if aaTT-Tg-gAAA is mapped to TTTAAA then the map should record
2799    * that columns 3,4,6,10,11,12 map to characters T,T,T,A,A,A of the mapped to
2800    * sequence.
2801    *
2802    * @param seq
2803    * the sequence whose column positions we are recording
2804    * @param fromSeq
2805    * a sequence that is mapped to the first sequence
2806    * @param seqMap
2807    * the mapping from 'fromSeq' to 'seq'
2808    * @param map
2809    * a map to add the column positions (in fromSeq) of the mapped
2810    * positions of seq
2811    * @return
2812    */
 
2813  28 toggle static boolean addMappedPositions(SequenceI seq, SequenceI fromSeq,
2814    Mapping seqMap, Map<Integer, Map<SequenceI, Character>> map)
2815    {
2816  28 if (seqMap == null)
2817    {
2818  0 return false;
2819    }
2820   
2821    /*
2822    * invert mapping if it is from unaligned to aligned sequence
2823    */
2824  28 if (seqMap.getTo() == fromSeq.getDatasetSequence())
2825    {
2826  0 seqMap = new Mapping(seq.getDatasetSequence(),
2827    seqMap.getMap().getInverse());
2828    }
2829   
2830  28 int toStart = seq.getStart();
2831   
2832    /*
2833    * traverse [start, end, start, end...] ranges in fromSeq
2834    */
2835  28 for (int[] fromRange : seqMap.getMap().getFromRanges())
2836    {
2837  62 for (int i = 0; i < fromRange.length - 1; i += 2)
2838    {
2839  31 boolean forward = fromRange[i + 1] >= fromRange[i];
2840   
2841    /*
2842    * find the range mapped to (sequence positions base 1)
2843    */
2844  31 int[] range = seqMap.locateMappedRange(fromRange[i],
2845    fromRange[i + 1]);
2846  31 if (range == null)
2847    {
2848  0 jalview.bin.Console.errPrintln("Error in mapping " + seqMap
2849    + " from " + fromSeq.getName());
2850  0 return false;
2851    }
2852  31 int fromCol = fromSeq.findIndex(fromRange[i]);
2853  31 int mappedCharPos = range[0];
2854   
2855    /*
2856    * walk over the 'from' aligned sequence in forward or reverse
2857    * direction; when a non-gap is found, record the column position
2858    * of the next character of the mapped-to sequence; stop when all
2859    * the characters of the range have been counted
2860    */
2861  2794274 while (mappedCharPos <= range[1] && fromCol <= fromSeq.getLength()
2862    && fromCol >= 0)
2863    {
2864  2794243 if (!Comparison.isGap(fromSeq.getCharAt(fromCol - 1)))
2865    {
2866    /*
2867    * mapped from sequence has a character in this column
2868    * record the column position for the mapped to character
2869    */
2870  31998 Map<SequenceI, Character> seqsMap = map.get(fromCol);
2871  31998 if (seqsMap == null)
2872    {
2873  5398 seqsMap = new HashMap<>();
2874  5398 map.put(fromCol, seqsMap);
2875    }
2876  31998 seqsMap.put(seq, seq.getCharAt(mappedCharPos - toStart));
2877  31998 mappedCharPos++;
2878    }
2879  2794243 fromCol += (forward ? 1 : -1);
2880    }
2881    }
2882    }
2883  28 return true;
2884    }
2885   
2886    // strictly temporary hack until proper criteria for aligning protein to cds
2887    // are in place; this is so Ensembl -> fetch xrefs Uniprot aligns the Uniprot
 
2888  4 toggle public static boolean looksLikeEnsembl(AlignmentI alignment)
2889    {
2890  4 for (SequenceI seq : alignment.getSequences())
2891    {
2892  88 String name = seq.getName();
2893  88 if (!name.startsWith("ENSG") && !name.startsWith("ENST"))
2894    {
2895  0 return false;
2896    }
2897    }
2898  4 return true;
2899    }
2900   
2901    /**
2902    * This method filters and returns the secondary structure annotations
2903    * in an array of annotations
2904    *
2905    * @param annotations
2906    * @return array of secondary structure annotations
2907    */
 
2908  0 toggle public static List<AlignmentAnnotation> getSecondaryStructureAnnots(
2909    AlignmentAnnotation[] annotations)
2910    {
2911  0 List<AlignmentAnnotation> ssAnnotations = new ArrayList<>();
2912  0 if (annotations == null || annotations.length == 0)
2913    {
2914  0 return null;
2915    }
2916   
2917  0 synchronized (annotations)
2918    {
2919  0 for (AlignmentAnnotation aa : annotations)
2920    {
2921  0 if (aa == null)
2922    {
2923  0 continue;
2924    }
2925   
2926  0 if (aa.label != null && Constants.SECONDARY_STRUCTURE_LABELS
2927    .containsKey(aa.label))
2928    {
2929  0 ssAnnotations.add(aa);
2930    }
2931    }
2932    }
2933  0 return ssAnnotations;
2934   
2935    }
2936   
 
2937  19 toggle public static boolean isSecondaryStructurePresent(
2938    AlignmentAnnotation[] annotations)
2939    {
2940  19 boolean ssPresent = false;
2941  19 if (annotations == null || annotations.length == 0)
2942    {
2943  8 return false;
2944    }
2945   
2946  11 synchronized (annotations)
2947    {
2948  11 for (AlignmentAnnotation aa : annotations)
2949    {
2950  12 if (aa == null)
2951    {
2952  0 continue;
2953    }
2954    // TODO: @RENIA decide what exactly we should be testing for - not
2955    // strings/labels
2956  12 if (// aa.isValidStruc() || aa.hasIcons ||
2957  12 (aa.label != null && Constants.SECONDARY_STRUCTURE_LABELS
2958    .containsKey(aa.label)))
2959    {
2960  9 ssPresent = true;
2961  9 break;
2962    }
2963    }
2964    }
2965  11 return ssPresent;
2966   
2967    }
2968   
 
2969  4 toggle public static Color getSecondaryStructureAnnotationColour(char symbol)
2970    {
2971   
2972  4 if (symbol == Constants.COIL)
2973    {
2974  1 return Color.gray;
2975    }
2976  3 if (symbol == Constants.SHEET)
2977    {
2978  1 return Color.green;
2979    }
2980  2 if (symbol == Constants.HELIX)
2981    {
2982  1 return Color.red;
2983    }
2984   
2985  1 return Color.white;
2986    }
2987   
 
2988  170802 toggle public static char findSSAnnotationForGivenSeqposition(
2989    AlignmentAnnotation aa, int seqPosition)
2990    {
2991  170806 char ss = '*';
2992   
2993  170810 if (aa != null)
2994    {
2995  170824 if (aa.getAnnotationForPosition(seqPosition) != null)
2996    {
2997  80343 Annotation a = aa.getAnnotationForPosition(seqPosition);
2998  80341 ss = a.secondaryStructure;
2999   
3000    // There is no representation for coil and it can be either ' ' or null.
3001  80345 if (ss == ' ' || ss == '-')
3002    {
3003  16565 ss = Constants.COIL;
3004    }
3005    }
3006    else
3007    {
3008  90484 ss = Constants.COIL;
3009    }
3010    }
3011   
3012  170815 return ss;
3013    }
3014   
 
3015  2194 toggle public static List<String> extractSSSourceInAlignmentAnnotation(
3016    AlignmentAnnotation[] annotations)
3017    {
3018   
3019  2194 List<String> ssSources = new ArrayList<>();
3020  2194 Set<String> addedSources = new HashSet<>(); // to keep track of added
3021    // sources
3022   
3023  2194 if (annotations == null)
3024    {
3025  12 return ssSources;
3026    }
3027   
3028  2182 for (AlignmentAnnotation aa : annotations)
3029    {
3030   
3031  11844 String ssSource = AlignmentAnnotationUtils.extractSSSourceFromAnnotationDescription(aa);
3032   
3033  11844 if (ssSource != null && !addedSources.contains(ssSource))
3034    {
3035  204 ssSources.add(ssSource);
3036  204 addedSources.add(ssSource);
3037    }
3038   
3039    }
3040  2182 Collections.sort(ssSources);
3041   
3042  2182 return ssSources;
3043   
3044    }
3045   
3046   
3047    /**
3048    * Returns secondary structure annotations from the given sequence for a specified source.
3049    * @param seq the sequence
3050    * @param ssSource the source label ("PDB", "JPred")
3051    * @return list of matching annotations
3052    */
 
3053  11761800 toggle public static List<AlignmentAnnotation> getAlignmentAnnotationForSource(
3054    SequenceI seq, String ssSource)
3055    {
3056   
3057  11772428 List<AlignmentAnnotation> ssAnnots = new ArrayList<AlignmentAnnotation>();
3058   
3059    // Iterate through all known secondary structure labels
3060  11791776 for (String ssLabel : Constants.SECONDARY_STRUCTURE_LABELS.keySet())
3061    {
3062   
3063  23560641 AlignmentAnnotation[] aa = seq.getAnnotation(ssLabel);
3064  23477568 if (aa != null)
3065    {
3066    // If source is "ALL", include all annotations for this label
3067  280934 if (Constants.SS_ALL_PROVIDERS.equals(ssSource))
3068    {
3069  110042 ssAnnots.addAll(Arrays.asList(aa));
3070  110042 continue;
3071    }
3072   
3073  170892 for (AlignmentAnnotation annot : aa)
3074    {
3075  202171 String ssSourceForAnnot = AlignmentAnnotationUtils.extractSSSourceFromAnnotationDescription(
3076    annot);
3077    // Add annotations from the matching source
3078  202155 if (ssSourceForAnnot != null && ssSource.equals(ssSourceForAnnot)
3079    && annot.isForDisplay())
3080    {
3081  91565 ssAnnots.add(annot);
3082    }
3083    }
3084    }
3085    }
3086  11499247 if (ssAnnots.size() > 0)
3087    {
3088  176308 return ssAnnots;
3089    }
3090   
3091  11638468 return null;
3092   
3093    }
3094   
 
3095  48 toggle public static Map<SequenceI, ArrayList<AlignmentAnnotation>> getSequenceAssociatedAlignmentAnnotations(
3096    AlignmentAnnotation[] alignAnnotList, String selectedSSSource)
3097    {
3098   
3099  48 Map<SequenceI, ArrayList<AlignmentAnnotation>> ssAlignmentAnnotationForSequences = new HashMap<SequenceI, ArrayList<AlignmentAnnotation>>();
3100  48 if (alignAnnotList == null || alignAnnotList.length == 0)
3101    {
3102  0 return ssAlignmentAnnotationForSequences;
3103    }
3104   
3105  48 for (AlignmentAnnotation aa : alignAnnotList)
3106    {
3107  251 if (aa.sequenceRef == null)
3108    {
3109  32 continue;
3110    }
3111   
3112  219 if (isSecondaryStructureFrom(selectedSSSource, aa))
3113    {
3114  78 ssAlignmentAnnotationForSequences
3115    .computeIfAbsent(aa.sequenceRef.getDatasetSequence(),
3116    k -> new ArrayList<>())
3117    .add(aa);
3118    }
3119    }
3120   
3121  48 return ssAlignmentAnnotationForSequences;
3122   
3123    }
3124   
3125    /**
3126    * param alignAnnotation
3127    * @param selectedSSSource
3128    * @return any alignment annotation that matches the given source
3129    */
 
3130  0 toggle public static List<AlignmentAnnotation> getSecondaryStructureAnnotionFor(List<AlignmentAnnotation> alignAnnotation, String selectedSSSource)
3131    {
3132  0 ArrayList<AlignmentAnnotation> annForSource = new ArrayList();
3133  0 for (AlignmentAnnotation alan:alignAnnotation)
3134    {
3135  0 if (isSecondaryStructureFrom(selectedSSSource, alan)) {
3136  0 annForSource.add(alan);
3137    }
3138    }
3139  0 return annForSource;
3140    }
3141   
3142    /**
3143    *
3144    * @param selectedSSSource
3145    * @param aa
3146    * @return true if aa is from a provider or all providers as specified by
3147    * selectedSSSource
3148    */
 
3149  219 toggle public static boolean isSecondaryStructureFrom(String selectedSSSource,
3150    AlignmentAnnotation aa)
3151    {
3152   
3153  219 for (String label : Constants.SECONDARY_STRUCTURE_LABELS.keySet())
3154    {
3155   
3156  376 if (label.equals(aa.label))
3157    {
3158   
3159  170 if (selectedSSSource.equals(Constants.SS_ALL_PROVIDERS))
3160    {
3161  38 return true;
3162    }
3163  132 String ssSource = AlignmentAnnotationUtils
3164    .extractSSSourceFromAnnotationDescription(aa);
3165  132 if (ssSource != null && ssSource.equals(selectedSSSource))
3166    {
3167  40 return true;
3168    }
3169    }
3170    }
3171  141 return false;
3172    }
3173   
3174    // Method to get the key for a given provider value
 
3175  0 toggle public static String getSecondaryStructureProviderKey(
3176    String providerValue)
3177    {
3178  0 for (Map.Entry<String, String> entry : Constants.STRUCTURE_PROVIDERS
3179    .entrySet())
3180    {
3181  0 if (entry.getValue().equals(providerValue))
3182    {
3183  0 return entry.getKey(); // Return the key (abbreviation) for the matching
3184    // provider value
3185    }
3186    }
3187  0 return null; // Return null if no match is found
3188    }
3189   
 
3190  0 toggle public static String reduceLabelLength(String label)
3191    {
3192    // Split the input by " | "
3193  0 String[] parts = label.split(" \\| ");
3194   
3195    // Map the full names to their abbreviations
3196  0 String reducedLabel = Arrays.stream(parts)
3197    .map(fullName -> Constants.STRUCTURE_PROVIDERS.entrySet()
3198    .stream()
3199    .filter(entry -> entry.getValue().equals(fullName))
3200    .map(Map.Entry::getKey).findFirst().orElse(fullName)) // Use
3201    // fullName
3202    // if
3203    // no
3204    // abbreviation
3205    // is
3206    // found
3207    .collect(Collectors.joining(" | "));
3208   
3209  0 return reducedLabel; // Return the reduced label
3210    }
3211   
 
3212  532 toggle public static HashMap<String, Color> assignColorsForSecondaryStructureProviders(
3213    List<String> labels)
3214    {
3215    // Loop through each unique label and add it to the map with a colour.
3216    // for (String label : uniqueLabels)
3217   
3218  532 HashMap<String, Color> secondaryStructureProviderColorMap = new HashMap<String, Color>();
3219  532 for (String label : labels)
3220    {
3221    // Generate or retrieve a colour for the label.
3222  491 String name = label.toUpperCase(Locale.ROOT).trim();
3223  491 secondaryStructureProviderColorMap.put(name,
3224    ColorUtils.getColourFromNameAndScheme(name, "NONE"));
3225    }
3226   
3227  532 return secondaryStructureProviderColorMap;
3228    }
3229    }