Clover icon

Coverage Report

  1. Project Clover database Thu Dec 4 2025 14:43:25 GMT
  2. Package jalview.analysis

File AlignmentUtils.java

 

Coverage histogram

../../img/srcFileCovDistChart9.png
13% of files have more coverage

Code metrics

490
987
64
2
3,358
2,089
378
0.38
15.42
32
5.91

Classes

Class Line # Actions
AlignmentUtils 84 981 372
0.833660883.4%
AlignmentUtils.DnaVariant 100 6 6
0.00%
 

Contributing tests

This file is covered by 352 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.analysis;
22   
23    import java.awt.Color;
24    import java.util.ArrayList;
25    import java.util.Arrays;
26    import java.util.Collection;
27    import java.util.Collections;
28    import java.util.HashMap;
29    import java.util.HashSet;
30    import java.util.Iterator;
31    import java.util.LinkedHashMap;
32    import java.util.List;
33    import java.util.Locale;
34    import java.util.Map;
35    import java.util.Map.Entry;
36    import java.util.NoSuchElementException;
37    import java.util.Set;
38    import java.util.SortedMap;
39    import java.util.TreeMap;
40    import java.util.Vector;
41    import java.util.stream.Collectors;
42   
43    import jalview.bin.Console;
44    import jalview.commands.RemoveGapColCommand;
45    import jalview.datamodel.AlignedCodon;
46    import jalview.datamodel.AlignedCodonFrame;
47    import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping;
48    import jalview.datamodel.Alignment;
49    import jalview.datamodel.AlignmentAnnotation;
50    import jalview.datamodel.AlignmentI;
51    import jalview.datamodel.Annotation;
52    import jalview.datamodel.ContactMatrixI;
53    import jalview.datamodel.DBRefEntry;
54    import jalview.datamodel.DBRefSource;
55    import jalview.datamodel.GeneLociI;
56    import jalview.datamodel.HiddenColumns;
57    import jalview.datamodel.IncompleteCodonException;
58    import jalview.datamodel.Mapping;
59    import jalview.datamodel.PDBEntry;
60    import jalview.datamodel.Sequence;
61    import jalview.datamodel.SequenceFeature;
62    import jalview.datamodel.SequenceGroup;
63    import jalview.datamodel.SequenceI;
64    import jalview.datamodel.features.SequenceFeatures;
65    import jalview.io.gff.SequenceOntologyI;
66    import jalview.schemes.ResidueProperties;
67    import jalview.util.ColorUtils;
68    import jalview.util.Comparison;
69    import jalview.util.Constants;
70    import jalview.util.DBRefUtils;
71    import jalview.util.IntRangeComparator;
72    import jalview.util.MapList;
73    import jalview.util.MappingUtils;
74    import jalview.util.ShiftList;
75    import uk.ac.vamsas.objects.core.descriptors.NewickDescriptor;
76   
77    /**
78    * grab bag of useful alignment manipulation operations Expect these to be
79    * refactored elsewhere at some point.
80    *
81    * @author jimp
82    *
83    */
 
84    public class AlignmentUtils
85    {
86    private static final int CODON_LENGTH = 3;
87   
88    private static final String SEQUENCE_VARIANT = "sequence_variant:";
89   
90    /*
91    * the 'id' attribute is provided for variant features fetched from
92    * Ensembl using its REST service with JSON format
93    */
94    public static final String VARIANT_ID = "id";
95   
96    /**
97    * A data model to hold the 'normal' base value at a position, and an optional
98    * sequence variant feature
99    */
 
100    static final class DnaVariant
101    {
102    final String base;
103   
104    SequenceFeature variant;
105   
 
106  0 toggle DnaVariant(String nuc)
107    {
108  0 base = nuc;
109  0 variant = null;
110    }
111   
 
112  0 toggle DnaVariant(String nuc, SequenceFeature var)
113    {
114  0 base = nuc;
115  0 variant = var;
116    }
117   
 
118  0 toggle public String getSource()
119    {
120  0 return variant == null ? null : variant.getFeatureGroup();
121    }
122   
123    /**
124    * toString for aid in the debugger only
125    */
 
126  0 toggle @Override
127    public String toString()
128    {
129  0 return base + ":" + (variant == null ? "" : variant.getDescription());
130    }
131    }
132   
133    /**
134    * given an existing alignment, create a new alignment including all, or up to
135    * flankSize additional symbols from each sequence's dataset sequence
136    *
137    * @param core
138    * @param flankSize
139    * @return AlignmentI
140    */
 
141  27 toggle public static AlignmentI expandContext(AlignmentI core, int flankSize)
142    {
143  27 List<SequenceI> sq = new ArrayList<>();
144  27 int maxoffset = 0;
145  27 for (SequenceI s : core.getSequences())
146    {
147  131 SequenceI newSeq = s.deriveSequence();
148  131 final int newSeqStart = newSeq.getStart() - 1;
149  131 if (newSeqStart > maxoffset
150    && newSeq.getDatasetSequence().getStart() < s.getStart())
151    {
152  131 maxoffset = newSeqStart;
153    }
154  131 sq.add(newSeq);
155    }
156  27 if (flankSize > -1)
157    {
158  25 maxoffset = Math.min(maxoffset, flankSize);
159    }
160   
161    /*
162    * now add offset left and right to create an expanded alignment
163    */
164  27 for (SequenceI s : sq)
165    {
166  131 SequenceI ds = s;
167  262 while (ds.getDatasetSequence() != null)
168    {
169  131 ds = ds.getDatasetSequence();
170    }
171  131 int s_end = s.findPosition(s.getStart() + s.getLength());
172    // find available flanking residues for sequence
173  131 int ustream_ds = s.getStart() - ds.getStart();
174  131 int dstream_ds = ds.getEnd() - s_end;
175   
176    // build new flanked sequence
177   
178    // compute gap padding to start of flanking sequence
179  131 int offset = maxoffset - ustream_ds;
180   
181    // padding is gapChar x ( maxoffset - min(ustream_ds, flank)
182  131 if (flankSize >= 0)
183    {
184  125 if (flankSize < ustream_ds)
185    {
186    // take up to flankSize residues
187  40 offset = maxoffset - flankSize;
188  40 ustream_ds = flankSize;
189    }
190  125 if (flankSize <= dstream_ds)
191    {
192  116 dstream_ds = flankSize - 1;
193    }
194    }
195    // TODO use Character.toLowerCase to avoid creating String objects?
196  131 char[] upstream = new String(ds
197    .getSequence(s.getStart() - 1 - ustream_ds, s.getStart() - 1))
198    .toLowerCase(Locale.ROOT).toCharArray();
199  131 char[] downstream = new String(
200    ds.getSequence(s_end - 1, s_end + dstream_ds))
201    .toLowerCase(Locale.ROOT).toCharArray();
202  131 char[] coreseq = s.getSequence();
203  131 char[] nseq = new char[offset + upstream.length + downstream.length
204    + coreseq.length];
205  131 char c = core.getGapCharacter();
206   
207  131 int p = 0;
208  461 for (; p < offset; p++)
209    {
210  330 nseq[p] = c;
211    }
212   
213  131 System.arraycopy(upstream, 0, nseq, p, upstream.length);
214  131 System.arraycopy(coreseq, 0, nseq, p + upstream.length,
215    coreseq.length);
216  131 System.arraycopy(downstream, 0, nseq,
217    p + coreseq.length + upstream.length, downstream.length);
218  131 s.setSequence(new String(nseq));
219  131 s.setStart(s.getStart() - ustream_ds);
220  131 s.setEnd(s_end + downstream.length);
221    }
222  27 AlignmentI newAl = new jalview.datamodel.Alignment(
223    sq.toArray(new SequenceI[0]));
224  27 for (SequenceI s : sq)
225    {
226  131 if (s.getAnnotation() != null)
227    {
228  1 for (AlignmentAnnotation aa : s.getAnnotation())
229    {
230  1 aa.adjustForAlignment(); // JAL-1712 fix
231  1 newAl.addAnnotation(aa);
232    }
233    }
234    }
235  27 newAl.setDataset(core.getDataset());
236  27 return newAl;
237    }
238   
239    /**
240    * Returns the index (zero-based position) of a sequence in an alignment, or
241    * -1 if not found.
242    *
243    * @param al
244    * @param seq
245    * @return
246    */
 
247  63904 toggle public static int getSequenceIndex(AlignmentI al, SequenceI seq)
248    {
249  63904 int result = -1;
250  63904 int pos = 0;
251  63904 for (SequenceI alSeq : al.getSequences())
252    {
253  126268949 if (alSeq == seq)
254    {
255  63833 result = pos;
256  63833 break;
257    }
258  126205116 pos++;
259    }
260  63904 return result;
261    }
262   
263    /**
264    * Returns a map of lists of sequences in the alignment, keyed by sequence
265    * name. For use in mapping between different alignment views of the same
266    * sequences.
267    *
268    * @see jalview.datamodel.AlignmentI#getSequencesByName()
269    */
 
270  1 toggle public static Map<String, List<SequenceI>> getSequencesByName(
271    AlignmentI al)
272    {
273  1 Map<String, List<SequenceI>> theMap = new LinkedHashMap<>();
274  1 for (SequenceI seq : al.getSequences())
275    {
276  3 String name = seq.getName();
277  3 if (name != null)
278    {
279  3 List<SequenceI> seqs = theMap.get(name);
280  3 if (seqs == null)
281    {
282  2 seqs = new ArrayList<>();
283  2 theMap.put(name, seqs);
284    }
285  3 seqs.add(seq);
286    }
287    }
288  1 return theMap;
289    }
290   
291    /**
292    * Build mapping of protein to cDNA alignment. Mappings are made between
293    * sequences where the cDNA translates to the protein sequence. Any new
294    * mappings are added to the protein alignment. Returns true if any mappings
295    * either already exist or were added, else false.
296    *
297    * @param proteinAlignment
298    * @param cdnaAlignment
299    * @return
300    */
 
301  4 toggle public static boolean mapProteinAlignmentToCdna(
302    final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment)
303    {
304  4 if (proteinAlignment == null || cdnaAlignment == null)
305    {
306  0 return false;
307    }
308   
309  4 Set<SequenceI> mappedDna = new HashSet<>();
310  4 Set<SequenceI> mappedProtein = new HashSet<>();
311   
312    /*
313    * First pass - map sequences where cross-references exist. This include
314    * 1-to-many mappings to support, for example, variant cDNA.
315    */
316  4 boolean mappingPerformed = mapProteinToCdna(proteinAlignment,
317    cdnaAlignment, mappedDna, mappedProtein, true);
318   
319    /*
320    * Second pass - map sequences where no cross-references exist. This only
321    * does 1-to-1 mappings and assumes corresponding sequences are in the same
322    * order in the alignments.
323    */
324  4 mappingPerformed |= mapProteinToCdna(proteinAlignment, cdnaAlignment,
325    mappedDna, mappedProtein, false);
326  4 return mappingPerformed;
327    }
328   
329    /**
330    * Make mappings between compatible sequences (where the cDNA translation
331    * matches the protein).
332    *
333    * @param proteinAlignment
334    * @param cdnaAlignment
335    * @param mappedDna
336    * a set of mapped DNA sequences (to add to)
337    * @param mappedProtein
338    * a set of mapped Protein sequences (to add to)
339    * @param xrefsOnly
340    * if true, only map sequences where xrefs exist
341    * @return
342    */
 
343  8 toggle protected static boolean mapProteinToCdna(
344    final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment,
345    Set<SequenceI> mappedDna, Set<SequenceI> mappedProtein,
346    boolean xrefsOnly)
347    {
348  8 boolean mappingExistsOrAdded = false;
349  8 List<SequenceI> thisSeqs = proteinAlignment.getSequences();
350  8 for (SequenceI aaSeq : thisSeqs)
351    {
352  22 boolean proteinMapped = false;
353  22 AlignedCodonFrame acf = new AlignedCodonFrame();
354   
355  22 for (SequenceI cdnaSeq : cdnaAlignment.getSequences())
356    {
357    /*
358    * Always try to map if sequences have xref to each other; this supports
359    * variant cDNA or alternative splicing for a protein sequence.
360    *
361    * If no xrefs, try to map progressively, assuming that alignments have
362    * mappable sequences in corresponding order. These are not
363    * many-to-many, as that would risk mixing species with similar cDNA
364    * sequences.
365    */
366  86 if (xrefsOnly && !AlignmentUtils.haveCrossRef(aaSeq, cdnaSeq))
367    {
368  39 continue;
369    }
370   
371    /*
372    * Don't map non-xrefd sequences more than once each. This heuristic
373    * allows us to pair up similar sequences in ordered alignments.
374    */
375  47 if (!xrefsOnly && (mappedProtein.contains(aaSeq)
376    || mappedDna.contains(cdnaSeq)))
377    {
378  29 continue;
379    }
380  18 if (mappingExists(proteinAlignment.getCodonFrames(),
381    aaSeq.getDatasetSequence(), cdnaSeq.getDatasetSequence()))
382    {
383  0 mappingExistsOrAdded = true;
384    }
385    else
386    {
387  18 MapList map = mapCdnaToProtein(aaSeq, cdnaSeq);
388  18 if (map != null)
389    {
390  12 acf.addMap(cdnaSeq, aaSeq, map);
391  12 mappingExistsOrAdded = true;
392  12 proteinMapped = true;
393  12 mappedDna.add(cdnaSeq);
394  12 mappedProtein.add(aaSeq);
395    }
396    }
397    }
398  22 if (proteinMapped)
399    {
400  11 proteinAlignment.addCodonFrame(acf);
401    }
402    }
403  8 return mappingExistsOrAdded;
404    }
405   
406    /**
407    * Answers true if the mappings include one between the given (dataset)
408    * sequences.
409    */
 
410  18 toggle protected static boolean mappingExists(List<AlignedCodonFrame> mappings,
411    SequenceI aaSeq, SequenceI cdnaSeq)
412    {
413  18 if (mappings != null)
414    {
415  18 for (AlignedCodonFrame acf : mappings)
416    {
417  14 if (cdnaSeq == acf.getDnaForAaSeq(aaSeq))
418    {
419  0 return true;
420    }
421    }
422    }
423  18 return false;
424    }
425   
426    /**
427    * Builds a mapping (if possible) of a cDNA to a protein sequence.
428    * <ul>
429    * <li>first checks if the cdna translates exactly to the protein
430    * sequence</li>
431    * <li>else checks for translation after removing a STOP codon</li>
432    * <li>else checks for translation after removing a START codon</li>
433    * <li>if that fails, inspect CDS features on the cDNA sequence</li>
434    * </ul>
435    * Returns null if no mapping is determined.
436    *
437    * @param proteinSeq
438    * the aligned protein sequence
439    * @param cdnaSeq
440    * the aligned cdna sequence
441    * @return
442    */
 
443  31 toggle public static MapList mapCdnaToProtein(SequenceI proteinSeq,
444    SequenceI cdnaSeq)
445    {
446    /*
447    * Here we handle either dataset sequence set (desktop) or absent (applet).
448    * Use only the char[] form of the sequence to avoid creating possibly large
449    * String objects.
450    */
451  31 final SequenceI proteinDataset = proteinSeq.getDatasetSequence();
452  31 char[] aaSeqChars = proteinDataset != null
453    ? proteinDataset.getSequence()
454    : proteinSeq.getSequence();
455  31 final SequenceI cdnaDataset = cdnaSeq.getDatasetSequence();
456  31 char[] cdnaSeqChars = cdnaDataset != null ? cdnaDataset.getSequence()
457    : cdnaSeq.getSequence();
458  31 if (aaSeqChars == null || cdnaSeqChars == null)
459    {
460  0 return null;
461    }
462   
463    /*
464    * cdnaStart/End, proteinStartEnd are base 1 (for dataset sequence mapping)
465    */
466  31 final int mappedLength = CODON_LENGTH * aaSeqChars.length;
467  31 int cdnaLength = cdnaSeqChars.length;
468  31 int cdnaStart = cdnaSeq.getStart();
469  31 int cdnaEnd = cdnaSeq.getEnd();
470  31 final int proteinStart = proteinSeq.getStart();
471  31 final int proteinEnd = proteinSeq.getEnd();
472   
473    /*
474    * If lengths don't match, try ignoring stop codon (if present)
475    */
476  31 if (cdnaLength != mappedLength && cdnaLength > 2)
477    {
478  16 String lastCodon = String.valueOf(cdnaSeqChars,
479    cdnaLength - CODON_LENGTH, CODON_LENGTH)
480    .toUpperCase(Locale.ROOT);
481  16 for (String stop : ResidueProperties.STOP_CODONS)
482    {
483  47 if (lastCodon.equals(stop))
484    {
485  2 cdnaEnd -= CODON_LENGTH;
486  2 cdnaLength -= CODON_LENGTH;
487  2 break;
488    }
489    }
490    }
491   
492    /*
493    * If lengths still don't match, try ignoring start codon.
494    */
495  31 int startOffset = 0;
496  31 if (cdnaLength != mappedLength && cdnaLength > 2
497    && String.valueOf(cdnaSeqChars, 0, CODON_LENGTH)
498    .toUpperCase(Locale.ROOT)
499    .equals(ResidueProperties.START))
500    {
501  5 startOffset += CODON_LENGTH;
502  5 cdnaStart += CODON_LENGTH;
503  5 cdnaLength -= CODON_LENGTH;
504    }
505   
506  31 if (translatesAs(cdnaSeqChars, startOffset, aaSeqChars))
507    {
508    /*
509    * protein is translation of dna (+/- start/stop codons)
510    */
511  15 MapList map = new MapList(new int[] { cdnaStart, cdnaEnd },
512    new int[]
513    { proteinStart, proteinEnd }, CODON_LENGTH, 1);
514  15 return map;
515    }
516   
517    /*
518    * translation failed - try mapping CDS annotated regions of dna
519    */
520  16 return mapCdsToProtein(cdnaSeq, proteinSeq);
521    }
522   
523    /**
524    * Test whether the given cdna sequence, starting at the given offset,
525    * translates to the given amino acid sequence, using the standard translation
526    * table. Designed to fail fast i.e. as soon as a mismatch position is found.
527    *
528    * @param cdnaSeqChars
529    * @param cdnaStart
530    * @param aaSeqChars
531    * @return
532    */
 
533  51 toggle protected static boolean translatesAs(char[] cdnaSeqChars, int cdnaStart,
534    char[] aaSeqChars)
535    {
536  51 if (cdnaSeqChars == null || aaSeqChars == null)
537    {
538  3 return false;
539    }
540   
541  48 int aaPos = 0;
542  48 int dnaPos = cdnaStart;
543  161 for (; dnaPos < cdnaSeqChars.length - 2
544    && aaPos < aaSeqChars.length; dnaPos += CODON_LENGTH, aaPos++)
545    {
546  130 String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
547  130 final String translated = ResidueProperties.codonTranslate(codon);
548   
549    /*
550    * allow * in protein to match untranslatable in dna
551    */
552  130 final char aaRes = aaSeqChars[aaPos];
553  130 if ((translated == null || ResidueProperties.STOP.equals(translated))
554    && aaRes == '*')
555    {
556  4 continue;
557    }
558  126 if (translated == null || !(aaRes == translated.charAt(0)))
559    {
560    // debug
561    // jalview.bin.Console.outPrintln(("Mismatch at " + i + "/" + aaResidue
562    // + ": "
563    // + codon + "(" + translated + ") != " + aaRes));
564  17 return false;
565    }
566    }
567   
568    /*
569    * check we matched all of the protein sequence
570    */
571  31 if (aaPos != aaSeqChars.length)
572    {
573  2 return false;
574    }
575   
576    /*
577    * check we matched all of the dna except
578    * for optional trailing STOP codon
579    */
580  29 if (dnaPos == cdnaSeqChars.length)
581    {
582  17 return true;
583    }
584  12 if (dnaPos == cdnaSeqChars.length - CODON_LENGTH)
585    {
586  11 String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
587  11 if (ResidueProperties.STOP
588    .equals(ResidueProperties.codonTranslate(codon)))
589    {
590  9 return true;
591    }
592    }
593  3 return false;
594    }
595   
596    /**
597    * Align sequence 'seq' to match the alignment of a mapped sequence. Note this
598    * currently assumes that we are aligning cDNA to match protein.
599    *
600    * @param seq
601    * the sequence to be realigned
602    * @param al
603    * the alignment whose sequence alignment is to be 'copied'
604    * @param gap
605    * character string represent a gap in the realigned sequence
606    * @param preserveUnmappedGaps
607    * @param preserveMappedGaps
608    * @return true if the sequence was realigned, false if it could not be
609    */
 
610  8 toggle public static boolean alignSequenceAs(SequenceI seq, AlignmentI al,
611    String gap, boolean preserveMappedGaps,
612    boolean preserveUnmappedGaps)
613    {
614    /*
615    * Get any mappings from the source alignment to the target (dataset)
616    * sequence.
617    */
618    // TODO there may be one AlignedCodonFrame per dataset sequence, or one with
619    // all mappings. Would it help to constrain this?
620  8 List<AlignedCodonFrame> mappings = al.getCodonFrame(seq);
621  8 if (mappings == null || mappings.isEmpty())
622    {
623  0 return false;
624    }
625   
626    /*
627    * Locate the aligned source sequence whose dataset sequence is mapped. We
628    * just take the first match here (as we can't align like more than one
629    * sequence).
630    */
631  8 SequenceI alignFrom = null;
632  8 AlignedCodonFrame mapping = null;
633  8 for (AlignedCodonFrame mp : mappings)
634    {
635  8 alignFrom = mp.findAlignedSequence(seq, al);
636  8 if (alignFrom != null)
637    {
638  4 mapping = mp;
639  4 break;
640    }
641    }
642   
643  8 if (alignFrom == null)
644    {
645  4 return false;
646    }
647  4 alignSequenceAs(seq, alignFrom, mapping, gap, al.getGapCharacter(),
648    preserveMappedGaps, preserveUnmappedGaps);
649  4 return true;
650    }
651   
652    /**
653    * Align sequence 'alignTo' the same way as 'alignFrom', using the mapping to
654    * match residues and codons. Flags control whether existing gaps in unmapped
655    * (intron) and mapped (exon) regions are preserved or not. Gaps between
656    * intron and exon are only retained if both flags are set.
657    *
658    * @param alignTo
659    * @param alignFrom
660    * @param mapping
661    * @param myGap
662    * @param sourceGap
663    * @param preserveUnmappedGaps
664    * @param preserveMappedGaps
665    */
 
666  19 toggle public static void alignSequenceAs(SequenceI alignTo, SequenceI alignFrom,
667    AlignedCodonFrame mapping, String myGap, char sourceGap,
668    boolean preserveMappedGaps, boolean preserveUnmappedGaps)
669    {
670    // TODO generalise to work for Protein-Protein, dna-dna, dna-protein
671   
672    // aligned and dataset sequence positions, all base zero
673  19 int thisSeqPos = 0;
674  19 int sourceDsPos = 0;
675   
676  19 int basesWritten = 0;
677  19 char myGapChar = myGap.charAt(0);
678  19 int ratio = myGap.length();
679   
680  19 int fromOffset = alignFrom.getStart() - 1;
681  19 int toOffset = alignTo.getStart() - 1;
682  19 int sourceGapMappedLength = 0;
683  19 boolean inExon = false;
684  19 final int toLength = alignTo.getLength();
685  19 final int fromLength = alignFrom.getLength();
686  19 StringBuilder thisAligned = new StringBuilder(2 * toLength);
687   
688    /*
689    * Traverse the 'model' aligned sequence
690    */
691  205 for (int i = 0; i < fromLength; i++)
692    {
693  186 char sourceChar = alignFrom.getCharAt(i);
694  186 if (sourceChar == sourceGap)
695    {
696  44 sourceGapMappedLength += ratio;
697  44 continue;
698    }
699   
700    /*
701    * Found a non-gap character. Locate its mapped region if any.
702    */
703  142 sourceDsPos++;
704    // Note mapping positions are base 1, our sequence positions base 0
705  142 int[] mappedPos = mapping.getMappedRegion(alignTo, alignFrom,
706    sourceDsPos + fromOffset);
707  142 if (mappedPos == null)
708    {
709    /*
710    * unmapped position; treat like a gap
711    */
712  94 sourceGapMappedLength += ratio;
713    // jalview.bin.Console.errPrintln("Can't align: no codon mapping to
714    // residue "
715    // + sourceDsPos + "(" + sourceChar + ")");
716    // return;
717  94 continue;
718    }
719   
720  48 int mappedCodonStart = mappedPos[0]; // position (1...) of codon start
721  48 int mappedCodonEnd = mappedPos[mappedPos.length - 1]; // codon end pos
722  48 StringBuilder trailingCopiedGap = new StringBuilder();
723   
724    /*
725    * Copy dna sequence up to and including this codon. Optionally, include
726    * gaps before the codon starts (in introns) and/or after the codon starts
727    * (in exons).
728    *
729    * Note this only works for 'linear' splicing, not reverse or interleaved.
730    * But then 'align dna as protein' doesn't make much sense otherwise.
731    */
732  48 int intronLength = 0;
733  294 while (basesWritten + toOffset < mappedCodonEnd
734    && thisSeqPos < toLength)
735    {
736  246 final char c = alignTo.getCharAt(thisSeqPos++);
737  246 if (c != myGapChar)
738    {
739  146 basesWritten++;
740  146 int sourcePosition = basesWritten + toOffset;
741  146 if (sourcePosition < mappedCodonStart)
742    {
743    /*
744    * Found an unmapped (intron) base. First add in any preceding gaps
745    * (if wanted).
746    */
747  48 if (preserveUnmappedGaps && trailingCopiedGap.length() > 0)
748    {
749  17 thisAligned.append(trailingCopiedGap.toString());
750  17 intronLength += trailingCopiedGap.length();
751  17 trailingCopiedGap = new StringBuilder();
752    }
753  48 intronLength++;
754  48 inExon = false;
755    }
756    else
757    {
758  98 final boolean startOfCodon = sourcePosition == mappedCodonStart;
759  98 int gapsToAdd = calculateGapsToInsert(preserveMappedGaps,
760    preserveUnmappedGaps, sourceGapMappedLength, inExon,
761    trailingCopiedGap.length(), intronLength, startOfCodon);
762  215 for (int k = 0; k < gapsToAdd; k++)
763    {
764  117 thisAligned.append(myGapChar);
765    }
766  98 sourceGapMappedLength = 0;
767  98 inExon = true;
768    }
769  146 thisAligned.append(c);
770  146 trailingCopiedGap = new StringBuilder();
771    }
772    else
773    {
774  100 if (inExon && preserveMappedGaps)
775    {
776  32 trailingCopiedGap.append(myGapChar);
777    }
778  68 else if (!inExon && preserveUnmappedGaps)
779    {
780  27 trailingCopiedGap.append(myGapChar);
781    }
782    }
783    }
784    }
785   
786    /*
787    * At end of model aligned sequence. Copy any remaining target sequence, optionally
788    * including (intron) gaps.
789    */
790  129 while (thisSeqPos < toLength)
791    {
792  110 final char c = alignTo.getCharAt(thisSeqPos++);
793  110 if (c != myGapChar || preserveUnmappedGaps)
794    {
795  102 thisAligned.append(c);
796    }
797  110 sourceGapMappedLength--;
798    }
799   
800    /*
801    * finally add gaps to pad for any trailing source gaps or
802    * unmapped characters
803    */
804  19 if (preserveUnmappedGaps)
805    {
806  24 while (sourceGapMappedLength > 0)
807    {
808  12 thisAligned.append(myGapChar);
809  12 sourceGapMappedLength--;
810    }
811    }
812   
813    /*
814    * All done aligning, set the aligned sequence.
815    */
816  19 alignTo.setSequence(new String(thisAligned));
817    }
818   
819    /**
820    * Helper method to work out how many gaps to insert when realigning.
821    *
822    * @param preserveMappedGaps
823    * @param preserveUnmappedGaps
824    * @param sourceGapMappedLength
825    * @param inExon
826    * @param trailingCopiedGap
827    * @param intronLength
828    * @param startOfCodon
829    * @return
830    */
 
831  98 toggle protected static int calculateGapsToInsert(boolean preserveMappedGaps,
832    boolean preserveUnmappedGaps, int sourceGapMappedLength,
833    boolean inExon, int trailingGapLength, int intronLength,
834    final boolean startOfCodon)
835    {
836  98 int gapsToAdd = 0;
837  98 if (startOfCodon)
838    {
839    /*
840    * Reached start of codon. Ignore trailing gaps in intron unless we are
841    * preserving gaps in both exon and intron. Ignore them anyway if the
842    * protein alignment introduces a gap at least as large as the intronic
843    * region.
844    */
845  40 if (inExon && !preserveMappedGaps)
846    {
847  4 trailingGapLength = 0;
848    }
849  40 if (!inExon && !(preserveMappedGaps && preserveUnmappedGaps))
850    {
851  19 trailingGapLength = 0;
852    }
853  40 if (inExon)
854    {
855  14 gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
856    }
857    else
858    {
859  26 if (intronLength + trailingGapLength <= sourceGapMappedLength)
860    {
861  20 gapsToAdd = sourceGapMappedLength - intronLength;
862    }
863    else
864    {
865  6 gapsToAdd = Math.min(
866    intronLength + trailingGapLength - sourceGapMappedLength,
867    trailingGapLength);
868    }
869    }
870    }
871    else
872    {
873    /*
874    * second or third base of codon; check for any gaps in dna
875    */
876  58 if (!preserveMappedGaps)
877    {
878  32 trailingGapLength = 0;
879    }
880  58 gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
881    }
882  98 return gapsToAdd;
883    }
884   
885    /**
886    * Realigns the given protein to match the alignment of the dna, using codon
887    * mappings to translate aligned codon positions to protein residues.
888    *
889    * @param protein
890    * the alignment whose sequences are realigned by this method
891    * @param dna
892    * the dna alignment whose alignment we are 'copying'
893    * @return the number of sequences that were realigned
894    */
 
895  5 toggle public static int alignProteinAsDna(AlignmentI protein, AlignmentI dna)
896    {
897  5 if (protein.isNucleotide() || !dna.isNucleotide())
898    {
899  0 jalview.bin.Console
900    .errPrintln("Wrong alignment type in alignProteinAsDna");
901  0 return 0;
902    }
903  5 List<SequenceI> unmappedProtein = new ArrayList<>();
904  5 Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = buildCodonColumnsMap(
905    protein, dna, unmappedProtein);
906  5 return alignProteinAs(protein, alignedCodons, unmappedProtein);
907    }
908   
909    /**
910    * Realigns the given dna to match the alignment of the protein, using codon
911    * mappings to translate aligned peptide positions to codons.
912    *
913    * Always produces a padded CDS alignment.
914    *
915    * @param dna
916    * the alignment whose sequences are realigned by this method
917    * @param protein
918    * the protein alignment whose alignment we are 'copying'
919    * @return the number of sequences that were realigned
920    */
 
921  4 toggle public static int alignCdsAsProtein(AlignmentI dna, AlignmentI protein)
922    {
923  4 if (protein.isNucleotide() || !dna.isNucleotide())
924    {
925  0 jalview.bin.Console
926    .errPrintln("Wrong alignment type in alignProteinAsDna");
927  0 return 0;
928    }
929    // todo: implement this
930  4 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
931  4 int alignedCount = 0;
932  4 int width = 0; // alignment width for padding CDS
933  4 for (SequenceI dnaSeq : dna.getSequences())
934    {
935  5 if (alignCdsSequenceAsProtein(dnaSeq, protein, mappings,
936    dna.getGapCharacter()))
937    {
938  5 alignedCount++;
939    }
940  5 width = Math.max(dnaSeq.getLength(), width);
941    }
942  4 int oldwidth;
943  4 int diff;
944  4 for (SequenceI dnaSeq : dna.getSequences())
945    {
946  5 oldwidth = dnaSeq.getLength();
947  5 diff = width - oldwidth;
948  5 if (diff > 0)
949    {
950  1 dnaSeq.insertCharAt(oldwidth, diff, dna.getGapCharacter());
951    }
952    }
953  4 return alignedCount;
954    }
955   
956    /**
957    * Helper method to align (if possible) the dna sequence to match the
958    * alignment of a mapped protein sequence. This is currently limited to
959    * handling coding sequence only.
960    *
961    * @param cdsSeq
962    * @param protein
963    * @param mappings
964    * @param gapChar
965    * @return
966    */
 
967  5 toggle static boolean alignCdsSequenceAsProtein(SequenceI cdsSeq,
968    AlignmentI protein, List<AlignedCodonFrame> mappings,
969    char gapChar)
970    {
971  5 SequenceI cdsDss = cdsSeq.getDatasetSequence();
972  5 if (cdsDss == null)
973    {
974  0 System.err
975    .println("alignCdsSequenceAsProtein needs aligned sequence!");
976  0 return false;
977    }
978   
979  5 List<AlignedCodonFrame> dnaMappings = MappingUtils
980    .findMappingsForSequence(cdsSeq, mappings);
981  5 for (AlignedCodonFrame mapping : dnaMappings)
982    {
983  5 SequenceI peptide = mapping.findAlignedSequence(cdsSeq, protein);
984  5 if (peptide != null)
985    {
986  5 final int peptideLength = peptide.getLength();
987  5 Mapping map = mapping.getMappingBetween(cdsSeq, peptide);
988  5 if (map != null)
989    {
990  5 MapList mapList = map.getMap();
991  5 if (map.getTo() == peptide.getDatasetSequence())
992    {
993  5 mapList = mapList.getInverse();
994    }
995  5 final int cdsLength = cdsDss.getLength();
996  5 int mappedFromLength = MappingUtils
997    .getLength(mapList.getFromRanges());
998  5 int mappedToLength = MappingUtils
999    .getLength(mapList.getToRanges());
1000  5 boolean addStopCodon = (cdsLength == mappedFromLength
1001    * CODON_LENGTH + CODON_LENGTH)
1002    || (peptide.getDatasetSequence()
1003    .getLength() == mappedFromLength - 1);
1004  5 if (cdsLength != mappedToLength && !addStopCodon)
1005    {
1006  0 jalview.bin.Console.errPrintln(String.format(
1007    "Can't align cds as protein (length mismatch %d/%d): %s",
1008    cdsLength, mappedToLength, cdsSeq.getName()));
1009    }
1010   
1011    /*
1012    * pre-fill the aligned cds sequence with gaps
1013    */
1014  5 char[] alignedCds = new char[peptideLength * CODON_LENGTH
1015  5 + (addStopCodon ? CODON_LENGTH : 0)];
1016  5 Arrays.fill(alignedCds, gapChar);
1017   
1018    /*
1019    * walk over the aligned peptide sequence and insert mapped
1020    * codons for residues in the aligned cds sequence
1021    */
1022  5 int copiedBases = 0;
1023  5 int cdsStart = cdsDss.getStart();
1024  5 int proteinPos = peptide.getStart() - 1;
1025  5 int cdsCol = 0;
1026   
1027  33 for (int col = 0; col < peptideLength; col++)
1028    {
1029  28 char residue = peptide.getCharAt(col);
1030   
1031  28 if (Comparison.isGap(residue))
1032    {
1033  13 cdsCol += CODON_LENGTH;
1034    }
1035    else
1036    {
1037  15 proteinPos++;
1038  15 int[] codon = mapList.locateInTo(proteinPos, proteinPos);
1039  15 if (codon == null)
1040    {
1041    // e.g. incomplete start codon, X in peptide
1042  0 cdsCol += CODON_LENGTH;
1043    }
1044    else
1045    {
1046  60 for (int j = codon[0]; j <= codon[1]; j++)
1047    {
1048  45 char mappedBase = cdsDss.getCharAt(j - cdsStart);
1049  45 alignedCds[cdsCol++] = mappedBase;
1050  45 copiedBases++;
1051    }
1052    }
1053    }
1054    }
1055   
1056    /*
1057    * append stop codon if not mapped from protein,
1058    * closing it up to the end of the mapped sequence
1059    */
1060  5 if (copiedBases == cdsLength - CODON_LENGTH)
1061    {
1062  0 for (int i = alignedCds.length - 1; i >= 0; i--)
1063    {
1064  0 if (!Comparison.isGap(alignedCds[i]))
1065    {
1066  0 cdsCol = i + 1; // gap just after end of sequence
1067  0 break;
1068    }
1069    }
1070  0 for (int i = cdsLength - CODON_LENGTH; i < cdsLength; i++)
1071    {
1072  0 alignedCds[cdsCol++] = cdsDss.getCharAt(i);
1073    }
1074    }
1075  5 cdsSeq.setSequence(new String(alignedCds));
1076  5 return true;
1077    }
1078    }
1079    }
1080  0 return false;
1081    }
1082   
1083    /**
1084    * Builds a map whose key is an aligned codon position (3 alignment column
1085    * numbers base 0), and whose value is a map from protein sequence to each
1086    * protein's peptide residue for that codon. The map generates an ordering of
1087    * the codons, and allows us to read off the peptides at each position in
1088    * order to assemble 'aligned' protein sequences.
1089    *
1090    * @param protein
1091    * the protein alignment
1092    * @param dna
1093    * the coding dna alignment
1094    * @param unmappedProtein
1095    * any unmapped proteins are added to this list
1096    * @return
1097    */
 
1098  5 toggle protected static Map<AlignedCodon, Map<SequenceI, AlignedCodon>> buildCodonColumnsMap(
1099    AlignmentI protein, AlignmentI dna,
1100    List<SequenceI> unmappedProtein)
1101    {
1102    /*
1103    * maintain a list of any proteins with no mappings - these will be
1104    * rendered 'as is' in the protein alignment as we can't align them
1105    */
1106  5 unmappedProtein.addAll(protein.getSequences());
1107   
1108  5 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
1109   
1110    /*
1111    * Map will hold, for each aligned codon position e.g. [3, 5, 6], a map of
1112    * {dnaSequence, {proteinSequence, codonProduct}} at that position. The
1113    * comparator keeps the codon positions ordered.
1114    */
1115  5 Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = new TreeMap<>(
1116    new CodonComparator());
1117   
1118  5 for (SequenceI dnaSeq : dna.getSequences())
1119    {
1120  30 for (AlignedCodonFrame mapping : mappings)
1121    {
1122  516 SequenceI prot = mapping.findAlignedSequence(dnaSeq, protein);
1123  516 if (prot != null)
1124    {
1125  32 Mapping seqMap = mapping.getMappingForSequence(dnaSeq);
1126  32 addCodonPositions(dnaSeq, prot, protein.getGapCharacter(), seqMap,
1127    alignedCodons);
1128  32 unmappedProtein.remove(prot);
1129    }
1130    }
1131    }
1132   
1133    /*
1134    * Finally add any unmapped peptide start residues (e.g. for incomplete
1135    * codons) as if at the codon position before the second residue
1136    */
1137    // TODO resolve JAL-2022 so this fudge can be removed
1138  5 int mappedSequenceCount = protein.getHeight() - unmappedProtein.size();
1139  5 addUnmappedPeptideStarts(alignedCodons, mappedSequenceCount);
1140   
1141  5 return alignedCodons;
1142    }
1143   
1144    /**
1145    * Scans for any protein mapped from position 2 (meaning unmapped start
1146    * position e.g. an incomplete codon), and synthesizes a 'codon' for it at the
1147    * preceding position in the alignment
1148    *
1149    * @param alignedCodons
1150    * the codon-to-peptide map
1151    * @param mappedSequenceCount
1152    * the number of distinct sequences in the map
1153    */
 
1154  5 toggle protected static void addUnmappedPeptideStarts(
1155    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1156    int mappedSequenceCount)
1157    {
1158    // TODO delete this ugly hack once JAL-2022 is resolved
1159    // i.e. we can model startPhase > 0 (incomplete start codon)
1160   
1161  5 List<SequenceI> sequencesChecked = new ArrayList<>();
1162  5 AlignedCodon lastCodon = null;
1163  5 Map<SequenceI, AlignedCodon> toAdd = new HashMap<>();
1164   
1165  5 for (Entry<AlignedCodon, Map<SequenceI, AlignedCodon>> entry : alignedCodons
1166    .entrySet())
1167    {
1168  1913 for (Entry<SequenceI, AlignedCodon> sequenceCodon : entry.getValue()
1169    .entrySet())
1170    {
1171  10667 SequenceI seq = sequenceCodon.getKey();
1172  10667 if (sequencesChecked.contains(seq))
1173    {
1174  10637 continue;
1175    }
1176  30 sequencesChecked.add(seq);
1177  30 AlignedCodon codon = sequenceCodon.getValue();
1178  30 if (codon.peptideCol > 1)
1179    {
1180  0 jalview.bin.Console.errPrintln(
1181    "Problem mapping protein with >1 unmapped start positions: "
1182    + seq.getName());
1183    }
1184  30 else if (codon.peptideCol == 1)
1185    {
1186    /*
1187    * first position (peptideCol == 0) was unmapped - add it
1188    */
1189  6 if (lastCodon != null)
1190    {
1191  5 AlignedCodon firstPeptide = new AlignedCodon(lastCodon.pos1,
1192    lastCodon.pos2, lastCodon.pos3,
1193    String.valueOf(seq.getCharAt(0)), 0);
1194  5 toAdd.put(seq, firstPeptide);
1195    }
1196    else
1197    {
1198    /*
1199    * unmapped residue at start of alignment (no prior column) -
1200    * 'insert' at nominal codon [0, 0, 0]
1201    */
1202  1 AlignedCodon firstPeptide = new AlignedCodon(0, 0, 0,
1203    String.valueOf(seq.getCharAt(0)), 0);
1204  1 toAdd.put(seq, firstPeptide);
1205    }
1206    }
1207  30 if (sequencesChecked.size() == mappedSequenceCount)
1208    {
1209    // no need to check past first mapped position in all sequences
1210  5 break;
1211    }
1212    }
1213  1913 lastCodon = entry.getKey();
1214    }
1215   
1216    /*
1217    * add any new codons safely after iterating over the map
1218    */
1219  5 for (Entry<SequenceI, AlignedCodon> startCodon : toAdd.entrySet())
1220    {
1221  6 addCodonToMap(alignedCodons, startCodon.getValue(),
1222    startCodon.getKey());
1223    }
1224    }
1225   
1226    /**
1227    * Update the aligned protein sequences to match the codon alignments given in
1228    * the map.
1229    *
1230    * @param protein
1231    * @param alignedCodons
1232    * an ordered map of codon positions (columns), with sequence/peptide
1233    * values present in each column
1234    * @param unmappedProtein
1235    * @return
1236    */
 
1237  5 toggle protected static int alignProteinAs(AlignmentI protein,
1238    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1239    List<SequenceI> unmappedProtein)
1240    {
1241    /*
1242    * prefill peptide sequences with gaps
1243    */
1244  5 int alignedWidth = alignedCodons.size();
1245  5 char[] gaps = new char[alignedWidth];
1246  5 Arrays.fill(gaps, protein.getGapCharacter());
1247  5 Map<SequenceI, char[]> peptides = new HashMap<>();
1248  5 for (SequenceI seq : protein.getSequences())
1249    {
1250  31 if (!unmappedProtein.contains(seq))
1251    {
1252  30 peptides.put(seq, Arrays.copyOf(gaps, gaps.length));
1253    }
1254    }
1255   
1256    /*
1257    * Traverse the codons left to right (as defined by CodonComparator)
1258    * and insert peptides in each column where the sequence is mapped.
1259    * This gives a peptide 'alignment' where residues are aligned if their
1260    * corresponding codons occupy the same columns in the cdna alignment.
1261    */
1262  5 int column = 0;
1263  5 for (AlignedCodon codon : alignedCodons.keySet())
1264    {
1265  1914 final Map<SequenceI, AlignedCodon> columnResidues = alignedCodons
1266    .get(codon);
1267  1914 for (Entry<SequenceI, AlignedCodon> entry : columnResidues.entrySet())
1268    {
1269  10682 char residue = entry.getValue().product.charAt(0);
1270  10682 peptides.get(entry.getKey())[column] = residue;
1271    }
1272  1914 column++;
1273    }
1274   
1275    /*
1276    * and finally set the constructed sequences
1277    */
1278  5 for (Entry<SequenceI, char[]> entry : peptides.entrySet())
1279    {
1280  30 entry.getKey().setSequence(new String(entry.getValue()));
1281    }
1282   
1283  5 return 0;
1284    }
1285   
1286    /**
1287    * Populate the map of aligned codons by traversing the given sequence
1288    * mapping, locating the aligned positions of mapped codons, and adding those
1289    * positions and their translation products to the map.
1290    *
1291    * @param dna
1292    * the aligned sequence we are mapping from
1293    * @param protein
1294    * the sequence to be aligned to the codons
1295    * @param gapChar
1296    * the gap character in the dna sequence
1297    * @param seqMap
1298    * a mapping to a sequence translation
1299    * @param alignedCodons
1300    * the map we are building up
1301    */
 
1302  32 toggle static void addCodonPositions(SequenceI dna, SequenceI protein,
1303    char gapChar, Mapping seqMap,
1304    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons)
1305    {
1306  32 Iterator<AlignedCodon> codons = seqMap.getCodonIterator(dna, gapChar);
1307   
1308    /*
1309    * add codon positions, and their peptide translations, to the alignment
1310    * map, while remembering the first codon mapped
1311    */
1312  10716 while (codons.hasNext())
1313    {
1314  10684 try
1315    {
1316  10684 AlignedCodon codon = codons.next();
1317  10684 addCodonToMap(alignedCodons, codon, protein);
1318    } catch (IncompleteCodonException e)
1319    {
1320    // possible incomplete trailing codon - ignore
1321    } catch (NoSuchElementException e)
1322    {
1323    // possibly peptide lacking STOP
1324    }
1325    }
1326    }
1327   
1328    /**
1329    * Helper method to add a codon-to-peptide entry to the aligned codons map
1330    *
1331    * @param alignedCodons
1332    * @param codon
1333    * @param protein
1334    */
 
1335  10690 toggle protected static void addCodonToMap(
1336    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1337    AlignedCodon codon, SequenceI protein)
1338    {
1339  10690 Map<SequenceI, AlignedCodon> seqProduct = alignedCodons.get(codon);
1340  10690 if (seqProduct == null)
1341    {
1342  1914 seqProduct = new HashMap<>();
1343  1914 alignedCodons.put(codon, seqProduct);
1344    }
1345  10690 seqProduct.put(protein, codon);
1346    }
1347   
1348    /**
1349    * Returns true if a cDNA/Protein mapping either exists, or could be made,
1350    * between at least one pair of sequences in the two alignments. Currently,
1351    * the logic is:
1352    * <ul>
1353    * <li>One alignment must be nucleotide, and the other protein</li>
1354    * <li>At least one pair of sequences must be already mapped, or mappable</li>
1355    * <li>Mappable means the nucleotide translation matches the protein
1356    * sequence</li>
1357    * <li>The translation may ignore start and stop codons if present in the
1358    * nucleotide</li>
1359    * </ul>
1360    *
1361    * @param al1
1362    * @param al2
1363    * @return
1364    */
 
1365  26 toggle public static boolean isMappable(AlignmentI al1, AlignmentI al2)
1366    {
1367  26 if (al1 == null || al2 == null)
1368    {
1369  3 return false;
1370    }
1371   
1372    /*
1373    * Require one nucleotide and one protein
1374    */
1375  23 if (al1.isNucleotide() == al2.isNucleotide())
1376    {
1377  12 return false;
1378    }
1379  11 AlignmentI dna = al1.isNucleotide() ? al1 : al2;
1380  11 AlignmentI protein = dna == al1 ? al2 : al1;
1381  11 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
1382  11 for (SequenceI dnaSeq : dna.getSequences())
1383    {
1384  12 for (SequenceI proteinSeq : protein.getSequences())
1385    {
1386  12 if (isMappable(dnaSeq, proteinSeq, mappings))
1387    {
1388  2 return true;
1389    }
1390    }
1391    }
1392  9 return false;
1393    }
1394   
1395    /**
1396    * Returns true if the dna sequence is mapped, or could be mapped, to the
1397    * protein sequence.
1398    *
1399    * @param dnaSeq
1400    * @param proteinSeq
1401    * @param mappings
1402    * @return
1403    */
 
1404  12 toggle protected static boolean isMappable(SequenceI dnaSeq,
1405    SequenceI proteinSeq, List<AlignedCodonFrame> mappings)
1406    {
1407  12 if (dnaSeq == null || proteinSeq == null)
1408    {
1409  0 return false;
1410    }
1411   
1412  12 SequenceI dnaDs = dnaSeq.getDatasetSequence() == null ? dnaSeq
1413    : dnaSeq.getDatasetSequence();
1414  12 SequenceI proteinDs = proteinSeq.getDatasetSequence() == null
1415    ? proteinSeq
1416    : proteinSeq.getDatasetSequence();
1417   
1418  12 for (AlignedCodonFrame mapping : mappings)
1419    {
1420  0 if (proteinDs == mapping.getAaForDnaSeq(dnaDs))
1421    {
1422    /*
1423    * already mapped
1424    */
1425  0 return true;
1426    }
1427    }
1428   
1429    /*
1430    * Just try to make a mapping (it is not yet stored), test whether
1431    * successful.
1432    */
1433  12 return mapCdnaToProtein(proteinDs, dnaDs) != null;
1434    }
1435   
1436    /**
1437    * Finds any reference annotations associated with the sequences in
1438    * sequenceScope, that are not already added to the alignment, and adds them
1439    * to the 'candidates' map. Also populates a lookup table of annotation
1440    * labels, keyed by calcId, for use in constructing tooltips or the like.
1441    *
1442    * @param sequenceScope
1443    * the sequences to scan for reference annotations
1444    * @param labelForCalcId
1445    * (optional) map to populate with label for calcId
1446    * @param candidates
1447    * map to populate with annotations for sequence
1448    * @param al
1449    * the alignment to check for presence of annotations
1450    */
 
1451  84 toggle public static void findAddableReferenceAnnotations(
1452    List<SequenceI> sequenceScope, Map<String, String> labelForCalcId,
1453    final Map<SequenceI, List<AlignmentAnnotation>> candidates,
1454    AlignmentI al)
1455    {
1456  84 if (sequenceScope == null)
1457    {
1458  1 return;
1459    }
1460   
1461    /*
1462    * For each sequence in scope, make a list of any annotations on the
1463    * underlying dataset sequence which are not already on the alignment.
1464    *
1465    * Add to a map of { alignmentSequence, <List of annotations to add> }
1466    */
1467  83 for (SequenceI seq : sequenceScope)
1468    {
1469  80 SequenceI dataset = seq.getDatasetSequence();
1470  80 if (dataset == null)
1471    {
1472  0 continue;
1473    }
1474  80 AlignmentAnnotation[] datasetAnnotations = dataset.getAnnotation();
1475  80 if (datasetAnnotations == null)
1476    {
1477  35 continue;
1478    }
1479  45 final List<AlignmentAnnotation> result = new ArrayList<>();
1480  45 for (AlignmentAnnotation dsann : datasetAnnotations)
1481    {
1482  156 if (dsann.annotations == null) {
1483  0 continue;// ignore non-positional annotation
1484    }
1485    /*
1486    * Find matching annotations on the alignment. If none is found, then
1487    * add this annotation to the list of 'addable' annotations for this
1488    * sequence.
1489    */
1490  156 final Iterable<AlignmentAnnotation> matchedAlignmentAnnotations = al
1491    .findAnnotations(seq, dsann.getCalcId(), dsann.label);
1492  156 boolean found = false;
1493  156 if (matchedAlignmentAnnotations != null)
1494    {
1495  153 for (AlignmentAnnotation matched : matchedAlignmentAnnotations)
1496    {
1497  127 if (dsann.description.equals(matched.description))
1498    {
1499  49 found = true;
1500  49 break;
1501    }
1502    }
1503    }
1504  156 if (!found) {
1505  107 result.add(dsann);
1506  107 if (labelForCalcId != null)
1507    {
1508  10 labelForCalcId.put(dsann.getCalcId(), dsann.label);
1509    }
1510    }
1511    }
1512    /*
1513    * Save any addable annotations for this sequence
1514    */
1515  45 if (!result.isEmpty())
1516    {
1517  43 candidates.put(seq, result);
1518    }
1519    }
1520    }
1521   
1522    /**
1523    * Adds annotations to the top of the alignment annotations, in the same order
1524    * as their related sequences. If you already have an annotation and want to
1525    * add it to a sequence in an alignment use {@code addReferenceAnnotationTo}
1526    *
1527    * @param annotations
1528    * the annotations to add
1529    * @param alignment
1530    * the alignment to add them to
1531    * @param selectionGroup
1532    * current selection group - may be null, if provided then any added
1533    * annotation will be trimmed to just those columns in the selection
1534    * group
1535    */
 
1536  38 toggle public static void addReferenceAnnotations(
1537    Map<SequenceI, List<AlignmentAnnotation>> annotations,
1538    final AlignmentI alignment, final SequenceGroup selectionGroup)
1539    {
1540  38 for (SequenceI seq : annotations.keySet())
1541    {
1542  39 for (AlignmentAnnotation ann : annotations.get(seq))
1543    {
1544  101 addReferenceAnnotationTo(alignment, seq, ann, selectionGroup);
1545    }
1546    }
1547    }
1548   
 
1549  4 toggle public static boolean isSSAnnotationPresent(
1550    Map<SequenceI, List<AlignmentAnnotation>> annotations)
1551    {
1552   
1553  4 for (SequenceI seq : annotations.keySet())
1554    {
1555  3 if (isSecondaryStructurePresent(
1556    annotations.get(seq).toArray(new AlignmentAnnotation[0])))
1557    {
1558  2 return true;
1559    }
1560    }
1561  2 return false;
1562    }
1563   
1564    /**
1565    * Make a copy of a reference annotation {@code ann} and add it to an
1566    * alignment sequence {@code seq} in {@code alignment}, optionally limited to
1567    * the extent of {@code selectionGroup}
1568    *
1569    * @param alignment
1570    * @param seq
1571    * @param ann
1572    * @param selectionGroup
1573    * current selection group - may be null, if provided then any added
1574    * annotation will be trimmed to just those columns in the selection
1575    * group
1576    * @return annotation added to {@code seq and {@code alignment}
1577    */
 
1578  105 toggle public static AlignmentAnnotation addReferenceAnnotationTo(
1579    final AlignmentI alignment, final SequenceI seq,
1580    final AlignmentAnnotation ann, final SequenceGroup selectionGroup)
1581    {
1582  105 AlignmentAnnotation copyAnn = new AlignmentAnnotation(ann);
1583  105 int startRes = 0;
1584  105 int endRes = ann.annotations.length;
1585  105 if (selectionGroup != null)
1586    {
1587  2 startRes = -1 + Math.min(seq.getEnd(), Math.max(seq.getStart(),
1588    seq.findPosition(selectionGroup.getStartRes())));
1589  2 endRes = -1 + Math.min(seq.getEnd(),
1590    seq.findPosition(selectionGroup.getEndRes()));
1591   
1592    }
1593  105 copyAnn.restrict(startRes, endRes + 0);
1594   
1595    /*
1596    * Add to the sequence (sets copyAnn.datasetSequence), unless the
1597    * original annotation is already on the sequence.
1598    */
1599  105 if (!seq.hasAnnotation(ann))
1600    {
1601  105 ContactMatrixI cm = seq.getDatasetSequence().getContactMatrixFor(ann);
1602  105 if (cm != null)
1603    {
1604  29 seq.addContactListFor(copyAnn, cm);
1605    }
1606  105 seq.addAlignmentAnnotation(copyAnn);
1607    }
1608    // adjust for gaps
1609  105 copyAnn.adjustForAlignment();
1610    // add to the alignment and set visible
1611  105 alignment.addAnnotation(copyAnn);
1612  105 copyAnn.visible = true;
1613   
1614  105 return copyAnn;
1615    }
1616   
1617    /**
1618    * Set visibility of alignment annotations of specified types (labels), for
1619    * specified sequences. This supports controls like "Show all secondary
1620    * structure", "Hide all Temp factor", etc.
1621    *
1622    * @al the alignment to scan for annotations
1623    * @param types
1624    * the types (labels) of annotations to be updated
1625    * @param forSequences
1626    * if not null, only annotations linked to one of these sequences are
1627    * in scope for update; if null, acts on all sequence annotations
1628    * @param anyType
1629    * if this flag is true, 'types' is ignored (label not checked)
1630    * @param doShow
1631    * if true, set visibility on, else set off
1632    */
 
1633  5 toggle public static void showOrHideSequenceAnnotations(AlignmentI al,
1634    Collection<String> types, List<SequenceI> forSequences,
1635    boolean anyType, boolean doShow)
1636    {
1637  5 AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
1638  5 if (anns != null)
1639    {
1640  5 for (AlignmentAnnotation aa : anns)
1641    {
1642  30 if (anyType || types.contains(aa.label))
1643    {
1644  21 if ((aa.sequenceRef != null) && (forSequences == null
1645    || forSequences.contains(aa.sequenceRef)))
1646    {
1647  11 aa.visible = doShow;
1648    }
1649    }
1650    }
1651    }
1652    }
1653   
1654    /**
1655    * Shows or hides auto calculated annotations for a sequence group.
1656    *
1657    * @param al
1658    * The alignment object with the annotations.
1659    * @param type
1660    * The type of annotation to show or hide.
1661    * @param selectedGroup
1662    * The sequence group for which the annotations should be shown or
1663    * hidden.
1664    * @param anyType
1665    * If true, all types of annotations will be shown/hidden.
1666    * @param doShow
1667    * If true, the annotations will be shown; if false, annotations will
1668    * be hidden.
1669    */
 
1670  0 toggle public static void showOrHideAutoCalculatedAnnotationsForGroup(
1671    AlignmentI al, String type, SequenceGroup selectedGroup,
1672    boolean anyType, boolean doShow)
1673    {
1674    // Get all alignment annotations
1675  0 AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
1676   
1677  0 if (anns != null)
1678    {
1679  0 for (AlignmentAnnotation aa : anns)
1680    {
1681    // Check if anyType is true or if the annotation's label contains the
1682    // specified type (currently for secondary structure consensus)
1683  0 if ((anyType && aa.label
1684    .startsWith(Constants.SECONDARY_STRUCTURE_CONSENSUS_LABEL))
1685    || aa.label.startsWith(type))
1686    {
1687    // If the annotation's group reference is not null and matches the
1688    // selected group, update its visibility.
1689  0 if (aa.groupRef != null && selectedGroup == aa.groupRef)
1690    {
1691  0 aa.visible = doShow;
1692    }
1693    }
1694    }
1695    }
1696    }
1697   
 
1698  0 toggle public static AlignmentAnnotation getFirstSequenceAnnotationOfType(
1699    AlignmentI al, int graphType)
1700    {
1701  0 AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
1702  0 if (anns != null)
1703    {
1704  0 for (AlignmentAnnotation aa : anns)
1705    {
1706  0 if (aa.sequenceRef != null && aa.graph == graphType)
1707  0 return aa;
1708    }
1709    }
1710  0 return null;
1711    }
1712   
1713    /**
1714    * Returns true if either sequence has a cross-reference to the other
1715    *
1716    * @param seq1
1717    * @param seq2
1718    * @return
1719    */
 
1720  52 toggle public static boolean haveCrossRef(SequenceI seq1, SequenceI seq2)
1721    {
1722    // Note: moved here from class CrossRef as the latter class has dependencies
1723    // not availability to the applet's classpath
1724  52 return hasCrossRef(seq1, seq2) || hasCrossRef(seq2, seq1);
1725    }
1726   
1727    /**
1728    * Returns true if seq1 has a cross-reference to seq2. Currently this assumes
1729    * that sequence name is structured as Source|AccessionId.
1730    *
1731    * @param seq1
1732    * @param seq2
1733    * @return
1734    */
 
1735  108 toggle public static boolean hasCrossRef(SequenceI seq1, SequenceI seq2)
1736    {
1737  108 if (seq1 == null || seq2 == null)
1738    {
1739  8 return false;
1740    }
1741  100 String name = seq2.getName();
1742  100 final List<DBRefEntry> xrefs = seq1.getDBRefs();
1743  100 if (xrefs != null)
1744    {
1745  34 for (int ix = 0, nx = xrefs.size(); ix < nx; ix++)
1746    {
1747  24 DBRefEntry xref = xrefs.get(ix);
1748  24 String xrefName = xref.getSource() + "|" + xref.getAccessionId();
1749    // case-insensitive test, consistent with DBRefEntry.equalRef()
1750  24 if (xrefName.equalsIgnoreCase(name))
1751    {
1752  12 return true;
1753    }
1754    }
1755    }
1756  88 return false;
1757    }
1758   
1759    /**
1760    * Constructs an alignment consisting of the mapped (CDS) regions in the given
1761    * nucleotide sequences, and updates mappings to match. The CDS sequences are
1762    * added to the original alignment's dataset, which is shared by the new
1763    * alignment. Mappings from nucleotide to CDS, and from CDS to protein, are
1764    * added to the alignment dataset.
1765    *
1766    * @param dna
1767    * aligned nucleotide (dna or cds) sequences
1768    * @param dataset
1769    * the alignment dataset the sequences belong to
1770    * @param products
1771    * (optional) to restrict results to CDS that map to specified
1772    * protein products
1773    * @return an alignment whose sequences are the cds-only parts of the dna
1774    * sequences (or null if no mappings are found)
1775    */
 
1776  6 toggle public static AlignmentI makeCdsAlignment(SequenceI[] dna,
1777    AlignmentI dataset, SequenceI[] products)
1778    {
1779  6 if (dataset == null || dataset.getDataset() != null)
1780    {
1781  0 throw new IllegalArgumentException(
1782    "IMPLEMENTATION ERROR: dataset.getDataset() must be null!");
1783    }
1784  6 List<SequenceI> foundSeqs = new ArrayList<>();
1785  6 List<SequenceI> cdsSeqs = new ArrayList<>();
1786  6 List<AlignedCodonFrame> mappings = dataset.getCodonFrames();
1787  6 HashSet<SequenceI> productSeqs = null;
1788  6 if (products != null)
1789    {
1790  3 productSeqs = new HashSet<>();
1791  3 for (SequenceI seq : products)
1792    {
1793  24 productSeqs.add(seq.getDatasetSequence() == null ? seq
1794    : seq.getDatasetSequence());
1795    }
1796    }
1797   
1798    /*
1799    * Construct CDS sequences from mappings on the alignment dataset.
1800    * The logic is:
1801    * - find the protein product(s) mapped to from each dna sequence
1802    * - if the mapping covers the whole dna sequence (give or take start/stop
1803    * codon), take the dna as the CDS sequence
1804    * - else search dataset mappings for a suitable dna sequence, i.e. one
1805    * whose whole sequence is mapped to the protein
1806    * - if no sequence found, construct one from the dna sequence and mapping
1807    * (and add it to dataset so it is found if this is repeated)
1808    */
1809  6 for (SequenceI dnaSeq : dna)
1810    {
1811  52 SequenceI dnaDss = dnaSeq.getDatasetSequence() == null ? dnaSeq
1812    : dnaSeq.getDatasetSequence();
1813   
1814  52 List<AlignedCodonFrame> seqMappings = MappingUtils
1815    .findMappingsForSequence(dnaSeq, mappings);
1816  52 for (AlignedCodonFrame mapping : seqMappings)
1817    {
1818  42 List<Mapping> mappingsFromSequence = mapping
1819    .getMappingsFromSequence(dnaSeq);
1820   
1821  42 for (Mapping aMapping : mappingsFromSequence)
1822    {
1823  44 MapList mapList = aMapping.getMap();
1824  44 if (mapList.getFromRatio() == 1)
1825    {
1826    /*
1827    * not a dna-to-protein mapping (likely dna-to-cds)
1828    */
1829  11 continue;
1830    }
1831   
1832    /*
1833    * skip if mapping is not to one of the target set of proteins
1834    */
1835  33 SequenceI proteinProduct = aMapping.getTo();
1836  33 if (productSeqs != null && !productSeqs.contains(proteinProduct))
1837    {
1838  2 continue;
1839    }
1840   
1841    /*
1842    * try to locate the CDS from the dataset mappings;
1843    * guard against duplicate results (for the case that protein has
1844    * dbrefs to both dna and cds sequences)
1845    */
1846  31 SequenceI cdsSeq = findCdsForProtein(mappings, dnaSeq,
1847    seqMappings, aMapping);
1848  31 if (cdsSeq != null)
1849    {
1850  11 if (!foundSeqs.contains(cdsSeq))
1851    {
1852  11 foundSeqs.add(cdsSeq);
1853  11 SequenceI derivedSequence = cdsSeq.deriveSequence();
1854  11 cdsSeqs.add(derivedSequence);
1855  11 if (!dataset.getSequences().contains(cdsSeq))
1856    {
1857  0 dataset.addSequence(cdsSeq);
1858    }
1859    }
1860  11 continue;
1861    }
1862   
1863    /*
1864    * didn't find mapped CDS sequence - construct it and add
1865    * its dataset sequence to the dataset
1866    */
1867  20 cdsSeq = makeCdsSequence(dnaSeq.getDatasetSequence(), aMapping,
1868    dataset).deriveSequence();
1869    // cdsSeq has a name constructed as CDS|<dbref>
1870    // <dbref> will be either the accession for the coding sequence,
1871    // marked in the /via/ dbref to the protein product accession
1872    // or it will be the original nucleotide accession.
1873  20 SequenceI cdsSeqDss = cdsSeq.getDatasetSequence();
1874   
1875  20 cdsSeqs.add(cdsSeq);
1876   
1877    /*
1878    * build the mapping from CDS to protein
1879    */
1880  20 List<int[]> cdsRange = Collections
1881    .singletonList(new int[]
1882    { cdsSeq.getStart(),
1883    cdsSeq.getLength() + cdsSeq.getStart() - 1 });
1884  20 MapList cdsToProteinMap = new MapList(cdsRange,
1885    mapList.getToRanges(), mapList.getFromRatio(),
1886    mapList.getToRatio());
1887   
1888  20 if (!dataset.getSequences().contains(cdsSeqDss))
1889    {
1890    /*
1891    * if this sequence is a newly created one, add it to the dataset
1892    * and made a CDS to protein mapping (if sequence already exists,
1893    * CDS-to-protein mapping _is_ the transcript-to-protein mapping)
1894    */
1895  20 dataset.addSequence(cdsSeqDss);
1896  20 AlignedCodonFrame cdsToProteinMapping = new AlignedCodonFrame();
1897  20 cdsToProteinMapping.addMap(cdsSeqDss, proteinProduct,
1898    cdsToProteinMap);
1899   
1900    /*
1901    * guard against duplicating the mapping if repeating this action
1902    */
1903  20 if (!mappings.contains(cdsToProteinMapping))
1904    {
1905  20 mappings.add(cdsToProteinMapping);
1906    }
1907    }
1908   
1909  20 propagateDBRefsToCDS(cdsSeqDss, dnaSeq.getDatasetSequence(),
1910    proteinProduct, aMapping);
1911    /*
1912    * add another mapping from original 'from' range to CDS
1913    */
1914  20 AlignedCodonFrame dnaToCdsMapping = new AlignedCodonFrame();
1915  20 final MapList dnaToCdsMap = new MapList(mapList.getFromRanges(),
1916    cdsRange, 1, 1);
1917  20 dnaToCdsMapping.addMap(dnaSeq.getDatasetSequence(), cdsSeqDss,
1918    dnaToCdsMap);
1919  20 if (!mappings.contains(dnaToCdsMapping))
1920    {
1921  20 mappings.add(dnaToCdsMapping);
1922    }
1923   
1924    /*
1925    * transfer dna chromosomal loci (if known) to the CDS
1926    * sequence (via the mapping)
1927    */
1928  20 final MapList cdsToDnaMap = dnaToCdsMap.getInverse();
1929  20 transferGeneLoci(dnaSeq, cdsToDnaMap, cdsSeq);
1930   
1931    /*
1932    * add DBRef with mapping from protein to CDS
1933    * (this enables Get Cross-References from protein alignment)
1934    * This is tricky because we can't have two DBRefs with the
1935    * same source and accession, so need a different accession for
1936    * the CDS from the dna sequence
1937    */
1938   
1939    // specific use case:
1940    // Genomic contig ENSCHR:1, contains coding regions for ENSG01,
1941    // ENSG02, ENSG03, with transcripts and products similarly named.
1942    // cannot add distinct dbrefs mapping location on ENSCHR:1 to ENSG01
1943   
1944    // JBPNote: ?? can't actually create an example that demonstrates we
1945    // need to
1946    // synthesize an xref.
1947   
1948  20 List<DBRefEntry> primrefs = dnaDss.getPrimaryDBRefs();
1949  33 for (int ip = 0, np = primrefs.size(); ip < np; ip++)
1950    {
1951  13 DBRefEntry primRef = primrefs.get(ip);
1952    /*
1953    * create a cross-reference from CDS to the source sequence's
1954    * primary reference and vice versa
1955    */
1956  13 String source = primRef.getSource();
1957  13 String version = primRef.getVersion();
1958  13 DBRefEntry cdsCrossRef = new DBRefEntry(source,
1959    source + ":" + version, primRef.getAccessionId());
1960  13 cdsCrossRef
1961    .setMap(new Mapping(dnaDss, new MapList(cdsToDnaMap)));
1962  13 cdsSeqDss.addDBRef(cdsCrossRef);
1963   
1964  13 dnaSeq.addDBRef(new DBRefEntry(source, version,
1965    cdsSeq.getName(), new Mapping(cdsSeqDss, dnaToCdsMap)));
1966    // problem here is that the cross-reference is synthesized -
1967    // cdsSeq.getName() may be like 'CDS|dnaaccession' or
1968    // 'CDS|emblcdsacc'
1969    // assuming cds version same as dna ?!?
1970   
1971  13 DBRefEntry proteinToCdsRef = new DBRefEntry(source, version,
1972    cdsSeq.getName());
1973    //
1974  13 proteinToCdsRef.setMap(
1975    new Mapping(cdsSeqDss, cdsToProteinMap.getInverse()));
1976  13 proteinProduct.addDBRef(proteinToCdsRef);
1977    }
1978    /*
1979    * transfer any features on dna that overlap the CDS
1980    */
1981  20 transferFeatures(dnaSeq, cdsSeq, dnaToCdsMap, null,
1982    SequenceOntologyI.CDS);
1983    }
1984    }
1985    }
1986   
1987  6 AlignmentI cds = new Alignment(
1988    cdsSeqs.toArray(new SequenceI[cdsSeqs.size()]));
1989  6 cds.setDataset(dataset);
1990   
1991  6 return cds;
1992    }
1993   
1994    /**
1995    * Tries to transfer gene loci (dbref to chromosome positions) from fromSeq to
1996    * toSeq, mediated by the given mapping between the sequences
1997    *
1998    * @param fromSeq
1999    * @param targetToFrom
2000    * Map
2001    * @param targetSeq
2002    */
 
2003  23 toggle protected static void transferGeneLoci(SequenceI fromSeq,
2004    MapList targetToFrom, SequenceI targetSeq)
2005    {
2006  23 if (targetSeq.getGeneLoci() != null)
2007    {
2008    // already have - don't override
2009  1 return;
2010    }
2011  22 GeneLociI fromLoci = fromSeq.getGeneLoci();
2012  22 if (fromLoci == null)
2013    {
2014  10 return;
2015    }
2016   
2017  12 MapList newMap = targetToFrom.traverse(fromLoci.getMapping());
2018   
2019  12 if (newMap != null)
2020    {
2021  12 targetSeq.setGeneLoci(fromLoci.getSpeciesId(),
2022    fromLoci.getAssemblyId(), fromLoci.getChromosomeId(), newMap);
2023    }
2024    }
2025   
2026    /**
2027    * A helper method that finds a CDS sequence in the alignment dataset that is
2028    * mapped to the given protein sequence, and either is, or has a mapping from,
2029    * the given dna sequence.
2030    *
2031    * @param mappings
2032    * set of all mappings on the dataset
2033    * @param dnaSeq
2034    * a dna (or cds) sequence we are searching from
2035    * @param seqMappings
2036    * the set of mappings involving dnaSeq
2037    * @param aMapping
2038    * a transcript-to-peptide mapping
2039    * @return
2040    */
 
2041  38 toggle static SequenceI findCdsForProtein(List<AlignedCodonFrame> mappings,
2042    SequenceI dnaSeq, List<AlignedCodonFrame> seqMappings,
2043    Mapping aMapping)
2044    {
2045    /*
2046    * TODO a better dna-cds-protein mapping data representation to allow easy
2047    * navigation; until then this clunky looping around lists of mappings
2048    */
2049  38 SequenceI seqDss = dnaSeq.getDatasetSequence() == null ? dnaSeq
2050    : dnaSeq.getDatasetSequence();
2051  38 SequenceI proteinProduct = aMapping.getTo();
2052   
2053    /*
2054    * is this mapping from the whole dna sequence (i.e. CDS)?
2055    * allowing for possible stop codon on dna but not peptide
2056    */
2057  38 int mappedFromLength = MappingUtils
2058    .getLength(aMapping.getMap().getFromRanges());
2059  38 int dnaLength = seqDss.getLength();
2060  38 if (mappedFromLength == dnaLength
2061    || mappedFromLength == dnaLength - CODON_LENGTH)
2062    {
2063    /*
2064    * if sequence has CDS features, this is a transcript with no UTR
2065    * - do not take this as the CDS sequence! (JAL-2789)
2066    */
2067  4 if (seqDss.getFeatures().getFeaturesByOntology(SequenceOntologyI.CDS)
2068    .isEmpty())
2069    {
2070  1 return seqDss;
2071    }
2072    }
2073   
2074    /*
2075    * looks like we found the dna-to-protein mapping; search for the
2076    * corresponding cds-to-protein mapping
2077    */
2078  37 List<AlignedCodonFrame> mappingsToPeptide = MappingUtils
2079    .findMappingsForSequence(proteinProduct, mappings);
2080  37 for (AlignedCodonFrame acf : mappingsToPeptide)
2081    {
2082  52 for (SequenceToSequenceMapping map : acf.getMappings())
2083    {
2084  276 Mapping mapping = map.getMapping();
2085  276 if (mapping != aMapping
2086    && mapping.getMap().getFromRatio() == CODON_LENGTH
2087    && proteinProduct == mapping.getTo()
2088    && seqDss != map.getFromSeq())
2089    {
2090  15 mappedFromLength = MappingUtils
2091    .getLength(mapping.getMap().getFromRanges());
2092  15 if (mappedFromLength == map.getFromSeq().getLength())
2093    {
2094    /*
2095    * found a 3:1 mapping to the protein product which covers
2096    * the whole dna sequence i.e. is from CDS; finally check the CDS
2097    * is mapped from the given dna start sequence
2098    */
2099  15 SequenceI cdsSeq = map.getFromSeq();
2100    // todo this test is weak if seqMappings contains multiple mappings;
2101    // we get away with it if transcript:cds relationship is 1:1
2102  15 List<AlignedCodonFrame> dnaToCdsMaps = MappingUtils
2103    .findMappingsForSequence(cdsSeq, seqMappings);
2104  15 if (!dnaToCdsMaps.isEmpty())
2105    {
2106  13 return cdsSeq;
2107    }
2108    }
2109    }
2110    }
2111    }
2112  24 return null;
2113    }
2114   
2115    /**
2116    * Helper method that makes a CDS sequence as defined by the mappings from the
2117    * given sequence i.e. extracts the 'mapped from' ranges (which may be on
2118    * forward or reverse strand).
2119    *
2120    * @param seq
2121    * @param mapping
2122    * @param dataset
2123    * - existing dataset. We check for sequences that look like the CDS
2124    * we are about to construct, if one exists already, then we will
2125    * just return that one.
2126    * @return CDS sequence (as a dataset sequence)
2127    */
 
2128  20 toggle static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping,
2129    AlignmentI dataset)
2130    {
2131    /*
2132    * construct CDS sequence name as "CDS|" with 'from id' held in the mapping
2133    * if set (e.g. EMBL protein_id), else sequence name appended
2134    */
2135  20 String mapFromId = mapping.getMappedFromId();
2136  20 final String seqId = "CDS|"
2137  20 + (mapFromId != null ? mapFromId : seq.getName());
2138   
2139  20 SequenceI newSeq = null;
2140   
2141    /*
2142    * construct CDS sequence by splicing mapped from ranges
2143    */
2144  20 char[] seqChars = seq.getSequence();
2145  20 List<int[]> fromRanges = mapping.getMap().getFromRanges();
2146  20 int cdsWidth = MappingUtils.getLength(fromRanges);
2147  20 char[] newSeqChars = new char[cdsWidth];
2148   
2149  20 int newPos = 0;
2150  20 for (int[] range : fromRanges)
2151    {
2152  32 if (range[0] <= range[1])
2153    {
2154    // forward strand mapping - just copy the range
2155  32 int length = range[1] - range[0] + 1;
2156  32 System.arraycopy(seqChars, range[0] - 1, newSeqChars, newPos,
2157    length);
2158  32 newPos += length;
2159    }
2160    else
2161    {
2162    // reverse strand mapping - copy and complement one by one
2163  0 for (int i = range[0]; i >= range[1]; i--)
2164    {
2165  0 newSeqChars[newPos++] = Dna.getComplement(seqChars[i - 1]);
2166    }
2167    }
2168   
2169  32 newSeq = new Sequence(seqId, newSeqChars, 1, newPos);
2170    }
2171   
2172  20 if (dataset != null)
2173    {
2174  20 SequenceI[] matches = dataset.findSequenceMatch(newSeq.getName());
2175  20 if (matches != null)
2176    {
2177  20 boolean matched = false;
2178  20 for (SequenceI mtch : matches)
2179    {
2180  3 if (mtch.getStart() != newSeq.getStart())
2181    {
2182  0 continue;
2183    }
2184  3 if (mtch.getEnd() != newSeq.getEnd())
2185    {
2186  0 continue;
2187    }
2188  3 if (!Arrays.equals(mtch.getSequence(), newSeq.getSequence()))
2189    {
2190  3 continue;
2191    }
2192  0 if (!matched)
2193    {
2194  0 matched = true;
2195  0 newSeq = mtch;
2196    }
2197    else
2198    {
2199  0 Console.error(
2200    "JAL-2154 regression: warning - found (and ignored) a duplicate CDS sequence:"
2201    + mtch.toString());
2202    }
2203    }
2204    }
2205    }
2206    // newSeq.setDescription(mapFromId);
2207   
2208  20 return newSeq;
2209    }
2210   
2211    /**
2212    * Adds any DBRefEntrys to cdsSeq from contig that have a Mapping congruent to
2213    * the given mapping.
2214    *
2215    * @param cdsSeq
2216    * @param contig
2217    * @param proteinProduct
2218    * @param mapping
2219    * @return list of DBRefEntrys added
2220    */
 
2221  20 toggle protected static List<DBRefEntry> propagateDBRefsToCDS(SequenceI cdsSeq,
2222    SequenceI contig, SequenceI proteinProduct, Mapping mapping)
2223    {
2224   
2225    // gather direct refs from contig congruent with mapping
2226  20 List<DBRefEntry> direct = new ArrayList<>();
2227  20 HashSet<String> directSources = new HashSet<>();
2228   
2229  20 List<DBRefEntry> refs = contig.getDBRefs();
2230  20 if (refs != null)
2231    {
2232  292 for (int ib = 0, nb = refs.size(); ib < nb; ib++)
2233    {
2234  279 DBRefEntry dbr = refs.get(ib);
2235  279 MapList map;
2236  ? if (dbr.hasMap() && (map = dbr.getMap().getMap()).isTripletMap())
2237    {
2238    // check if map is the CDS mapping
2239  24 if (mapping.getMap().equals(map))
2240    {
2241  24 direct.add(dbr);
2242  24 directSources.add(dbr.getSource());
2243    }
2244    }
2245    }
2246    }
2247  20 List<DBRefEntry> onSource = DBRefUtils.selectRefs(
2248    proteinProduct.getDBRefs(),
2249    directSources.toArray(new String[0]));
2250  20 List<DBRefEntry> propagated = new ArrayList<>();
2251   
2252    // and generate appropriate mappings
2253  44 for (int ic = 0, nc = direct.size(); ic < nc; ic++)
2254    {
2255  24 DBRefEntry cdsref = direct.get(ic);
2256  24 Mapping m = cdsref.getMap();
2257    // clone maplist and mapping
2258  24 MapList cdsposmap = new MapList(
2259    Arrays.asList(new int[][]
2260    { new int[] { cdsSeq.getStart(), cdsSeq.getEnd() } }),
2261    m.getMap().getToRanges(), 3, 1);
2262  24 Mapping cdsmap = new Mapping(m.getTo(), m.getMap());
2263   
2264    // create dbref
2265  24 DBRefEntry newref = new DBRefEntry(cdsref.getSource(),
2266    cdsref.getVersion(), cdsref.getAccessionId(),
2267    new Mapping(cdsmap.getTo(), cdsposmap));
2268   
2269    // and see if we can map to the protein product for this mapping.
2270    // onSource is the filtered set of accessions on protein that we are
2271    // tranferring, so we assume accession is the same.
2272  24 if (cdsmap.getTo() == null && onSource != null)
2273    {
2274  2 List<DBRefEntry> sourceRefs = DBRefUtils.searchRefs(onSource,
2275    cdsref.getAccessionId());
2276  2 if (sourceRefs != null)
2277    {
2278  2 for (DBRefEntry srcref : sourceRefs)
2279    {
2280  2 if (srcref.getSource().equalsIgnoreCase(cdsref.getSource()))
2281    {
2282    // we have found a complementary dbref on the protein product, so
2283    // update mapping's getTo
2284  2 newref.getMap().setTo(proteinProduct);
2285    }
2286    }
2287    }
2288    }
2289  24 cdsSeq.addDBRef(newref);
2290  24 propagated.add(newref);
2291    }
2292  20 return propagated;
2293    }
2294   
2295    /**
2296    * Transfers co-located features on 'fromSeq' to 'toSeq', adjusting the
2297    * feature start/end ranges, optionally omitting specified feature types.
2298    * Returns the number of features copied.
2299    *
2300    * @param fromSeq
2301    * @param toSeq
2302    * @param mapping
2303    * the mapping from 'fromSeq' to 'toSeq'
2304    * @param select
2305    * if not null, only features of this type are copied (including
2306    * subtypes in the Sequence Ontology)
2307    * @param omitting
2308    */
 
2309  23 toggle protected static int transferFeatures(SequenceI fromSeq, SequenceI toSeq,
2310    MapList mapping, String select, String... omitting)
2311    {
2312  23 SequenceI copyTo = toSeq;
2313  43 while (copyTo.getDatasetSequence() != null)
2314    {
2315  20 copyTo = copyTo.getDatasetSequence();
2316    }
2317  23 if (fromSeq == copyTo || fromSeq.getDatasetSequence() == copyTo)
2318    {
2319  0 return 0; // shared dataset sequence
2320    }
2321   
2322    /*
2323    * get features, optionally restricted by an ontology term
2324    */
2325  23 List<SequenceFeature> sfs = select == null
2326    ? fromSeq.getFeatures().getPositionalFeatures()
2327    : fromSeq.getFeatures().getFeaturesByOntology(select);
2328   
2329  23 int count = 0;
2330  23 for (SequenceFeature sf : sfs)
2331    {
2332  9610 String type = sf.getType();
2333  9610 boolean omit = false;
2334  9610 for (String toOmit : omitting)
2335    {
2336  9603 if (type.equals(toOmit))
2337    {
2338  134 omit = true;
2339    }
2340    }
2341  9610 if (omit)
2342    {
2343  134 continue;
2344    }
2345   
2346    /*
2347    * locate the mapped range - null if either start or end is
2348    * not mapped (no partial overlaps are calculated)
2349    */
2350  9476 int start = sf.getBegin();
2351  9476 int end = sf.getEnd();
2352  9476 int[] mappedTo = mapping.locateInTo(start, end);
2353    /*
2354    * if whole exon range doesn't map, try interpreting it
2355    * as 5' or 3' exon overlapping the CDS range
2356    */
2357  9476 if (mappedTo == null)
2358    {
2359  4447 mappedTo = mapping.locateInTo(end, end);
2360  4447 if (mappedTo != null)
2361    {
2362    /*
2363    * end of exon is in CDS range - 5' overlap
2364    * to a range from the start of the peptide
2365    */
2366  0 mappedTo[0] = 1;
2367    }
2368    }
2369  9476 if (mappedTo == null)
2370    {
2371  4447 mappedTo = mapping.locateInTo(start, start);
2372  4447 if (mappedTo != null)
2373    {
2374    /*
2375    * start of exon is in CDS range - 3' overlap
2376    * to a range up to the end of the peptide
2377    */
2378  0 mappedTo[1] = toSeq.getLength();
2379    }
2380    }
2381  9476 if (mappedTo != null)
2382    {
2383  5029 int newBegin = Math.min(mappedTo[0], mappedTo[1]);
2384  5029 int newEnd = Math.max(mappedTo[0], mappedTo[1]);
2385  5029 SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
2386    sf.getFeatureGroup(), sf.getScore());
2387  5029 copyTo.addSequenceFeature(copy);
2388  5029 count++;
2389    }
2390    }
2391  23 return count;
2392    }
2393   
2394    /**
2395    * Returns a mapping from dna to protein by inspecting sequence features of
2396    * type "CDS" on the dna. A mapping is constructed if the total CDS feature
2397    * length is 3 times the peptide length (optionally after dropping a trailing
2398    * stop codon). This method does not check whether the CDS nucleotide sequence
2399    * translates to the peptide sequence.
2400    *
2401    * @param dnaSeq
2402    * @param proteinSeq
2403    * @return
2404    */
 
2405  22 toggle public static MapList mapCdsToProtein(SequenceI dnaSeq,
2406    SequenceI proteinSeq)
2407    {
2408  22 List<int[]> ranges = findCdsPositions(dnaSeq);
2409  22 int mappedDnaLength = MappingUtils.getLength(ranges);
2410   
2411    /*
2412    * if not a whole number of codons, truncate mapping
2413    */
2414  22 int codonRemainder = mappedDnaLength % CODON_LENGTH;
2415  22 if (codonRemainder > 0)
2416    {
2417  2 mappedDnaLength -= codonRemainder;
2418  2 MappingUtils.removeEndPositions(codonRemainder, ranges);
2419    }
2420   
2421  22 int proteinLength = proteinSeq.getLength();
2422  22 int proteinStart = proteinSeq.getStart();
2423  22 int proteinEnd = proteinSeq.getEnd();
2424   
2425    /*
2426    * incomplete start codon may mean X at start of peptide
2427    * we ignore both for mapping purposes
2428    */
2429  22 if (proteinSeq.getCharAt(0) == 'X')
2430    {
2431    // todo JAL-2022 support startPhase > 0
2432  1 proteinStart++;
2433  1 proteinLength--;
2434    }
2435  22 List<int[]> proteinRange = new ArrayList<>();
2436   
2437    /*
2438    * dna length should map to protein (or protein plus stop codon)
2439    */
2440  22 int codesForResidues = mappedDnaLength / CODON_LENGTH;
2441  22 if (codesForResidues == (proteinLength + 1))
2442    {
2443    // assuming extra codon is for STOP and not in peptide
2444    // todo: check trailing codon is indeed a STOP codon
2445  2 codesForResidues--;
2446  2 mappedDnaLength -= CODON_LENGTH;
2447  2 MappingUtils.removeEndPositions(CODON_LENGTH, ranges);
2448    }
2449   
2450  22 if (codesForResidues == proteinLength)
2451    {
2452  4 proteinRange.add(new int[] { proteinStart, proteinEnd });
2453  4 return new MapList(ranges, proteinRange, CODON_LENGTH, 1);
2454    }
2455  18 return null;
2456    }
2457   
2458    /**
2459    * Returns a list of CDS ranges found (as sequence positions base 1), i.e. of
2460    * [start, end] positions of sequence features of type "CDS" (or a sub-type of
2461    * CDS in the Sequence Ontology). The ranges are sorted into ascending start
2462    * position order, so this method is only valid for linear CDS in the same
2463    * sense as the protein product.
2464    *
2465    * @param dnaSeq
2466    * @return
2467    */
 
2468  24 toggle protected static List<int[]> findCdsPositions(SequenceI dnaSeq)
2469    {
2470  24 List<int[]> result = new ArrayList<>();
2471   
2472  24 List<SequenceFeature> sfs = dnaSeq.getFeatures()
2473    .getFeaturesByOntology(SequenceOntologyI.CDS);
2474  24 if (sfs.isEmpty())
2475    {
2476  16 return result;
2477    }
2478  8 SequenceFeatures.sortFeatures(sfs, true);
2479   
2480  8 for (SequenceFeature sf : sfs)
2481    {
2482  16 int phase = 0;
2483  16 try
2484    {
2485  16 String s = sf.getPhase();
2486  16 if (s != null)
2487    {
2488  2 phase = Integer.parseInt(s);
2489    }
2490    } catch (NumberFormatException e)
2491    {
2492    // leave as zero
2493    }
2494    /*
2495    * phase > 0 on first codon means 5' incomplete - skip to the start
2496    * of the next codon; example ENST00000496384
2497    */
2498  16 int begin = sf.getBegin();
2499  16 int end = sf.getEnd();
2500  16 if (result.isEmpty() && phase > 0)
2501    {
2502  2 begin += phase;
2503  2 if (begin > end)
2504    {
2505    // shouldn't happen!
2506  0 System.err
2507    .println("Error: start phase extends beyond start CDS in "
2508    + dnaSeq.getName());
2509    }
2510    }
2511  16 result.add(new int[] { begin, end });
2512    }
2513   
2514    /*
2515    * Finally sort ranges by start position. This avoids a dependency on
2516    * keeping features in order on the sequence (if they are in order anyway,
2517    * the sort will have almost no work to do). The implicit assumption is CDS
2518    * ranges are assembled in order. Other cases should not use this method,
2519    * but instead construct an explicit mapping for CDS (e.g. EMBL parsing).
2520    */
2521  8 Collections.sort(result, IntRangeComparator.ASCENDING);
2522  8 return result;
2523    }
2524   
2525    /**
2526    * Makes an alignment with a copy of the given sequences, adding in any
2527    * non-redundant sequences which are mapped to by the cross-referenced
2528    * sequences.
2529    *
2530    * @param seqs
2531    * @param xrefs
2532    * @param dataset
2533    * the alignment dataset shared by the new copy
2534    * @return
2535    */
 
2536  0 toggle public static AlignmentI makeCopyAlignment(SequenceI[] seqs,
2537    SequenceI[] xrefs, AlignmentI dataset)
2538    {
2539  0 AlignmentI copy = new Alignment(new Alignment(seqs));
2540  0 copy.setDataset(dataset);
2541  0 boolean isProtein = !copy.isNucleotide();
2542  0 SequenceIdMatcher matcher = new SequenceIdMatcher(seqs);
2543  0 if (xrefs != null)
2544    {
2545    // BH 2019.01.25 recoded to remove iterators
2546   
2547  0 for (int ix = 0, nx = xrefs.length; ix < nx; ix++)
2548    {
2549  0 SequenceI xref = xrefs[ix];
2550  0 List<DBRefEntry> dbrefs = xref.getDBRefs();
2551  0 if (dbrefs != null)
2552    {
2553  0 for (int ir = 0, nir = dbrefs.size(); ir < nir; ir++)
2554    {
2555  0 DBRefEntry dbref = dbrefs.get(ir);
2556  0 Mapping map = dbref.getMap();
2557  0 SequenceI mto;
2558  0 if (map == null || (mto = map.getTo()) == null
2559    || mto.isProtein() != isProtein)
2560    {
2561  0 continue;
2562    }
2563  0 SequenceI mappedTo = mto;
2564  0 SequenceI match = matcher.findIdMatch(mappedTo);
2565  0 if (match == null)
2566    {
2567  0 matcher.add(mappedTo);
2568  0 copy.addSequence(mappedTo);
2569    }
2570    }
2571    }
2572    }
2573    }
2574  0 return copy;
2575    }
2576   
2577    /**
2578    * Try to align sequences in 'unaligned' to match the alignment of their
2579    * mapped regions in 'aligned'. For example, could use this to align CDS
2580    * sequences which are mapped to their parent cDNA sequences.
2581    *
2582    * This method handles 1:1 mappings (dna-to-dna or protein-to-protein). For
2583    * dna-to-protein or protein-to-dna use alternative methods.
2584    *
2585    * @param unaligned
2586    * sequences to be aligned
2587    * @param aligned
2588    * holds aligned sequences and their mappings
2589    * @return
2590    */
 
2591  4 toggle public static int alignAs(AlignmentI unaligned, AlignmentI aligned)
2592    {
2593    /*
2594    * easy case - aligning a copy of aligned sequences
2595    */
2596  4 if (alignAsSameSequences(unaligned, aligned))
2597    {
2598  0 return unaligned.getHeight();
2599    }
2600   
2601    /*
2602    * fancy case - aligning via mappings between sequences
2603    */
2604  4 List<SequenceI> unmapped = new ArrayList<>();
2605  4 Map<Integer, Map<SequenceI, Character>> columnMap = buildMappedColumnsMap(
2606    unaligned, aligned, unmapped);
2607  4 int width = columnMap.size();
2608  4 char gap = unaligned.getGapCharacter();
2609  4 int realignedCount = 0;
2610    // TODO: verify this loop scales sensibly for very wide/high alignments
2611   
2612  4 for (SequenceI seq : unaligned.getSequences())
2613    {
2614  26 if (!unmapped.contains(seq))
2615    {
2616  26 char[] newSeq = new char[width];
2617  26 Arrays.fill(newSeq, gap); // JBPComment - doubt this is faster than the
2618    // Integer iteration below
2619  26 int newCol = 0;
2620  26 int lastCol = 0;
2621   
2622    /*
2623    * traverse the map to find columns populated
2624    * by our sequence
2625    */
2626  26 for (Integer column : columnMap.keySet())
2627    {
2628  58976 Character c = columnMap.get(column).get(seq);
2629  58976 if (c != null)
2630    {
2631    /*
2632    * sequence has a character at this position
2633    *
2634    */
2635  31986 newSeq[newCol] = c;
2636  31986 lastCol = newCol;
2637    }
2638  58976 newCol++;
2639    }
2640   
2641    /*
2642    * trim trailing gaps
2643    */
2644  26 if (lastCol < width)
2645    {
2646  26 char[] tmp = new char[lastCol + 1];
2647  26 System.arraycopy(newSeq, 0, tmp, 0, lastCol + 1);
2648  26 newSeq = tmp;
2649    }
2650    // TODO: optimise SequenceI to avoid char[]->String->char[]
2651  26 seq.setSequence(String.valueOf(newSeq));
2652  26 realignedCount++;
2653    }
2654    }
2655  4 return realignedCount;
2656    }
2657   
2658    /**
2659    * If unaligned and aligned sequences share the same dataset sequences, then
2660    * simply copies the aligned sequences to the unaligned sequences and returns
2661    * true; else returns false
2662    *
2663    * @param unaligned
2664    * - sequences to be aligned based on aligned
2665    * @param aligned
2666    * - 'guide' alignment containing sequences derived from same dataset
2667    * as unaligned
2668    * @return
2669    */
 
2670  8 toggle static boolean alignAsSameSequences(AlignmentI unaligned,
2671    AlignmentI aligned)
2672    {
2673  8 if (aligned.getDataset() == null || unaligned.getDataset() == null)
2674    {
2675  0 return false; // should only pass alignments with datasets here
2676    }
2677   
2678    // map from dataset sequence to alignment sequence(s)
2679  8 Map<SequenceI, List<SequenceI>> alignedDatasets = new HashMap<>();
2680  8 for (SequenceI seq : aligned.getSequences())
2681    {
2682  59 SequenceI ds = seq.getDatasetSequence();
2683  59 if (alignedDatasets.get(ds) == null)
2684    {
2685  58 alignedDatasets.put(ds, new ArrayList<SequenceI>());
2686    }
2687  59 alignedDatasets.get(ds).add(seq);
2688    }
2689   
2690    /*
2691    * first pass - check whether all sequences to be aligned share a
2692    * dataset sequence with an aligned sequence; also note the leftmost
2693    * ungapped column from which to copy
2694    */
2695  8 int leftmost = Integer.MAX_VALUE;
2696  8 for (SequenceI seq : unaligned.getSequences())
2697    {
2698  14 final SequenceI ds = seq.getDatasetSequence();
2699  14 if (!alignedDatasets.containsKey(ds))
2700    {
2701  5 return false;
2702    }
2703  9 SequenceI alignedSeq = alignedDatasets.get(ds).get(0);
2704  9 int startCol = alignedSeq.findIndex(seq.getStart()); // 1..
2705  9 leftmost = Math.min(leftmost, startCol);
2706    }
2707   
2708    /*
2709    * second pass - copy aligned sequences;
2710    * heuristic rule: pair off sequences in order for the case where
2711    * more than one shares the same dataset sequence
2712    */
2713  3 final char gapCharacter = aligned.getGapCharacter();
2714  3 for (SequenceI seq : unaligned.getSequences())
2715    {
2716  7 List<SequenceI> alignedSequences = alignedDatasets
2717    .get(seq.getDatasetSequence());
2718  7 if (alignedSequences.isEmpty())
2719    {
2720    /*
2721    * defensive check - shouldn't happen! (JAL-3536)
2722    */
2723  0 continue;
2724    }
2725  7 SequenceI alignedSeq = alignedSequences.get(0);
2726   
2727    /*
2728    * gap fill for leading (5') UTR if any
2729    */
2730    // TODO this copies intron columns - wrong!
2731  7 int startCol = alignedSeq.findIndex(seq.getStart()); // 1..
2732  7 int endCol = alignedSeq.findIndex(seq.getEnd());
2733  7 char[] seqchars = new char[endCol - leftmost + 1];
2734  7 Arrays.fill(seqchars, gapCharacter);
2735  7 char[] toCopy = alignedSeq.getSequence(startCol - 1, endCol);
2736  7 System.arraycopy(toCopy, 0, seqchars, startCol - leftmost,
2737    toCopy.length);
2738  7 seq.setSequence(String.valueOf(seqchars));
2739  7 if (alignedSequences.size() > 0)
2740    {
2741    // pop off aligned sequences (except the last one)
2742  7 alignedSequences.remove(0);
2743    }
2744    }
2745   
2746    /*
2747    * finally remove gapped columns (e.g. introns)
2748    */
2749  3 new RemoveGapColCommand("", unaligned.getSequencesArray(), 0,
2750    unaligned.getWidth() - 1, unaligned);
2751   
2752  3 return true;
2753    }
2754   
2755    /**
2756    * Returns a map whose key is alignment column number (base 1), and whose
2757    * values are a map of sequence characters in that column.
2758    *
2759    * @param unaligned
2760    * @param aligned
2761    * @param unmapped
2762    * @return
2763    */
 
2764  4 toggle static SortedMap<Integer, Map<SequenceI, Character>> buildMappedColumnsMap(
2765    AlignmentI unaligned, AlignmentI aligned,
2766    List<SequenceI> unmapped)
2767    {
2768    /*
2769    * Map will hold, for each aligned column position, a map of
2770    * {unalignedSequence, characterPerSequence} at that position.
2771    * TreeMap keeps the entries in ascending column order.
2772    */
2773  4 SortedMap<Integer, Map<SequenceI, Character>> map = new TreeMap<>();
2774   
2775    /*
2776    * record any sequences that have no mapping so can't be realigned
2777    */
2778  4 unmapped.addAll(unaligned.getSequences());
2779   
2780  4 List<AlignedCodonFrame> mappings = aligned.getCodonFrames();
2781   
2782  4 for (SequenceI seq : unaligned.getSequences())
2783    {
2784  26 for (AlignedCodonFrame mapping : mappings)
2785    {
2786  510 SequenceI fromSeq = mapping.findAlignedSequence(seq, aligned);
2787  510 if (fromSeq != null)
2788    {
2789  26 Mapping seqMap = mapping.getMappingBetween(fromSeq, seq);
2790  26 if (addMappedPositions(seq, fromSeq, seqMap, map))
2791    {
2792  26 unmapped.remove(seq);
2793    }
2794    }
2795    }
2796    }
2797  4 return map;
2798    }
2799   
2800    /**
2801    * Helper method that adds to a map the mapped column positions of a sequence.
2802    * <br>
2803    * For example if aaTT-Tg-gAAA is mapped to TTTAAA then the map should record
2804    * that columns 3,4,6,10,11,12 map to characters T,T,T,A,A,A of the mapped to
2805    * sequence.
2806    *
2807    * @param seq
2808    * the sequence whose column positions we are recording
2809    * @param fromSeq
2810    * a sequence that is mapped to the first sequence
2811    * @param seqMap
2812    * the mapping from 'fromSeq' to 'seq'
2813    * @param map
2814    * a map to add the column positions (in fromSeq) of the mapped
2815    * positions of seq
2816    * @return
2817    */
 
2818  28 toggle static boolean addMappedPositions(SequenceI seq, SequenceI fromSeq,
2819    Mapping seqMap, Map<Integer, Map<SequenceI, Character>> map)
2820    {
2821  28 if (seqMap == null)
2822    {
2823  0 return false;
2824    }
2825   
2826    /*
2827    * invert mapping if it is from unaligned to aligned sequence
2828    */
2829  28 if (seqMap.getTo() == fromSeq.getDatasetSequence())
2830    {
2831  0 seqMap = new Mapping(seq.getDatasetSequence(),
2832    seqMap.getMap().getInverse());
2833    }
2834   
2835  28 int toStart = seq.getStart();
2836   
2837    /*
2838    * traverse [start, end, start, end...] ranges in fromSeq
2839    */
2840  28 for (int[] fromRange : seqMap.getMap().getFromRanges())
2841    {
2842  62 for (int i = 0; i < fromRange.length - 1; i += 2)
2843    {
2844  31 boolean forward = fromRange[i + 1] >= fromRange[i];
2845   
2846    /*
2847    * find the range mapped to (sequence positions base 1)
2848    */
2849  31 int[] range = seqMap.locateMappedRange(fromRange[i],
2850    fromRange[i + 1]);
2851  31 if (range == null)
2852    {
2853  0 jalview.bin.Console.errPrintln("Error in mapping " + seqMap
2854    + " from " + fromSeq.getName());
2855  0 return false;
2856    }
2857  31 int fromCol = fromSeq.findIndex(fromRange[i]);
2858  31 int mappedCharPos = range[0];
2859   
2860    /*
2861    * walk over the 'from' aligned sequence in forward or reverse
2862    * direction; when a non-gap is found, record the column position
2863    * of the next character of the mapped-to sequence; stop when all
2864    * the characters of the range have been counted
2865    */
2866  2794274 while (mappedCharPos <= range[1] && fromCol <= fromSeq.getLength()
2867    && fromCol >= 0)
2868    {
2869  2794243 if (!Comparison.isGap(fromSeq.getCharAt(fromCol - 1)))
2870    {
2871    /*
2872    * mapped from sequence has a character in this column
2873    * record the column position for the mapped to character
2874    */
2875  31998 Map<SequenceI, Character> seqsMap = map.get(fromCol);
2876  31998 if (seqsMap == null)
2877    {
2878  5398 seqsMap = new HashMap<>();
2879  5398 map.put(fromCol, seqsMap);
2880    }
2881  31998 seqsMap.put(seq, seq.getCharAt(mappedCharPos - toStart));
2882  31998 mappedCharPos++;
2883    }
2884  2794243 fromCol += (forward ? 1 : -1);
2885    }
2886    }
2887    }
2888  28 return true;
2889    }
2890   
2891    // strictly temporary hack until proper criteria for aligning protein to cds
2892    // are in place; this is so Ensembl -> fetch xrefs Uniprot aligns the Uniprot
 
2893  4 toggle public static boolean looksLikeEnsembl(AlignmentI alignment)
2894    {
2895  4 for (SequenceI seq : alignment.getSequences())
2896    {
2897  88 String name = seq.getName();
2898  88 if (!name.startsWith("ENSG") && !name.startsWith("ENST"))
2899    {
2900  0 return false;
2901    }
2902    }
2903  4 return true;
2904    }
2905   
2906    /**
2907    * Creates a deep copy of an alignment along with a dataset alignment,
2908    * alignment annotations, representative sequence and hidden columns.
2909    */
 
2910  0 toggle public static AlignmentI deepCopyAlignment(AlignmentI alignment)
2911    {
2912  0 var alnCpy = new Alignment(alignment);
2913  0 alnCpy.setGapCharacter(alignment.getGapCharacter());
2914  0 alnCpy.setDataset(alignment.getDataset());
2915  0 for (AlignmentAnnotation annotation : alignment.getAlignmentAnnotation())
2916  0 alnCpy.addAnnotation(new AlignmentAnnotation(annotation));
2917  0 if (alignment.hasSeqrep())
2918    {
2919  0 int idx = alignment.findIndex(alignment.getSeqrep());
2920  0 if (idx >= 0)
2921  0 alnCpy.setSeqrep(alnCpy.getSequenceAt(idx));
2922    }
2923  0 if (alignment.getHiddenColumns() != null)
2924  0 alnCpy.setHiddenColumns(new HiddenColumns(alignment.getHiddenColumns()));
2925  0 return alnCpy;
2926    }
2927   
2928    /**
2929    * This method filters and returns the secondary structure annotations
2930    * in an array of annotations
2931    *
2932    * @param annotations
2933    * @return array of secondary structure annotations
2934    */
 
2935  0 toggle public static List<AlignmentAnnotation> getSecondaryStructureAnnots(
2936    AlignmentAnnotation[] annotations)
2937    {
2938  0 List<AlignmentAnnotation> ssAnnotations = new ArrayList<>();
2939  0 if (annotations == null || annotations.length == 0)
2940    {
2941  0 return null;
2942    }
2943   
2944  0 synchronized (annotations)
2945    {
2946  0 for (AlignmentAnnotation aa : annotations)
2947    {
2948  0 if (aa == null)
2949    {
2950  0 continue;
2951    }
2952   
2953  0 if (aa.label != null && Constants.SECONDARY_STRUCTURE_LABELS
2954    .containsKey(aa.label))
2955    {
2956  0 ssAnnotations.add(aa);
2957    }
2958    }
2959    }
2960  0 return ssAnnotations;
2961   
2962    }
2963   
 
2964  19 toggle public static boolean isSecondaryStructurePresent(
2965    AlignmentAnnotation[] annotations)
2966    {
2967  19 boolean ssPresent = false;
2968  19 if (annotations == null || annotations.length == 0)
2969    {
2970  8 return false;
2971    }
2972   
2973  11 synchronized (annotations)
2974    {
2975  11 for (AlignmentAnnotation aa : annotations)
2976    {
2977  12 if (aa == null)
2978    {
2979  0 continue;
2980    }
2981    // TODO: @RENIA decide what exactly we should be testing for - not
2982    // strings/labels
2983  12 if (// aa.isValidStruc() || aa.hasIcons ||
2984  12 (aa.label != null && Constants.SECONDARY_STRUCTURE_LABELS
2985    .containsKey(aa.label)))
2986    {
2987  9 ssPresent = true;
2988  9 break;
2989    }
2990    }
2991    }
2992  11 return ssPresent;
2993   
2994    }
2995   
 
2996  4 toggle public static Color getSecondaryStructureAnnotationColour(char symbol)
2997    {
2998   
2999  4 if (symbol == Constants.COIL)
3000    {
3001  1 return Color.gray;
3002    }
3003  3 if (symbol == Constants.SHEET)
3004    {
3005  1 return Color.green;
3006    }
3007  2 if (symbol == Constants.HELIX)
3008    {
3009  1 return Color.red;
3010    }
3011   
3012  1 return Color.white;
3013    }
3014   
 
3015  106089 toggle public static char findSSAnnotationForGivenSeqposition(
3016    AlignmentAnnotation aa, int seqPosition)
3017    {
3018  106091 char ss = '*';
3019   
3020  106081 if (aa != null)
3021    {
3022  106092 if (aa.getAnnotationForPosition(seqPosition) != null)
3023    {
3024  49840 Annotation a = aa.getAnnotationForPosition(seqPosition);
3025  49832 ss = a.secondaryStructure;
3026   
3027    // There is no representation for coil and it can be either ' ' or null.
3028  49832 if (ss == ' ' || ss == '-')
3029    {
3030  10921 ss = Constants.COIL;
3031    }
3032    }
3033    else
3034    {
3035  56248 ss = Constants.COIL;
3036    }
3037    }
3038   
3039  106052 return ss;
3040    }
3041   
 
3042  3956 toggle public static List<String> extractSSSourceInAlignmentAnnotation(
3043    AlignmentAnnotation[] annotations)
3044    {
3045   
3046  3956 List<String> ssSources = new ArrayList<>();
3047  3956 Set<String> addedSources = new HashSet<>(); // to keep track of added
3048    // sources
3049   
3050  3956 if (annotations == null)
3051    {
3052  12 return ssSources;
3053    }
3054   
3055  3944 for (AlignmentAnnotation aa : annotations)
3056    {
3057   
3058  20483 String ssSource = AlignmentAnnotationUtils.extractSSSourceFromAnnotationDescription(aa);
3059   
3060  20483 if (ssSource != null && !addedSources.contains(ssSource))
3061    {
3062  300 ssSources.add(ssSource);
3063  300 addedSources.add(ssSource);
3064    }
3065   
3066    }
3067  3944 Collections.sort(ssSources);
3068   
3069  3944 return ssSources;
3070   
3071    }
3072   
3073   
3074    /**
3075    * Returns secondary structure annotations from the given sequence for a specified source.
3076    * @param seq the sequence
3077    * @param ssSource the source label ("PDB", "JPred")
3078    * @return list of matching annotations
3079    */
 
3080  6269449 toggle public static List<AlignmentAnnotation> getAlignmentAnnotationForSource(
3081    SequenceI seq, String ssSource)
3082    {
3083   
3084  6269392 List<AlignmentAnnotation> ssAnnots = new ArrayList<AlignmentAnnotation>();
3085   
3086    // Iterate through all known secondary structure labels
3087  6269400 for (String ssLabel : Constants.SECONDARY_STRUCTURE_LABELS.keySet())
3088    {
3089   
3090  12534438 AlignmentAnnotation[] aa = seq.getAnnotation(ssLabel);
3091  12534517 if (aa != null)
3092    {
3093    // If source is "ALL", include all annotations for this label
3094  169668 if (Constants.SS_ALL_PROVIDERS.equals(ssSource))
3095    {
3096  63191 ssAnnots.addAll(Arrays.asList(aa));
3097  63183 continue;
3098    }
3099   
3100  106478 for (AlignmentAnnotation annot : aa)
3101    {
3102  128124 String ssSourceForAnnot = AlignmentAnnotationUtils.extractSSSourceFromAnnotationDescription(
3103    annot);
3104    // Add annotations from the matching source
3105  128124 if (ssSourceForAnnot != null && ssSource.equals(ssSourceForAnnot)
3106    && annot.isForDisplay())
3107    {
3108  51456 ssAnnots.add(annot);
3109    }
3110    }
3111    }
3112    }
3113  6268988 if (ssAnnots.size() > 0)
3114    {
3115  96457 return ssAnnots;
3116    }
3117   
3118  6172623 return null;
3119   
3120    }
3121   
 
3122  48 toggle public static Map<SequenceI, ArrayList<AlignmentAnnotation>> getSequenceAssociatedAlignmentAnnotations(
3123    AlignmentAnnotation[] alignAnnotList, String selectedSSSource)
3124    {
3125   
3126  48 Map<SequenceI, ArrayList<AlignmentAnnotation>> ssAlignmentAnnotationForSequences = new HashMap<SequenceI, ArrayList<AlignmentAnnotation>>();
3127  48 if (alignAnnotList == null || alignAnnotList.length == 0)
3128    {
3129  0 return ssAlignmentAnnotationForSequences;
3130    }
3131   
3132  48 for (AlignmentAnnotation aa : alignAnnotList)
3133    {
3134  251 if (aa.sequenceRef == null)
3135    {
3136  32 continue;
3137    }
3138   
3139  219 if (isSecondaryStructureFrom(selectedSSSource, aa))
3140    {
3141  78 ssAlignmentAnnotationForSequences
3142    .computeIfAbsent(aa.sequenceRef.getDatasetSequence(),
3143    k -> new ArrayList<>())
3144    .add(aa);
3145    }
3146    }
3147   
3148  48 return ssAlignmentAnnotationForSequences;
3149   
3150    }
3151   
3152    /**
3153    * param alignAnnotation
3154    * @param selectedSSSource
3155    * @return any alignment annotation that matches the given source
3156    */
 
3157  0 toggle public static List<AlignmentAnnotation> getSecondaryStructureAnnotionFor(List<AlignmentAnnotation> alignAnnotation, String selectedSSSource)
3158    {
3159  0 ArrayList<AlignmentAnnotation> annForSource = new ArrayList();
3160  0 for (AlignmentAnnotation alan:alignAnnotation)
3161    {
3162  0 if (isSecondaryStructureFrom(selectedSSSource, alan)) {
3163  0 annForSource.add(alan);
3164    }
3165    }
3166  0 return annForSource;
3167    }
3168   
3169    /**
3170    *
3171    * @param selectedSSSource
3172    * @param aa
3173    * @return true if aa is from a provider or all providers as specified by
3174    * selectedSSSource
3175    */
 
3176  219 toggle public static boolean isSecondaryStructureFrom(String selectedSSSource,
3177    AlignmentAnnotation aa)
3178    {
3179   
3180  219 for (String label : Constants.SECONDARY_STRUCTURE_LABELS.keySet())
3181    {
3182   
3183  376 if (label.equals(aa.label))
3184    {
3185   
3186  170 if (selectedSSSource.equals(Constants.SS_ALL_PROVIDERS))
3187    {
3188  38 return true;
3189    }
3190  132 String ssSource = AlignmentAnnotationUtils
3191    .extractSSSourceFromAnnotationDescription(aa);
3192  132 if (ssSource != null && ssSource.equals(selectedSSSource))
3193    {
3194  40 return true;
3195    }
3196    }
3197    }
3198  141 return false;
3199    }
3200   
3201    // Method to get the key for a given provider value
 
3202  0 toggle public static String getSecondaryStructureProviderKey(
3203    String providerValue)
3204    {
3205  0 for (Map.Entry<String, String> entry : Constants.STRUCTURE_PROVIDERS
3206    .entrySet())
3207    {
3208  0 if (entry.getValue().equals(providerValue))
3209    {
3210  0 return entry.getKey(); // Return the key (abbreviation) for the matching
3211    // provider value
3212    }
3213    }
3214  0 return null; // Return null if no match is found
3215    }
3216   
 
3217  0 toggle public static String reduceLabelLength(String label)
3218    {
3219    // Split the input by " | "
3220  0 StringBuilder reducedLabel = new StringBuilder();
3221  0 for (String part:label.split(" \\| ")) {
3222  0 String nextLabel=part;
3223  0 for (Entry<String,String> provider:Constants.STRUCTURE_PROVIDERS.entrySet()) {
3224  0 if (part.equals(provider.getValue())) {
3225  0 nextLabel = provider.getKey();
3226    }
3227    }
3228  0 if (reducedLabel.length()>0)
3229    {
3230  0 reducedLabel.append(" | ");
3231    }
3232  0 reducedLabel.append(nextLabel);
3233    }
3234  0 return reducedLabel.toString();
3235    }
3236   
 
3237  596 toggle public static HashMap<String, Color> assignColorsForSecondaryStructureProviders(
3238    List<String> labels)
3239    {
3240    // Loop through each unique label and add it to the map with a colour.
3241    // for (String label : uniqueLabels)
3242   
3243  596 HashMap<String, Color> secondaryStructureProviderColorMap = new HashMap<String, Color>();
3244  596 for (String label : labels)
3245    {
3246    // Generate or retrieve a colour for the label.
3247  544 String name = label.toUpperCase(Locale.ROOT).trim();
3248  544 secondaryStructureProviderColorMap.put(name,
3249    ColorUtils.getColourFromNameAndScheme(name, "NONE"));
3250    }
3251   
3252  596 return secondaryStructureProviderColorMap;
3253    }
3254   
 
3255  6 toggle public static int computeMaxShifts(SequenceI[] seqs, ShiftList inserts)
3256    {
3257   
3258  6 int oldwidth = 0;
3259  6 int p = 0, lastP = 0, lastIns = 0;
3260  6 for (SequenceI seq : seqs)
3261    {
3262  140 char[] sqs = seq.getSequence();
3263    // System.out.println(seq.getSequenceAsString());
3264  140 if (oldwidth < sqs.length)
3265    {
3266  13 oldwidth = sqs.length;
3267    }
3268  140 p = 0;
3269  140 lastP = 0;
3270  140 lastIns = 0;
3271  140 int modelOffset = 0; // shift between index and insertions encountered
3272  140 do
3273    {
3274  53877 if (sqs[p] >= 'a' && sqs[p] <= 'z')
3275    {
3276  2015 if (lastIns == 0)
3277    {
3278  589 lastP = (modelOffset + p) * 2;
3279    }
3280  2015 lastIns++;
3281    }
3282    else
3283    {
3284  51862 if (lastIns > 0)
3285    {
3286  586 inserts.extendShift(lastP, lastIns);
3287  586 modelOffset -= lastIns;
3288    }
3289  51862 lastIns = 0;
3290    }
3291  53877 } while (++p < sqs.length);
3292  140 if (lastIns > 0)
3293    {
3294  3 inserts.extendShift(lastP, lastIns);
3295    }
3296    }
3297  6 return oldwidth;
3298    }
3299   
3300    /**
3301    * quick and dirty lowercase - to - insert expansion
3302    *
3303    * @param seqs
3304    * - these will be direcly modified
3305    * @return message indicating how many gaps were inserted.
3306    */
 
3307  4 toggle public static String a3mToMSA(SequenceI[] seqs)
3308    {
3309  4 String response = "";
3310  4 ShiftList inserts = new ShiftList();
3311  4 int oldwidth = computeMaxShifts(seqs, inserts);
3312   
3313  4 int newwidth = insertShifts(seqs,inserts);
3314  4 if (oldwidth != newwidth)
3315    {
3316  4 response = "Added " + (newwidth - oldwidth) + " inserts.";
3317    }
3318  4 return response;
3319    }
3320   
 
3321  5 toggle public static int insertShifts(SequenceI[] seqs, ShiftList inserts)
3322    {
3323  5 int newwidth=0;
3324  5 for (SequenceI seq : seqs)
3325    {
3326  80 boolean changed=false;
3327  80 int posShift = 0, totalShift = 0;
3328  80 for (int[] shift : inserts.getShifts())
3329    {
3330    // locate this shift in the totalShift'ed sequence
3331  5912 posShift = (shift[0] / 2);
3332  5912 int len = shift[1];
3333   
3334    // account for any insertions in this sequence
3335  5912 int p = posShift + totalShift;
3336  7021 while (len > 0 && p < seq.getLength()
3337    && (seq.getCharAt(p) >= 'a' && seq.getCharAt(p) <= 'z'))
3338    {
3339  1109 p++;
3340  1109 len--;
3341    }
3342    // and mark any remaining after the insert
3343  5912 if (len > 0)
3344    {
3345  5670 changed=true;
3346  5670 seq.doInsert(p, len, '-');
3347    }
3348  5912 totalShift += shift[1]; // update shifted positions in this sequence
3349    }
3350  80 if (changed)
3351    {
3352  78 seq.sequenceChanged();
3353    }
3354  80 newwidth = (seq.getLength() > newwidth) ? seq.getLength() : newwidth;
3355    }
3356  5 return newwidth;
3357    }
3358    }