Clover icon

Coverage Report

  1. Project Clover database Mon Nov 11 2024 20:42:03 GMT
  2. Package jalview.analysis

File AlignmentUtils.java

 

Coverage histogram

../../img/srcFileCovDistChart9.png
12% of files have more coverage

Code metrics

468
928
60
2
3,283
1,991
358
0.39
15.47
30
5.97

Classes

Class Line # Actions
AlignmentUtils 80 922 352
0.819694982%
AlignmentUtils.DnaVariant 96 6 6
0.00%
 

Contributing tests

This file is covered by 286 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.analysis;
22   
23    import java.awt.Color;
24    import java.util.ArrayList;
25    import java.util.Arrays;
26    import java.util.Collection;
27    import java.util.Collections;
28    import java.util.HashMap;
29    import java.util.HashSet;
30    import java.util.Iterator;
31    import java.util.LinkedHashMap;
32    import java.util.List;
33    import java.util.Locale;
34    import java.util.Map;
35    import java.util.Map.Entry;
36    import java.util.NoSuchElementException;
37    import java.util.Set;
38    import java.util.SortedMap;
39    import java.util.TreeMap;
40    import java.util.Vector;
41    import java.util.stream.Collectors;
42   
43    import jalview.bin.Console;
44    import jalview.commands.RemoveGapColCommand;
45    import jalview.datamodel.AlignedCodon;
46    import jalview.datamodel.AlignedCodonFrame;
47    import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping;
48    import jalview.datamodel.Alignment;
49    import jalview.datamodel.AlignmentAnnotation;
50    import jalview.datamodel.AlignmentI;
51    import jalview.datamodel.Annotation;
52    import jalview.datamodel.ContactMatrixI;
53    import jalview.datamodel.DBRefEntry;
54    import jalview.datamodel.GeneLociI;
55    import jalview.datamodel.IncompleteCodonException;
56    import jalview.datamodel.Mapping;
57    import jalview.datamodel.PDBEntry;
58    import jalview.datamodel.Sequence;
59    import jalview.datamodel.SequenceFeature;
60    import jalview.datamodel.SequenceGroup;
61    import jalview.datamodel.SequenceI;
62    import jalview.datamodel.features.SequenceFeatures;
63    import jalview.io.gff.SequenceOntologyI;
64    import jalview.schemes.ResidueProperties;
65    import jalview.util.ColorUtils;
66    import jalview.util.Comparison;
67    import jalview.util.Constants;
68    import jalview.util.DBRefUtils;
69    import jalview.util.IntRangeComparator;
70    import jalview.util.MapList;
71    import jalview.util.MappingUtils;
72   
73    /**
74    * grab bag of useful alignment manipulation operations Expect these to be
75    * refactored elsewhere at some point.
76    *
77    * @author jimp
78    *
79    */
 
80    public class AlignmentUtils
81    {
82    private static final int CODON_LENGTH = 3;
83   
84    private static final String SEQUENCE_VARIANT = "sequence_variant:";
85   
86    /*
87    * the 'id' attribute is provided for variant features fetched from
88    * Ensembl using its REST service with JSON format
89    */
90    public static final String VARIANT_ID = "id";
91   
92    /**
93    * A data model to hold the 'normal' base value at a position, and an optional
94    * sequence variant feature
95    */
 
96    static final class DnaVariant
97    {
98    final String base;
99   
100    SequenceFeature variant;
101   
 
102  0 toggle DnaVariant(String nuc)
103    {
104  0 base = nuc;
105  0 variant = null;
106    }
107   
 
108  0 toggle DnaVariant(String nuc, SequenceFeature var)
109    {
110  0 base = nuc;
111  0 variant = var;
112    }
113   
 
114  0 toggle public String getSource()
115    {
116  0 return variant == null ? null : variant.getFeatureGroup();
117    }
118   
119    /**
120    * toString for aid in the debugger only
121    */
 
122  0 toggle @Override
123    public String toString()
124    {
125  0 return base + ":" + (variant == null ? "" : variant.getDescription());
126    }
127    }
128   
129    /**
130    * given an existing alignment, create a new alignment including all, or up to
131    * flankSize additional symbols from each sequence's dataset sequence
132    *
133    * @param core
134    * @param flankSize
135    * @return AlignmentI
136    */
 
137  27 toggle public static AlignmentI expandContext(AlignmentI core, int flankSize)
138    {
139  27 List<SequenceI> sq = new ArrayList<>();
140  27 int maxoffset = 0;
141  27 for (SequenceI s : core.getSequences())
142    {
143  131 SequenceI newSeq = s.deriveSequence();
144  131 final int newSeqStart = newSeq.getStart() - 1;
145  131 if (newSeqStart > maxoffset
146    && newSeq.getDatasetSequence().getStart() < s.getStart())
147    {
148  131 maxoffset = newSeqStart;
149    }
150  131 sq.add(newSeq);
151    }
152  27 if (flankSize > -1)
153    {
154  25 maxoffset = Math.min(maxoffset, flankSize);
155    }
156   
157    /*
158    * now add offset left and right to create an expanded alignment
159    */
160  27 for (SequenceI s : sq)
161    {
162  131 SequenceI ds = s;
163  262 while (ds.getDatasetSequence() != null)
164    {
165  131 ds = ds.getDatasetSequence();
166    }
167  131 int s_end = s.findPosition(s.getStart() + s.getLength());
168    // find available flanking residues for sequence
169  131 int ustream_ds = s.getStart() - ds.getStart();
170  131 int dstream_ds = ds.getEnd() - s_end;
171   
172    // build new flanked sequence
173   
174    // compute gap padding to start of flanking sequence
175  131 int offset = maxoffset - ustream_ds;
176   
177    // padding is gapChar x ( maxoffset - min(ustream_ds, flank)
178  131 if (flankSize >= 0)
179    {
180  125 if (flankSize < ustream_ds)
181    {
182    // take up to flankSize residues
183  40 offset = maxoffset - flankSize;
184  40 ustream_ds = flankSize;
185    }
186  125 if (flankSize <= dstream_ds)
187    {
188  116 dstream_ds = flankSize - 1;
189    }
190    }
191    // TODO use Character.toLowerCase to avoid creating String objects?
192  131 char[] upstream = new String(ds
193    .getSequence(s.getStart() - 1 - ustream_ds, s.getStart() - 1))
194    .toLowerCase(Locale.ROOT).toCharArray();
195  131 char[] downstream = new String(
196    ds.getSequence(s_end - 1, s_end + dstream_ds))
197    .toLowerCase(Locale.ROOT).toCharArray();
198  131 char[] coreseq = s.getSequence();
199  131 char[] nseq = new char[offset + upstream.length + downstream.length
200    + coreseq.length];
201  131 char c = core.getGapCharacter();
202   
203  131 int p = 0;
204  461 for (; p < offset; p++)
205    {
206  330 nseq[p] = c;
207    }
208   
209  131 System.arraycopy(upstream, 0, nseq, p, upstream.length);
210  131 System.arraycopy(coreseq, 0, nseq, p + upstream.length,
211    coreseq.length);
212  131 System.arraycopy(downstream, 0, nseq,
213    p + coreseq.length + upstream.length, downstream.length);
214  131 s.setSequence(new String(nseq));
215  131 s.setStart(s.getStart() - ustream_ds);
216  131 s.setEnd(s_end + downstream.length);
217    }
218  27 AlignmentI newAl = new jalview.datamodel.Alignment(
219    sq.toArray(new SequenceI[0]));
220  27 for (SequenceI s : sq)
221    {
222  131 if (s.getAnnotation() != null)
223    {
224  1 for (AlignmentAnnotation aa : s.getAnnotation())
225    {
226  1 aa.adjustForAlignment(); // JAL-1712 fix
227  1 newAl.addAnnotation(aa);
228    }
229    }
230    }
231  27 newAl.setDataset(core.getDataset());
232  27 return newAl;
233    }
234   
235    /**
236    * Returns the index (zero-based position) of a sequence in an alignment, or
237    * -1 if not found.
238    *
239    * @param al
240    * @param seq
241    * @return
242    */
 
243  59593 toggle public static int getSequenceIndex(AlignmentI al, SequenceI seq)
244    {
245  59593 int result = -1;
246  59593 int pos = 0;
247  59593 for (SequenceI alSeq : al.getSequences())
248    {
249  126234233 if (alSeq == seq)
250    {
251  59535 result = pos;
252  59535 break;
253    }
254  126174698 pos++;
255    }
256  59593 return result;
257    }
258   
259    /**
260    * Returns a map of lists of sequences in the alignment, keyed by sequence
261    * name. For use in mapping between different alignment views of the same
262    * sequences.
263    *
264    * @see jalview.datamodel.AlignmentI#getSequencesByName()
265    */
 
266  1 toggle public static Map<String, List<SequenceI>> getSequencesByName(
267    AlignmentI al)
268    {
269  1 Map<String, List<SequenceI>> theMap = new LinkedHashMap<>();
270  1 for (SequenceI seq : al.getSequences())
271    {
272  3 String name = seq.getName();
273  3 if (name != null)
274    {
275  3 List<SequenceI> seqs = theMap.get(name);
276  3 if (seqs == null)
277    {
278  2 seqs = new ArrayList<>();
279  2 theMap.put(name, seqs);
280    }
281  3 seqs.add(seq);
282    }
283    }
284  1 return theMap;
285    }
286   
287    /**
288    * Build mapping of protein to cDNA alignment. Mappings are made between
289    * sequences where the cDNA translates to the protein sequence. Any new
290    * mappings are added to the protein alignment. Returns true if any mappings
291    * either already exist or were added, else false.
292    *
293    * @param proteinAlignment
294    * @param cdnaAlignment
295    * @return
296    */
 
297  4 toggle public static boolean mapProteinAlignmentToCdna(
298    final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment)
299    {
300  4 if (proteinAlignment == null || cdnaAlignment == null)
301    {
302  0 return false;
303    }
304   
305  4 Set<SequenceI> mappedDna = new HashSet<>();
306  4 Set<SequenceI> mappedProtein = new HashSet<>();
307   
308    /*
309    * First pass - map sequences where cross-references exist. This include
310    * 1-to-many mappings to support, for example, variant cDNA.
311    */
312  4 boolean mappingPerformed = mapProteinToCdna(proteinAlignment,
313    cdnaAlignment, mappedDna, mappedProtein, true);
314   
315    /*
316    * Second pass - map sequences where no cross-references exist. This only
317    * does 1-to-1 mappings and assumes corresponding sequences are in the same
318    * order in the alignments.
319    */
320  4 mappingPerformed |= mapProteinToCdna(proteinAlignment, cdnaAlignment,
321    mappedDna, mappedProtein, false);
322  4 return mappingPerformed;
323    }
324   
325    /**
326    * Make mappings between compatible sequences (where the cDNA translation
327    * matches the protein).
328    *
329    * @param proteinAlignment
330    * @param cdnaAlignment
331    * @param mappedDna
332    * a set of mapped DNA sequences (to add to)
333    * @param mappedProtein
334    * a set of mapped Protein sequences (to add to)
335    * @param xrefsOnly
336    * if true, only map sequences where xrefs exist
337    * @return
338    */
 
339  8 toggle protected static boolean mapProteinToCdna(
340    final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment,
341    Set<SequenceI> mappedDna, Set<SequenceI> mappedProtein,
342    boolean xrefsOnly)
343    {
344  8 boolean mappingExistsOrAdded = false;
345  8 List<SequenceI> thisSeqs = proteinAlignment.getSequences();
346  8 for (SequenceI aaSeq : thisSeqs)
347    {
348  22 boolean proteinMapped = false;
349  22 AlignedCodonFrame acf = new AlignedCodonFrame();
350   
351  22 for (SequenceI cdnaSeq : cdnaAlignment.getSequences())
352    {
353    /*
354    * Always try to map if sequences have xref to each other; this supports
355    * variant cDNA or alternative splicing for a protein sequence.
356    *
357    * If no xrefs, try to map progressively, assuming that alignments have
358    * mappable sequences in corresponding order. These are not
359    * many-to-many, as that would risk mixing species with similar cDNA
360    * sequences.
361    */
362  86 if (xrefsOnly && !AlignmentUtils.haveCrossRef(aaSeq, cdnaSeq))
363    {
364  39 continue;
365    }
366   
367    /*
368    * Don't map non-xrefd sequences more than once each. This heuristic
369    * allows us to pair up similar sequences in ordered alignments.
370    */
371  47 if (!xrefsOnly && (mappedProtein.contains(aaSeq)
372    || mappedDna.contains(cdnaSeq)))
373    {
374  29 continue;
375    }
376  18 if (mappingExists(proteinAlignment.getCodonFrames(),
377    aaSeq.getDatasetSequence(), cdnaSeq.getDatasetSequence()))
378    {
379  0 mappingExistsOrAdded = true;
380    }
381    else
382    {
383  18 MapList map = mapCdnaToProtein(aaSeq, cdnaSeq);
384  18 if (map != null)
385    {
386  12 acf.addMap(cdnaSeq, aaSeq, map);
387  12 mappingExistsOrAdded = true;
388  12 proteinMapped = true;
389  12 mappedDna.add(cdnaSeq);
390  12 mappedProtein.add(aaSeq);
391    }
392    }
393    }
394  22 if (proteinMapped)
395    {
396  11 proteinAlignment.addCodonFrame(acf);
397    }
398    }
399  8 return mappingExistsOrAdded;
400    }
401   
402    /**
403    * Answers true if the mappings include one between the given (dataset)
404    * sequences.
405    */
 
406  18 toggle protected static boolean mappingExists(List<AlignedCodonFrame> mappings,
407    SequenceI aaSeq, SequenceI cdnaSeq)
408    {
409  18 if (mappings != null)
410    {
411  18 for (AlignedCodonFrame acf : mappings)
412    {
413  14 if (cdnaSeq == acf.getDnaForAaSeq(aaSeq))
414    {
415  0 return true;
416    }
417    }
418    }
419  18 return false;
420    }
421   
422    /**
423    * Builds a mapping (if possible) of a cDNA to a protein sequence.
424    * <ul>
425    * <li>first checks if the cdna translates exactly to the protein
426    * sequence</li>
427    * <li>else checks for translation after removing a STOP codon</li>
428    * <li>else checks for translation after removing a START codon</li>
429    * <li>if that fails, inspect CDS features on the cDNA sequence</li>
430    * </ul>
431    * Returns null if no mapping is determined.
432    *
433    * @param proteinSeq
434    * the aligned protein sequence
435    * @param cdnaSeq
436    * the aligned cdna sequence
437    * @return
438    */
 
439  31 toggle public static MapList mapCdnaToProtein(SequenceI proteinSeq,
440    SequenceI cdnaSeq)
441    {
442    /*
443    * Here we handle either dataset sequence set (desktop) or absent (applet).
444    * Use only the char[] form of the sequence to avoid creating possibly large
445    * String objects.
446    */
447  31 final SequenceI proteinDataset = proteinSeq.getDatasetSequence();
448  31 char[] aaSeqChars = proteinDataset != null
449    ? proteinDataset.getSequence()
450    : proteinSeq.getSequence();
451  31 final SequenceI cdnaDataset = cdnaSeq.getDatasetSequence();
452  31 char[] cdnaSeqChars = cdnaDataset != null ? cdnaDataset.getSequence()
453    : cdnaSeq.getSequence();
454  31 if (aaSeqChars == null || cdnaSeqChars == null)
455    {
456  0 return null;
457    }
458   
459    /*
460    * cdnaStart/End, proteinStartEnd are base 1 (for dataset sequence mapping)
461    */
462  31 final int mappedLength = CODON_LENGTH * aaSeqChars.length;
463  31 int cdnaLength = cdnaSeqChars.length;
464  31 int cdnaStart = cdnaSeq.getStart();
465  31 int cdnaEnd = cdnaSeq.getEnd();
466  31 final int proteinStart = proteinSeq.getStart();
467  31 final int proteinEnd = proteinSeq.getEnd();
468   
469    /*
470    * If lengths don't match, try ignoring stop codon (if present)
471    */
472  31 if (cdnaLength != mappedLength && cdnaLength > 2)
473    {
474  16 String lastCodon = String.valueOf(cdnaSeqChars,
475    cdnaLength - CODON_LENGTH, CODON_LENGTH)
476    .toUpperCase(Locale.ROOT);
477  16 for (String stop : ResidueProperties.STOP_CODONS)
478    {
479  47 if (lastCodon.equals(stop))
480    {
481  2 cdnaEnd -= CODON_LENGTH;
482  2 cdnaLength -= CODON_LENGTH;
483  2 break;
484    }
485    }
486    }
487   
488    /*
489    * If lengths still don't match, try ignoring start codon.
490    */
491  31 int startOffset = 0;
492  31 if (cdnaLength != mappedLength && cdnaLength > 2
493    && String.valueOf(cdnaSeqChars, 0, CODON_LENGTH)
494    .toUpperCase(Locale.ROOT)
495    .equals(ResidueProperties.START))
496    {
497  5 startOffset += CODON_LENGTH;
498  5 cdnaStart += CODON_LENGTH;
499  5 cdnaLength -= CODON_LENGTH;
500    }
501   
502  31 if (translatesAs(cdnaSeqChars, startOffset, aaSeqChars))
503    {
504    /*
505    * protein is translation of dna (+/- start/stop codons)
506    */
507  15 MapList map = new MapList(new int[] { cdnaStart, cdnaEnd },
508    new int[]
509    { proteinStart, proteinEnd }, CODON_LENGTH, 1);
510  15 return map;
511    }
512   
513    /*
514    * translation failed - try mapping CDS annotated regions of dna
515    */
516  16 return mapCdsToProtein(cdnaSeq, proteinSeq);
517    }
518   
519    /**
520    * Test whether the given cdna sequence, starting at the given offset,
521    * translates to the given amino acid sequence, using the standard translation
522    * table. Designed to fail fast i.e. as soon as a mismatch position is found.
523    *
524    * @param cdnaSeqChars
525    * @param cdnaStart
526    * @param aaSeqChars
527    * @return
528    */
 
529  51 toggle protected static boolean translatesAs(char[] cdnaSeqChars, int cdnaStart,
530    char[] aaSeqChars)
531    {
532  51 if (cdnaSeqChars == null || aaSeqChars == null)
533    {
534  3 return false;
535    }
536   
537  48 int aaPos = 0;
538  48 int dnaPos = cdnaStart;
539  161 for (; dnaPos < cdnaSeqChars.length - 2
540    && aaPos < aaSeqChars.length; dnaPos += CODON_LENGTH, aaPos++)
541    {
542  130 String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
543  130 final String translated = ResidueProperties.codonTranslate(codon);
544   
545    /*
546    * allow * in protein to match untranslatable in dna
547    */
548  130 final char aaRes = aaSeqChars[aaPos];
549  130 if ((translated == null || ResidueProperties.STOP.equals(translated))
550    && aaRes == '*')
551    {
552  4 continue;
553    }
554  126 if (translated == null || !(aaRes == translated.charAt(0)))
555    {
556    // debug
557    // jalview.bin.Console.outPrintln(("Mismatch at " + i + "/" + aaResidue
558    // + ": "
559    // + codon + "(" + translated + ") != " + aaRes));
560  17 return false;
561    }
562    }
563   
564    /*
565    * check we matched all of the protein sequence
566    */
567  31 if (aaPos != aaSeqChars.length)
568    {
569  2 return false;
570    }
571   
572    /*
573    * check we matched all of the dna except
574    * for optional trailing STOP codon
575    */
576  29 if (dnaPos == cdnaSeqChars.length)
577    {
578  17 return true;
579    }
580  12 if (dnaPos == cdnaSeqChars.length - CODON_LENGTH)
581    {
582  11 String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
583  11 if (ResidueProperties.STOP
584    .equals(ResidueProperties.codonTranslate(codon)))
585    {
586  9 return true;
587    }
588    }
589  3 return false;
590    }
591   
592    /**
593    * Align sequence 'seq' to match the alignment of a mapped sequence. Note this
594    * currently assumes that we are aligning cDNA to match protein.
595    *
596    * @param seq
597    * the sequence to be realigned
598    * @param al
599    * the alignment whose sequence alignment is to be 'copied'
600    * @param gap
601    * character string represent a gap in the realigned sequence
602    * @param preserveUnmappedGaps
603    * @param preserveMappedGaps
604    * @return true if the sequence was realigned, false if it could not be
605    */
 
606  8 toggle public static boolean alignSequenceAs(SequenceI seq, AlignmentI al,
607    String gap, boolean preserveMappedGaps,
608    boolean preserveUnmappedGaps)
609    {
610    /*
611    * Get any mappings from the source alignment to the target (dataset)
612    * sequence.
613    */
614    // TODO there may be one AlignedCodonFrame per dataset sequence, or one with
615    // all mappings. Would it help to constrain this?
616  8 List<AlignedCodonFrame> mappings = al.getCodonFrame(seq);
617  8 if (mappings == null || mappings.isEmpty())
618    {
619  0 return false;
620    }
621   
622    /*
623    * Locate the aligned source sequence whose dataset sequence is mapped. We
624    * just take the first match here (as we can't align like more than one
625    * sequence).
626    */
627  8 SequenceI alignFrom = null;
628  8 AlignedCodonFrame mapping = null;
629  8 for (AlignedCodonFrame mp : mappings)
630    {
631  8 alignFrom = mp.findAlignedSequence(seq, al);
632  8 if (alignFrom != null)
633    {
634  4 mapping = mp;
635  4 break;
636    }
637    }
638   
639  8 if (alignFrom == null)
640    {
641  4 return false;
642    }
643  4 alignSequenceAs(seq, alignFrom, mapping, gap, al.getGapCharacter(),
644    preserveMappedGaps, preserveUnmappedGaps);
645  4 return true;
646    }
647   
648    /**
649    * Align sequence 'alignTo' the same way as 'alignFrom', using the mapping to
650    * match residues and codons. Flags control whether existing gaps in unmapped
651    * (intron) and mapped (exon) regions are preserved or not. Gaps between
652    * intron and exon are only retained if both flags are set.
653    *
654    * @param alignTo
655    * @param alignFrom
656    * @param mapping
657    * @param myGap
658    * @param sourceGap
659    * @param preserveUnmappedGaps
660    * @param preserveMappedGaps
661    */
 
662  19 toggle public static void alignSequenceAs(SequenceI alignTo, SequenceI alignFrom,
663    AlignedCodonFrame mapping, String myGap, char sourceGap,
664    boolean preserveMappedGaps, boolean preserveUnmappedGaps)
665    {
666    // TODO generalise to work for Protein-Protein, dna-dna, dna-protein
667   
668    // aligned and dataset sequence positions, all base zero
669  19 int thisSeqPos = 0;
670  19 int sourceDsPos = 0;
671   
672  19 int basesWritten = 0;
673  19 char myGapChar = myGap.charAt(0);
674  19 int ratio = myGap.length();
675   
676  19 int fromOffset = alignFrom.getStart() - 1;
677  19 int toOffset = alignTo.getStart() - 1;
678  19 int sourceGapMappedLength = 0;
679  19 boolean inExon = false;
680  19 final int toLength = alignTo.getLength();
681  19 final int fromLength = alignFrom.getLength();
682  19 StringBuilder thisAligned = new StringBuilder(2 * toLength);
683   
684    /*
685    * Traverse the 'model' aligned sequence
686    */
687  205 for (int i = 0; i < fromLength; i++)
688    {
689  186 char sourceChar = alignFrom.getCharAt(i);
690  186 if (sourceChar == sourceGap)
691    {
692  44 sourceGapMappedLength += ratio;
693  44 continue;
694    }
695   
696    /*
697    * Found a non-gap character. Locate its mapped region if any.
698    */
699  142 sourceDsPos++;
700    // Note mapping positions are base 1, our sequence positions base 0
701  142 int[] mappedPos = mapping.getMappedRegion(alignTo, alignFrom,
702    sourceDsPos + fromOffset);
703  142 if (mappedPos == null)
704    {
705    /*
706    * unmapped position; treat like a gap
707    */
708  94 sourceGapMappedLength += ratio;
709    // jalview.bin.Console.errPrintln("Can't align: no codon mapping to
710    // residue "
711    // + sourceDsPos + "(" + sourceChar + ")");
712    // return;
713  94 continue;
714    }
715   
716  48 int mappedCodonStart = mappedPos[0]; // position (1...) of codon start
717  48 int mappedCodonEnd = mappedPos[mappedPos.length - 1]; // codon end pos
718  48 StringBuilder trailingCopiedGap = new StringBuilder();
719   
720    /*
721    * Copy dna sequence up to and including this codon. Optionally, include
722    * gaps before the codon starts (in introns) and/or after the codon starts
723    * (in exons).
724    *
725    * Note this only works for 'linear' splicing, not reverse or interleaved.
726    * But then 'align dna as protein' doesn't make much sense otherwise.
727    */
728  48 int intronLength = 0;
729  294 while (basesWritten + toOffset < mappedCodonEnd
730    && thisSeqPos < toLength)
731    {
732  246 final char c = alignTo.getCharAt(thisSeqPos++);
733  246 if (c != myGapChar)
734    {
735  146 basesWritten++;
736  146 int sourcePosition = basesWritten + toOffset;
737  146 if (sourcePosition < mappedCodonStart)
738    {
739    /*
740    * Found an unmapped (intron) base. First add in any preceding gaps
741    * (if wanted).
742    */
743  48 if (preserveUnmappedGaps && trailingCopiedGap.length() > 0)
744    {
745  17 thisAligned.append(trailingCopiedGap.toString());
746  17 intronLength += trailingCopiedGap.length();
747  17 trailingCopiedGap = new StringBuilder();
748    }
749  48 intronLength++;
750  48 inExon = false;
751    }
752    else
753    {
754  98 final boolean startOfCodon = sourcePosition == mappedCodonStart;
755  98 int gapsToAdd = calculateGapsToInsert(preserveMappedGaps,
756    preserveUnmappedGaps, sourceGapMappedLength, inExon,
757    trailingCopiedGap.length(), intronLength, startOfCodon);
758  215 for (int k = 0; k < gapsToAdd; k++)
759    {
760  117 thisAligned.append(myGapChar);
761    }
762  98 sourceGapMappedLength = 0;
763  98 inExon = true;
764    }
765  146 thisAligned.append(c);
766  146 trailingCopiedGap = new StringBuilder();
767    }
768    else
769    {
770  100 if (inExon && preserveMappedGaps)
771    {
772  32 trailingCopiedGap.append(myGapChar);
773    }
774  68 else if (!inExon && preserveUnmappedGaps)
775    {
776  27 trailingCopiedGap.append(myGapChar);
777    }
778    }
779    }
780    }
781   
782    /*
783    * At end of model aligned sequence. Copy any remaining target sequence, optionally
784    * including (intron) gaps.
785    */
786  129 while (thisSeqPos < toLength)
787    {
788  110 final char c = alignTo.getCharAt(thisSeqPos++);
789  110 if (c != myGapChar || preserveUnmappedGaps)
790    {
791  102 thisAligned.append(c);
792    }
793  110 sourceGapMappedLength--;
794    }
795   
796    /*
797    * finally add gaps to pad for any trailing source gaps or
798    * unmapped characters
799    */
800  19 if (preserveUnmappedGaps)
801    {
802  24 while (sourceGapMappedLength > 0)
803    {
804  12 thisAligned.append(myGapChar);
805  12 sourceGapMappedLength--;
806    }
807    }
808   
809    /*
810    * All done aligning, set the aligned sequence.
811    */
812  19 alignTo.setSequence(new String(thisAligned));
813    }
814   
815    /**
816    * Helper method to work out how many gaps to insert when realigning.
817    *
818    * @param preserveMappedGaps
819    * @param preserveUnmappedGaps
820    * @param sourceGapMappedLength
821    * @param inExon
822    * @param trailingCopiedGap
823    * @param intronLength
824    * @param startOfCodon
825    * @return
826    */
 
827  98 toggle protected static int calculateGapsToInsert(boolean preserveMappedGaps,
828    boolean preserveUnmappedGaps, int sourceGapMappedLength,
829    boolean inExon, int trailingGapLength, int intronLength,
830    final boolean startOfCodon)
831    {
832  98 int gapsToAdd = 0;
833  98 if (startOfCodon)
834    {
835    /*
836    * Reached start of codon. Ignore trailing gaps in intron unless we are
837    * preserving gaps in both exon and intron. Ignore them anyway if the
838    * protein alignment introduces a gap at least as large as the intronic
839    * region.
840    */
841  40 if (inExon && !preserveMappedGaps)
842    {
843  4 trailingGapLength = 0;
844    }
845  40 if (!inExon && !(preserveMappedGaps && preserveUnmappedGaps))
846    {
847  19 trailingGapLength = 0;
848    }
849  40 if (inExon)
850    {
851  14 gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
852    }
853    else
854    {
855  26 if (intronLength + trailingGapLength <= sourceGapMappedLength)
856    {
857  20 gapsToAdd = sourceGapMappedLength - intronLength;
858    }
859    else
860    {
861  6 gapsToAdd = Math.min(
862    intronLength + trailingGapLength - sourceGapMappedLength,
863    trailingGapLength);
864    }
865    }
866    }
867    else
868    {
869    /*
870    * second or third base of codon; check for any gaps in dna
871    */
872  58 if (!preserveMappedGaps)
873    {
874  32 trailingGapLength = 0;
875    }
876  58 gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
877    }
878  98 return gapsToAdd;
879    }
880   
881    /**
882    * Realigns the given protein to match the alignment of the dna, using codon
883    * mappings to translate aligned codon positions to protein residues.
884    *
885    * @param protein
886    * the alignment whose sequences are realigned by this method
887    * @param dna
888    * the dna alignment whose alignment we are 'copying'
889    * @return the number of sequences that were realigned
890    */
 
891  5 toggle public static int alignProteinAsDna(AlignmentI protein, AlignmentI dna)
892    {
893  5 if (protein.isNucleotide() || !dna.isNucleotide())
894    {
895  0 jalview.bin.Console
896    .errPrintln("Wrong alignment type in alignProteinAsDna");
897  0 return 0;
898    }
899  5 List<SequenceI> unmappedProtein = new ArrayList<>();
900  5 Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = buildCodonColumnsMap(
901    protein, dna, unmappedProtein);
902  5 return alignProteinAs(protein, alignedCodons, unmappedProtein);
903    }
904   
905    /**
906    * Realigns the given dna to match the alignment of the protein, using codon
907    * mappings to translate aligned peptide positions to codons.
908    *
909    * Always produces a padded CDS alignment.
910    *
911    * @param dna
912    * the alignment whose sequences are realigned by this method
913    * @param protein
914    * the protein alignment whose alignment we are 'copying'
915    * @return the number of sequences that were realigned
916    */
 
917  4 toggle public static int alignCdsAsProtein(AlignmentI dna, AlignmentI protein)
918    {
919  4 if (protein.isNucleotide() || !dna.isNucleotide())
920    {
921  0 jalview.bin.Console
922    .errPrintln("Wrong alignment type in alignProteinAsDna");
923  0 return 0;
924    }
925    // todo: implement this
926  4 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
927  4 int alignedCount = 0;
928  4 int width = 0; // alignment width for padding CDS
929  4 for (SequenceI dnaSeq : dna.getSequences())
930    {
931  5 if (alignCdsSequenceAsProtein(dnaSeq, protein, mappings,
932    dna.getGapCharacter()))
933    {
934  5 alignedCount++;
935    }
936  5 width = Math.max(dnaSeq.getLength(), width);
937    }
938  4 int oldwidth;
939  4 int diff;
940  4 for (SequenceI dnaSeq : dna.getSequences())
941    {
942  5 oldwidth = dnaSeq.getLength();
943  5 diff = width - oldwidth;
944  5 if (diff > 0)
945    {
946  1 dnaSeq.insertCharAt(oldwidth, diff, dna.getGapCharacter());
947    }
948    }
949  4 return alignedCount;
950    }
951   
952    /**
953    * Helper method to align (if possible) the dna sequence to match the
954    * alignment of a mapped protein sequence. This is currently limited to
955    * handling coding sequence only.
956    *
957    * @param cdsSeq
958    * @param protein
959    * @param mappings
960    * @param gapChar
961    * @return
962    */
 
963  5 toggle static boolean alignCdsSequenceAsProtein(SequenceI cdsSeq,
964    AlignmentI protein, List<AlignedCodonFrame> mappings,
965    char gapChar)
966    {
967  5 SequenceI cdsDss = cdsSeq.getDatasetSequence();
968  5 if (cdsDss == null)
969    {
970  0 System.err
971    .println("alignCdsSequenceAsProtein needs aligned sequence!");
972  0 return false;
973    }
974   
975  5 List<AlignedCodonFrame> dnaMappings = MappingUtils
976    .findMappingsForSequence(cdsSeq, mappings);
977  5 for (AlignedCodonFrame mapping : dnaMappings)
978    {
979  5 SequenceI peptide = mapping.findAlignedSequence(cdsSeq, protein);
980  5 if (peptide != null)
981    {
982  5 final int peptideLength = peptide.getLength();
983  5 Mapping map = mapping.getMappingBetween(cdsSeq, peptide);
984  5 if (map != null)
985    {
986  5 MapList mapList = map.getMap();
987  5 if (map.getTo() == peptide.getDatasetSequence())
988    {
989  5 mapList = mapList.getInverse();
990    }
991  5 final int cdsLength = cdsDss.getLength();
992  5 int mappedFromLength = MappingUtils
993    .getLength(mapList.getFromRanges());
994  5 int mappedToLength = MappingUtils
995    .getLength(mapList.getToRanges());
996  5 boolean addStopCodon = (cdsLength == mappedFromLength
997    * CODON_LENGTH + CODON_LENGTH)
998    || (peptide.getDatasetSequence()
999    .getLength() == mappedFromLength - 1);
1000  5 if (cdsLength != mappedToLength && !addStopCodon)
1001    {
1002  0 jalview.bin.Console.errPrintln(String.format(
1003    "Can't align cds as protein (length mismatch %d/%d): %s",
1004    cdsLength, mappedToLength, cdsSeq.getName()));
1005    }
1006   
1007    /*
1008    * pre-fill the aligned cds sequence with gaps
1009    */
1010  5 char[] alignedCds = new char[peptideLength * CODON_LENGTH
1011  5 + (addStopCodon ? CODON_LENGTH : 0)];
1012  5 Arrays.fill(alignedCds, gapChar);
1013   
1014    /*
1015    * walk over the aligned peptide sequence and insert mapped
1016    * codons for residues in the aligned cds sequence
1017    */
1018  5 int copiedBases = 0;
1019  5 int cdsStart = cdsDss.getStart();
1020  5 int proteinPos = peptide.getStart() - 1;
1021  5 int cdsCol = 0;
1022   
1023  33 for (int col = 0; col < peptideLength; col++)
1024    {
1025  28 char residue = peptide.getCharAt(col);
1026   
1027  28 if (Comparison.isGap(residue))
1028    {
1029  13 cdsCol += CODON_LENGTH;
1030    }
1031    else
1032    {
1033  15 proteinPos++;
1034  15 int[] codon = mapList.locateInTo(proteinPos, proteinPos);
1035  15 if (codon == null)
1036    {
1037    // e.g. incomplete start codon, X in peptide
1038  0 cdsCol += CODON_LENGTH;
1039    }
1040    else
1041    {
1042  60 for (int j = codon[0]; j <= codon[1]; j++)
1043    {
1044  45 char mappedBase = cdsDss.getCharAt(j - cdsStart);
1045  45 alignedCds[cdsCol++] = mappedBase;
1046  45 copiedBases++;
1047    }
1048    }
1049    }
1050    }
1051   
1052    /*
1053    * append stop codon if not mapped from protein,
1054    * closing it up to the end of the mapped sequence
1055    */
1056  5 if (copiedBases == cdsLength - CODON_LENGTH)
1057    {
1058  0 for (int i = alignedCds.length - 1; i >= 0; i--)
1059    {
1060  0 if (!Comparison.isGap(alignedCds[i]))
1061    {
1062  0 cdsCol = i + 1; // gap just after end of sequence
1063  0 break;
1064    }
1065    }
1066  0 for (int i = cdsLength - CODON_LENGTH; i < cdsLength; i++)
1067    {
1068  0 alignedCds[cdsCol++] = cdsDss.getCharAt(i);
1069    }
1070    }
1071  5 cdsSeq.setSequence(new String(alignedCds));
1072  5 return true;
1073    }
1074    }
1075    }
1076  0 return false;
1077    }
1078   
1079    /**
1080    * Builds a map whose key is an aligned codon position (3 alignment column
1081    * numbers base 0), and whose value is a map from protein sequence to each
1082    * protein's peptide residue for that codon. The map generates an ordering of
1083    * the codons, and allows us to read off the peptides at each position in
1084    * order to assemble 'aligned' protein sequences.
1085    *
1086    * @param protein
1087    * the protein alignment
1088    * @param dna
1089    * the coding dna alignment
1090    * @param unmappedProtein
1091    * any unmapped proteins are added to this list
1092    * @return
1093    */
 
1094  5 toggle protected static Map<AlignedCodon, Map<SequenceI, AlignedCodon>> buildCodonColumnsMap(
1095    AlignmentI protein, AlignmentI dna,
1096    List<SequenceI> unmappedProtein)
1097    {
1098    /*
1099    * maintain a list of any proteins with no mappings - these will be
1100    * rendered 'as is' in the protein alignment as we can't align them
1101    */
1102  5 unmappedProtein.addAll(protein.getSequences());
1103   
1104  5 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
1105   
1106    /*
1107    * Map will hold, for each aligned codon position e.g. [3, 5, 6], a map of
1108    * {dnaSequence, {proteinSequence, codonProduct}} at that position. The
1109    * comparator keeps the codon positions ordered.
1110    */
1111  5 Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = new TreeMap<>(
1112    new CodonComparator());
1113   
1114  5 for (SequenceI dnaSeq : dna.getSequences())
1115    {
1116  30 for (AlignedCodonFrame mapping : mappings)
1117    {
1118  516 SequenceI prot = mapping.findAlignedSequence(dnaSeq, protein);
1119  516 if (prot != null)
1120    {
1121  32 Mapping seqMap = mapping.getMappingForSequence(dnaSeq);
1122  32 addCodonPositions(dnaSeq, prot, protein.getGapCharacter(), seqMap,
1123    alignedCodons);
1124  32 unmappedProtein.remove(prot);
1125    }
1126    }
1127    }
1128   
1129    /*
1130    * Finally add any unmapped peptide start residues (e.g. for incomplete
1131    * codons) as if at the codon position before the second residue
1132    */
1133    // TODO resolve JAL-2022 so this fudge can be removed
1134  5 int mappedSequenceCount = protein.getHeight() - unmappedProtein.size();
1135  5 addUnmappedPeptideStarts(alignedCodons, mappedSequenceCount);
1136   
1137  5 return alignedCodons;
1138    }
1139   
1140    /**
1141    * Scans for any protein mapped from position 2 (meaning unmapped start
1142    * position e.g. an incomplete codon), and synthesizes a 'codon' for it at the
1143    * preceding position in the alignment
1144    *
1145    * @param alignedCodons
1146    * the codon-to-peptide map
1147    * @param mappedSequenceCount
1148    * the number of distinct sequences in the map
1149    */
 
1150  5 toggle protected static void addUnmappedPeptideStarts(
1151    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1152    int mappedSequenceCount)
1153    {
1154    // TODO delete this ugly hack once JAL-2022 is resolved
1155    // i.e. we can model startPhase > 0 (incomplete start codon)
1156   
1157  5 List<SequenceI> sequencesChecked = new ArrayList<>();
1158  5 AlignedCodon lastCodon = null;
1159  5 Map<SequenceI, AlignedCodon> toAdd = new HashMap<>();
1160   
1161  5 for (Entry<AlignedCodon, Map<SequenceI, AlignedCodon>> entry : alignedCodons
1162    .entrySet())
1163    {
1164  1913 for (Entry<SequenceI, AlignedCodon> sequenceCodon : entry.getValue()
1165    .entrySet())
1166    {
1167  10669 SequenceI seq = sequenceCodon.getKey();
1168  10669 if (sequencesChecked.contains(seq))
1169    {
1170  10639 continue;
1171    }
1172  30 sequencesChecked.add(seq);
1173  30 AlignedCodon codon = sequenceCodon.getValue();
1174  30 if (codon.peptideCol > 1)
1175    {
1176  0 jalview.bin.Console.errPrintln(
1177    "Problem mapping protein with >1 unmapped start positions: "
1178    + seq.getName());
1179    }
1180  30 else if (codon.peptideCol == 1)
1181    {
1182    /*
1183    * first position (peptideCol == 0) was unmapped - add it
1184    */
1185  6 if (lastCodon != null)
1186    {
1187  5 AlignedCodon firstPeptide = new AlignedCodon(lastCodon.pos1,
1188    lastCodon.pos2, lastCodon.pos3,
1189    String.valueOf(seq.getCharAt(0)), 0);
1190  5 toAdd.put(seq, firstPeptide);
1191    }
1192    else
1193    {
1194    /*
1195    * unmapped residue at start of alignment (no prior column) -
1196    * 'insert' at nominal codon [0, 0, 0]
1197    */
1198  1 AlignedCodon firstPeptide = new AlignedCodon(0, 0, 0,
1199    String.valueOf(seq.getCharAt(0)), 0);
1200  1 toAdd.put(seq, firstPeptide);
1201    }
1202    }
1203  30 if (sequencesChecked.size() == mappedSequenceCount)
1204    {
1205    // no need to check past first mapped position in all sequences
1206  5 break;
1207    }
1208    }
1209  1913 lastCodon = entry.getKey();
1210    }
1211   
1212    /*
1213    * add any new codons safely after iterating over the map
1214    */
1215  5 for (Entry<SequenceI, AlignedCodon> startCodon : toAdd.entrySet())
1216    {
1217  6 addCodonToMap(alignedCodons, startCodon.getValue(),
1218    startCodon.getKey());
1219    }
1220    }
1221   
1222    /**
1223    * Update the aligned protein sequences to match the codon alignments given in
1224    * the map.
1225    *
1226    * @param protein
1227    * @param alignedCodons
1228    * an ordered map of codon positions (columns), with sequence/peptide
1229    * values present in each column
1230    * @param unmappedProtein
1231    * @return
1232    */
 
1233  5 toggle protected static int alignProteinAs(AlignmentI protein,
1234    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1235    List<SequenceI> unmappedProtein)
1236    {
1237    /*
1238    * prefill peptide sequences with gaps
1239    */
1240  5 int alignedWidth = alignedCodons.size();
1241  5 char[] gaps = new char[alignedWidth];
1242  5 Arrays.fill(gaps, protein.getGapCharacter());
1243  5 Map<SequenceI, char[]> peptides = new HashMap<>();
1244  5 for (SequenceI seq : protein.getSequences())
1245    {
1246  31 if (!unmappedProtein.contains(seq))
1247    {
1248  30 peptides.put(seq, Arrays.copyOf(gaps, gaps.length));
1249    }
1250    }
1251   
1252    /*
1253    * Traverse the codons left to right (as defined by CodonComparator)
1254    * and insert peptides in each column where the sequence is mapped.
1255    * This gives a peptide 'alignment' where residues are aligned if their
1256    * corresponding codons occupy the same columns in the cdna alignment.
1257    */
1258  5 int column = 0;
1259  5 for (AlignedCodon codon : alignedCodons.keySet())
1260    {
1261  1914 final Map<SequenceI, AlignedCodon> columnResidues = alignedCodons
1262    .get(codon);
1263  1914 for (Entry<SequenceI, AlignedCodon> entry : columnResidues.entrySet())
1264    {
1265  10682 char residue = entry.getValue().product.charAt(0);
1266  10682 peptides.get(entry.getKey())[column] = residue;
1267    }
1268  1914 column++;
1269    }
1270   
1271    /*
1272    * and finally set the constructed sequences
1273    */
1274  5 for (Entry<SequenceI, char[]> entry : peptides.entrySet())
1275    {
1276  30 entry.getKey().setSequence(new String(entry.getValue()));
1277    }
1278   
1279  5 return 0;
1280    }
1281   
1282    /**
1283    * Populate the map of aligned codons by traversing the given sequence
1284    * mapping, locating the aligned positions of mapped codons, and adding those
1285    * positions and their translation products to the map.
1286    *
1287    * @param dna
1288    * the aligned sequence we are mapping from
1289    * @param protein
1290    * the sequence to be aligned to the codons
1291    * @param gapChar
1292    * the gap character in the dna sequence
1293    * @param seqMap
1294    * a mapping to a sequence translation
1295    * @param alignedCodons
1296    * the map we are building up
1297    */
 
1298  32 toggle static void addCodonPositions(SequenceI dna, SequenceI protein,
1299    char gapChar, Mapping seqMap,
1300    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons)
1301    {
1302  32 Iterator<AlignedCodon> codons = seqMap.getCodonIterator(dna, gapChar);
1303   
1304    /*
1305    * add codon positions, and their peptide translations, to the alignment
1306    * map, while remembering the first codon mapped
1307    */
1308  10716 while (codons.hasNext())
1309    {
1310  10684 try
1311    {
1312  10684 AlignedCodon codon = codons.next();
1313  10684 addCodonToMap(alignedCodons, codon, protein);
1314    } catch (IncompleteCodonException e)
1315    {
1316    // possible incomplete trailing codon - ignore
1317    } catch (NoSuchElementException e)
1318    {
1319    // possibly peptide lacking STOP
1320    }
1321    }
1322    }
1323   
1324    /**
1325    * Helper method to add a codon-to-peptide entry to the aligned codons map
1326    *
1327    * @param alignedCodons
1328    * @param codon
1329    * @param protein
1330    */
 
1331  10690 toggle protected static void addCodonToMap(
1332    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1333    AlignedCodon codon, SequenceI protein)
1334    {
1335  10690 Map<SequenceI, AlignedCodon> seqProduct = alignedCodons.get(codon);
1336  10690 if (seqProduct == null)
1337    {
1338  1914 seqProduct = new HashMap<>();
1339  1914 alignedCodons.put(codon, seqProduct);
1340    }
1341  10690 seqProduct.put(protein, codon);
1342    }
1343   
1344    /**
1345    * Returns true if a cDNA/Protein mapping either exists, or could be made,
1346    * between at least one pair of sequences in the two alignments. Currently,
1347    * the logic is:
1348    * <ul>
1349    * <li>One alignment must be nucleotide, and the other protein</li>
1350    * <li>At least one pair of sequences must be already mapped, or mappable</li>
1351    * <li>Mappable means the nucleotide translation matches the protein
1352    * sequence</li>
1353    * <li>The translation may ignore start and stop codons if present in the
1354    * nucleotide</li>
1355    * </ul>
1356    *
1357    * @param al1
1358    * @param al2
1359    * @return
1360    */
 
1361  28 toggle public static boolean isMappable(AlignmentI al1, AlignmentI al2)
1362    {
1363  28 if (al1 == null || al2 == null)
1364    {
1365  3 return false;
1366    }
1367   
1368    /*
1369    * Require one nucleotide and one protein
1370    */
1371  25 if (al1.isNucleotide() == al2.isNucleotide())
1372    {
1373  14 return false;
1374    }
1375  11 AlignmentI dna = al1.isNucleotide() ? al1 : al2;
1376  11 AlignmentI protein = dna == al1 ? al2 : al1;
1377  11 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
1378  11 for (SequenceI dnaSeq : dna.getSequences())
1379    {
1380  12 for (SequenceI proteinSeq : protein.getSequences())
1381    {
1382  12 if (isMappable(dnaSeq, proteinSeq, mappings))
1383    {
1384  2 return true;
1385    }
1386    }
1387    }
1388  9 return false;
1389    }
1390   
1391    /**
1392    * Returns true if the dna sequence is mapped, or could be mapped, to the
1393    * protein sequence.
1394    *
1395    * @param dnaSeq
1396    * @param proteinSeq
1397    * @param mappings
1398    * @return
1399    */
 
1400  12 toggle protected static boolean isMappable(SequenceI dnaSeq,
1401    SequenceI proteinSeq, List<AlignedCodonFrame> mappings)
1402    {
1403  12 if (dnaSeq == null || proteinSeq == null)
1404    {
1405  0 return false;
1406    }
1407   
1408  12 SequenceI dnaDs = dnaSeq.getDatasetSequence() == null ? dnaSeq
1409    : dnaSeq.getDatasetSequence();
1410  12 SequenceI proteinDs = proteinSeq.getDatasetSequence() == null
1411    ? proteinSeq
1412    : proteinSeq.getDatasetSequence();
1413   
1414  12 for (AlignedCodonFrame mapping : mappings)
1415    {
1416  0 if (proteinDs == mapping.getAaForDnaSeq(dnaDs))
1417    {
1418    /*
1419    * already mapped
1420    */
1421  0 return true;
1422    }
1423    }
1424   
1425    /*
1426    * Just try to make a mapping (it is not yet stored), test whether
1427    * successful.
1428    */
1429  12 return mapCdnaToProtein(proteinDs, dnaDs) != null;
1430    }
1431   
1432    /**
1433    * Finds any reference annotations associated with the sequences in
1434    * sequenceScope, that are not already added to the alignment, and adds them
1435    * to the 'candidates' map. Also populates a lookup table of annotation
1436    * labels, keyed by calcId, for use in constructing tooltips or the like.
1437    *
1438    * @param sequenceScope
1439    * the sequences to scan for reference annotations
1440    * @param labelForCalcId
1441    * (optional) map to populate with label for calcId
1442    * @param candidates
1443    * map to populate with annotations for sequence
1444    * @param al
1445    * the alignment to check for presence of annotations
1446    */
 
1447  86 toggle public static void findAddableReferenceAnnotations(
1448    List<SequenceI> sequenceScope, Map<String, String> labelForCalcId,
1449    final Map<SequenceI, List<AlignmentAnnotation>> candidates,
1450    AlignmentI al)
1451    {
1452  86 if (sequenceScope == null)
1453    {
1454  1 return;
1455    }
1456   
1457    /*
1458    * For each sequence in scope, make a list of any annotations on the
1459    * underlying dataset sequence which are not already on the alignment.
1460    *
1461    * Add to a map of { alignmentSequence, <List of annotations to add> }
1462    */
1463  85 for (SequenceI seq : sequenceScope)
1464    {
1465  82 SequenceI dataset = seq.getDatasetSequence();
1466  82 if (dataset == null)
1467    {
1468  0 continue;
1469    }
1470  82 AlignmentAnnotation[] datasetAnnotations = dataset.getAnnotation();
1471  82 if (datasetAnnotations == null)
1472    {
1473  35 continue;
1474    }
1475  47 final List<AlignmentAnnotation> result = new ArrayList<>();
1476  47 for (AlignmentAnnotation dsann : datasetAnnotations)
1477    {
1478    /*
1479    * Find matching annotations on the alignment. If none is found, then
1480    * add this annotation to the list of 'addable' annotations for this
1481    * sequence.
1482    */
1483  155 final Iterable<AlignmentAnnotation> matchedAlignmentAnnotations = al
1484    .findAnnotations(seq, dsann.getCalcId(), dsann.label);
1485  155 boolean found = false;
1486  155 if (matchedAlignmentAnnotations != null)
1487    {
1488  152 for (AlignmentAnnotation matched : matchedAlignmentAnnotations)
1489    {
1490  135 if (dsann.description.equals(matched.description))
1491    {
1492  54 found = true;
1493  54 break;
1494    }
1495    }
1496    }
1497  155 if (!found)
1498    {
1499  101 result.add(dsann);
1500  101 if (labelForCalcId != null)
1501    {
1502  10 labelForCalcId.put(dsann.getCalcId(), dsann.label);
1503    }
1504    }
1505    }
1506    /*
1507    * Save any addable annotations for this sequence
1508    */
1509  47 if (!result.isEmpty())
1510    {
1511  43 candidates.put(seq, result);
1512    }
1513    }
1514    }
1515   
1516    /**
1517    * Adds annotations to the top of the alignment annotations, in the same order
1518    * as their related sequences. If you already have an annotation and want to
1519    * add it to a sequence in an alignment use {@code addReferenceAnnotationTo}
1520    *
1521    * @param annotations
1522    * the annotations to add
1523    * @param alignment
1524    * the alignment to add them to
1525    * @param selectionGroup
1526    * current selection group - may be null, if provided then any added
1527    * annotation will be trimmed to just those columns in the selection
1528    * group
1529    */
 
1530  40 toggle public static void addReferenceAnnotations(
1531    Map<SequenceI, List<AlignmentAnnotation>> annotations,
1532    final AlignmentI alignment, final SequenceGroup selectionGroup)
1533    {
1534  40 for (SequenceI seq : annotations.keySet())
1535    {
1536  39 for (AlignmentAnnotation ann : annotations.get(seq))
1537    {
1538  95 addReferenceAnnotationTo(alignment, seq, ann, selectionGroup);
1539    }
1540    }
1541    }
1542   
 
1543  0 toggle public static boolean isSSAnnotationPresent(
1544    Map<SequenceI, List<AlignmentAnnotation>> annotations)
1545    {
1546   
1547  0 for (SequenceI seq : annotations.keySet())
1548    {
1549  0 if (isSecondaryStructurePresent(
1550    annotations.get(seq).toArray(new AlignmentAnnotation[0])))
1551    {
1552  0 return true;
1553    }
1554    }
1555  0 return false;
1556    }
1557   
1558    /**
1559    * Make a copy of a reference annotation {@code ann} and add it to an
1560    * alignment sequence {@code seq} in {@code alignment}, optionally limited to
1561    * the extent of {@code selectionGroup}
1562    *
1563    * @param alignment
1564    * @param seq
1565    * @param ann
1566    * @param selectionGroup
1567    * current selection group - may be null, if provided then any added
1568    * annotation will be trimmed to just those columns in the selection
1569    * group
1570    * @return annotation added to {@code seq and {@code alignment}
1571    */
 
1572  99 toggle public static AlignmentAnnotation addReferenceAnnotationTo(
1573    final AlignmentI alignment, final SequenceI seq,
1574    final AlignmentAnnotation ann, final SequenceGroup selectionGroup)
1575    {
1576  99 AlignmentAnnotation copyAnn = new AlignmentAnnotation(ann);
1577  99 int startRes = 0;
1578  99 int endRes = ann.annotations.length;
1579  99 if (selectionGroup != null)
1580    {
1581  2 startRes = -1 + Math.min(seq.getEnd(), Math.max(seq.getStart(),
1582    seq.findPosition(selectionGroup.getStartRes())));
1583  2 endRes = -1 + Math.min(seq.getEnd(),
1584    seq.findPosition(selectionGroup.getEndRes()));
1585   
1586    }
1587  99 copyAnn.restrict(startRes, endRes + 0);
1588   
1589    /*
1590    * Add to the sequence (sets copyAnn.datasetSequence), unless the
1591    * original annotation is already on the sequence.
1592    */
1593  99 if (!seq.hasAnnotation(ann))
1594    {
1595  99 ContactMatrixI cm = seq.getDatasetSequence().getContactMatrixFor(ann);
1596  99 if (cm != null)
1597    {
1598  25 seq.addContactListFor(copyAnn, cm);
1599    }
1600  99 seq.addAlignmentAnnotation(copyAnn);
1601    }
1602    // adjust for gaps
1603  99 copyAnn.adjustForAlignment();
1604    // add to the alignment and set visible
1605  99 alignment.addAnnotation(copyAnn);
1606  99 copyAnn.visible = true;
1607   
1608  99 return copyAnn;
1609    }
1610   
1611    /**
1612    * Set visibility of alignment annotations of specified types (labels), for
1613    * specified sequences. This supports controls like "Show all secondary
1614    * structure", "Hide all Temp factor", etc.
1615    *
1616    * @al the alignment to scan for annotations
1617    * @param types
1618    * the types (labels) of annotations to be updated
1619    * @param forSequences
1620    * if not null, only annotations linked to one of these sequences are
1621    * in scope for update; if null, acts on all sequence annotations
1622    * @param anyType
1623    * if this flag is true, 'types' is ignored (label not checked)
1624    * @param doShow
1625    * if true, set visibility on, else set off
1626    */
 
1627  5 toggle public static void showOrHideSequenceAnnotations(AlignmentI al,
1628    Collection<String> types, List<SequenceI> forSequences,
1629    boolean anyType, boolean doShow)
1630    {
1631  5 AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
1632  5 if (anns != null)
1633    {
1634  5 for (AlignmentAnnotation aa : anns)
1635    {
1636  30 if (anyType || types.contains(aa.label))
1637    {
1638  21 if ((aa.sequenceRef != null) && (forSequences == null
1639    || forSequences.contains(aa.sequenceRef)))
1640    {
1641  11 aa.visible = doShow;
1642    }
1643    }
1644    }
1645    }
1646    }
1647   
1648    /**
1649    * Shows or hides auto calculated annotations for a sequence group.
1650    *
1651    * @param al
1652    * The alignment object with the annotations.
1653    * @param type
1654    * The type of annotation to show or hide.
1655    * @param selectedGroup
1656    * The sequence group for which the annotations should be shown or
1657    * hidden.
1658    * @param anyType
1659    * If true, all types of annotations will be shown/hidden.
1660    * @param doShow
1661    * If true, the annotations will be shown; if false, annotations will
1662    * be hidden.
1663    */
 
1664  0 toggle public static void showOrHideAutoCalculatedAnnotationsForGroup(
1665    AlignmentI al, String type, SequenceGroup selectedGroup,
1666    boolean anyType, boolean doShow)
1667    {
1668    // Get all alignment annotations
1669  0 AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
1670   
1671  0 if (anns != null)
1672    {
1673  0 for (AlignmentAnnotation aa : anns)
1674    {
1675    // Check if anyType is true or if the annotation's label contains the
1676    // specified type (currently for secondary structure consensus)
1677  0 if ((anyType && aa.label
1678    .startsWith(Constants.SECONDARY_STRUCTURE_CONSENSUS_LABEL))
1679    || aa.label.startsWith(type))
1680    {
1681    // If the annotation's group reference is not null and matches the
1682    // selected group, update its visibility.
1683  0 if (aa.groupRef != null && selectedGroup == aa.groupRef)
1684    {
1685  0 aa.visible = doShow;
1686    }
1687    }
1688    }
1689    }
1690    }
1691   
 
1692  0 toggle public static AlignmentAnnotation getFirstSequenceAnnotationOfType(
1693    AlignmentI al, int graphType)
1694    {
1695  0 AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
1696  0 if (anns != null)
1697    {
1698  0 for (AlignmentAnnotation aa : anns)
1699    {
1700  0 if (aa.sequenceRef != null && aa.graph == graphType)
1701  0 return aa;
1702    }
1703    }
1704  0 return null;
1705    }
1706   
1707    /**
1708    * Returns true if either sequence has a cross-reference to the other
1709    *
1710    * @param seq1
1711    * @param seq2
1712    * @return
1713    */
 
1714  52 toggle public static boolean haveCrossRef(SequenceI seq1, SequenceI seq2)
1715    {
1716    // Note: moved here from class CrossRef as the latter class has dependencies
1717    // not availability to the applet's classpath
1718  52 return hasCrossRef(seq1, seq2) || hasCrossRef(seq2, seq1);
1719    }
1720   
1721    /**
1722    * Returns true if seq1 has a cross-reference to seq2. Currently this assumes
1723    * that sequence name is structured as Source|AccessionId.
1724    *
1725    * @param seq1
1726    * @param seq2
1727    * @return
1728    */
 
1729  108 toggle public static boolean hasCrossRef(SequenceI seq1, SequenceI seq2)
1730    {
1731  108 if (seq1 == null || seq2 == null)
1732    {
1733  8 return false;
1734    }
1735  100 String name = seq2.getName();
1736  100 final List<DBRefEntry> xrefs = seq1.getDBRefs();
1737  100 if (xrefs != null)
1738    {
1739  34 for (int ix = 0, nx = xrefs.size(); ix < nx; ix++)
1740    {
1741  24 DBRefEntry xref = xrefs.get(ix);
1742  24 String xrefName = xref.getSource() + "|" + xref.getAccessionId();
1743    // case-insensitive test, consistent with DBRefEntry.equalRef()
1744  24 if (xrefName.equalsIgnoreCase(name))
1745    {
1746  12 return true;
1747    }
1748    }
1749    }
1750  88 return false;
1751    }
1752   
1753    /**
1754    * Constructs an alignment consisting of the mapped (CDS) regions in the given
1755    * nucleotide sequences, and updates mappings to match. The CDS sequences are
1756    * added to the original alignment's dataset, which is shared by the new
1757    * alignment. Mappings from nucleotide to CDS, and from CDS to protein, are
1758    * added to the alignment dataset.
1759    *
1760    * @param dna
1761    * aligned nucleotide (dna or cds) sequences
1762    * @param dataset
1763    * the alignment dataset the sequences belong to
1764    * @param products
1765    * (optional) to restrict results to CDS that map to specified
1766    * protein products
1767    * @return an alignment whose sequences are the cds-only parts of the dna
1768    * sequences (or null if no mappings are found)
1769    */
 
1770  6 toggle public static AlignmentI makeCdsAlignment(SequenceI[] dna,
1771    AlignmentI dataset, SequenceI[] products)
1772    {
1773  6 if (dataset == null || dataset.getDataset() != null)
1774    {
1775  0 throw new IllegalArgumentException(
1776    "IMPLEMENTATION ERROR: dataset.getDataset() must be null!");
1777    }
1778  6 List<SequenceI> foundSeqs = new ArrayList<>();
1779  6 List<SequenceI> cdsSeqs = new ArrayList<>();
1780  6 List<AlignedCodonFrame> mappings = dataset.getCodonFrames();
1781  6 HashSet<SequenceI> productSeqs = null;
1782  6 if (products != null)
1783    {
1784  3 productSeqs = new HashSet<>();
1785  3 for (SequenceI seq : products)
1786    {
1787  24 productSeqs.add(seq.getDatasetSequence() == null ? seq
1788    : seq.getDatasetSequence());
1789    }
1790    }
1791   
1792    /*
1793    * Construct CDS sequences from mappings on the alignment dataset.
1794    * The logic is:
1795    * - find the protein product(s) mapped to from each dna sequence
1796    * - if the mapping covers the whole dna sequence (give or take start/stop
1797    * codon), take the dna as the CDS sequence
1798    * - else search dataset mappings for a suitable dna sequence, i.e. one
1799    * whose whole sequence is mapped to the protein
1800    * - if no sequence found, construct one from the dna sequence and mapping
1801    * (and add it to dataset so it is found if this is repeated)
1802    */
1803  6 for (SequenceI dnaSeq : dna)
1804    {
1805  52 SequenceI dnaDss = dnaSeq.getDatasetSequence() == null ? dnaSeq
1806    : dnaSeq.getDatasetSequence();
1807   
1808  52 List<AlignedCodonFrame> seqMappings = MappingUtils
1809    .findMappingsForSequence(dnaSeq, mappings);
1810  52 for (AlignedCodonFrame mapping : seqMappings)
1811    {
1812  42 List<Mapping> mappingsFromSequence = mapping
1813    .getMappingsFromSequence(dnaSeq);
1814   
1815  42 for (Mapping aMapping : mappingsFromSequence)
1816    {
1817  44 MapList mapList = aMapping.getMap();
1818  44 if (mapList.getFromRatio() == 1)
1819    {
1820    /*
1821    * not a dna-to-protein mapping (likely dna-to-cds)
1822    */
1823  11 continue;
1824    }
1825   
1826    /*
1827    * skip if mapping is not to one of the target set of proteins
1828    */
1829  33 SequenceI proteinProduct = aMapping.getTo();
1830  33 if (productSeqs != null && !productSeqs.contains(proteinProduct))
1831    {
1832  2 continue;
1833    }
1834   
1835    /*
1836    * try to locate the CDS from the dataset mappings;
1837    * guard against duplicate results (for the case that protein has
1838    * dbrefs to both dna and cds sequences)
1839    */
1840  31 SequenceI cdsSeq = findCdsForProtein(mappings, dnaSeq,
1841    seqMappings, aMapping);
1842  31 if (cdsSeq != null)
1843    {
1844  11 if (!foundSeqs.contains(cdsSeq))
1845    {
1846  11 foundSeqs.add(cdsSeq);
1847  11 SequenceI derivedSequence = cdsSeq.deriveSequence();
1848  11 cdsSeqs.add(derivedSequence);
1849  11 if (!dataset.getSequences().contains(cdsSeq))
1850    {
1851  0 dataset.addSequence(cdsSeq);
1852    }
1853    }
1854  11 continue;
1855    }
1856   
1857    /*
1858    * didn't find mapped CDS sequence - construct it and add
1859    * its dataset sequence to the dataset
1860    */
1861  20 cdsSeq = makeCdsSequence(dnaSeq.getDatasetSequence(), aMapping,
1862    dataset).deriveSequence();
1863    // cdsSeq has a name constructed as CDS|<dbref>
1864    // <dbref> will be either the accession for the coding sequence,
1865    // marked in the /via/ dbref to the protein product accession
1866    // or it will be the original nucleotide accession.
1867  20 SequenceI cdsSeqDss = cdsSeq.getDatasetSequence();
1868   
1869  20 cdsSeqs.add(cdsSeq);
1870   
1871    /*
1872    * build the mapping from CDS to protein
1873    */
1874  20 List<int[]> cdsRange = Collections
1875    .singletonList(new int[]
1876    { cdsSeq.getStart(),
1877    cdsSeq.getLength() + cdsSeq.getStart() - 1 });
1878  20 MapList cdsToProteinMap = new MapList(cdsRange,
1879    mapList.getToRanges(), mapList.getFromRatio(),
1880    mapList.getToRatio());
1881   
1882  20 if (!dataset.getSequences().contains(cdsSeqDss))
1883    {
1884    /*
1885    * if this sequence is a newly created one, add it to the dataset
1886    * and made a CDS to protein mapping (if sequence already exists,
1887    * CDS-to-protein mapping _is_ the transcript-to-protein mapping)
1888    */
1889  20 dataset.addSequence(cdsSeqDss);
1890  20 AlignedCodonFrame cdsToProteinMapping = new AlignedCodonFrame();
1891  20 cdsToProteinMapping.addMap(cdsSeqDss, proteinProduct,
1892    cdsToProteinMap);
1893   
1894    /*
1895    * guard against duplicating the mapping if repeating this action
1896    */
1897  20 if (!mappings.contains(cdsToProteinMapping))
1898    {
1899  20 mappings.add(cdsToProteinMapping);
1900    }
1901    }
1902   
1903  20 propagateDBRefsToCDS(cdsSeqDss, dnaSeq.getDatasetSequence(),
1904    proteinProduct, aMapping);
1905    /*
1906    * add another mapping from original 'from' range to CDS
1907    */
1908  20 AlignedCodonFrame dnaToCdsMapping = new AlignedCodonFrame();
1909  20 final MapList dnaToCdsMap = new MapList(mapList.getFromRanges(),
1910    cdsRange, 1, 1);
1911  20 dnaToCdsMapping.addMap(dnaSeq.getDatasetSequence(), cdsSeqDss,
1912    dnaToCdsMap);
1913  20 if (!mappings.contains(dnaToCdsMapping))
1914    {
1915  20 mappings.add(dnaToCdsMapping);
1916    }
1917   
1918    /*
1919    * transfer dna chromosomal loci (if known) to the CDS
1920    * sequence (via the mapping)
1921    */
1922  20 final MapList cdsToDnaMap = dnaToCdsMap.getInverse();
1923  20 transferGeneLoci(dnaSeq, cdsToDnaMap, cdsSeq);
1924   
1925    /*
1926    * add DBRef with mapping from protein to CDS
1927    * (this enables Get Cross-References from protein alignment)
1928    * This is tricky because we can't have two DBRefs with the
1929    * same source and accession, so need a different accession for
1930    * the CDS from the dna sequence
1931    */
1932   
1933    // specific use case:
1934    // Genomic contig ENSCHR:1, contains coding regions for ENSG01,
1935    // ENSG02, ENSG03, with transcripts and products similarly named.
1936    // cannot add distinct dbrefs mapping location on ENSCHR:1 to ENSG01
1937   
1938    // JBPNote: ?? can't actually create an example that demonstrates we
1939    // need to
1940    // synthesize an xref.
1941   
1942  20 List<DBRefEntry> primrefs = dnaDss.getPrimaryDBRefs();
1943  33 for (int ip = 0, np = primrefs.size(); ip < np; ip++)
1944    {
1945  13 DBRefEntry primRef = primrefs.get(ip);
1946    /*
1947    * create a cross-reference from CDS to the source sequence's
1948    * primary reference and vice versa
1949    */
1950  13 String source = primRef.getSource();
1951  13 String version = primRef.getVersion();
1952  13 DBRefEntry cdsCrossRef = new DBRefEntry(source,
1953    source + ":" + version, primRef.getAccessionId());
1954  13 cdsCrossRef
1955    .setMap(new Mapping(dnaDss, new MapList(cdsToDnaMap)));
1956  13 cdsSeqDss.addDBRef(cdsCrossRef);
1957   
1958  13 dnaSeq.addDBRef(new DBRefEntry(source, version,
1959    cdsSeq.getName(), new Mapping(cdsSeqDss, dnaToCdsMap)));
1960    // problem here is that the cross-reference is synthesized -
1961    // cdsSeq.getName() may be like 'CDS|dnaaccession' or
1962    // 'CDS|emblcdsacc'
1963    // assuming cds version same as dna ?!?
1964   
1965  13 DBRefEntry proteinToCdsRef = new DBRefEntry(source, version,
1966    cdsSeq.getName());
1967    //
1968  13 proteinToCdsRef.setMap(
1969    new Mapping(cdsSeqDss, cdsToProteinMap.getInverse()));
1970  13 proteinProduct.addDBRef(proteinToCdsRef);
1971    }
1972    /*
1973    * transfer any features on dna that overlap the CDS
1974    */
1975  20 transferFeatures(dnaSeq, cdsSeq, dnaToCdsMap, null,
1976    SequenceOntologyI.CDS);
1977    }
1978    }
1979    }
1980   
1981  6 AlignmentI cds = new Alignment(
1982    cdsSeqs.toArray(new SequenceI[cdsSeqs.size()]));
1983  6 cds.setDataset(dataset);
1984   
1985  6 return cds;
1986    }
1987   
1988    /**
1989    * Tries to transfer gene loci (dbref to chromosome positions) from fromSeq to
1990    * toSeq, mediated by the given mapping between the sequences
1991    *
1992    * @param fromSeq
1993    * @param targetToFrom
1994    * Map
1995    * @param targetSeq
1996    */
 
1997  23 toggle protected static void transferGeneLoci(SequenceI fromSeq,
1998    MapList targetToFrom, SequenceI targetSeq)
1999    {
2000  23 if (targetSeq.getGeneLoci() != null)
2001    {
2002    // already have - don't override
2003  1 return;
2004    }
2005  22 GeneLociI fromLoci = fromSeq.getGeneLoci();
2006  22 if (fromLoci == null)
2007    {
2008  10 return;
2009    }
2010   
2011  12 MapList newMap = targetToFrom.traverse(fromLoci.getMapping());
2012   
2013  12 if (newMap != null)
2014    {
2015  12 targetSeq.setGeneLoci(fromLoci.getSpeciesId(),
2016    fromLoci.getAssemblyId(), fromLoci.getChromosomeId(), newMap);
2017    }
2018    }
2019   
2020    /**
2021    * A helper method that finds a CDS sequence in the alignment dataset that is
2022    * mapped to the given protein sequence, and either is, or has a mapping from,
2023    * the given dna sequence.
2024    *
2025    * @param mappings
2026    * set of all mappings on the dataset
2027    * @param dnaSeq
2028    * a dna (or cds) sequence we are searching from
2029    * @param seqMappings
2030    * the set of mappings involving dnaSeq
2031    * @param aMapping
2032    * a transcript-to-peptide mapping
2033    * @return
2034    */
 
2035  38 toggle static SequenceI findCdsForProtein(List<AlignedCodonFrame> mappings,
2036    SequenceI dnaSeq, List<AlignedCodonFrame> seqMappings,
2037    Mapping aMapping)
2038    {
2039    /*
2040    * TODO a better dna-cds-protein mapping data representation to allow easy
2041    * navigation; until then this clunky looping around lists of mappings
2042    */
2043  38 SequenceI seqDss = dnaSeq.getDatasetSequence() == null ? dnaSeq
2044    : dnaSeq.getDatasetSequence();
2045  38 SequenceI proteinProduct = aMapping.getTo();
2046   
2047    /*
2048    * is this mapping from the whole dna sequence (i.e. CDS)?
2049    * allowing for possible stop codon on dna but not peptide
2050    */
2051  38 int mappedFromLength = MappingUtils
2052    .getLength(aMapping.getMap().getFromRanges());
2053  38 int dnaLength = seqDss.getLength();
2054  38 if (mappedFromLength == dnaLength
2055    || mappedFromLength == dnaLength - CODON_LENGTH)
2056    {
2057    /*
2058    * if sequence has CDS features, this is a transcript with no UTR
2059    * - do not take this as the CDS sequence! (JAL-2789)
2060    */
2061  4 if (seqDss.getFeatures().getFeaturesByOntology(SequenceOntologyI.CDS)
2062    .isEmpty())
2063    {
2064  1 return seqDss;
2065    }
2066    }
2067   
2068    /*
2069    * looks like we found the dna-to-protein mapping; search for the
2070    * corresponding cds-to-protein mapping
2071    */
2072  37 List<AlignedCodonFrame> mappingsToPeptide = MappingUtils
2073    .findMappingsForSequence(proteinProduct, mappings);
2074  37 for (AlignedCodonFrame acf : mappingsToPeptide)
2075    {
2076  52 for (SequenceToSequenceMapping map : acf.getMappings())
2077    {
2078  276 Mapping mapping = map.getMapping();
2079  276 if (mapping != aMapping
2080    && mapping.getMap().getFromRatio() == CODON_LENGTH
2081    && proteinProduct == mapping.getTo()
2082    && seqDss != map.getFromSeq())
2083    {
2084  15 mappedFromLength = MappingUtils
2085    .getLength(mapping.getMap().getFromRanges());
2086  15 if (mappedFromLength == map.getFromSeq().getLength())
2087    {
2088    /*
2089    * found a 3:1 mapping to the protein product which covers
2090    * the whole dna sequence i.e. is from CDS; finally check the CDS
2091    * is mapped from the given dna start sequence
2092    */
2093  15 SequenceI cdsSeq = map.getFromSeq();
2094    // todo this test is weak if seqMappings contains multiple mappings;
2095    // we get away with it if transcript:cds relationship is 1:1
2096  15 List<AlignedCodonFrame> dnaToCdsMaps = MappingUtils
2097    .findMappingsForSequence(cdsSeq, seqMappings);
2098  15 if (!dnaToCdsMaps.isEmpty())
2099    {
2100  13 return cdsSeq;
2101    }
2102    }
2103    }
2104    }
2105    }
2106  24 return null;
2107    }
2108   
2109    /**
2110    * Helper method that makes a CDS sequence as defined by the mappings from the
2111    * given sequence i.e. extracts the 'mapped from' ranges (which may be on
2112    * forward or reverse strand).
2113    *
2114    * @param seq
2115    * @param mapping
2116    * @param dataset
2117    * - existing dataset. We check for sequences that look like the CDS
2118    * we are about to construct, if one exists already, then we will
2119    * just return that one.
2120    * @return CDS sequence (as a dataset sequence)
2121    */
 
2122  20 toggle static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping,
2123    AlignmentI dataset)
2124    {
2125    /*
2126    * construct CDS sequence name as "CDS|" with 'from id' held in the mapping
2127    * if set (e.g. EMBL protein_id), else sequence name appended
2128    */
2129  20 String mapFromId = mapping.getMappedFromId();
2130  20 final String seqId = "CDS|"
2131  20 + (mapFromId != null ? mapFromId : seq.getName());
2132   
2133  20 SequenceI newSeq = null;
2134   
2135    /*
2136    * construct CDS sequence by splicing mapped from ranges
2137    */
2138  20 char[] seqChars = seq.getSequence();
2139  20 List<int[]> fromRanges = mapping.getMap().getFromRanges();
2140  20 int cdsWidth = MappingUtils.getLength(fromRanges);
2141  20 char[] newSeqChars = new char[cdsWidth];
2142   
2143  20 int newPos = 0;
2144  20 for (int[] range : fromRanges)
2145    {
2146  32 if (range[0] <= range[1])
2147    {
2148    // forward strand mapping - just copy the range
2149  32 int length = range[1] - range[0] + 1;
2150  32 System.arraycopy(seqChars, range[0] - 1, newSeqChars, newPos,
2151    length);
2152  32 newPos += length;
2153    }
2154    else
2155    {
2156    // reverse strand mapping - copy and complement one by one
2157  0 for (int i = range[0]; i >= range[1]; i--)
2158    {
2159  0 newSeqChars[newPos++] = Dna.getComplement(seqChars[i - 1]);
2160    }
2161    }
2162   
2163  32 newSeq = new Sequence(seqId, newSeqChars, 1, newPos);
2164    }
2165   
2166  20 if (dataset != null)
2167    {
2168  20 SequenceI[] matches = dataset.findSequenceMatch(newSeq.getName());
2169  20 if (matches != null)
2170    {
2171  20 boolean matched = false;
2172  20 for (SequenceI mtch : matches)
2173    {
2174  3 if (mtch.getStart() != newSeq.getStart())
2175    {
2176  0 continue;
2177    }
2178  3 if (mtch.getEnd() != newSeq.getEnd())
2179    {
2180  0 continue;
2181    }
2182  3 if (!Arrays.equals(mtch.getSequence(), newSeq.getSequence()))
2183    {
2184  3 continue;
2185    }
2186  0 if (!matched)
2187    {
2188  0 matched = true;
2189  0 newSeq = mtch;
2190    }
2191    else
2192    {
2193  0 Console.error(
2194    "JAL-2154 regression: warning - found (and ignored) a duplicate CDS sequence:"
2195    + mtch.toString());
2196    }
2197    }
2198    }
2199    }
2200    // newSeq.setDescription(mapFromId);
2201   
2202  20 return newSeq;
2203    }
2204   
2205    /**
2206    * Adds any DBRefEntrys to cdsSeq from contig that have a Mapping congruent to
2207    * the given mapping.
2208    *
2209    * @param cdsSeq
2210    * @param contig
2211    * @param proteinProduct
2212    * @param mapping
2213    * @return list of DBRefEntrys added
2214    */
 
2215  20 toggle protected static List<DBRefEntry> propagateDBRefsToCDS(SequenceI cdsSeq,
2216    SequenceI contig, SequenceI proteinProduct, Mapping mapping)
2217    {
2218   
2219    // gather direct refs from contig congruent with mapping
2220  20 List<DBRefEntry> direct = new ArrayList<>();
2221  20 HashSet<String> directSources = new HashSet<>();
2222   
2223  20 List<DBRefEntry> refs = contig.getDBRefs();
2224  20 if (refs != null)
2225    {
2226  292 for (int ib = 0, nb = refs.size(); ib < nb; ib++)
2227    {
2228  279 DBRefEntry dbr = refs.get(ib);
2229  279 MapList map;
2230  ? if (dbr.hasMap() && (map = dbr.getMap().getMap()).isTripletMap())
2231    {
2232    // check if map is the CDS mapping
2233  24 if (mapping.getMap().equals(map))
2234    {
2235  24 direct.add(dbr);
2236  24 directSources.add(dbr.getSource());
2237    }
2238    }
2239    }
2240    }
2241  20 List<DBRefEntry> onSource = DBRefUtils.selectRefs(
2242    proteinProduct.getDBRefs(),
2243    directSources.toArray(new String[0]));
2244  20 List<DBRefEntry> propagated = new ArrayList<>();
2245   
2246    // and generate appropriate mappings
2247  44 for (int ic = 0, nc = direct.size(); ic < nc; ic++)
2248    {
2249  24 DBRefEntry cdsref = direct.get(ic);
2250  24 Mapping m = cdsref.getMap();
2251    // clone maplist and mapping
2252  24 MapList cdsposmap = new MapList(
2253    Arrays.asList(new int[][]
2254    { new int[] { cdsSeq.getStart(), cdsSeq.getEnd() } }),
2255    m.getMap().getToRanges(), 3, 1);
2256  24 Mapping cdsmap = new Mapping(m.getTo(), m.getMap());
2257   
2258    // create dbref
2259  24 DBRefEntry newref = new DBRefEntry(cdsref.getSource(),
2260    cdsref.getVersion(), cdsref.getAccessionId(),
2261    new Mapping(cdsmap.getTo(), cdsposmap));
2262   
2263    // and see if we can map to the protein product for this mapping.
2264    // onSource is the filtered set of accessions on protein that we are
2265    // tranferring, so we assume accession is the same.
2266  24 if (cdsmap.getTo() == null && onSource != null)
2267    {
2268  2 List<DBRefEntry> sourceRefs = DBRefUtils.searchRefs(onSource,
2269    cdsref.getAccessionId());
2270  2 if (sourceRefs != null)
2271    {
2272  2 for (DBRefEntry srcref : sourceRefs)
2273    {
2274  2 if (srcref.getSource().equalsIgnoreCase(cdsref.getSource()))
2275    {
2276    // we have found a complementary dbref on the protein product, so
2277    // update mapping's getTo
2278  2 newref.getMap().setTo(proteinProduct);
2279    }
2280    }
2281    }
2282    }
2283  24 cdsSeq.addDBRef(newref);
2284  24 propagated.add(newref);
2285    }
2286  20 return propagated;
2287    }
2288   
2289    /**
2290    * Transfers co-located features on 'fromSeq' to 'toSeq', adjusting the
2291    * feature start/end ranges, optionally omitting specified feature types.
2292    * Returns the number of features copied.
2293    *
2294    * @param fromSeq
2295    * @param toSeq
2296    * @param mapping
2297    * the mapping from 'fromSeq' to 'toSeq'
2298    * @param select
2299    * if not null, only features of this type are copied (including
2300    * subtypes in the Sequence Ontology)
2301    * @param omitting
2302    */
 
2303  23 toggle protected static int transferFeatures(SequenceI fromSeq, SequenceI toSeq,
2304    MapList mapping, String select, String... omitting)
2305    {
2306  23 SequenceI copyTo = toSeq;
2307  43 while (copyTo.getDatasetSequence() != null)
2308    {
2309  20 copyTo = copyTo.getDatasetSequence();
2310    }
2311  23 if (fromSeq == copyTo || fromSeq.getDatasetSequence() == copyTo)
2312    {
2313  0 return 0; // shared dataset sequence
2314    }
2315   
2316    /*
2317    * get features, optionally restricted by an ontology term
2318    */
2319  23 List<SequenceFeature> sfs = select == null
2320    ? fromSeq.getFeatures().getPositionalFeatures()
2321    : fromSeq.getFeatures().getFeaturesByOntology(select);
2322   
2323  23 int count = 0;
2324  23 for (SequenceFeature sf : sfs)
2325    {
2326  9610 String type = sf.getType();
2327  9610 boolean omit = false;
2328  9610 for (String toOmit : omitting)
2329    {
2330  9603 if (type.equals(toOmit))
2331    {
2332  134 omit = true;
2333    }
2334    }
2335  9610 if (omit)
2336    {
2337  134 continue;
2338    }
2339   
2340    /*
2341    * locate the mapped range - null if either start or end is
2342    * not mapped (no partial overlaps are calculated)
2343    */
2344  9476 int start = sf.getBegin();
2345  9476 int end = sf.getEnd();
2346  9476 int[] mappedTo = mapping.locateInTo(start, end);
2347    /*
2348    * if whole exon range doesn't map, try interpreting it
2349    * as 5' or 3' exon overlapping the CDS range
2350    */
2351  9476 if (mappedTo == null)
2352    {
2353  4447 mappedTo = mapping.locateInTo(end, end);
2354  4447 if (mappedTo != null)
2355    {
2356    /*
2357    * end of exon is in CDS range - 5' overlap
2358    * to a range from the start of the peptide
2359    */
2360  0 mappedTo[0] = 1;
2361    }
2362    }
2363  9476 if (mappedTo == null)
2364    {
2365  4447 mappedTo = mapping.locateInTo(start, start);
2366  4447 if (mappedTo != null)
2367    {
2368    /*
2369    * start of exon is in CDS range - 3' overlap
2370    * to a range up to the end of the peptide
2371    */
2372  0 mappedTo[1] = toSeq.getLength();
2373    }
2374    }
2375  9476 if (mappedTo != null)
2376    {
2377  5029 int newBegin = Math.min(mappedTo[0], mappedTo[1]);
2378  5029 int newEnd = Math.max(mappedTo[0], mappedTo[1]);
2379  5029 SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
2380    sf.getFeatureGroup(), sf.getScore());
2381  5029 copyTo.addSequenceFeature(copy);
2382  5029 count++;
2383    }
2384    }
2385  23 return count;
2386    }
2387   
2388    /**
2389    * Returns a mapping from dna to protein by inspecting sequence features of
2390    * type "CDS" on the dna. A mapping is constructed if the total CDS feature
2391    * length is 3 times the peptide length (optionally after dropping a trailing
2392    * stop codon). This method does not check whether the CDS nucleotide sequence
2393    * translates to the peptide sequence.
2394    *
2395    * @param dnaSeq
2396    * @param proteinSeq
2397    * @return
2398    */
 
2399  22 toggle public static MapList mapCdsToProtein(SequenceI dnaSeq,
2400    SequenceI proteinSeq)
2401    {
2402  22 List<int[]> ranges = findCdsPositions(dnaSeq);
2403  22 int mappedDnaLength = MappingUtils.getLength(ranges);
2404   
2405    /*
2406    * if not a whole number of codons, truncate mapping
2407    */
2408  22 int codonRemainder = mappedDnaLength % CODON_LENGTH;
2409  22 if (codonRemainder > 0)
2410    {
2411  2 mappedDnaLength -= codonRemainder;
2412  2 MappingUtils.removeEndPositions(codonRemainder, ranges);
2413    }
2414   
2415  22 int proteinLength = proteinSeq.getLength();
2416  22 int proteinStart = proteinSeq.getStart();
2417  22 int proteinEnd = proteinSeq.getEnd();
2418   
2419    /*
2420    * incomplete start codon may mean X at start of peptide
2421    * we ignore both for mapping purposes
2422    */
2423  22 if (proteinSeq.getCharAt(0) == 'X')
2424    {
2425    // todo JAL-2022 support startPhase > 0
2426  1 proteinStart++;
2427  1 proteinLength--;
2428    }
2429  22 List<int[]> proteinRange = new ArrayList<>();
2430   
2431    /*
2432    * dna length should map to protein (or protein plus stop codon)
2433    */
2434  22 int codesForResidues = mappedDnaLength / CODON_LENGTH;
2435  22 if (codesForResidues == (proteinLength + 1))
2436    {
2437    // assuming extra codon is for STOP and not in peptide
2438    // todo: check trailing codon is indeed a STOP codon
2439  2 codesForResidues--;
2440  2 mappedDnaLength -= CODON_LENGTH;
2441  2 MappingUtils.removeEndPositions(CODON_LENGTH, ranges);
2442    }
2443   
2444  22 if (codesForResidues == proteinLength)
2445    {
2446  4 proteinRange.add(new int[] { proteinStart, proteinEnd });
2447  4 return new MapList(ranges, proteinRange, CODON_LENGTH, 1);
2448    }
2449  18 return null;
2450    }
2451   
2452    /**
2453    * Returns a list of CDS ranges found (as sequence positions base 1), i.e. of
2454    * [start, end] positions of sequence features of type "CDS" (or a sub-type of
2455    * CDS in the Sequence Ontology). The ranges are sorted into ascending start
2456    * position order, so this method is only valid for linear CDS in the same
2457    * sense as the protein product.
2458    *
2459    * @param dnaSeq
2460    * @return
2461    */
 
2462  24 toggle protected static List<int[]> findCdsPositions(SequenceI dnaSeq)
2463    {
2464  24 List<int[]> result = new ArrayList<>();
2465   
2466  24 List<SequenceFeature> sfs = dnaSeq.getFeatures()
2467    .getFeaturesByOntology(SequenceOntologyI.CDS);
2468  24 if (sfs.isEmpty())
2469    {
2470  16 return result;
2471    }
2472  8 SequenceFeatures.sortFeatures(sfs, true);
2473   
2474  8 for (SequenceFeature sf : sfs)
2475    {
2476  16 int phase = 0;
2477  16 try
2478    {
2479  16 String s = sf.getPhase();
2480  16 if (s != null)
2481    {
2482  2 phase = Integer.parseInt(s);
2483    }
2484    } catch (NumberFormatException e)
2485    {
2486    // leave as zero
2487    }
2488    /*
2489    * phase > 0 on first codon means 5' incomplete - skip to the start
2490    * of the next codon; example ENST00000496384
2491    */
2492  16 int begin = sf.getBegin();
2493  16 int end = sf.getEnd();
2494  16 if (result.isEmpty() && phase > 0)
2495    {
2496  2 begin += phase;
2497  2 if (begin > end)
2498    {
2499    // shouldn't happen!
2500  0 System.err
2501    .println("Error: start phase extends beyond start CDS in "
2502    + dnaSeq.getName());
2503    }
2504    }
2505  16 result.add(new int[] { begin, end });
2506    }
2507   
2508    /*
2509    * Finally sort ranges by start position. This avoids a dependency on
2510    * keeping features in order on the sequence (if they are in order anyway,
2511    * the sort will have almost no work to do). The implicit assumption is CDS
2512    * ranges are assembled in order. Other cases should not use this method,
2513    * but instead construct an explicit mapping for CDS (e.g. EMBL parsing).
2514    */
2515  8 Collections.sort(result, IntRangeComparator.ASCENDING);
2516  8 return result;
2517    }
2518   
2519    /**
2520    * Makes an alignment with a copy of the given sequences, adding in any
2521    * non-redundant sequences which are mapped to by the cross-referenced
2522    * sequences.
2523    *
2524    * @param seqs
2525    * @param xrefs
2526    * @param dataset
2527    * the alignment dataset shared by the new copy
2528    * @return
2529    */
 
2530  0 toggle public static AlignmentI makeCopyAlignment(SequenceI[] seqs,
2531    SequenceI[] xrefs, AlignmentI dataset)
2532    {
2533  0 AlignmentI copy = new Alignment(new Alignment(seqs));
2534  0 copy.setDataset(dataset);
2535  0 boolean isProtein = !copy.isNucleotide();
2536  0 SequenceIdMatcher matcher = new SequenceIdMatcher(seqs);
2537  0 if (xrefs != null)
2538    {
2539    // BH 2019.01.25 recoded to remove iterators
2540   
2541  0 for (int ix = 0, nx = xrefs.length; ix < nx; ix++)
2542    {
2543  0 SequenceI xref = xrefs[ix];
2544  0 List<DBRefEntry> dbrefs = xref.getDBRefs();
2545  0 if (dbrefs != null)
2546    {
2547  0 for (int ir = 0, nir = dbrefs.size(); ir < nir; ir++)
2548    {
2549  0 DBRefEntry dbref = dbrefs.get(ir);
2550  0 Mapping map = dbref.getMap();
2551  0 SequenceI mto;
2552  0 if (map == null || (mto = map.getTo()) == null
2553    || mto.isProtein() != isProtein)
2554    {
2555  0 continue;
2556    }
2557  0 SequenceI mappedTo = mto;
2558  0 SequenceI match = matcher.findIdMatch(mappedTo);
2559  0 if (match == null)
2560    {
2561  0 matcher.add(mappedTo);
2562  0 copy.addSequence(mappedTo);
2563    }
2564    }
2565    }
2566    }
2567    }
2568  0 return copy;
2569    }
2570   
2571    /**
2572    * Try to align sequences in 'unaligned' to match the alignment of their
2573    * mapped regions in 'aligned'. For example, could use this to align CDS
2574    * sequences which are mapped to their parent cDNA sequences.
2575    *
2576    * This method handles 1:1 mappings (dna-to-dna or protein-to-protein). For
2577    * dna-to-protein or protein-to-dna use alternative methods.
2578    *
2579    * @param unaligned
2580    * sequences to be aligned
2581    * @param aligned
2582    * holds aligned sequences and their mappings
2583    * @return
2584    */
 
2585  4 toggle public static int alignAs(AlignmentI unaligned, AlignmentI aligned)
2586    {
2587    /*
2588    * easy case - aligning a copy of aligned sequences
2589    */
2590  4 if (alignAsSameSequences(unaligned, aligned))
2591    {
2592  0 return unaligned.getHeight();
2593    }
2594   
2595    /*
2596    * fancy case - aligning via mappings between sequences
2597    */
2598  4 List<SequenceI> unmapped = new ArrayList<>();
2599  4 Map<Integer, Map<SequenceI, Character>> columnMap = buildMappedColumnsMap(
2600    unaligned, aligned, unmapped);
2601  4 int width = columnMap.size();
2602  4 char gap = unaligned.getGapCharacter();
2603  4 int realignedCount = 0;
2604    // TODO: verify this loop scales sensibly for very wide/high alignments
2605   
2606  4 for (SequenceI seq : unaligned.getSequences())
2607    {
2608  26 if (!unmapped.contains(seq))
2609    {
2610  26 char[] newSeq = new char[width];
2611  26 Arrays.fill(newSeq, gap); // JBPComment - doubt this is faster than the
2612    // Integer iteration below
2613  26 int newCol = 0;
2614  26 int lastCol = 0;
2615   
2616    /*
2617    * traverse the map to find columns populated
2618    * by our sequence
2619    */
2620  26 for (Integer column : columnMap.keySet())
2621    {
2622  58976 Character c = columnMap.get(column).get(seq);
2623  58976 if (c != null)
2624    {
2625    /*
2626    * sequence has a character at this position
2627    *
2628    */
2629  31986 newSeq[newCol] = c;
2630  31986 lastCol = newCol;
2631    }
2632  58976 newCol++;
2633    }
2634   
2635    /*
2636    * trim trailing gaps
2637    */
2638  26 if (lastCol < width)
2639    {
2640  26 char[] tmp = new char[lastCol + 1];
2641  26 System.arraycopy(newSeq, 0, tmp, 0, lastCol + 1);
2642  26 newSeq = tmp;
2643    }
2644    // TODO: optimise SequenceI to avoid char[]->String->char[]
2645  26 seq.setSequence(String.valueOf(newSeq));
2646  26 realignedCount++;
2647    }
2648    }
2649  4 return realignedCount;
2650    }
2651   
2652    /**
2653    * If unaligned and aligned sequences share the same dataset sequences, then
2654    * simply copies the aligned sequences to the unaligned sequences and returns
2655    * true; else returns false
2656    *
2657    * @param unaligned
2658    * - sequences to be aligned based on aligned
2659    * @param aligned
2660    * - 'guide' alignment containing sequences derived from same dataset
2661    * as unaligned
2662    * @return
2663    */
 
2664  8 toggle static boolean alignAsSameSequences(AlignmentI unaligned,
2665    AlignmentI aligned)
2666    {
2667  8 if (aligned.getDataset() == null || unaligned.getDataset() == null)
2668    {
2669  0 return false; // should only pass alignments with datasets here
2670    }
2671   
2672    // map from dataset sequence to alignment sequence(s)
2673  8 Map<SequenceI, List<SequenceI>> alignedDatasets = new HashMap<>();
2674  8 for (SequenceI seq : aligned.getSequences())
2675    {
2676  59 SequenceI ds = seq.getDatasetSequence();
2677  59 if (alignedDatasets.get(ds) == null)
2678    {
2679  58 alignedDatasets.put(ds, new ArrayList<SequenceI>());
2680    }
2681  59 alignedDatasets.get(ds).add(seq);
2682    }
2683   
2684    /*
2685    * first pass - check whether all sequences to be aligned share a
2686    * dataset sequence with an aligned sequence; also note the leftmost
2687    * ungapped column from which to copy
2688    */
2689  8 int leftmost = Integer.MAX_VALUE;
2690  8 for (SequenceI seq : unaligned.getSequences())
2691    {
2692  14 final SequenceI ds = seq.getDatasetSequence();
2693  14 if (!alignedDatasets.containsKey(ds))
2694    {
2695  5 return false;
2696    }
2697  9 SequenceI alignedSeq = alignedDatasets.get(ds).get(0);
2698  9 int startCol = alignedSeq.findIndex(seq.getStart()); // 1..
2699  9 leftmost = Math.min(leftmost, startCol);
2700    }
2701   
2702    /*
2703    * second pass - copy aligned sequences;
2704    * heuristic rule: pair off sequences in order for the case where
2705    * more than one shares the same dataset sequence
2706    */
2707  3 final char gapCharacter = aligned.getGapCharacter();
2708  3 for (SequenceI seq : unaligned.getSequences())
2709    {
2710  7 List<SequenceI> alignedSequences = alignedDatasets
2711    .get(seq.getDatasetSequence());
2712  7 if (alignedSequences.isEmpty())
2713    {
2714    /*
2715    * defensive check - shouldn't happen! (JAL-3536)
2716    */
2717  0 continue;
2718    }
2719  7 SequenceI alignedSeq = alignedSequences.get(0);
2720   
2721    /*
2722    * gap fill for leading (5') UTR if any
2723    */
2724    // TODO this copies intron columns - wrong!
2725  7 int startCol = alignedSeq.findIndex(seq.getStart()); // 1..
2726  7 int endCol = alignedSeq.findIndex(seq.getEnd());
2727  7 char[] seqchars = new char[endCol - leftmost + 1];
2728  7 Arrays.fill(seqchars, gapCharacter);
2729  7 char[] toCopy = alignedSeq.getSequence(startCol - 1, endCol);
2730  7 System.arraycopy(toCopy, 0, seqchars, startCol - leftmost,
2731    toCopy.length);
2732  7 seq.setSequence(String.valueOf(seqchars));
2733  7 if (alignedSequences.size() > 0)
2734    {
2735    // pop off aligned sequences (except the last one)
2736  7 alignedSequences.remove(0);
2737    }
2738    }
2739   
2740    /*
2741    * finally remove gapped columns (e.g. introns)
2742    */
2743  3 new RemoveGapColCommand("", unaligned.getSequencesArray(), 0,
2744    unaligned.getWidth() - 1, unaligned);
2745   
2746  3 return true;
2747    }
2748   
2749    /**
2750    * Returns a map whose key is alignment column number (base 1), and whose
2751    * values are a map of sequence characters in that column.
2752    *
2753    * @param unaligned
2754    * @param aligned
2755    * @param unmapped
2756    * @return
2757    */
 
2758  4 toggle static SortedMap<Integer, Map<SequenceI, Character>> buildMappedColumnsMap(
2759    AlignmentI unaligned, AlignmentI aligned,
2760    List<SequenceI> unmapped)
2761    {
2762    /*
2763    * Map will hold, for each aligned column position, a map of
2764    * {unalignedSequence, characterPerSequence} at that position.
2765    * TreeMap keeps the entries in ascending column order.
2766    */
2767  4 SortedMap<Integer, Map<SequenceI, Character>> map = new TreeMap<>();
2768   
2769    /*
2770    * record any sequences that have no mapping so can't be realigned
2771    */
2772  4 unmapped.addAll(unaligned.getSequences());
2773   
2774  4 List<AlignedCodonFrame> mappings = aligned.getCodonFrames();
2775   
2776  4 for (SequenceI seq : unaligned.getSequences())
2777    {
2778  26 for (AlignedCodonFrame mapping : mappings)
2779    {
2780  510 SequenceI fromSeq = mapping.findAlignedSequence(seq, aligned);
2781  510 if (fromSeq != null)
2782    {
2783  26 Mapping seqMap = mapping.getMappingBetween(fromSeq, seq);
2784  26 if (addMappedPositions(seq, fromSeq, seqMap, map))
2785    {
2786  26 unmapped.remove(seq);
2787    }
2788    }
2789    }
2790    }
2791  4 return map;
2792    }
2793   
2794    /**
2795    * Helper method that adds to a map the mapped column positions of a sequence.
2796    * <br>
2797    * For example if aaTT-Tg-gAAA is mapped to TTTAAA then the map should record
2798    * that columns 3,4,6,10,11,12 map to characters T,T,T,A,A,A of the mapped to
2799    * sequence.
2800    *
2801    * @param seq
2802    * the sequence whose column positions we are recording
2803    * @param fromSeq
2804    * a sequence that is mapped to the first sequence
2805    * @param seqMap
2806    * the mapping from 'fromSeq' to 'seq'
2807    * @param map
2808    * a map to add the column positions (in fromSeq) of the mapped
2809    * positions of seq
2810    * @return
2811    */
 
2812  28 toggle static boolean addMappedPositions(SequenceI seq, SequenceI fromSeq,
2813    Mapping seqMap, Map<Integer, Map<SequenceI, Character>> map)
2814    {
2815  28 if (seqMap == null)
2816    {
2817  0 return false;
2818    }
2819   
2820    /*
2821    * invert mapping if it is from unaligned to aligned sequence
2822    */
2823  28 if (seqMap.getTo() == fromSeq.getDatasetSequence())
2824    {
2825  0 seqMap = new Mapping(seq.getDatasetSequence(),
2826    seqMap.getMap().getInverse());
2827    }
2828   
2829  28 int toStart = seq.getStart();
2830   
2831    /*
2832    * traverse [start, end, start, end...] ranges in fromSeq
2833    */
2834  28 for (int[] fromRange : seqMap.getMap().getFromRanges())
2835    {
2836  62 for (int i = 0; i < fromRange.length - 1; i += 2)
2837    {
2838  31 boolean forward = fromRange[i + 1] >= fromRange[i];
2839   
2840    /*
2841    * find the range mapped to (sequence positions base 1)
2842    */
2843  31 int[] range = seqMap.locateMappedRange(fromRange[i],
2844    fromRange[i + 1]);
2845  31 if (range == null)
2846    {
2847  0 jalview.bin.Console.errPrintln("Error in mapping " + seqMap
2848    + " from " + fromSeq.getName());
2849  0 return false;
2850    }
2851  31 int fromCol = fromSeq.findIndex(fromRange[i]);
2852  31 int mappedCharPos = range[0];
2853   
2854    /*
2855    * walk over the 'from' aligned sequence in forward or reverse
2856    * direction; when a non-gap is found, record the column position
2857    * of the next character of the mapped-to sequence; stop when all
2858    * the characters of the range have been counted
2859    */
2860  2794274 while (mappedCharPos <= range[1] && fromCol <= fromSeq.getLength()
2861    && fromCol >= 0)
2862    {
2863  2794243 if (!Comparison.isGap(fromSeq.getCharAt(fromCol - 1)))
2864    {
2865    /*
2866    * mapped from sequence has a character in this column
2867    * record the column position for the mapped to character
2868    */
2869  31998 Map<SequenceI, Character> seqsMap = map.get(fromCol);
2870  31998 if (seqsMap == null)
2871    {
2872  5398 seqsMap = new HashMap<>();
2873  5398 map.put(fromCol, seqsMap);
2874    }
2875  31998 seqsMap.put(seq, seq.getCharAt(mappedCharPos - toStart));
2876  31998 mappedCharPos++;
2877    }
2878  2794243 fromCol += (forward ? 1 : -1);
2879    }
2880    }
2881    }
2882  28 return true;
2883    }
2884   
2885    // strictly temporary hack until proper criteria for aligning protein to cds
2886    // are in place; this is so Ensembl -> fetch xrefs Uniprot aligns the Uniprot
 
2887  4 toggle public static boolean looksLikeEnsembl(AlignmentI alignment)
2888    {
2889  4 for (SequenceI seq : alignment.getSequences())
2890    {
2891  88 String name = seq.getName();
2892  88 if (!name.startsWith("ENSG") && !name.startsWith("ENST"))
2893    {
2894  0 return false;
2895    }
2896    }
2897  4 return true;
2898    }
2899   
 
2900  5 toggle public static boolean isSecondaryStructurePresent(
2901    AlignmentAnnotation[] annotations)
2902    {
2903  5 boolean ssPresent = false;
2904   
2905  5 for (AlignmentAnnotation aa : annotations)
2906    {
2907  5 if (ssPresent)
2908    {
2909  0 break;
2910    }
2911   
2912  5 if (Constants.SECONDARY_STRUCTURE_LABELS.containsKey(aa.label))
2913    {
2914  3 ssPresent = true;
2915  3 break;
2916    }
2917    }
2918   
2919  5 return ssPresent;
2920   
2921    }
2922   
 
2923  0 toggle public static Color getSecondaryStructureAnnotationColour(char symbol)
2924    {
2925   
2926  0 if (symbol == Constants.COIL)
2927    {
2928  0 return Color.gray;
2929    }
2930  0 if (symbol == Constants.SHEET)
2931    {
2932  0 return Color.green;
2933    }
2934  0 if (symbol == Constants.HELIX)
2935    {
2936  0 return Color.red;
2937    }
2938   
2939  0 return Color.white;
2940    }
2941   
 
2942  70406 toggle public static char findSSAnnotationForGivenSeqposition(
2943    AlignmentAnnotation aa, int seqPosition)
2944    {
2945  70406 char ss = '*';
2946   
2947  70406 if (aa != null)
2948    {
2949  70406 if (aa.getAnnotationForPosition(seqPosition) != null)
2950    {
2951  41428 Annotation a = aa.getAnnotationForPosition(seqPosition);
2952  41425 ss = a.secondaryStructure;
2953   
2954    // There is no representation for coil and it can be either ' ' or null.
2955  41428 if (ss == ' ' || ss == '-')
2956    {
2957  9747 ss = Constants.COIL;
2958    }
2959    }
2960    else
2961    {
2962  28978 ss = Constants.COIL;
2963    }
2964    }
2965   
2966  70406 return ss;
2967    }
2968   
 
2969  1674 toggle public static List<String> extractSSSourceInAlignmentAnnotation(
2970    AlignmentAnnotation[] annotations)
2971    {
2972   
2973  1674 List<String> ssSources = new ArrayList<>();
2974  1674 Set<String> addedSources = new HashSet<>(); // to keep track of added
2975    // sources
2976   
2977  1674 if (annotations == null)
2978    {
2979  12 return ssSources;
2980    }
2981   
2982  1662 for (AlignmentAnnotation aa : annotations)
2983    {
2984   
2985  7817 String ssSource = extractSSSourceFromAnnotationDescription(aa);
2986   
2987  7817 if (ssSource != null && !addedSources.contains(ssSource))
2988    {
2989  65 ssSources.add(ssSource);
2990  65 addedSources.add(ssSource);
2991    }
2992   
2993    }
2994  1662 Collections.sort(ssSources);
2995   
2996  1662 return ssSources;
2997   
2998    }
2999   
 
3000  61843 toggle public static String extractSSSourceFromAnnotationDescription(
3001    AlignmentAnnotation aa)
3002    {
3003   
3004  61843 for (String label : Constants.SECONDARY_STRUCTURE_LABELS.keySet())
3005    {
3006   
3007  69036 if (label.equals(aa.label))
3008    {
3009   
3010  54652 if (aa.getProperty(Constants.SS_PROVIDER_PROPERTY) != null)
3011    {
3012   
3013  0 return aa.getProperty(Constants.SS_PROVIDER_PROPERTY);
3014   
3015    }
3016   
3017    // For JPred
3018  54653 if (Constants.SS_ANNOTATION_FROM_JPRED_LABEL.equals(aa.label))
3019    {
3020   
3021  0 return (Constants.SECONDARY_STRUCTURE_LABELS.get(aa.label));
3022   
3023    }
3024   
3025    // For input with secondary structure
3026  54653 if (Constants.SS_ANNOTATION_LABEL.equals(aa.label)
3027    && aa.description != null
3028    && Constants.SS_ANNOTATION_LABEL.equals(aa.description))
3029    {
3030   
3031  17329 return (Constants.SECONDARY_STRUCTURE_LABELS.get(aa.label));
3032   
3033    }
3034   
3035    // For other sources
3036  37324 if (aa.sequenceRef == null)
3037    {
3038  168 return null;
3039    }
3040  37156 else if (aa.sequenceRef.getDatasetSequence() == null)
3041    {
3042  0 return null;
3043    }
3044  37156 Vector<PDBEntry> pdbEntries = aa.sequenceRef.getDatasetSequence()
3045    .getAllPDBEntries();
3046   
3047    // TODO: this is an incredibly fragile mechanism
3048  37156 for (PDBEntry entry : pdbEntries)
3049    {
3050   
3051  45270 String entryProvider = entry.getProvider();
3052  45270 if (entryProvider == null)
3053    {
3054    // No provider - so this is either an old Jalview project, or not
3055    // retrieved from recognised source
3056  45270 entryProvider = "PDB";
3057    }
3058   
3059    // Should (re)use a standard mechanism for extracting the PDB ID as it
3060    // is written 1QWXTUV:CHAIN
3061    // Trim the string from first occurrence of colon
3062  45270 String entryID = entry.getId();
3063  45270 int index = entryID.indexOf(':');
3064   
3065    // Check if colon exists
3066  45270 if (index != -1)
3067    {
3068   
3069    // Trim the string from first occurrence of colon
3070  0 entryID = entryID.substring(0, index);
3071   
3072    }
3073   
3074    // TODO: shouldn't need to extract from description what the
3075    // originating ID is for this annotation!
3076  45270 if (entryProvider == "PDB" && aa.description.toLowerCase()
3077    .contains("secondary structure for "
3078    + entryID.toLowerCase()))
3079    {
3080   
3081  37156 return entryProvider;
3082   
3083    }
3084   
3085  8114 else if (entryProvider != "PDB" && aa.description.toLowerCase()
3086    .contains(entryID.toLowerCase()))
3087    {
3088   
3089  0 return entryProvider;
3090   
3091    }
3092   
3093    }
3094    }
3095    }
3096   
3097  7192 return null;
3098   
3099    }
3100   
3101    // to do set priority for labels
 
3102  10826552 toggle public static List<AlignmentAnnotation> getAlignmentAnnotationForSource(
3103    SequenceI seq, String ssSource)
3104    {
3105   
3106  10827920 List<AlignmentAnnotation> ssAnnots = new ArrayList<AlignmentAnnotation>();
3107  10835360 for (String ssLabel : Constants.SECONDARY_STRUCTURE_LABELS.keySet())
3108    {
3109   
3110  21475438 AlignmentAnnotation[] aa = seq.getAnnotation(ssLabel);
3111  21189853 if (aa != null)
3112    {
3113   
3114  99300 if (Constants.SS_ALL_PROVIDERS.equals(ssSource))
3115    {
3116  49656 ssAnnots.addAll(Arrays.asList(aa));
3117  49656 continue;
3118    }
3119   
3120  49644 for (AlignmentAnnotation annot : aa)
3121    {
3122   
3123  54028 String ssSourceForAnnot = extractSSSourceFromAnnotationDescription(
3124    annot);
3125  54028 if (ssSourceForAnnot != null && ssSource.equals(ssSourceForAnnot))
3126    {
3127  54028 ssAnnots.add(annot);
3128    }
3129    }
3130    }
3131    }
3132  10852554 if (ssAnnots.size() > 0)
3133    {
3134  99297 return ssAnnots;
3135    }
3136   
3137  10742578 return null;
3138   
3139    }
3140   
 
3141  3 toggle public static Map<SequenceI, ArrayList<AlignmentAnnotation>> getSequenceAssociatedAlignmentAnnotations(
3142    AlignmentAnnotation[] alignAnnotList, String selectedSSSource)
3143    {
3144   
3145  3 Map<SequenceI, ArrayList<AlignmentAnnotation>> ssAlignmentAnnotationForSequences = new HashMap<SequenceI, ArrayList<AlignmentAnnotation>>();
3146  3 if (alignAnnotList == null || alignAnnotList.length == 0)
3147    {
3148  0 return ssAlignmentAnnotationForSequences;
3149    }
3150   
3151  3 for (AlignmentAnnotation aa : alignAnnotList)
3152    {
3153  12 if (aa.sequenceRef == null)
3154    {
3155  12 continue;
3156    }
3157   
3158  0 if (isSecondaryStructureFrom(selectedSSSource, aa))
3159    {
3160  0 ssAlignmentAnnotationForSequences
3161    .computeIfAbsent(aa.sequenceRef.getDatasetSequence(),
3162    k -> new ArrayList<>())
3163    .add(aa);
3164    }
3165    }
3166   
3167  3 return ssAlignmentAnnotationForSequences;
3168   
3169    }
3170   
3171    /**
3172    *
3173    * @param selectedSSSource
3174    * @param aa
3175    * @return true if aa is from a provider or all providers as specified by
3176    * selectedSSSource
3177    */
 
3178  0 toggle public static boolean isSecondaryStructureFrom(String selectedSSSource,
3179    AlignmentAnnotation aa)
3180    {
3181   
3182  0 for (String label : Constants.SECONDARY_STRUCTURE_LABELS.keySet())
3183    {
3184   
3185  0 if (label.equals(aa.label))
3186    {
3187   
3188  0 if (selectedSSSource.equals(Constants.SS_ALL_PROVIDERS))
3189    {
3190  0 return true;
3191    }
3192  0 String ssSource = AlignmentUtils
3193    .extractSSSourceFromAnnotationDescription(aa);
3194  0 if (ssSource != null && ssSource.equals(selectedSSSource))
3195    {
3196  0 return true;
3197    }
3198    }
3199    }
3200  0 return false;
3201    }
3202   
3203    // Method to get the key for a given provider value
 
3204  0 toggle public static String getSecondaryStructureProviderKey(
3205    String providerValue)
3206    {
3207  0 for (Map.Entry<String, String> entry : Constants.STRUCTURE_PROVIDERS
3208    .entrySet())
3209    {
3210  0 if (entry.getValue().equals(providerValue))
3211    {
3212  0 return entry.getKey(); // Return the key (abbreviation) for the matching
3213    // provider value
3214    }
3215    }
3216  0 return null; // Return null if no match is found
3217    }
3218   
 
3219  0 toggle public static String reduceLabelLength(String label)
3220    {
3221    // Split the input by " | "
3222  0 String[] parts = label.split(" \\| ");
3223   
3224    // Map the full names to their abbreviations
3225  0 String reducedLabel = Arrays.stream(parts)
3226    .map(fullName -> Constants.STRUCTURE_PROVIDERS.entrySet()
3227    .stream()
3228    .filter(entry -> entry.getValue().equals(fullName))
3229    .map(Map.Entry::getKey).findFirst().orElse(fullName)) // Use
3230    // fullName
3231    // if
3232    // no
3233    // abbreviation
3234    // is
3235    // found
3236    .collect(Collectors.joining(" | "));
3237   
3238  0 return reducedLabel; // Return the reduced label if abbreviations were
3239    // applied
3240    }
3241   
 
3242  0 toggle public static Color getSecondaryStructureProviderColor(String label)
3243    {
3244   
3245    // return Constants.STRUCTURE_PROVIDERS_COLOR.getOrDefault(label,
3246    // Color.BLACK);
3247  0 Color c = Constants.STRUCTURE_PROVIDERS_COLOR.get(label.trim());
3248  0 if (c == null)
3249  0 c = Color.BLACK;
3250  0 return c;
3251    }
3252   
 
3253  0 toggle public static void assignSecondaryStructureProviderColor(
3254    Map<String, Color> secondaryStructureProviderColorMap,
3255    List<String> labels)
3256    {
3257    /*
3258    // Use a Set to track unique labels
3259    Set<String> uniqueLabels = new HashSet<>(labels);
3260   
3261    Color[] palette = ColorBrewer.Paired
3262    .getColorPalette(uniqueLabels.size());
3263   
3264    List<Color> colorList = new ArrayList<>();
3265    Collections.addAll(colorList, palette);
3266    Collections.shuffle(colorList);
3267    int i = 0;
3268    */
3269    // Loop through each unique label and add it to the map with a color.
3270    // for (String label : uniqueLabels)
3271  0 for (String label : labels)
3272    {
3273    // Generate or retrieve a color for the label.
3274  0 String name = label.toUpperCase(Locale.ROOT).trim();
3275  0 secondaryStructureProviderColorMap.put(name,
3276    // ColorUtils.getDefaultColourFromName(name));
3277    ColorUtils.getColourFromNameAndScheme(name, "DESATURATED"));
3278    // secondaryStructureProviderColorMap.put(label.toUpperCase().trim(),
3279    // colorList.get(i));
3280    // i++;
3281    }
3282    }
3283    }