Clover icon

Coverage Report

  1. Project Clover database Fri Jun 19 2026 11:35:32 BST
  2. Package jalview.analysis

File AlignmentUtils.java

 

Coverage histogram

../../img/srcFileCovDistChart9.png
13% of files have more coverage

Code metrics

482
977
64
2
3,348
2,076
374
0.38
15.27
32
5.84

Classes

Class Line # Actions
AlignmentUtils 83 971 368
0.845593184.6%
AlignmentUtils.DnaVariant 99 6 6
0.00%
 

Contributing tests

This file is covered by 326 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.analysis;
22   
23    import java.awt.Color;
24    import java.util.ArrayList;
25    import java.util.Arrays;
26    import java.util.Collection;
27    import java.util.Collections;
28    import java.util.HashMap;
29    import java.util.HashSet;
30    import java.util.Iterator;
31    import java.util.LinkedHashMap;
32    import java.util.List;
33    import java.util.Locale;
34    import java.util.Map;
35    import java.util.Map.Entry;
36    import java.util.NoSuchElementException;
37    import java.util.Set;
38    import java.util.SortedMap;
39    import java.util.TreeMap;
40    import java.util.Vector;
41    import java.util.stream.Collectors;
42   
43    import jalview.bin.Console;
44    import jalview.commands.RemoveGapColCommand;
45    import jalview.datamodel.AlignedCodon;
46    import jalview.datamodel.AlignedCodonFrame;
47    import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping;
48    import jalview.datamodel.Alignment;
49    import jalview.datamodel.AlignmentAnnotation;
50    import jalview.datamodel.AlignmentI;
51    import jalview.datamodel.Annotation;
52    import jalview.datamodel.ContactMatrixI;
53    import jalview.datamodel.DBRefEntry;
54    import jalview.datamodel.DBRefSource;
55    import jalview.datamodel.GeneLociI;
56    import jalview.datamodel.IncompleteCodonException;
57    import jalview.datamodel.Mapping;
58    import jalview.datamodel.PDBEntry;
59    import jalview.datamodel.Sequence;
60    import jalview.datamodel.SequenceFeature;
61    import jalview.datamodel.SequenceGroup;
62    import jalview.datamodel.SequenceI;
63    import jalview.datamodel.features.SequenceFeatures;
64    import jalview.io.gff.SequenceOntologyI;
65    import jalview.schemes.ResidueProperties;
66    import jalview.util.ColorUtils;
67    import jalview.util.Comparison;
68    import jalview.util.Constants;
69    import jalview.util.DBRefUtils;
70    import jalview.util.IntRangeComparator;
71    import jalview.util.MapList;
72    import jalview.util.MappingUtils;
73    import jalview.util.ShiftList;
74    import uk.ac.vamsas.objects.core.descriptors.NewickDescriptor;
75   
76    /**
77    * grab bag of useful alignment manipulation operations Expect these to be
78    * refactored elsewhere at some point.
79    *
80    * @author jimp
81    *
82    */
 
83    public class AlignmentUtils
84    {
85    private static final int CODON_LENGTH = 3;
86   
87    private static final String SEQUENCE_VARIANT = "sequence_variant:";
88   
89    /*
90    * the 'id' attribute is provided for variant features fetched from
91    * Ensembl using its REST service with JSON format
92    */
93    public static final String VARIANT_ID = "id";
94   
95    /**
96    * A data model to hold the 'normal' base value at a position, and an optional
97    * sequence variant feature
98    */
 
99    static final class DnaVariant
100    {
101    final String base;
102   
103    SequenceFeature variant;
104   
 
105  0 toggle DnaVariant(String nuc)
106    {
107  0 base = nuc;
108  0 variant = null;
109    }
110   
 
111  0 toggle DnaVariant(String nuc, SequenceFeature var)
112    {
113  0 base = nuc;
114  0 variant = var;
115    }
116   
 
117  0 toggle public String getSource()
118    {
119  0 return variant == null ? null : variant.getFeatureGroup();
120    }
121   
122    /**
123    * toString for aid in the debugger only
124    */
 
125  0 toggle @Override
126    public String toString()
127    {
128  0 return base + ":" + (variant == null ? "" : variant.getDescription());
129    }
130    }
131   
132    /**
133    * given an existing alignment, create a new alignment including all, or up to
134    * flankSize additional symbols from each sequence's dataset sequence
135    *
136    * @param core
137    * @param flankSize
138    * @return AlignmentI
139    */
 
140  27 toggle public static AlignmentI expandContext(AlignmentI core, int flankSize)
141    {
142  27 List<SequenceI> sq = new ArrayList<>();
143  27 int maxoffset = 0;
144  27 for (SequenceI s : core.getSequences())
145    {
146  131 SequenceI newSeq = s.deriveSequence();
147  131 final int newSeqStart = newSeq.getStart() - 1;
148  131 if (newSeqStart > maxoffset
149    && newSeq.getDatasetSequence().getStart() < s.getStart())
150    {
151  131 maxoffset = newSeqStart;
152    }
153  131 sq.add(newSeq);
154    }
155  27 if (flankSize > -1)
156    {
157  25 maxoffset = Math.min(maxoffset, flankSize);
158    }
159   
160    /*
161    * now add offset left and right to create an expanded alignment
162    */
163  27 for (SequenceI s : sq)
164    {
165  131 SequenceI ds = s;
166  262 while (ds.getDatasetSequence() != null)
167    {
168  131 ds = ds.getDatasetSequence();
169    }
170  131 int s_end = s.findPosition(s.getStart() + s.getLength());
171    // find available flanking residues for sequence
172  131 int ustream_ds = s.getStart() - ds.getStart();
173  131 int dstream_ds = ds.getEnd() - s_end;
174   
175    // build new flanked sequence
176   
177    // compute gap padding to start of flanking sequence
178  131 int offset = maxoffset - ustream_ds;
179   
180    // padding is gapChar x ( maxoffset - min(ustream_ds, flank)
181  131 if (flankSize >= 0)
182    {
183  125 if (flankSize < ustream_ds)
184    {
185    // take up to flankSize residues
186  40 offset = maxoffset - flankSize;
187  40 ustream_ds = flankSize;
188    }
189  125 if (flankSize <= dstream_ds)
190    {
191  116 dstream_ds = flankSize - 1;
192    }
193    }
194    // TODO use Character.toLowerCase to avoid creating String objects?
195  131 char[] upstream = new String(ds
196    .getSequence(s.getStart() - 1 - ustream_ds, s.getStart() - 1))
197    .toLowerCase(Locale.ROOT).toCharArray();
198  131 char[] downstream = new String(
199    ds.getSequence(s_end - 1, s_end + dstream_ds))
200    .toLowerCase(Locale.ROOT).toCharArray();
201  131 char[] coreseq = s.getSequence();
202  131 char[] nseq = new char[offset + upstream.length + downstream.length
203    + coreseq.length];
204  131 char c = core.getGapCharacter();
205   
206  131 int p = 0;
207  461 for (; p < offset; p++)
208    {
209  330 nseq[p] = c;
210    }
211   
212  131 System.arraycopy(upstream, 0, nseq, p, upstream.length);
213  131 System.arraycopy(coreseq, 0, nseq, p + upstream.length,
214    coreseq.length);
215  131 System.arraycopy(downstream, 0, nseq,
216    p + coreseq.length + upstream.length, downstream.length);
217  131 s.setSequence(new String(nseq));
218  131 s.setStart(s.getStart() - ustream_ds);
219  131 s.setEnd(s_end + downstream.length);
220    }
221  27 AlignmentI newAl = new jalview.datamodel.Alignment(
222    sq.toArray(new SequenceI[0]));
223  27 for (SequenceI s : sq)
224    {
225  131 if (s.getAnnotation() != null)
226    {
227  1 for (AlignmentAnnotation aa : s.getAnnotation())
228    {
229  1 aa.adjustForAlignment(); // JAL-1712 fix
230  1 newAl.addAnnotation(aa);
231    }
232    }
233    }
234  27 newAl.setDataset(core.getDataset());
235  27 return newAl;
236    }
237   
238    /**
239    * Returns the index (zero-based position) of a sequence in an alignment, or
240    * -1 if not found.
241    *
242    * @param al
243    * @param seq
244    * @return
245    */
 
246  67199 toggle public static int getSequenceIndex(AlignmentI al, SequenceI seq)
247    {
248  67199 int result = -1;
249  67199 int pos = 0;
250  67199 for (SequenceI alSeq : al.getSequences())
251    {
252  126433495 if (alSeq == seq)
253    {
254  67128 result = pos;
255  67128 break;
256    }
257  126366367 pos++;
258    }
259  67199 return result;
260    }
261   
262    /**
263    * Returns a map of lists of sequences in the alignment, keyed by sequence
264    * name. For use in mapping between different alignment views of the same
265    * sequences.
266    *
267    * @see jalview.datamodel.AlignmentI#getSequencesByName()
268    */
 
269  1 toggle public static Map<String, List<SequenceI>> getSequencesByName(
270    AlignmentI al)
271    {
272  1 Map<String, List<SequenceI>> theMap = new LinkedHashMap<>();
273  1 for (SequenceI seq : al.getSequences())
274    {
275  3 String name = seq.getName();
276  3 if (name != null)
277    {
278  3 List<SequenceI> seqs = theMap.get(name);
279  3 if (seqs == null)
280    {
281  2 seqs = new ArrayList<>();
282  2 theMap.put(name, seqs);
283    }
284  3 seqs.add(seq);
285    }
286    }
287  1 return theMap;
288    }
289   
290    /**
291    * Build mapping of protein to cDNA alignment. Mappings are made between
292    * sequences where the cDNA translates to the protein sequence. Any new
293    * mappings are added to the protein alignment. Returns true if any mappings
294    * either already exist or were added, else false.
295    *
296    * @param proteinAlignment
297    * @param cdnaAlignment
298    * @return
299    */
 
300  4 toggle public static boolean mapProteinAlignmentToCdna(
301    final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment)
302    {
303  4 if (proteinAlignment == null || cdnaAlignment == null)
304    {
305  0 return false;
306    }
307   
308  4 Set<SequenceI> mappedDna = new HashSet<>();
309  4 Set<SequenceI> mappedProtein = new HashSet<>();
310   
311    /*
312    * First pass - map sequences where cross-references exist. This include
313    * 1-to-many mappings to support, for example, variant cDNA.
314    */
315  4 boolean mappingPerformed = mapProteinToCdna(proteinAlignment,
316    cdnaAlignment, mappedDna, mappedProtein, true);
317   
318    /*
319    * Second pass - map sequences where no cross-references exist. This only
320    * does 1-to-1 mappings and assumes corresponding sequences are in the same
321    * order in the alignments.
322    */
323  4 mappingPerformed |= mapProteinToCdna(proteinAlignment, cdnaAlignment,
324    mappedDna, mappedProtein, false);
325  4 return mappingPerformed;
326    }
327   
328    /**
329    * Make mappings between compatible sequences (where the cDNA translation
330    * matches the protein).
331    *
332    * @param proteinAlignment
333    * @param cdnaAlignment
334    * @param mappedDna
335    * a set of mapped DNA sequences (to add to)
336    * @param mappedProtein
337    * a set of mapped Protein sequences (to add to)
338    * @param xrefsOnly
339    * if true, only map sequences where xrefs exist
340    * @return
341    */
 
342  8 toggle protected static boolean mapProteinToCdna(
343    final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment,
344    Set<SequenceI> mappedDna, Set<SequenceI> mappedProtein,
345    boolean xrefsOnly)
346    {
347  8 boolean mappingExistsOrAdded = false;
348  8 List<SequenceI> thisSeqs = proteinAlignment.getSequences();
349  8 for (SequenceI aaSeq : thisSeqs)
350    {
351  22 boolean proteinMapped = false;
352  22 AlignedCodonFrame acf = new AlignedCodonFrame();
353   
354  22 for (SequenceI cdnaSeq : cdnaAlignment.getSequences())
355    {
356    /*
357    * Always try to map if sequences have xref to each other; this supports
358    * variant cDNA or alternative splicing for a protein sequence.
359    *
360    * If no xrefs, try to map progressively, assuming that alignments have
361    * mappable sequences in corresponding order. These are not
362    * many-to-many, as that would risk mixing species with similar cDNA
363    * sequences.
364    */
365  86 if (xrefsOnly && !AlignmentUtils.haveCrossRef(aaSeq, cdnaSeq))
366    {
367  39 continue;
368    }
369   
370    /*
371    * Don't map non-xrefd sequences more than once each. This heuristic
372    * allows us to pair up similar sequences in ordered alignments.
373    */
374  47 if (!xrefsOnly && (mappedProtein.contains(aaSeq)
375    || mappedDna.contains(cdnaSeq)))
376    {
377  29 continue;
378    }
379  18 if (mappingExists(proteinAlignment.getCodonFrames(),
380    aaSeq.getDatasetSequence(), cdnaSeq.getDatasetSequence()))
381    {
382  0 mappingExistsOrAdded = true;
383    }
384    else
385    {
386  18 MapList map = mapCdnaToProtein(aaSeq, cdnaSeq);
387  18 if (map != null)
388    {
389  12 acf.addMap(cdnaSeq, aaSeq, map);
390  12 mappingExistsOrAdded = true;
391  12 proteinMapped = true;
392  12 mappedDna.add(cdnaSeq);
393  12 mappedProtein.add(aaSeq);
394    }
395    }
396    }
397  22 if (proteinMapped)
398    {
399  11 proteinAlignment.addCodonFrame(acf);
400    }
401    }
402  8 return mappingExistsOrAdded;
403    }
404   
405    /**
406    * Answers true if the mappings include one between the given (dataset)
407    * sequences.
408    */
 
409  18 toggle protected static boolean mappingExists(List<AlignedCodonFrame> mappings,
410    SequenceI aaSeq, SequenceI cdnaSeq)
411    {
412  18 if (mappings != null)
413    {
414  18 for (AlignedCodonFrame acf : mappings)
415    {
416  14 if (cdnaSeq == acf.getDnaForAaSeq(aaSeq))
417    {
418  0 return true;
419    }
420    }
421    }
422  18 return false;
423    }
424   
425    /**
426    * Builds a mapping (if possible) of a cDNA to a protein sequence.
427    * <ul>
428    * <li>first checks if the cdna translates exactly to the protein
429    * sequence</li>
430    * <li>else checks for translation after removing a STOP codon</li>
431    * <li>else checks for translation after removing a START codon</li>
432    * <li>if that fails, inspect CDS features on the cDNA sequence</li>
433    * </ul>
434    * Returns null if no mapping is determined.
435    *
436    * @param proteinSeq
437    * the aligned protein sequence
438    * @param cdnaSeq
439    * the aligned cdna sequence
440    * @return
441    */
 
442  31 toggle public static MapList mapCdnaToProtein(SequenceI proteinSeq,
443    SequenceI cdnaSeq)
444    {
445    /*
446    * Here we handle either dataset sequence set (desktop) or absent (applet).
447    * Use only the char[] form of the sequence to avoid creating possibly large
448    * String objects.
449    */
450  31 final SequenceI proteinDataset = proteinSeq.getDatasetSequence();
451  31 char[] aaSeqChars = proteinDataset != null
452    ? proteinDataset.getSequence()
453    : proteinSeq.getSequence();
454  31 final SequenceI cdnaDataset = cdnaSeq.getDatasetSequence();
455  31 char[] cdnaSeqChars = cdnaDataset != null ? cdnaDataset.getSequence()
456    : cdnaSeq.getSequence();
457  31 if (aaSeqChars == null || cdnaSeqChars == null)
458    {
459  0 return null;
460    }
461   
462    /*
463    * cdnaStart/End, proteinStartEnd are base 1 (for dataset sequence mapping)
464    */
465  31 final int mappedLength = CODON_LENGTH * aaSeqChars.length;
466  31 int cdnaLength = cdnaSeqChars.length;
467  31 int cdnaStart = cdnaSeq.getStart();
468  31 int cdnaEnd = cdnaSeq.getEnd();
469  31 final int proteinStart = proteinSeq.getStart();
470  31 final int proteinEnd = proteinSeq.getEnd();
471   
472    /*
473    * If lengths don't match, try ignoring stop codon (if present)
474    */
475  31 if (cdnaLength != mappedLength && cdnaLength > 2)
476    {
477  16 String lastCodon = String.valueOf(cdnaSeqChars,
478    cdnaLength - CODON_LENGTH, CODON_LENGTH)
479    .toUpperCase(Locale.ROOT);
480  16 for (String stop : ResidueProperties.STOP_CODONS)
481    {
482  47 if (lastCodon.equals(stop))
483    {
484  2 cdnaEnd -= CODON_LENGTH;
485  2 cdnaLength -= CODON_LENGTH;
486  2 break;
487    }
488    }
489    }
490   
491    /*
492    * If lengths still don't match, try ignoring start codon.
493    */
494  31 int startOffset = 0;
495  31 if (cdnaLength != mappedLength && cdnaLength > 2
496    && String.valueOf(cdnaSeqChars, 0, CODON_LENGTH)
497    .toUpperCase(Locale.ROOT)
498    .equals(ResidueProperties.START))
499    {
500  5 startOffset += CODON_LENGTH;
501  5 cdnaStart += CODON_LENGTH;
502  5 cdnaLength -= CODON_LENGTH;
503    }
504   
505  31 if (translatesAs(cdnaSeqChars, startOffset, aaSeqChars))
506    {
507    /*
508    * protein is translation of dna (+/- start/stop codons)
509    */
510  15 MapList map = new MapList(new int[] { cdnaStart, cdnaEnd },
511    new int[]
512    { proteinStart, proteinEnd }, CODON_LENGTH, 1);
513  15 return map;
514    }
515   
516    /*
517    * translation failed - try mapping CDS annotated regions of dna
518    */
519  16 return mapCdsToProtein(cdnaSeq, proteinSeq);
520    }
521   
522    /**
523    * Test whether the given cdna sequence, starting at the given offset,
524    * translates to the given amino acid sequence, using the standard translation
525    * table. Designed to fail fast i.e. as soon as a mismatch position is found.
526    *
527    * @param cdnaSeqChars
528    * @param cdnaStart
529    * @param aaSeqChars
530    * @return
531    */
 
532  51 toggle protected static boolean translatesAs(char[] cdnaSeqChars, int cdnaStart,
533    char[] aaSeqChars)
534    {
535  51 if (cdnaSeqChars == null || aaSeqChars == null)
536    {
537  3 return false;
538    }
539   
540  48 int aaPos = 0;
541  48 int dnaPos = cdnaStart;
542  161 for (; dnaPos < cdnaSeqChars.length - 2
543    && aaPos < aaSeqChars.length; dnaPos += CODON_LENGTH, aaPos++)
544    {
545  130 String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
546  130 final String translated = ResidueProperties.codonTranslate(codon);
547   
548    /*
549    * allow * in protein to match untranslatable in dna
550    */
551  130 final char aaRes = aaSeqChars[aaPos];
552  130 if ((translated == null || ResidueProperties.STOP.equals(translated))
553    && aaRes == '*')
554    {
555  4 continue;
556    }
557  126 if (translated == null || !(aaRes == translated.charAt(0)))
558    {
559    // debug
560    // jalview.bin.Console.outPrintln(("Mismatch at " + i + "/" + aaResidue
561    // + ": "
562    // + codon + "(" + translated + ") != " + aaRes));
563  17 return false;
564    }
565    }
566   
567    /*
568    * check we matched all of the protein sequence
569    */
570  31 if (aaPos != aaSeqChars.length)
571    {
572  2 return false;
573    }
574   
575    /*
576    * check we matched all of the dna except
577    * for optional trailing STOP codon
578    */
579  29 if (dnaPos == cdnaSeqChars.length)
580    {
581  17 return true;
582    }
583  12 if (dnaPos == cdnaSeqChars.length - CODON_LENGTH)
584    {
585  11 String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
586  11 if (ResidueProperties.STOP
587    .equals(ResidueProperties.codonTranslate(codon)))
588    {
589  9 return true;
590    }
591    }
592  3 return false;
593    }
594   
595    /**
596    * Align sequence 'seq' to match the alignment of a mapped sequence. Note this
597    * currently assumes that we are aligning cDNA to match protein.
598    *
599    * @param seq
600    * the sequence to be realigned
601    * @param al
602    * the alignment whose sequence alignment is to be 'copied'
603    * @param gap
604    * character string represent a gap in the realigned sequence
605    * @param preserveUnmappedGaps
606    * @param preserveMappedGaps
607    * @return true if the sequence was realigned, false if it could not be
608    */
 
609  8 toggle public static boolean alignSequenceAs(SequenceI seq, AlignmentI al,
610    String gap, boolean preserveMappedGaps,
611    boolean preserveUnmappedGaps)
612    {
613    /*
614    * Get any mappings from the source alignment to the target (dataset)
615    * sequence.
616    */
617    // TODO there may be one AlignedCodonFrame per dataset sequence, or one with
618    // all mappings. Would it help to constrain this?
619  8 List<AlignedCodonFrame> mappings = al.getCodonFrame(seq);
620  8 if (mappings == null || mappings.isEmpty())
621    {
622  0 return false;
623    }
624   
625    /*
626    * Locate the aligned source sequence whose dataset sequence is mapped. We
627    * just take the first match here (as we can't align like more than one
628    * sequence).
629    */
630  8 SequenceI alignFrom = null;
631  8 AlignedCodonFrame mapping = null;
632  8 for (AlignedCodonFrame mp : mappings)
633    {
634  8 alignFrom = mp.findAlignedSequence(seq, al);
635  8 if (alignFrom != null)
636    {
637  4 mapping = mp;
638  4 break;
639    }
640    }
641   
642  8 if (alignFrom == null)
643    {
644  4 return false;
645    }
646  4 alignSequenceAs(seq, alignFrom, mapping, gap, al.getGapCharacter(),
647    preserveMappedGaps, preserveUnmappedGaps);
648  4 return true;
649    }
650   
651    /**
652    * Align sequence 'alignTo' the same way as 'alignFrom', using the mapping to
653    * match residues and codons. Flags control whether existing gaps in unmapped
654    * (intron) and mapped (exon) regions are preserved or not. Gaps between
655    * intron and exon are only retained if both flags are set.
656    *
657    * @param alignTo
658    * @param alignFrom
659    * @param mapping
660    * @param myGap
661    * @param sourceGap
662    * @param preserveUnmappedGaps
663    * @param preserveMappedGaps
664    */
 
665  19 toggle public static void alignSequenceAs(SequenceI alignTo, SequenceI alignFrom,
666    AlignedCodonFrame mapping, String myGap, char sourceGap,
667    boolean preserveMappedGaps, boolean preserveUnmappedGaps)
668    {
669    // TODO generalise to work for Protein-Protein, dna-dna, dna-protein
670   
671    // aligned and dataset sequence positions, all base zero
672  19 int thisSeqPos = 0;
673  19 int sourceDsPos = 0;
674   
675  19 int basesWritten = 0;
676  19 char myGapChar = myGap.charAt(0);
677  19 int ratio = myGap.length();
678   
679  19 int fromOffset = alignFrom.getStart() - 1;
680  19 int toOffset = alignTo.getStart() - 1;
681  19 int sourceGapMappedLength = 0;
682  19 boolean inExon = false;
683  19 final int toLength = alignTo.getLength();
684  19 final int fromLength = alignFrom.getLength();
685  19 StringBuilder thisAligned = new StringBuilder(2 * toLength);
686   
687    /*
688    * Traverse the 'model' aligned sequence
689    */
690  205 for (int i = 0; i < fromLength; i++)
691    {
692  186 char sourceChar = alignFrom.getCharAt(i);
693  186 if (sourceChar == sourceGap)
694    {
695  44 sourceGapMappedLength += ratio;
696  44 continue;
697    }
698   
699    /*
700    * Found a non-gap character. Locate its mapped region if any.
701    */
702  142 sourceDsPos++;
703    // Note mapping positions are base 1, our sequence positions base 0
704  142 int[] mappedPos = mapping.getMappedRegion(alignTo, alignFrom,
705    sourceDsPos + fromOffset);
706  142 if (mappedPos == null)
707    {
708    /*
709    * unmapped position; treat like a gap
710    */
711  94 sourceGapMappedLength += ratio;
712    // jalview.bin.Console.errPrintln("Can't align: no codon mapping to
713    // residue "
714    // + sourceDsPos + "(" + sourceChar + ")");
715    // return;
716  94 continue;
717    }
718   
719  48 int mappedCodonStart = mappedPos[0]; // position (1...) of codon start
720  48 int mappedCodonEnd = mappedPos[mappedPos.length - 1]; // codon end pos
721  48 StringBuilder trailingCopiedGap = new StringBuilder();
722   
723    /*
724    * Copy dna sequence up to and including this codon. Optionally, include
725    * gaps before the codon starts (in introns) and/or after the codon starts
726    * (in exons).
727    *
728    * Note this only works for 'linear' splicing, not reverse or interleaved.
729    * But then 'align dna as protein' doesn't make much sense otherwise.
730    */
731  48 int intronLength = 0;
732  294 while (basesWritten + toOffset < mappedCodonEnd
733    && thisSeqPos < toLength)
734    {
735  246 final char c = alignTo.getCharAt(thisSeqPos++);
736  246 if (c != myGapChar)
737    {
738  146 basesWritten++;
739  146 int sourcePosition = basesWritten + toOffset;
740  146 if (sourcePosition < mappedCodonStart)
741    {
742    /*
743    * Found an unmapped (intron) base. First add in any preceding gaps
744    * (if wanted).
745    */
746  48 if (preserveUnmappedGaps && trailingCopiedGap.length() > 0)
747    {
748  17 thisAligned.append(trailingCopiedGap.toString());
749  17 intronLength += trailingCopiedGap.length();
750  17 trailingCopiedGap = new StringBuilder();
751    }
752  48 intronLength++;
753  48 inExon = false;
754    }
755    else
756    {
757  98 final boolean startOfCodon = sourcePosition == mappedCodonStart;
758  98 int gapsToAdd = calculateGapsToInsert(preserveMappedGaps,
759    preserveUnmappedGaps, sourceGapMappedLength, inExon,
760    trailingCopiedGap.length(), intronLength, startOfCodon);
761  215 for (int k = 0; k < gapsToAdd; k++)
762    {
763  117 thisAligned.append(myGapChar);
764    }
765  98 sourceGapMappedLength = 0;
766  98 inExon = true;
767    }
768  146 thisAligned.append(c);
769  146 trailingCopiedGap = new StringBuilder();
770    }
771    else
772    {
773  100 if (inExon && preserveMappedGaps)
774    {
775  32 trailingCopiedGap.append(myGapChar);
776    }
777  68 else if (!inExon && preserveUnmappedGaps)
778    {
779  27 trailingCopiedGap.append(myGapChar);
780    }
781    }
782    }
783    }
784   
785    /*
786    * At end of model aligned sequence. Copy any remaining target sequence, optionally
787    * including (intron) gaps.
788    */
789  129 while (thisSeqPos < toLength)
790    {
791  110 final char c = alignTo.getCharAt(thisSeqPos++);
792  110 if (c != myGapChar || preserveUnmappedGaps)
793    {
794  102 thisAligned.append(c);
795    }
796  110 sourceGapMappedLength--;
797    }
798   
799    /*
800    * finally add gaps to pad for any trailing source gaps or
801    * unmapped characters
802    */
803  19 if (preserveUnmappedGaps)
804    {
805  24 while (sourceGapMappedLength > 0)
806    {
807  12 thisAligned.append(myGapChar);
808  12 sourceGapMappedLength--;
809    }
810    }
811   
812    /*
813    * All done aligning, set the aligned sequence.
814    */
815  19 alignTo.setSequence(new String(thisAligned));
816    }
817   
818    /**
819    * Helper method to work out how many gaps to insert when realigning.
820    *
821    * @param preserveMappedGaps
822    * @param preserveUnmappedGaps
823    * @param sourceGapMappedLength
824    * @param inExon
825    * @param trailingCopiedGap
826    * @param intronLength
827    * @param startOfCodon
828    * @return
829    */
 
830  98 toggle protected static int calculateGapsToInsert(boolean preserveMappedGaps,
831    boolean preserveUnmappedGaps, int sourceGapMappedLength,
832    boolean inExon, int trailingGapLength, int intronLength,
833    final boolean startOfCodon)
834    {
835  98 int gapsToAdd = 0;
836  98 if (startOfCodon)
837    {
838    /*
839    * Reached start of codon. Ignore trailing gaps in intron unless we are
840    * preserving gaps in both exon and intron. Ignore them anyway if the
841    * protein alignment introduces a gap at least as large as the intronic
842    * region.
843    */
844  40 if (inExon && !preserveMappedGaps)
845    {
846  4 trailingGapLength = 0;
847    }
848  40 if (!inExon && !(preserveMappedGaps && preserveUnmappedGaps))
849    {
850  19 trailingGapLength = 0;
851    }
852  40 if (inExon)
853    {
854  14 gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
855    }
856    else
857    {
858  26 if (intronLength + trailingGapLength <= sourceGapMappedLength)
859    {
860  20 gapsToAdd = sourceGapMappedLength - intronLength;
861    }
862    else
863    {
864  6 gapsToAdd = Math.min(
865    intronLength + trailingGapLength - sourceGapMappedLength,
866    trailingGapLength);
867    }
868    }
869    }
870    else
871    {
872    /*
873    * second or third base of codon; check for any gaps in dna
874    */
875  58 if (!preserveMappedGaps)
876    {
877  32 trailingGapLength = 0;
878    }
879  58 gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
880    }
881  98 return gapsToAdd;
882    }
883   
884    /**
885    * Realigns the given protein to match the alignment of the dna, using codon
886    * mappings to translate aligned codon positions to protein residues.
887    *
888    * @param protein
889    * the alignment whose sequences are realigned by this method
890    * @param dna
891    * the dna alignment whose alignment we are 'copying'
892    * @return the number of sequences that were realigned
893    */
 
894  5 toggle public static int alignProteinAsDna(AlignmentI protein, AlignmentI dna)
895    {
896  5 if (protein.isNucleotide() || !dna.isNucleotide())
897    {
898  0 jalview.bin.Console
899    .errPrintln("Wrong alignment type in alignProteinAsDna");
900  0 return 0;
901    }
902  5 List<SequenceI> unmappedProtein = new ArrayList<>();
903  5 Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = buildCodonColumnsMap(
904    protein, dna, unmappedProtein);
905  5 return alignProteinAs(protein, alignedCodons, unmappedProtein);
906    }
907   
908    /**
909    * Realigns the given dna to match the alignment of the protein, using codon
910    * mappings to translate aligned peptide positions to codons.
911    *
912    * Always produces a padded CDS alignment.
913    *
914    * @param dna
915    * the alignment whose sequences are realigned by this method
916    * @param protein
917    * the protein alignment whose alignment we are 'copying'
918    * @return the number of sequences that were realigned
919    */
 
920  4 toggle public static int alignCdsAsProtein(AlignmentI dna, AlignmentI protein)
921    {
922  4 if (protein.isNucleotide() || !dna.isNucleotide())
923    {
924  0 jalview.bin.Console
925    .errPrintln("Wrong alignment type in alignProteinAsDna");
926  0 return 0;
927    }
928    // todo: implement this
929  4 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
930  4 int alignedCount = 0;
931  4 int width = 0; // alignment width for padding CDS
932  4 for (SequenceI dnaSeq : dna.getSequences())
933    {
934  5 if (alignCdsSequenceAsProtein(dnaSeq, protein, mappings,
935    dna.getGapCharacter()))
936    {
937  5 alignedCount++;
938    }
939  5 width = Math.max(dnaSeq.getLength(), width);
940    }
941  4 int oldwidth;
942  4 int diff;
943  4 for (SequenceI dnaSeq : dna.getSequences())
944    {
945  5 oldwidth = dnaSeq.getLength();
946  5 diff = width - oldwidth;
947  5 if (diff > 0)
948    {
949  1 dnaSeq.insertCharAt(oldwidth, diff, dna.getGapCharacter());
950    }
951    }
952  4 return alignedCount;
953    }
954   
955    /**
956    * Helper method to align (if possible) the dna sequence to match the
957    * alignment of a mapped protein sequence. This is currently limited to
958    * handling coding sequence only.
959    *
960    * @param cdsSeq
961    * @param protein
962    * @param mappings
963    * @param gapChar
964    * @return
965    */
 
966  5 toggle static boolean alignCdsSequenceAsProtein(SequenceI cdsSeq,
967    AlignmentI protein, List<AlignedCodonFrame> mappings,
968    char gapChar)
969    {
970  5 SequenceI cdsDss = cdsSeq.getDatasetSequence();
971  5 if (cdsDss == null)
972    {
973  0 System.err
974    .println("alignCdsSequenceAsProtein needs aligned sequence!");
975  0 return false;
976    }
977   
978  5 List<AlignedCodonFrame> dnaMappings = MappingUtils
979    .findMappingsForSequence(cdsSeq, mappings);
980  5 for (AlignedCodonFrame mapping : dnaMappings)
981    {
982  5 SequenceI peptide = mapping.findAlignedSequence(cdsSeq, protein);
983  5 if (peptide != null)
984    {
985  5 final int peptideLength = peptide.getLength();
986  5 Mapping map = mapping.getMappingBetween(cdsSeq, peptide);
987  5 if (map != null)
988    {
989  5 MapList mapList = map.getMap();
990  5 if (map.getTo() == peptide.getDatasetSequence())
991    {
992  5 mapList = mapList.getInverse();
993    }
994  5 final int cdsLength = cdsDss.getLength();
995  5 int mappedFromLength = MappingUtils
996    .getLength(mapList.getFromRanges());
997  5 int mappedToLength = MappingUtils
998    .getLength(mapList.getToRanges());
999  5 boolean addStopCodon = (cdsLength == mappedFromLength
1000    * CODON_LENGTH + CODON_LENGTH)
1001    || (peptide.getDatasetSequence()
1002    .getLength() == mappedFromLength - 1);
1003  5 if (cdsLength != mappedToLength && !addStopCodon)
1004    {
1005  0 jalview.bin.Console.errPrintln(String.format(
1006    "Can't align cds as protein (length mismatch %d/%d): %s",
1007    cdsLength, mappedToLength, cdsSeq.getName()));
1008    }
1009   
1010    /*
1011    * pre-fill the aligned cds sequence with gaps
1012    */
1013  5 char[] alignedCds = new char[peptideLength * CODON_LENGTH
1014  5 + (addStopCodon ? CODON_LENGTH : 0)];
1015  5 Arrays.fill(alignedCds, gapChar);
1016   
1017    /*
1018    * walk over the aligned peptide sequence and insert mapped
1019    * codons for residues in the aligned cds sequence
1020    */
1021  5 int copiedBases = 0;
1022  5 int cdsStart = cdsDss.getStart();
1023  5 int proteinPos = peptide.getStart() - 1;
1024  5 int cdsCol = 0;
1025   
1026  33 for (int col = 0; col < peptideLength; col++)
1027    {
1028  28 char residue = peptide.getCharAt(col);
1029   
1030  28 if (Comparison.isGap(residue))
1031    {
1032  13 cdsCol += CODON_LENGTH;
1033    }
1034    else
1035    {
1036  15 proteinPos++;
1037  15 int[] codon = mapList.locateInTo(proteinPos, proteinPos);
1038  15 if (codon == null)
1039    {
1040    // e.g. incomplete start codon, X in peptide
1041  0 cdsCol += CODON_LENGTH;
1042    }
1043    else
1044    {
1045  60 for (int j = codon[0]; j <= codon[1]; j++)
1046    {
1047  45 char mappedBase = cdsDss.getCharAt(j - cdsStart);
1048  45 alignedCds[cdsCol++] = mappedBase;
1049  45 copiedBases++;
1050    }
1051    }
1052    }
1053    }
1054   
1055    /*
1056    * append stop codon if not mapped from protein,
1057    * closing it up to the end of the mapped sequence
1058    */
1059  5 if (copiedBases == cdsLength - CODON_LENGTH)
1060    {
1061  0 for (int i = alignedCds.length - 1; i >= 0; i--)
1062    {
1063  0 if (!Comparison.isGap(alignedCds[i]))
1064    {
1065  0 cdsCol = i + 1; // gap just after end of sequence
1066  0 break;
1067    }
1068    }
1069  0 for (int i = cdsLength - CODON_LENGTH; i < cdsLength; i++)
1070    {
1071  0 alignedCds[cdsCol++] = cdsDss.getCharAt(i);
1072    }
1073    }
1074  5 cdsSeq.setSequence(new String(alignedCds));
1075  5 return true;
1076    }
1077    }
1078    }
1079  0 return false;
1080    }
1081   
1082    /**
1083    * Builds a map whose key is an aligned codon position (3 alignment column
1084    * numbers base 0), and whose value is a map from protein sequence to each
1085    * protein's peptide residue for that codon. The map generates an ordering of
1086    * the codons, and allows us to read off the peptides at each position in
1087    * order to assemble 'aligned' protein sequences.
1088    *
1089    * @param protein
1090    * the protein alignment
1091    * @param dna
1092    * the coding dna alignment
1093    * @param unmappedProtein
1094    * any unmapped proteins are added to this list
1095    * @return
1096    */
 
1097  5 toggle protected static Map<AlignedCodon, Map<SequenceI, AlignedCodon>> buildCodonColumnsMap(
1098    AlignmentI protein, AlignmentI dna,
1099    List<SequenceI> unmappedProtein)
1100    {
1101    /*
1102    * maintain a list of any proteins with no mappings - these will be
1103    * rendered 'as is' in the protein alignment as we can't align them
1104    */
1105  5 unmappedProtein.addAll(protein.getSequences());
1106   
1107  5 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
1108   
1109    /*
1110    * Map will hold, for each aligned codon position e.g. [3, 5, 6], a map of
1111    * {dnaSequence, {proteinSequence, codonProduct}} at that position. The
1112    * comparator keeps the codon positions ordered.
1113    */
1114  5 Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = new TreeMap<>(
1115    new CodonComparator());
1116   
1117  5 for (SequenceI dnaSeq : dna.getSequences())
1118    {
1119  30 for (AlignedCodonFrame mapping : mappings)
1120    {
1121  516 SequenceI prot = mapping.findAlignedSequence(dnaSeq, protein);
1122  516 if (prot != null)
1123    {
1124  32 Mapping seqMap = mapping.getMappingForSequence(dnaSeq);
1125  32 addCodonPositions(dnaSeq, prot, protein.getGapCharacter(), seqMap,
1126    alignedCodons);
1127  32 unmappedProtein.remove(prot);
1128    }
1129    }
1130    }
1131   
1132    /*
1133    * Finally add any unmapped peptide start residues (e.g. for incomplete
1134    * codons) as if at the codon position before the second residue
1135    */
1136    // TODO resolve JAL-2022 so this fudge can be removed
1137  5 int mappedSequenceCount = protein.getHeight() - unmappedProtein.size();
1138  5 addUnmappedPeptideStarts(alignedCodons, mappedSequenceCount);
1139   
1140  5 return alignedCodons;
1141    }
1142   
1143    /**
1144    * Scans for any protein mapped from position 2 (meaning unmapped start
1145    * position e.g. an incomplete codon), and synthesizes a 'codon' for it at the
1146    * preceding position in the alignment
1147    *
1148    * @param alignedCodons
1149    * the codon-to-peptide map
1150    * @param mappedSequenceCount
1151    * the number of distinct sequences in the map
1152    */
 
1153  5 toggle protected static void addUnmappedPeptideStarts(
1154    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1155    int mappedSequenceCount)
1156    {
1157    // TODO delete this ugly hack once JAL-2022 is resolved
1158    // i.e. we can model startPhase > 0 (incomplete start codon)
1159   
1160  5 List<SequenceI> sequencesChecked = new ArrayList<>();
1161  5 AlignedCodon lastCodon = null;
1162  5 Map<SequenceI, AlignedCodon> toAdd = new HashMap<>();
1163   
1164  5 for (Entry<AlignedCodon, Map<SequenceI, AlignedCodon>> entry : alignedCodons
1165    .entrySet())
1166    {
1167  1913 for (Entry<SequenceI, AlignedCodon> sequenceCodon : entry.getValue()
1168    .entrySet())
1169    {
1170  10671 SequenceI seq = sequenceCodon.getKey();
1171  10671 if (sequencesChecked.contains(seq))
1172    {
1173  10641 continue;
1174    }
1175  30 sequencesChecked.add(seq);
1176  30 AlignedCodon codon = sequenceCodon.getValue();
1177  30 if (codon.peptideCol > 1)
1178    {
1179  0 jalview.bin.Console.errPrintln(
1180    "Problem mapping protein with >1 unmapped start positions: "
1181    + seq.getName());
1182    }
1183  30 else if (codon.peptideCol == 1)
1184    {
1185    /*
1186    * first position (peptideCol == 0) was unmapped - add it
1187    */
1188  6 if (lastCodon != null)
1189    {
1190  5 AlignedCodon firstPeptide = new AlignedCodon(lastCodon.pos1,
1191    lastCodon.pos2, lastCodon.pos3,
1192    String.valueOf(seq.getCharAt(0)), 0);
1193  5 toAdd.put(seq, firstPeptide);
1194    }
1195    else
1196    {
1197    /*
1198    * unmapped residue at start of alignment (no prior column) -
1199    * 'insert' at nominal codon [0, 0, 0]
1200    */
1201  1 AlignedCodon firstPeptide = new AlignedCodon(0, 0, 0,
1202    String.valueOf(seq.getCharAt(0)), 0);
1203  1 toAdd.put(seq, firstPeptide);
1204    }
1205    }
1206  30 if (sequencesChecked.size() == mappedSequenceCount)
1207    {
1208    // no need to check past first mapped position in all sequences
1209  5 break;
1210    }
1211    }
1212  1913 lastCodon = entry.getKey();
1213    }
1214   
1215    /*
1216    * add any new codons safely after iterating over the map
1217    */
1218  5 for (Entry<SequenceI, AlignedCodon> startCodon : toAdd.entrySet())
1219    {
1220  6 addCodonToMap(alignedCodons, startCodon.getValue(),
1221    startCodon.getKey());
1222    }
1223    }
1224   
1225    /**
1226    * Update the aligned protein sequences to match the codon alignments given in
1227    * the map.
1228    *
1229    * @param protein
1230    * @param alignedCodons
1231    * an ordered map of codon positions (columns), with sequence/peptide
1232    * values present in each column
1233    * @param unmappedProtein
1234    * @return
1235    */
 
1236  5 toggle protected static int alignProteinAs(AlignmentI protein,
1237    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1238    List<SequenceI> unmappedProtein)
1239    {
1240    /*
1241    * prefill peptide sequences with gaps
1242    */
1243  5 int alignedWidth = alignedCodons.size();
1244  5 char[] gaps = new char[alignedWidth];
1245  5 Arrays.fill(gaps, protein.getGapCharacter());
1246  5 Map<SequenceI, char[]> peptides = new HashMap<>();
1247  5 for (SequenceI seq : protein.getSequences())
1248    {
1249  31 if (!unmappedProtein.contains(seq))
1250    {
1251  30 peptides.put(seq, Arrays.copyOf(gaps, gaps.length));
1252    }
1253    }
1254   
1255    /*
1256    * Traverse the codons left to right (as defined by CodonComparator)
1257    * and insert peptides in each column where the sequence is mapped.
1258    * This gives a peptide 'alignment' where residues are aligned if their
1259    * corresponding codons occupy the same columns in the cdna alignment.
1260    */
1261  5 int column = 0;
1262  5 for (AlignedCodon codon : alignedCodons.keySet())
1263    {
1264  1914 final Map<SequenceI, AlignedCodon> columnResidues = alignedCodons
1265    .get(codon);
1266  1914 for (Entry<SequenceI, AlignedCodon> entry : columnResidues.entrySet())
1267    {
1268  10682 char residue = entry.getValue().product.charAt(0);
1269  10682 peptides.get(entry.getKey())[column] = residue;
1270    }
1271  1914 column++;
1272    }
1273   
1274    /*
1275    * and finally set the constructed sequences
1276    */
1277  5 for (Entry<SequenceI, char[]> entry : peptides.entrySet())
1278    {
1279  30 entry.getKey().setSequence(new String(entry.getValue()));
1280    }
1281   
1282  5 return 0;
1283    }
1284   
1285    /**
1286    * Populate the map of aligned codons by traversing the given sequence
1287    * mapping, locating the aligned positions of mapped codons, and adding those
1288    * positions and their translation products to the map.
1289    *
1290    * @param dna
1291    * the aligned sequence we are mapping from
1292    * @param protein
1293    * the sequence to be aligned to the codons
1294    * @param gapChar
1295    * the gap character in the dna sequence
1296    * @param seqMap
1297    * a mapping to a sequence translation
1298    * @param alignedCodons
1299    * the map we are building up
1300    */
 
1301  32 toggle static void addCodonPositions(SequenceI dna, SequenceI protein,
1302    char gapChar, Mapping seqMap,
1303    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons)
1304    {
1305  32 Iterator<AlignedCodon> codons = seqMap.getCodonIterator(dna, gapChar);
1306   
1307    /*
1308    * add codon positions, and their peptide translations, to the alignment
1309    * map, while remembering the first codon mapped
1310    */
1311  10716 while (codons.hasNext())
1312    {
1313  10684 try
1314    {
1315  10684 AlignedCodon codon = codons.next();
1316  10684 addCodonToMap(alignedCodons, codon, protein);
1317    } catch (IncompleteCodonException e)
1318    {
1319    // possible incomplete trailing codon - ignore
1320    } catch (NoSuchElementException e)
1321    {
1322    // possibly peptide lacking STOP
1323    }
1324    }
1325    }
1326   
1327    /**
1328    * Helper method to add a codon-to-peptide entry to the aligned codons map
1329    *
1330    * @param alignedCodons
1331    * @param codon
1332    * @param protein
1333    */
 
1334  10690 toggle protected static void addCodonToMap(
1335    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1336    AlignedCodon codon, SequenceI protein)
1337    {
1338  10690 Map<SequenceI, AlignedCodon> seqProduct = alignedCodons.get(codon);
1339  10690 if (seqProduct == null)
1340    {
1341  1914 seqProduct = new HashMap<>();
1342  1914 alignedCodons.put(codon, seqProduct);
1343    }
1344  10690 seqProduct.put(protein, codon);
1345    }
1346   
1347    /**
1348    * Returns true if a cDNA/Protein mapping either exists, or could be made,
1349    * between at least one pair of sequences in the two alignments. Currently,
1350    * the logic is:
1351    * <ul>
1352    * <li>One alignment must be nucleotide, and the other protein</li>
1353    * <li>At least one pair of sequences must be already mapped, or mappable</li>
1354    * <li>Mappable means the nucleotide translation matches the protein
1355    * sequence</li>
1356    * <li>The translation may ignore start and stop codons if present in the
1357    * nucleotide</li>
1358    * </ul>
1359    *
1360    * @param al1
1361    * @param al2
1362    * @return
1363    */
 
1364  28 toggle public static boolean isMappable(AlignmentI al1, AlignmentI al2)
1365    {
1366  28 if (al1 == null || al2 == null)
1367    {
1368  3 return false;
1369    }
1370   
1371    /*
1372    * Require one nucleotide and one protein
1373    */
1374  25 if (al1.isNucleotide() == al2.isNucleotide())
1375    {
1376  14 return false;
1377    }
1378  11 AlignmentI dna = al1.isNucleotide() ? al1 : al2;
1379  11 AlignmentI protein = dna == al1 ? al2 : al1;
1380  11 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
1381  11 for (SequenceI dnaSeq : dna.getSequences())
1382    {
1383  12 for (SequenceI proteinSeq : protein.getSequences())
1384    {
1385  12 if (isMappable(dnaSeq, proteinSeq, mappings))
1386    {
1387  2 return true;
1388    }
1389    }
1390    }
1391  9 return false;
1392    }
1393   
1394    /**
1395    * Returns true if the dna sequence is mapped, or could be mapped, to the
1396    * protein sequence.
1397    *
1398    * @param dnaSeq
1399    * @param proteinSeq
1400    * @param mappings
1401    * @return
1402    */
 
1403  12 toggle protected static boolean isMappable(SequenceI dnaSeq,
1404    SequenceI proteinSeq, List<AlignedCodonFrame> mappings)
1405    {
1406  12 if (dnaSeq == null || proteinSeq == null)
1407    {
1408  0 return false;
1409    }
1410   
1411  12 SequenceI dnaDs = dnaSeq.getDatasetSequence() == null ? dnaSeq
1412    : dnaSeq.getDatasetSequence();
1413  12 SequenceI proteinDs = proteinSeq.getDatasetSequence() == null
1414    ? proteinSeq
1415    : proteinSeq.getDatasetSequence();
1416   
1417  12 for (AlignedCodonFrame mapping : mappings)
1418    {
1419  0 if (proteinDs == mapping.getAaForDnaSeq(dnaDs))
1420    {
1421    /*
1422    * already mapped
1423    */
1424  0 return true;
1425    }
1426    }
1427   
1428    /*
1429    * Just try to make a mapping (it is not yet stored), test whether
1430    * successful.
1431    */
1432  12 return mapCdnaToProtein(proteinDs, dnaDs) != null;
1433    }
1434   
1435    /**
1436    * Finds any reference annotations associated with the sequences in
1437    * sequenceScope, that are not already added to the alignment, and adds them
1438    * to the 'candidates' map. Also populates a lookup table of annotation
1439    * labels, keyed by calcId, for use in constructing tooltips or the like.
1440    *
1441    * @param sequenceScope
1442    * the sequences to scan for reference annotations
1443    * @param labelForCalcId
1444    * (optional) map to populate with label for calcId
1445    * @param candidates
1446    * map to populate with annotations for sequence
1447    * @param al
1448    * the alignment to check for presence of annotations
1449    */
 
1450  103 toggle public static void findAddableReferenceAnnotations(
1451    List<SequenceI> sequenceScope, Map<String, String> labelForCalcId,
1452    final Map<SequenceI, List<AlignmentAnnotation>> candidates,
1453    AlignmentI al)
1454    {
1455  103 if (sequenceScope == null)
1456    {
1457  1 return;
1458    }
1459   
1460    /*
1461    * For each sequence in scope, make a list of any annotations on the
1462    * underlying dataset sequence which are not already on the alignment.
1463    *
1464    * Add to a map of { alignmentSequence, <List of annotations to add> }
1465    */
1466  102 for (SequenceI seq : sequenceScope)
1467    {
1468  100 SequenceI dataset = seq.getDatasetSequence();
1469  100 if (dataset == null)
1470    {
1471  0 continue;
1472    }
1473  100 AlignmentAnnotation[] datasetAnnotations = dataset.getAnnotation();
1474  100 if (datasetAnnotations == null)
1475    {
1476  35 continue;
1477    }
1478  65 final List<AlignmentAnnotation> result = new ArrayList<>();
1479  65 for (AlignmentAnnotation dsann : datasetAnnotations)
1480    {
1481    /*
1482    * Find matching annotations on the alignment. If none is found, then
1483    * add this annotation to the list of 'addable' annotations for this
1484    * sequence.
1485    */
1486  221 final Iterable<AlignmentAnnotation> matchedAlignmentAnnotations = al
1487    .findAnnotations(seq, dsann.getCalcId(), dsann.label);
1488  221 boolean found = false;
1489  221 if (matchedAlignmentAnnotations != null)
1490    {
1491  218 for (AlignmentAnnotation matched : matchedAlignmentAnnotations)
1492    {
1493  176 if (dsann.description.equals(matched.description))
1494    {
1495  74 found = true;
1496  74 break;
1497    }
1498    }
1499    }
1500  221 if (!found)
1501    {
1502  147 result.add(dsann);
1503  147 if (labelForCalcId != null)
1504    {
1505  14 labelForCalcId.put(dsann.getCalcId(), dsann.label);
1506    }
1507    }
1508    }
1509    /*
1510    * Save any addable annotations for this sequence
1511    */
1512  65 if (!result.isEmpty())
1513    {
1514  61 candidates.put(seq, result);
1515    }
1516    }
1517    }
1518   
1519    /**
1520    * Adds annotations to the top of the alignment annotations, in the same order
1521    * as their related sequences. If you already have an annotation and want to
1522    * add it to a sequence in an alignment use {@code addReferenceAnnotationTo}
1523    *
1524    * @param annotations
1525    * the annotations to add
1526    * @param alignment
1527    * the alignment to add them to
1528    * @param selectionGroup
1529    * current selection group - may be null, if provided then any added
1530    * annotation will be trimmed to just those columns in the selection
1531    * group
1532    */
 
1533  57 toggle public static void addReferenceAnnotations(
1534    Map<SequenceI, List<AlignmentAnnotation>> annotations,
1535    final AlignmentI alignment, final SequenceGroup selectionGroup)
1536    {
1537  57 for (SequenceI seq : annotations.keySet())
1538    {
1539  57 for (AlignmentAnnotation ann : annotations.get(seq))
1540    {
1541  141 addReferenceAnnotationTo(alignment, seq, ann, selectionGroup);
1542    }
1543    }
1544    }
1545   
1546   
1547    /**
1548    * Add all reference annnotations to everything in the alignment.
1549    * @param al
1550    */
 
1551  7 toggle public static void addAllReferenceAnnotations(AlignmentI al) {
1552  7 SortedMap<String, String> tipEntries = new TreeMap<>();
1553  7 Map<SequenceI, List<AlignmentAnnotation>> candidates = new LinkedHashMap<>();
1554   
1555  7 AlignmentUtils.findAddableReferenceAnnotations(al.getSequences(),
1556    tipEntries, candidates, al);
1557  7 AlignmentUtils.addReferenceAnnotations(candidates, al, null);
1558    }
1559   
1560   
 
1561  4 toggle public static boolean isSSAnnotationPresent(
1562    Map<SequenceI, List<AlignmentAnnotation>> annotations)
1563    {
1564   
1565  4 for (SequenceI seq : annotations.keySet())
1566    {
1567  3 if (isSecondaryStructurePresent(
1568    annotations.get(seq).toArray(new AlignmentAnnotation[0])))
1569    {
1570  2 return true;
1571    }
1572    }
1573  2 return false;
1574    }
1575   
1576    /**
1577    * Make a copy of a reference annotation {@code ann} and add it to an
1578    * alignment sequence {@code seq} in {@code alignment}, optionally limited to
1579    * the extent of {@code selectionGroup}
1580    *
1581    * @param alignment
1582    * @param seq
1583    * @param ann
1584    * @param selectionGroup
1585    * current selection group - may be null, if provided then any added
1586    * annotation will be trimmed to just those columns in the selection
1587    * group
1588    * @return annotation added to {@code seq and {@code alignment}
1589    */
 
1590  145 toggle public static AlignmentAnnotation addReferenceAnnotationTo(
1591    final AlignmentI alignment, final SequenceI seq,
1592    final AlignmentAnnotation ann, final SequenceGroup selectionGroup)
1593    {
1594  145 AlignmentAnnotation copyAnn = new AlignmentAnnotation(ann);
1595  145 int startRes = 0;
1596  145 int endRes = ann.annotations.length;
1597  145 if (selectionGroup != null)
1598    {
1599  2 startRes = -1 + Math.min(seq.getEnd(), Math.max(seq.getStart(),
1600    seq.findPosition(selectionGroup.getStartRes())));
1601  2 endRes = -1 + Math.min(seq.getEnd(),
1602    seq.findPosition(selectionGroup.getEndRes()));
1603   
1604    }
1605  145 copyAnn.restrict(startRes, endRes + 0);
1606   
1607    /*
1608    * Add to the sequence (sets copyAnn.datasetSequence), unless the
1609    * original annotation is already on the sequence.
1610    */
1611  145 if (!seq.hasAnnotation(ann))
1612    {
1613  145 ContactMatrixI cm = seq.getDatasetSequence().getContactMatrixFor(ann);
1614  145 if (cm != null)
1615    {
1616  37 seq.addContactListFor(copyAnn, cm);
1617    }
1618  145 seq.addAlignmentAnnotation(copyAnn);
1619    }
1620    // adjust for gaps
1621  145 copyAnn.adjustForAlignment();
1622    // add to the alignment and set visible
1623  145 alignment.addAnnotation(copyAnn);
1624  145 copyAnn.visible = true;
1625   
1626  145 return copyAnn;
1627    }
1628   
1629    /**
1630    * Set visibility of alignment annotations of specified types (labels), for
1631    * specified sequences. This supports controls like "Show all secondary
1632    * structure", "Hide all Temp factor", etc.
1633    *
1634    * @al the alignment to scan for annotations
1635    * @param types
1636    * the types (labels) of annotations to be updated
1637    * @param forSequences
1638    * if not null, only annotations linked to one of these sequences are
1639    * in scope for update; if null, acts on all sequence annotations
1640    * @param anyType
1641    * if this flag is true, 'types' is ignored (label not checked)
1642    * @param doShow
1643    * if true, set visibility on, else set off
1644    */
 
1645  5 toggle public static void showOrHideSequenceAnnotations(AlignmentI al,
1646    Collection<String> types, List<SequenceI> forSequences,
1647    boolean anyType, boolean doShow)
1648    {
1649  5 AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
1650  5 if (anns != null)
1651    {
1652  5 for (AlignmentAnnotation aa : anns)
1653    {
1654  30 if (anyType || types.contains(aa.label))
1655    {
1656  21 if ((aa.sequenceRef != null) && (forSequences == null
1657    || forSequences.contains(aa.sequenceRef)))
1658    {
1659  11 aa.visible = doShow;
1660    }
1661    }
1662    }
1663    }
1664    }
1665   
1666    /**
1667    * Shows or hides auto calculated annotations for a sequence group.
1668    *
1669    * @param al
1670    * The alignment object with the annotations.
1671    * @param type
1672    * The type of annotation to show or hide.
1673    * @param selectedGroup
1674    * The sequence group for which the annotations should be shown or
1675    * hidden.
1676    * @param anyType
1677    * If true, all types of annotations will be shown/hidden.
1678    * @param doShow
1679    * If true, the annotations will be shown; if false, annotations will
1680    * be hidden.
1681    */
 
1682  0 toggle public static void showOrHideAutoCalculatedAnnotationsForGroup(
1683    AlignmentI al, String type, SequenceGroup selectedGroup,
1684    boolean anyType, boolean doShow)
1685    {
1686    // Get all alignment annotations
1687  0 AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
1688   
1689  0 if (anns != null)
1690    {
1691  0 for (AlignmentAnnotation aa : anns)
1692    {
1693    // Check if anyType is true or if the annotation's label contains the
1694    // specified type (currently for secondary structure consensus)
1695  0 if ((anyType && aa.label
1696    .startsWith(Constants.SECONDARY_STRUCTURE_CONSENSUS_LABEL))
1697    || aa.label.startsWith(type))
1698    {
1699    // If the annotation's group reference is not null and matches the
1700    // selected group, update its visibility.
1701  0 if (aa.groupRef != null && selectedGroup == aa.groupRef)
1702    {
1703  0 aa.visible = doShow;
1704    }
1705    }
1706    }
1707    }
1708    }
1709   
 
1710  0 toggle public static AlignmentAnnotation getFirstSequenceAnnotationOfType(
1711    AlignmentI al, int graphType)
1712    {
1713  0 AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
1714  0 if (anns != null)
1715    {
1716  0 for (AlignmentAnnotation aa : anns)
1717    {
1718  0 if (aa.sequenceRef != null && aa.graph == graphType)
1719  0 return aa;
1720    }
1721    }
1722  0 return null;
1723    }
1724   
1725    /**
1726    * Returns true if either sequence has a cross-reference to the other
1727    *
1728    * @param seq1
1729    * @param seq2
1730    * @return
1731    */
 
1732  52 toggle public static boolean haveCrossRef(SequenceI seq1, SequenceI seq2)
1733    {
1734    // Note: moved here from class CrossRef as the latter class has dependencies
1735    // not availability to the applet's classpath
1736  52 return hasCrossRef(seq1, seq2) || hasCrossRef(seq2, seq1);
1737    }
1738   
1739    /**
1740    * Returns true if seq1 has a cross-reference to seq2. Currently this assumes
1741    * that sequence name is structured as Source|AccessionId.
1742    *
1743    * @param seq1
1744    * @param seq2
1745    * @return
1746    */
 
1747  108 toggle public static boolean hasCrossRef(SequenceI seq1, SequenceI seq2)
1748    {
1749  108 if (seq1 == null || seq2 == null)
1750    {
1751  8 return false;
1752    }
1753  100 String name = seq2.getName();
1754  100 final List<DBRefEntry> xrefs = seq1.getDBRefs();
1755  100 if (xrefs != null)
1756    {
1757  34 for (int ix = 0, nx = xrefs.size(); ix < nx; ix++)
1758    {
1759  24 DBRefEntry xref = xrefs.get(ix);
1760  24 String xrefName = xref.getSource() + "|" + xref.getAccessionId();
1761    // case-insensitive test, consistent with DBRefEntry.equalRef()
1762  24 if (xrefName.equalsIgnoreCase(name))
1763    {
1764  12 return true;
1765    }
1766    }
1767    }
1768  88 return false;
1769    }
1770   
1771    /**
1772    * Constructs an alignment consisting of the mapped (CDS) regions in the given
1773    * nucleotide sequences, and updates mappings to match. The CDS sequences are
1774    * added to the original alignment's dataset, which is shared by the new
1775    * alignment. Mappings from nucleotide to CDS, and from CDS to protein, are
1776    * added to the alignment dataset.
1777    *
1778    * @param dna
1779    * aligned nucleotide (dna or cds) sequences
1780    * @param dataset
1781    * the alignment dataset the sequences belong to
1782    * @param products
1783    * (optional) to restrict results to CDS that map to specified
1784    * protein products
1785    * @return an alignment whose sequences are the cds-only parts of the dna
1786    * sequences (or null if no mappings are found)
1787    */
 
1788  6 toggle public static AlignmentI makeCdsAlignment(SequenceI[] dna,
1789    AlignmentI dataset, SequenceI[] products)
1790    {
1791  6 if (dataset == null || dataset.getDataset() != null)
1792    {
1793  0 throw new IllegalArgumentException(
1794    "IMPLEMENTATION ERROR: dataset.getDataset() must be null!");
1795    }
1796  6 List<SequenceI> foundSeqs = new ArrayList<>();
1797  6 List<SequenceI> cdsSeqs = new ArrayList<>();
1798  6 List<AlignedCodonFrame> mappings = dataset.getCodonFrames();
1799  6 HashSet<SequenceI> productSeqs = null;
1800  6 if (products != null)
1801    {
1802  3 productSeqs = new HashSet<>();
1803  3 for (SequenceI seq : products)
1804    {
1805  24 productSeqs.add(seq.getDatasetSequence() == null ? seq
1806    : seq.getDatasetSequence());
1807    }
1808    }
1809   
1810    /*
1811    * Construct CDS sequences from mappings on the alignment dataset.
1812    * The logic is:
1813    * - find the protein product(s) mapped to from each dna sequence
1814    * - if the mapping covers the whole dna sequence (give or take start/stop
1815    * codon), take the dna as the CDS sequence
1816    * - else search dataset mappings for a suitable dna sequence, i.e. one
1817    * whose whole sequence is mapped to the protein
1818    * - if no sequence found, construct one from the dna sequence and mapping
1819    * (and add it to dataset so it is found if this is repeated)
1820    */
1821  6 for (SequenceI dnaSeq : dna)
1822    {
1823  52 SequenceI dnaDss = dnaSeq.getDatasetSequence() == null ? dnaSeq
1824    : dnaSeq.getDatasetSequence();
1825   
1826  52 List<AlignedCodonFrame> seqMappings = MappingUtils
1827    .findMappingsForSequence(dnaSeq, mappings);
1828  52 for (AlignedCodonFrame mapping : seqMappings)
1829    {
1830  42 List<Mapping> mappingsFromSequence = mapping
1831    .getMappingsFromSequence(dnaSeq);
1832   
1833  42 for (Mapping aMapping : mappingsFromSequence)
1834    {
1835  44 MapList mapList = aMapping.getMap();
1836  44 if (mapList.getFromRatio() == 1)
1837    {
1838    /*
1839    * not a dna-to-protein mapping (likely dna-to-cds)
1840    */
1841  11 continue;
1842    }
1843   
1844    /*
1845    * skip if mapping is not to one of the target set of proteins
1846    */
1847  33 SequenceI proteinProduct = aMapping.getTo();
1848  33 if (productSeqs != null && !productSeqs.contains(proteinProduct))
1849    {
1850  2 continue;
1851    }
1852   
1853    /*
1854    * try to locate the CDS from the dataset mappings;
1855    * guard against duplicate results (for the case that protein has
1856    * dbrefs to both dna and cds sequences)
1857    */
1858  31 SequenceI cdsSeq = findCdsForProtein(mappings, dnaSeq,
1859    seqMappings, aMapping);
1860  31 if (cdsSeq != null)
1861    {
1862  11 if (!foundSeqs.contains(cdsSeq))
1863    {
1864  11 foundSeqs.add(cdsSeq);
1865  11 SequenceI derivedSequence = cdsSeq.deriveSequence();
1866  11 cdsSeqs.add(derivedSequence);
1867  11 if (!dataset.getSequences().contains(cdsSeq))
1868    {
1869  0 dataset.addSequence(cdsSeq);
1870    }
1871    }
1872  11 continue;
1873    }
1874   
1875    /*
1876    * didn't find mapped CDS sequence - construct it and add
1877    * its dataset sequence to the dataset
1878    */
1879  20 cdsSeq = makeCdsSequence(dnaSeq.getDatasetSequence(), aMapping,
1880    dataset).deriveSequence();
1881    // cdsSeq has a name constructed as CDS|<dbref>
1882    // <dbref> will be either the accession for the coding sequence,
1883    // marked in the /via/ dbref to the protein product accession
1884    // or it will be the original nucleotide accession.
1885  20 SequenceI cdsSeqDss = cdsSeq.getDatasetSequence();
1886   
1887  20 cdsSeqs.add(cdsSeq);
1888   
1889    /*
1890    * build the mapping from CDS to protein
1891    */
1892  20 List<int[]> cdsRange = Collections
1893    .singletonList(new int[]
1894    { cdsSeq.getStart(),
1895    cdsSeq.getLength() + cdsSeq.getStart() - 1 });
1896  20 MapList cdsToProteinMap = new MapList(cdsRange,
1897    mapList.getToRanges(), mapList.getFromRatio(),
1898    mapList.getToRatio());
1899   
1900  20 if (!dataset.getSequences().contains(cdsSeqDss))
1901    {
1902    /*
1903    * if this sequence is a newly created one, add it to the dataset
1904    * and made a CDS to protein mapping (if sequence already exists,
1905    * CDS-to-protein mapping _is_ the transcript-to-protein mapping)
1906    */
1907  20 dataset.addSequence(cdsSeqDss);
1908  20 AlignedCodonFrame cdsToProteinMapping = new AlignedCodonFrame();
1909  20 cdsToProteinMapping.addMap(cdsSeqDss, proteinProduct,
1910    cdsToProteinMap);
1911   
1912    /*
1913    * guard against duplicating the mapping if repeating this action
1914    */
1915  20 if (!mappings.contains(cdsToProteinMapping))
1916    {
1917  20 mappings.add(cdsToProteinMapping);
1918    }
1919    }
1920   
1921  20 propagateDBRefsToCDS(cdsSeqDss, dnaSeq.getDatasetSequence(),
1922    proteinProduct, aMapping);
1923    /*
1924    * add another mapping from original 'from' range to CDS
1925    */
1926  20 AlignedCodonFrame dnaToCdsMapping = new AlignedCodonFrame();
1927  20 final MapList dnaToCdsMap = new MapList(mapList.getFromRanges(),
1928    cdsRange, 1, 1);
1929  20 dnaToCdsMapping.addMap(dnaSeq.getDatasetSequence(), cdsSeqDss,
1930    dnaToCdsMap);
1931  20 if (!mappings.contains(dnaToCdsMapping))
1932    {
1933  20 mappings.add(dnaToCdsMapping);
1934    }
1935   
1936    /*
1937    * transfer dna chromosomal loci (if known) to the CDS
1938    * sequence (via the mapping)
1939    */
1940  20 final MapList cdsToDnaMap = dnaToCdsMap.getInverse();
1941  20 transferGeneLoci(dnaSeq, cdsToDnaMap, cdsSeq);
1942   
1943    /*
1944    * add DBRef with mapping from protein to CDS
1945    * (this enables Get Cross-References from protein alignment)
1946    * This is tricky because we can't have two DBRefs with the
1947    * same source and accession, so need a different accession for
1948    * the CDS from the dna sequence
1949    */
1950   
1951    // specific use case:
1952    // Genomic contig ENSCHR:1, contains coding regions for ENSG01,
1953    // ENSG02, ENSG03, with transcripts and products similarly named.
1954    // cannot add distinct dbrefs mapping location on ENSCHR:1 to ENSG01
1955   
1956    // JBPNote: ?? can't actually create an example that demonstrates we
1957    // need to
1958    // synthesize an xref.
1959   
1960  20 List<DBRefEntry> primrefs = dnaDss.getPrimaryDBRefs();
1961  33 for (int ip = 0, np = primrefs.size(); ip < np; ip++)
1962    {
1963  13 DBRefEntry primRef = primrefs.get(ip);
1964    /*
1965    * create a cross-reference from CDS to the source sequence's
1966    * primary reference and vice versa
1967    */
1968  13 String source = primRef.getSource();
1969  13 String version = primRef.getVersion();
1970  13 DBRefEntry cdsCrossRef = new DBRefEntry(source,
1971    source + ":" + version, primRef.getAccessionId());
1972  13 cdsCrossRef
1973    .setMap(new Mapping(dnaDss, new MapList(cdsToDnaMap)));
1974  13 cdsSeqDss.addDBRef(cdsCrossRef);
1975   
1976  13 dnaSeq.addDBRef(new DBRefEntry(source, version,
1977    cdsSeq.getName(), new Mapping(cdsSeqDss, dnaToCdsMap)));
1978    // problem here is that the cross-reference is synthesized -
1979    // cdsSeq.getName() may be like 'CDS|dnaaccession' or
1980    // 'CDS|emblcdsacc'
1981    // assuming cds version same as dna ?!?
1982   
1983  13 DBRefEntry proteinToCdsRef = new DBRefEntry(source, version,
1984    cdsSeq.getName());
1985    //
1986  13 proteinToCdsRef.setMap(
1987    new Mapping(cdsSeqDss, cdsToProteinMap.getInverse()));
1988  13 proteinProduct.addDBRef(proteinToCdsRef);
1989    }
1990    /*
1991    * transfer any features on dna that overlap the CDS
1992    */
1993  20 transferFeatures(dnaSeq, cdsSeq, dnaToCdsMap, null,
1994    SequenceOntologyI.CDS);
1995    }
1996    }
1997    }
1998   
1999  6 AlignmentI cds = new Alignment(
2000    cdsSeqs.toArray(new SequenceI[cdsSeqs.size()]));
2001  6 cds.setDataset(dataset);
2002   
2003  6 return cds;
2004    }
2005   
2006    /**
2007    * Tries to transfer gene loci (dbref to chromosome positions) from fromSeq to
2008    * toSeq, mediated by the given mapping between the sequences
2009    *
2010    * @param fromSeq
2011    * @param targetToFrom
2012    * Map
2013    * @param targetSeq
2014    */
 
2015  23 toggle protected static void transferGeneLoci(SequenceI fromSeq,
2016    MapList targetToFrom, SequenceI targetSeq)
2017    {
2018  23 if (targetSeq.getGeneLoci() != null)
2019    {
2020    // already have - don't override
2021  1 return;
2022    }
2023  22 GeneLociI fromLoci = fromSeq.getGeneLoci();
2024  22 if (fromLoci == null)
2025    {
2026  10 return;
2027    }
2028   
2029  12 MapList newMap = targetToFrom.traverse(fromLoci.getMapping());
2030   
2031  12 if (newMap != null)
2032    {
2033  12 targetSeq.setGeneLoci(fromLoci.getSpeciesId(),
2034    fromLoci.getAssemblyId(), fromLoci.getChromosomeId(), newMap);
2035    }
2036    }
2037   
2038    /**
2039    * A helper method that finds a CDS sequence in the alignment dataset that is
2040    * mapped to the given protein sequence, and either is, or has a mapping from,
2041    * the given dna sequence.
2042    *
2043    * @param mappings
2044    * set of all mappings on the dataset
2045    * @param dnaSeq
2046    * a dna (or cds) sequence we are searching from
2047    * @param seqMappings
2048    * the set of mappings involving dnaSeq
2049    * @param aMapping
2050    * a transcript-to-peptide mapping
2051    * @return
2052    */
 
2053  38 toggle static SequenceI findCdsForProtein(List<AlignedCodonFrame> mappings,
2054    SequenceI dnaSeq, List<AlignedCodonFrame> seqMappings,
2055    Mapping aMapping)
2056    {
2057    /*
2058    * TODO a better dna-cds-protein mapping data representation to allow easy
2059    * navigation; until then this clunky looping around lists of mappings
2060    */
2061  38 SequenceI seqDss = dnaSeq.getDatasetSequence() == null ? dnaSeq
2062    : dnaSeq.getDatasetSequence();
2063  38 SequenceI proteinProduct = aMapping.getTo();
2064   
2065    /*
2066    * is this mapping from the whole dna sequence (i.e. CDS)?
2067    * allowing for possible stop codon on dna but not peptide
2068    */
2069  38 int mappedFromLength = MappingUtils
2070    .getLength(aMapping.getMap().getFromRanges());
2071  38 int dnaLength = seqDss.getLength();
2072  38 if (mappedFromLength == dnaLength
2073    || mappedFromLength == dnaLength - CODON_LENGTH)
2074    {
2075    /*
2076    * if sequence has CDS features, this is a transcript with no UTR
2077    * - do not take this as the CDS sequence! (JAL-2789)
2078    */
2079  4 if (seqDss.getFeatures().getFeaturesByOntology(SequenceOntologyI.CDS)
2080    .isEmpty())
2081    {
2082  1 return seqDss;
2083    }
2084    }
2085   
2086    /*
2087    * looks like we found the dna-to-protein mapping; search for the
2088    * corresponding cds-to-protein mapping
2089    */
2090  37 List<AlignedCodonFrame> mappingsToPeptide = MappingUtils
2091    .findMappingsForSequence(proteinProduct, mappings);
2092  37 for (AlignedCodonFrame acf : mappingsToPeptide)
2093    {
2094  52 for (SequenceToSequenceMapping map : acf.getMappings())
2095    {
2096  276 Mapping mapping = map.getMapping();
2097  276 if (mapping != aMapping
2098    && mapping.getMap().getFromRatio() == CODON_LENGTH
2099    && proteinProduct == mapping.getTo()
2100    && seqDss != map.getFromSeq())
2101    {
2102  15 mappedFromLength = MappingUtils
2103    .getLength(mapping.getMap().getFromRanges());
2104  15 if (mappedFromLength == map.getFromSeq().getLength())
2105    {
2106    /*
2107    * found a 3:1 mapping to the protein product which covers
2108    * the whole dna sequence i.e. is from CDS; finally check the CDS
2109    * is mapped from the given dna start sequence
2110    */
2111  15 SequenceI cdsSeq = map.getFromSeq();
2112    // todo this test is weak if seqMappings contains multiple mappings;
2113    // we get away with it if transcript:cds relationship is 1:1
2114  15 List<AlignedCodonFrame> dnaToCdsMaps = MappingUtils
2115    .findMappingsForSequence(cdsSeq, seqMappings);
2116  15 if (!dnaToCdsMaps.isEmpty())
2117    {
2118  13 return cdsSeq;
2119    }
2120    }
2121    }
2122    }
2123    }
2124  24 return null;
2125    }
2126   
2127    /**
2128    * Helper method that makes a CDS sequence as defined by the mappings from the
2129    * given sequence i.e. extracts the 'mapped from' ranges (which may be on
2130    * forward or reverse strand).
2131    *
2132    * @param seq
2133    * @param mapping
2134    * @param dataset
2135    * - existing dataset. We check for sequences that look like the CDS
2136    * we are about to construct, if one exists already, then we will
2137    * just return that one.
2138    * @return CDS sequence (as a dataset sequence)
2139    */
 
2140  20 toggle static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping,
2141    AlignmentI dataset)
2142    {
2143    /*
2144    * construct CDS sequence name as "CDS|" with 'from id' held in the mapping
2145    * if set (e.g. EMBL protein_id), else sequence name appended
2146    */
2147  20 String mapFromId = mapping.getMappedFromId();
2148  20 final String seqId = "CDS|"
2149  20 + (mapFromId != null ? mapFromId : seq.getName());
2150   
2151  20 SequenceI newSeq = null;
2152   
2153    /*
2154    * construct CDS sequence by splicing mapped from ranges
2155    */
2156  20 char[] seqChars = seq.getSequence();
2157  20 List<int[]> fromRanges = mapping.getMap().getFromRanges();
2158  20 int cdsWidth = MappingUtils.getLength(fromRanges);
2159  20 char[] newSeqChars = new char[cdsWidth];
2160   
2161  20 int newPos = 0;
2162  20 for (int[] range : fromRanges)
2163    {
2164  32 if (range[0] <= range[1])
2165    {
2166    // forward strand mapping - just copy the range
2167  32 int length = range[1] - range[0] + 1;
2168  32 System.arraycopy(seqChars, range[0] - 1, newSeqChars, newPos,
2169    length);
2170  32 newPos += length;
2171    }
2172    else
2173    {
2174    // reverse strand mapping - copy and complement one by one
2175  0 for (int i = range[0]; i >= range[1]; i--)
2176    {
2177  0 newSeqChars[newPos++] = Dna.getComplement(seqChars[i - 1]);
2178    }
2179    }
2180   
2181  32 newSeq = new Sequence(seqId, newSeqChars, 1, newPos);
2182    }
2183   
2184  20 if (dataset != null)
2185    {
2186  20 SequenceI[] matches = dataset.findSequenceMatch(newSeq.getName());
2187  20 if (matches != null)
2188    {
2189  20 boolean matched = false;
2190  20 for (SequenceI mtch : matches)
2191    {
2192  3 if (mtch.getStart() != newSeq.getStart())
2193    {
2194  0 continue;
2195    }
2196  3 if (mtch.getEnd() != newSeq.getEnd())
2197    {
2198  0 continue;
2199    }
2200  3 if (!Arrays.equals(mtch.getSequence(), newSeq.getSequence()))
2201    {
2202  3 continue;
2203    }
2204  0 if (!matched)
2205    {
2206  0 matched = true;
2207  0 newSeq = mtch;
2208    }
2209    else
2210    {
2211  0 Console.error(
2212    "JAL-2154 regression: warning - found (and ignored) a duplicate CDS sequence:"
2213    + mtch.toString());
2214    }
2215    }
2216    }
2217    }
2218    // newSeq.setDescription(mapFromId);
2219   
2220  20 return newSeq;
2221    }
2222   
2223    /**
2224    * Adds any DBRefEntrys to cdsSeq from contig that have a Mapping congruent to
2225    * the given mapping.
2226    *
2227    * @param cdsSeq
2228    * @param contig
2229    * @param proteinProduct
2230    * @param mapping
2231    * @return list of DBRefEntrys added
2232    */
 
2233  20 toggle protected static List<DBRefEntry> propagateDBRefsToCDS(SequenceI cdsSeq,
2234    SequenceI contig, SequenceI proteinProduct, Mapping mapping)
2235    {
2236   
2237    // gather direct refs from contig congruent with mapping
2238  20 List<DBRefEntry> direct = new ArrayList<>();
2239  20 HashSet<String> directSources = new HashSet<>();
2240   
2241  20 List<DBRefEntry> refs = contig.getDBRefs();
2242  20 if (refs != null)
2243    {
2244  292 for (int ib = 0, nb = refs.size(); ib < nb; ib++)
2245    {
2246  279 DBRefEntry dbr = refs.get(ib);
2247  279 MapList map;
2248  ? if (dbr.hasMap() && (map = dbr.getMap().getMap()).isTripletMap())
2249    {
2250    // check if map is the CDS mapping
2251  24 if (mapping.getMap().equals(map))
2252    {
2253  24 direct.add(dbr);
2254  24 directSources.add(dbr.getSource());
2255    }
2256    }
2257    }
2258    }
2259  20 List<DBRefEntry> onSource = DBRefUtils.selectRefs(
2260    proteinProduct.getDBRefs(),
2261    directSources.toArray(new String[0]));
2262  20 List<DBRefEntry> propagated = new ArrayList<>();
2263   
2264    // and generate appropriate mappings
2265  44 for (int ic = 0, nc = direct.size(); ic < nc; ic++)
2266    {
2267  24 DBRefEntry cdsref = direct.get(ic);
2268  24 Mapping m = cdsref.getMap();
2269    // clone maplist and mapping
2270  24 MapList cdsposmap = new MapList(
2271    Arrays.asList(new int[][]
2272    { new int[] { cdsSeq.getStart(), cdsSeq.getEnd() } }),
2273    m.getMap().getToRanges(), 3, 1);
2274  24 Mapping cdsmap = new Mapping(m.getTo(), m.getMap());
2275   
2276    // create dbref
2277  24 DBRefEntry newref = new DBRefEntry(cdsref.getSource(),
2278    cdsref.getVersion(), cdsref.getAccessionId(),
2279    new Mapping(cdsmap.getTo(), cdsposmap));
2280   
2281    // and see if we can map to the protein product for this mapping.
2282    // onSource is the filtered set of accessions on protein that we are
2283    // tranferring, so we assume accession is the same.
2284  24 if (cdsmap.getTo() == null && onSource != null)
2285    {
2286  2 List<DBRefEntry> sourceRefs = DBRefUtils.searchRefs(onSource,
2287    cdsref.getAccessionId());
2288  2 if (sourceRefs != null)
2289    {
2290  2 for (DBRefEntry srcref : sourceRefs)
2291    {
2292  2 if (srcref.getSource().equalsIgnoreCase(cdsref.getSource()))
2293    {
2294    // we have found a complementary dbref on the protein product, so
2295    // update mapping's getTo
2296  2 newref.getMap().setTo(proteinProduct);
2297    }
2298    }
2299    }
2300    }
2301  24 cdsSeq.addDBRef(newref);
2302  24 propagated.add(newref);
2303    }
2304  20 return propagated;
2305    }
2306   
2307    /**
2308    * Transfers co-located features on 'fromSeq' to 'toSeq', adjusting the
2309    * feature start/end ranges, optionally omitting specified feature types.
2310    * Returns the number of features copied.
2311    *
2312    * @param fromSeq
2313    * @param toSeq
2314    * @param mapping
2315    * the mapping from 'fromSeq' to 'toSeq'
2316    * @param select
2317    * if not null, only features of this type are copied (including
2318    * subtypes in the Sequence Ontology)
2319    * @param omitting
2320    */
 
2321  23 toggle protected static int transferFeatures(SequenceI fromSeq, SequenceI toSeq,
2322    MapList mapping, String select, String... omitting)
2323    {
2324  23 SequenceI copyTo = toSeq;
2325  43 while (copyTo.getDatasetSequence() != null)
2326    {
2327  20 copyTo = copyTo.getDatasetSequence();
2328    }
2329  23 if (fromSeq == copyTo || fromSeq.getDatasetSequence() == copyTo)
2330    {
2331  0 return 0; // shared dataset sequence
2332    }
2333   
2334    /*
2335    * get features, optionally restricted by an ontology term
2336    */
2337  23 List<SequenceFeature> sfs = select == null
2338    ? fromSeq.getFeatures().getPositionalFeatures()
2339    : fromSeq.getFeatures().getFeaturesByOntology(select);
2340   
2341  23 int count = 0;
2342  23 for (SequenceFeature sf : sfs)
2343    {
2344  9610 String type = sf.getType();
2345  9610 boolean omit = false;
2346  9610 for (String toOmit : omitting)
2347    {
2348  9603 if (type.equals(toOmit))
2349    {
2350  134 omit = true;
2351    }
2352    }
2353  9610 if (omit)
2354    {
2355  134 continue;
2356    }
2357   
2358    /*
2359    * locate the mapped range - null if either start or end is
2360    * not mapped (no partial overlaps are calculated)
2361    */
2362  9476 int start = sf.getBegin();
2363  9476 int end = sf.getEnd();
2364  9476 int[] mappedTo = mapping.locateInTo(start, end);
2365    /*
2366    * if whole exon range doesn't map, try interpreting it
2367    * as 5' or 3' exon overlapping the CDS range
2368    */
2369  9476 if (mappedTo == null)
2370    {
2371  4447 mappedTo = mapping.locateInTo(end, end);
2372  4447 if (mappedTo != null)
2373    {
2374    /*
2375    * end of exon is in CDS range - 5' overlap
2376    * to a range from the start of the peptide
2377    */
2378  0 mappedTo[0] = 1;
2379    }
2380    }
2381  9476 if (mappedTo == null)
2382    {
2383  4447 mappedTo = mapping.locateInTo(start, start);
2384  4447 if (mappedTo != null)
2385    {
2386    /*
2387    * start of exon is in CDS range - 3' overlap
2388    * to a range up to the end of the peptide
2389    */
2390  0 mappedTo[1] = toSeq.getLength();
2391    }
2392    }
2393  9476 if (mappedTo != null)
2394    {
2395  5029 int newBegin = Math.min(mappedTo[0], mappedTo[1]);
2396  5029 int newEnd = Math.max(mappedTo[0], mappedTo[1]);
2397  5029 SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
2398    sf.getFeatureGroup(), sf.getScore());
2399  5029 copyTo.addSequenceFeature(copy);
2400  5029 count++;
2401    }
2402    }
2403  23 return count;
2404    }
2405   
2406    /**
2407    * Returns a mapping from dna to protein by inspecting sequence features of
2408    * type "CDS" on the dna. A mapping is constructed if the total CDS feature
2409    * length is 3 times the peptide length (optionally after dropping a trailing
2410    * stop codon). This method does not check whether the CDS nucleotide sequence
2411    * translates to the peptide sequence.
2412    *
2413    * @param dnaSeq
2414    * @param proteinSeq
2415    * @return
2416    */
 
2417  22 toggle public static MapList mapCdsToProtein(SequenceI dnaSeq,
2418    SequenceI proteinSeq)
2419    {
2420  22 List<int[]> ranges = findCdsPositions(dnaSeq);
2421  22 int mappedDnaLength = MappingUtils.getLength(ranges);
2422   
2423    /*
2424    * if not a whole number of codons, truncate mapping
2425    */
2426  22 int codonRemainder = mappedDnaLength % CODON_LENGTH;
2427  22 if (codonRemainder > 0)
2428    {
2429  2 mappedDnaLength -= codonRemainder;
2430  2 MappingUtils.removeEndPositions(codonRemainder, ranges);
2431    }
2432   
2433  22 int proteinLength = proteinSeq.getLength();
2434  22 int proteinStart = proteinSeq.getStart();
2435  22 int proteinEnd = proteinSeq.getEnd();
2436   
2437    /*
2438    * incomplete start codon may mean X at start of peptide
2439    * we ignore both for mapping purposes
2440    */
2441  22 if (proteinSeq.getCharAt(0) == 'X')
2442    {
2443    // todo JAL-2022 support startPhase > 0
2444  1 proteinStart++;
2445  1 proteinLength--;
2446    }
2447  22 List<int[]> proteinRange = new ArrayList<>();
2448   
2449    /*
2450    * dna length should map to protein (or protein plus stop codon)
2451    */
2452  22 int codesForResidues = mappedDnaLength / CODON_LENGTH;
2453  22 if (codesForResidues == (proteinLength + 1))
2454    {
2455    // assuming extra codon is for STOP and not in peptide
2456    // todo: check trailing codon is indeed a STOP codon
2457  2 codesForResidues--;
2458  2 mappedDnaLength -= CODON_LENGTH;
2459  2 MappingUtils.removeEndPositions(CODON_LENGTH, ranges);
2460    }
2461   
2462  22 if (codesForResidues == proteinLength)
2463    {
2464  4 proteinRange.add(new int[] { proteinStart, proteinEnd });
2465  4 return new MapList(ranges, proteinRange, CODON_LENGTH, 1);
2466    }
2467  18 return null;
2468    }
2469   
2470    /**
2471    * Returns a list of CDS ranges found (as sequence positions base 1), i.e. of
2472    * [start, end] positions of sequence features of type "CDS" (or a sub-type of
2473    * CDS in the Sequence Ontology). The ranges are sorted into ascending start
2474    * position order, so this method is only valid for linear CDS in the same
2475    * sense as the protein product.
2476    *
2477    * @param dnaSeq
2478    * @return
2479    */
 
2480  24 toggle protected static List<int[]> findCdsPositions(SequenceI dnaSeq)
2481    {
2482  24 List<int[]> result = new ArrayList<>();
2483   
2484  24 List<SequenceFeature> sfs = dnaSeq.getFeatures()
2485    .getFeaturesByOntology(SequenceOntologyI.CDS);
2486  24 if (sfs.isEmpty())
2487    {
2488  16 return result;
2489    }
2490  8 SequenceFeatures.sortFeatures(sfs, true);
2491   
2492  8 for (SequenceFeature sf : sfs)
2493    {
2494  16 int phase = 0;
2495  16 try
2496    {
2497  16 String s = sf.getPhase();
2498  16 if (s != null)
2499    {
2500  2 phase = Integer.parseInt(s);
2501    }
2502    } catch (NumberFormatException e)
2503    {
2504    // leave as zero
2505    }
2506    /*
2507    * phase > 0 on first codon means 5' incomplete - skip to the start
2508    * of the next codon; example ENST00000496384
2509    */
2510  16 int begin = sf.getBegin();
2511  16 int end = sf.getEnd();
2512  16 if (result.isEmpty() && phase > 0)
2513    {
2514  2 begin += phase;
2515  2 if (begin > end)
2516    {
2517    // shouldn't happen!
2518  0 System.err
2519    .println("Error: start phase extends beyond start CDS in "
2520    + dnaSeq.getName());
2521    }
2522    }
2523  16 result.add(new int[] { begin, end });
2524    }
2525   
2526    /*
2527    * Finally sort ranges by start position. This avoids a dependency on
2528    * keeping features in order on the sequence (if they are in order anyway,
2529    * the sort will have almost no work to do). The implicit assumption is CDS
2530    * ranges are assembled in order. Other cases should not use this method,
2531    * but instead construct an explicit mapping for CDS (e.g. EMBL parsing).
2532    */
2533  8 Collections.sort(result, IntRangeComparator.ASCENDING);
2534  8 return result;
2535    }
2536   
2537    /**
2538    * Makes an alignment with a copy of the given sequences, adding in any
2539    * non-redundant sequences which are mapped to by the cross-referenced
2540    * sequences.
2541    *
2542    * @param seqs
2543    * @param xrefs
2544    * @param dataset
2545    * the alignment dataset shared by the new copy
2546    * @return
2547    */
 
2548  0 toggle public static AlignmentI makeCopyAlignment(SequenceI[] seqs,
2549    SequenceI[] xrefs, AlignmentI dataset)
2550    {
2551  0 AlignmentI copy = new Alignment(new Alignment(seqs));
2552  0 copy.setDataset(dataset);
2553  0 boolean isProtein = !copy.isNucleotide();
2554  0 SequenceIdMatcher matcher = new SequenceIdMatcher(seqs);
2555  0 if (xrefs != null)
2556    {
2557    // BH 2019.01.25 recoded to remove iterators
2558   
2559  0 for (int ix = 0, nx = xrefs.length; ix < nx; ix++)
2560    {
2561  0 SequenceI xref = xrefs[ix];
2562  0 List<DBRefEntry> dbrefs = xref.getDBRefs();
2563  0 if (dbrefs != null)
2564    {
2565  0 for (int ir = 0, nir = dbrefs.size(); ir < nir; ir++)
2566    {
2567  0 DBRefEntry dbref = dbrefs.get(ir);
2568  0 Mapping map = dbref.getMap();
2569  0 SequenceI mto;
2570  0 if (map == null || (mto = map.getTo()) == null
2571    || mto.isProtein() != isProtein)
2572    {
2573  0 continue;
2574    }
2575  0 SequenceI mappedTo = mto;
2576  0 SequenceI match = matcher.findIdMatch(mappedTo);
2577  0 if (match == null)
2578    {
2579  0 matcher.add(mappedTo);
2580  0 copy.addSequence(mappedTo);
2581    }
2582    }
2583    }
2584    }
2585    }
2586  0 return copy;
2587    }
2588   
2589    /**
2590    * Try to align sequences in 'unaligned' to match the alignment of their
2591    * mapped regions in 'aligned'. For example, could use this to align CDS
2592    * sequences which are mapped to their parent cDNA sequences.
2593    *
2594    * This method handles 1:1 mappings (dna-to-dna or protein-to-protein). For
2595    * dna-to-protein or protein-to-dna use alternative methods.
2596    *
2597    * @param unaligned
2598    * sequences to be aligned
2599    * @param aligned
2600    * holds aligned sequences and their mappings
2601    * @return
2602    */
 
2603  4 toggle public static int alignAs(AlignmentI unaligned, AlignmentI aligned)
2604    {
2605    /*
2606    * easy case - aligning a copy of aligned sequences
2607    */
2608  4 if (alignAsSameSequences(unaligned, aligned))
2609    {
2610  0 return unaligned.getHeight();
2611    }
2612   
2613    /*
2614    * fancy case - aligning via mappings between sequences
2615    */
2616  4 List<SequenceI> unmapped = new ArrayList<>();
2617  4 Map<Integer, Map<SequenceI, Character>> columnMap = buildMappedColumnsMap(
2618    unaligned, aligned, unmapped);
2619  4 int width = columnMap.size();
2620  4 char gap = unaligned.getGapCharacter();
2621  4 int realignedCount = 0;
2622    // TODO: verify this loop scales sensibly for very wide/high alignments
2623   
2624  4 for (SequenceI seq : unaligned.getSequences())
2625    {
2626  26 if (!unmapped.contains(seq))
2627    {
2628  26 char[] newSeq = new char[width];
2629  26 Arrays.fill(newSeq, gap); // JBPComment - doubt this is faster than the
2630    // Integer iteration below
2631  26 int newCol = 0;
2632  26 int lastCol = 0;
2633   
2634    /*
2635    * traverse the map to find columns populated
2636    * by our sequence
2637    */
2638  26 for (Integer column : columnMap.keySet())
2639    {
2640  58976 Character c = columnMap.get(column).get(seq);
2641  58976 if (c != null)
2642    {
2643    /*
2644    * sequence has a character at this position
2645    *
2646    */
2647  31986 newSeq[newCol] = c;
2648  31986 lastCol = newCol;
2649    }
2650  58976 newCol++;
2651    }
2652   
2653    /*
2654    * trim trailing gaps
2655    */
2656  26 if (lastCol < width)
2657    {
2658  26 char[] tmp = new char[lastCol + 1];
2659  26 System.arraycopy(newSeq, 0, tmp, 0, lastCol + 1);
2660  26 newSeq = tmp;
2661    }
2662    // TODO: optimise SequenceI to avoid char[]->String->char[]
2663  26 seq.setSequence(String.valueOf(newSeq));
2664  26 realignedCount++;
2665    }
2666    }
2667  4 return realignedCount;
2668    }
2669   
2670    /**
2671    * If unaligned and aligned sequences share the same dataset sequences, then
2672    * simply copies the aligned sequences to the unaligned sequences and returns
2673    * true; else returns false
2674    *
2675    * @param unaligned
2676    * - sequences to be aligned based on aligned
2677    * @param aligned
2678    * - 'guide' alignment containing sequences derived from same dataset
2679    * as unaligned
2680    * @return
2681    */
 
2682  8 toggle static boolean alignAsSameSequences(AlignmentI unaligned,
2683    AlignmentI aligned)
2684    {
2685  8 if (aligned.getDataset() == null || unaligned.getDataset() == null)
2686    {
2687  0 return false; // should only pass alignments with datasets here
2688    }
2689   
2690    // map from dataset sequence to alignment sequence(s)
2691  8 Map<SequenceI, List<SequenceI>> alignedDatasets = new HashMap<>();
2692  8 for (SequenceI seq : aligned.getSequences())
2693    {
2694  59 SequenceI ds = seq.getDatasetSequence();
2695  59 if (alignedDatasets.get(ds) == null)
2696    {
2697  58 alignedDatasets.put(ds, new ArrayList<SequenceI>());
2698    }
2699  59 alignedDatasets.get(ds).add(seq);
2700    }
2701   
2702    /*
2703    * first pass - check whether all sequences to be aligned share a
2704    * dataset sequence with an aligned sequence; also note the leftmost
2705    * ungapped column from which to copy
2706    */
2707  8 int leftmost = Integer.MAX_VALUE;
2708  8 for (SequenceI seq : unaligned.getSequences())
2709    {
2710  14 final SequenceI ds = seq.getDatasetSequence();
2711  14 if (!alignedDatasets.containsKey(ds))
2712    {
2713  5 return false;
2714    }
2715  9 SequenceI alignedSeq = alignedDatasets.get(ds).get(0);
2716  9 int startCol = alignedSeq.findIndex(seq.getStart()); // 1..
2717  9 leftmost = Math.min(leftmost, startCol);
2718    }
2719   
2720    /*
2721    * second pass - copy aligned sequences;
2722    * heuristic rule: pair off sequences in order for the case where
2723    * more than one shares the same dataset sequence
2724    */
2725  3 final char gapCharacter = aligned.getGapCharacter();
2726  3 for (SequenceI seq : unaligned.getSequences())
2727    {
2728  7 List<SequenceI> alignedSequences = alignedDatasets
2729    .get(seq.getDatasetSequence());
2730  7 if (alignedSequences.isEmpty())
2731    {
2732    /*
2733    * defensive check - shouldn't happen! (JAL-3536)
2734    */
2735  0 continue;
2736    }
2737  7 SequenceI alignedSeq = alignedSequences.get(0);
2738   
2739    /*
2740    * gap fill for leading (5') UTR if any
2741    */
2742    // TODO this copies intron columns - wrong!
2743  7 int startCol = alignedSeq.findIndex(seq.getStart()); // 1..
2744  7 int endCol = alignedSeq.findIndex(seq.getEnd());
2745  7 char[] seqchars = new char[endCol - leftmost + 1];
2746  7 Arrays.fill(seqchars, gapCharacter);
2747  7 char[] toCopy = alignedSeq.getSequence(startCol - 1, endCol);
2748  7 System.arraycopy(toCopy, 0, seqchars, startCol - leftmost,
2749    toCopy.length);
2750  7 seq.setSequence(String.valueOf(seqchars));
2751  7 if (alignedSequences.size() > 0)
2752    {
2753    // pop off aligned sequences (except the last one)
2754  7 alignedSequences.remove(0);
2755    }
2756    }
2757   
2758    /*
2759    * finally remove gapped columns (e.g. introns)
2760    */
2761  3 new RemoveGapColCommand("", unaligned.getSequencesArray(), 0,
2762    unaligned.getWidth() - 1, unaligned);
2763   
2764  3 return true;
2765    }
2766   
2767    /**
2768    * Returns a map whose key is alignment column number (base 1), and whose
2769    * values are a map of sequence characters in that column.
2770    *
2771    * @param unaligned
2772    * @param aligned
2773    * @param unmapped
2774    * @return
2775    */
 
2776  4 toggle static SortedMap<Integer, Map<SequenceI, Character>> buildMappedColumnsMap(
2777    AlignmentI unaligned, AlignmentI aligned,
2778    List<SequenceI> unmapped)
2779    {
2780    /*
2781    * Map will hold, for each aligned column position, a map of
2782    * {unalignedSequence, characterPerSequence} at that position.
2783    * TreeMap keeps the entries in ascending column order.
2784    */
2785  4 SortedMap<Integer, Map<SequenceI, Character>> map = new TreeMap<>();
2786   
2787    /*
2788    * record any sequences that have no mapping so can't be realigned
2789    */
2790  4 unmapped.addAll(unaligned.getSequences());
2791   
2792  4 List<AlignedCodonFrame> mappings = aligned.getCodonFrames();
2793   
2794  4 for (SequenceI seq : unaligned.getSequences())
2795    {
2796  26 for (AlignedCodonFrame mapping : mappings)
2797    {
2798  510 SequenceI fromSeq = mapping.findAlignedSequence(seq, aligned);
2799  510 if (fromSeq != null)
2800    {
2801  26 Mapping seqMap = mapping.getMappingBetween(fromSeq, seq);
2802  26 if (addMappedPositions(seq, fromSeq, seqMap, map))
2803    {
2804  26 unmapped.remove(seq);
2805    }
2806    }
2807    }
2808    }
2809  4 return map;
2810    }
2811   
2812    /**
2813    * Helper method that adds to a map the mapped column positions of a sequence.
2814    * <br>
2815    * For example if aaTT-Tg-gAAA is mapped to TTTAAA then the map should record
2816    * that columns 3,4,6,10,11,12 map to characters T,T,T,A,A,A of the mapped to
2817    * sequence.
2818    *
2819    * @param seq
2820    * the sequence whose column positions we are recording
2821    * @param fromSeq
2822    * a sequence that is mapped to the first sequence
2823    * @param seqMap
2824    * the mapping from 'fromSeq' to 'seq'
2825    * @param map
2826    * a map to add the column positions (in fromSeq) of the mapped
2827    * positions of seq
2828    * @return
2829    */
 
2830  28 toggle static boolean addMappedPositions(SequenceI seq, SequenceI fromSeq,
2831    Mapping seqMap, Map<Integer, Map<SequenceI, Character>> map)
2832    {
2833  28 if (seqMap == null)
2834    {
2835  0 return false;
2836    }
2837   
2838    /*
2839    * invert mapping if it is from unaligned to aligned sequence
2840    */
2841  28 if (seqMap.getTo() == fromSeq.getDatasetSequence())
2842    {
2843  0 seqMap = new Mapping(seq.getDatasetSequence(),
2844    seqMap.getMap().getInverse());
2845    }
2846   
2847  28 int toStart = seq.getStart();
2848   
2849    /*
2850    * traverse [start, end, start, end...] ranges in fromSeq
2851    */
2852  28 for (int[] fromRange : seqMap.getMap().getFromRanges())
2853    {
2854  62 for (int i = 0; i < fromRange.length - 1; i += 2)
2855    {
2856  31 boolean forward = fromRange[i + 1] >= fromRange[i];
2857   
2858    /*
2859    * find the range mapped to (sequence positions base 1)
2860    */
2861  31 int[] range = seqMap.locateMappedRange(fromRange[i],
2862    fromRange[i + 1]);
2863  31 if (range == null)
2864    {
2865  0 jalview.bin.Console.errPrintln("Error in mapping " + seqMap
2866    + " from " + fromSeq.getName());
2867  0 return false;
2868    }
2869  31 int fromCol = fromSeq.findIndex(fromRange[i]);
2870  31 int mappedCharPos = range[0];
2871   
2872    /*
2873    * walk over the 'from' aligned sequence in forward or reverse
2874    * direction; when a non-gap is found, record the column position
2875    * of the next character of the mapped-to sequence; stop when all
2876    * the characters of the range have been counted
2877    */
2878  2794274 while (mappedCharPos <= range[1] && fromCol <= fromSeq.getLength()
2879    && fromCol >= 0)
2880    {
2881  2794243 if (!Comparison.isGap(fromSeq.getCharAt(fromCol - 1)))
2882    {
2883    /*
2884    * mapped from sequence has a character in this column
2885    * record the column position for the mapped to character
2886    */
2887  31998 Map<SequenceI, Character> seqsMap = map.get(fromCol);
2888  31998 if (seqsMap == null)
2889    {
2890  5398 seqsMap = new HashMap<>();
2891  5398 map.put(fromCol, seqsMap);
2892    }
2893  31998 seqsMap.put(seq, seq.getCharAt(mappedCharPos - toStart));
2894  31998 mappedCharPos++;
2895    }
2896  2794243 fromCol += (forward ? 1 : -1);
2897    }
2898    }
2899    }
2900  28 return true;
2901    }
2902   
2903    // strictly temporary hack until proper criteria for aligning protein to cds
2904    // are in place; this is so Ensembl -> fetch xrefs Uniprot aligns the Uniprot
 
2905  4 toggle public static boolean looksLikeEnsembl(AlignmentI alignment)
2906    {
2907  4 for (SequenceI seq : alignment.getSequences())
2908    {
2909  88 String name = seq.getName();
2910  88 if (!name.startsWith("ENSG") && !name.startsWith("ENST"))
2911    {
2912  0 return false;
2913    }
2914    }
2915  4 return true;
2916    }
2917   
2918    /**
2919    * This method filters and returns the secondary structure annotations
2920    * in an array of annotations
2921    *
2922    * @param annotations
2923    * @return array of secondary structure annotations
2924    */
 
2925  0 toggle public static List<AlignmentAnnotation> getSecondaryStructureAnnots(
2926    AlignmentAnnotation[] annotations)
2927    {
2928  0 List<AlignmentAnnotation> ssAnnotations = new ArrayList<>();
2929  0 if (annotations == null || annotations.length == 0)
2930    {
2931  0 return null;
2932    }
2933   
2934  0 synchronized (annotations)
2935    {
2936  0 for (AlignmentAnnotation aa : annotations)
2937    {
2938  0 if (aa == null)
2939    {
2940  0 continue;
2941    }
2942   
2943  0 if (aa.label != null && Constants.SECONDARY_STRUCTURE_LABELS
2944    .containsKey(aa.label))
2945    {
2946  0 ssAnnotations.add(aa);
2947    }
2948    }
2949    }
2950  0 return ssAnnotations;
2951   
2952    }
2953   
 
2954  19 toggle public static boolean isSecondaryStructurePresent(
2955    AlignmentAnnotation[] annotations)
2956    {
2957  19 boolean ssPresent = false;
2958  19 if (annotations == null || annotations.length == 0)
2959    {
2960  8 return false;
2961    }
2962   
2963  11 synchronized (annotations)
2964    {
2965  11 for (AlignmentAnnotation aa : annotations)
2966    {
2967  12 if (aa == null)
2968    {
2969  0 continue;
2970    }
2971    // TODO: @RENIA decide what exactly we should be testing for - not
2972    // strings/labels
2973  12 if (// aa.isValidStruc() || aa.hasIcons ||
2974  12 (aa.label != null && Constants.SECONDARY_STRUCTURE_LABELS
2975    .containsKey(aa.label)))
2976    {
2977  9 ssPresent = true;
2978  9 break;
2979    }
2980    }
2981    }
2982  11 return ssPresent;
2983   
2984    }
2985   
 
2986  4 toggle public static Color getSecondaryStructureAnnotationColour(char symbol)
2987    {
2988   
2989  4 if (symbol == Constants.COIL)
2990    {
2991  1 return Color.gray;
2992    }
2993  3 if (symbol == Constants.SHEET)
2994    {
2995  1 return Color.green;
2996    }
2997  2 if (symbol == Constants.HELIX)
2998    {
2999  1 return Color.red;
3000    }
3001   
3002  1 return Color.white;
3003    }
3004   
 
3005  255567 toggle public static char findSSAnnotationForGivenSeqposition(
3006    AlignmentAnnotation aa, int seqPosition)
3007    {
3008  255571 char ss = '*';
3009   
3010  255579 if (aa != null)
3011    {
3012  255615 if (aa.getAnnotationForPosition(seqPosition) != null)
3013    {
3014  116194 Annotation a = aa.getAnnotationForPosition(seqPosition);
3015  116186 ss = a.secondaryStructure;
3016   
3017    // There is no representation for coil and it can be either ' ' or null.
3018  116195 if (ss == ' ' || ss == '-')
3019    {
3020  15368 ss = Constants.COIL;
3021    }
3022    }
3023    else
3024    {
3025  139420 ss = Constants.COIL;
3026    }
3027    }
3028   
3029  255562 return ss;
3030    }
3031   
 
3032  5168 toggle public static List<String> extractSSSourceInAlignmentAnnotation(
3033    AlignmentAnnotation[] annotations)
3034    {
3035   
3036  5168 List<String> ssSources = new ArrayList<>();
3037  5167 Set<String> addedSources = new HashSet<>(); // to keep track of added
3038    // sources
3039   
3040  5168 if (annotations == null)
3041    {
3042  12 return ssSources;
3043    }
3044   
3045  5156 for (AlignmentAnnotation aa : annotations)
3046    {
3047   
3048  27050 String ssSource = AlignmentAnnotationUtils.extractSSSourceFromAnnotationDescription(aa);
3049   
3050  27054 if (ssSource != null && !addedSources.contains(ssSource))
3051    {
3052  532 ssSources.add(ssSource);
3053  532 addedSources.add(ssSource);
3054    }
3055   
3056    }
3057  5156 Collections.sort(ssSources);
3058   
3059  5156 return ssSources;
3060   
3061    }
3062   
3063   
3064    /**
3065    * Returns secondary structure annotations from the given sequence for a specified source.
3066    * @param seq the sequence
3067    * @param ssSource the source label ("PDB", "JPred")
3068    * @return list of matching annotations
3069    */
 
3070  17934650 toggle public static List<AlignmentAnnotation> getAlignmentAnnotationForSource(
3071    SequenceI seq, String ssSource)
3072    {
3073   
3074  17942689 List<AlignmentAnnotation> ssAnnots = new ArrayList<AlignmentAnnotation>();
3075   
3076    // Iterate through all known secondary structure labels
3077  17968903 for (String ssLabel : Constants.SECONDARY_STRUCTURE_LABELS.keySet())
3078    {
3079   
3080  35804171 AlignmentAnnotation[] aa = seq.getAnnotation(ssLabel);
3081  35254252 if (aa != null)
3082    {
3083    // If source is "ALL", include all annotations for this label
3084  373466 if (Constants.SS_ALL_PROVIDERS.equals(ssSource))
3085    {
3086  160101 ssAnnots.addAll(Arrays.asList(aa));
3087  160101 continue;
3088    }
3089   
3090  213392 for (AlignmentAnnotation annot : aa)
3091    {
3092  259240 String ssSourceForAnnot = AlignmentAnnotationUtils.extractSSSourceFromAnnotationDescription(
3093    annot);
3094    // Add annotations from the matching source
3095  259041 if (ssSourceForAnnot != null && ssSource.equals(ssSourceForAnnot)
3096    && annot.isForDisplay())
3097    {
3098  158737 ssAnnots.add(annot);
3099    }
3100    }
3101    }
3102    }
3103  17794619 if (ssAnnots.size() > 0)
3104    {
3105  277668 return ssAnnots;
3106    }
3107   
3108  17666040 return null;
3109   
3110    }
3111   
 
3112  48 toggle public static Map<SequenceI, ArrayList<AlignmentAnnotation>> getSequenceAssociatedAlignmentAnnotations(
3113    AlignmentAnnotation[] alignAnnotList, String selectedSSSource)
3114    {
3115   
3116  48 Map<SequenceI, ArrayList<AlignmentAnnotation>> ssAlignmentAnnotationForSequences = new HashMap<SequenceI, ArrayList<AlignmentAnnotation>>();
3117  48 if (alignAnnotList == null || alignAnnotList.length == 0)
3118    {
3119  0 return ssAlignmentAnnotationForSequences;
3120    }
3121   
3122  48 for (AlignmentAnnotation aa : alignAnnotList)
3123    {
3124  251 if (aa.sequenceRef == null)
3125    {
3126  32 continue;
3127    }
3128   
3129  219 if (isSecondaryStructureFrom(selectedSSSource, aa))
3130    {
3131  78 ssAlignmentAnnotationForSequences
3132    .computeIfAbsent(aa.sequenceRef.getDatasetSequence(),
3133    k -> new ArrayList<>())
3134    .add(aa);
3135    }
3136    }
3137   
3138  48 return ssAlignmentAnnotationForSequences;
3139   
3140    }
3141   
3142    /**
3143    * param alignAnnotation
3144    * @param selectedSSSource
3145    * @return any alignment annotation that matches the given source
3146    */
 
3147  0 toggle public static List<AlignmentAnnotation> getSecondaryStructureAnnotionFor(List<AlignmentAnnotation> alignAnnotation, String selectedSSSource)
3148    {
3149  0 ArrayList<AlignmentAnnotation> annForSource = new ArrayList();
3150  0 for (AlignmentAnnotation alan:alignAnnotation)
3151    {
3152  0 if (isSecondaryStructureFrom(selectedSSSource, alan)) {
3153  0 annForSource.add(alan);
3154    }
3155    }
3156  0 return annForSource;
3157    }
3158   
3159    /**
3160    *
3161    * @param selectedSSSource
3162    * @param aa
3163    * @return true if aa is from a provider or all providers as specified by
3164    * selectedSSSource
3165    */
 
3166  219 toggle public static boolean isSecondaryStructureFrom(String selectedSSSource,
3167    AlignmentAnnotation aa)
3168    {
3169   
3170  219 for (String label : Constants.SECONDARY_STRUCTURE_LABELS.keySet())
3171    {
3172   
3173  376 if (label.equals(aa.label))
3174    {
3175   
3176  170 if (selectedSSSource.equals(Constants.SS_ALL_PROVIDERS))
3177    {
3178  38 return true;
3179    }
3180  132 String ssSource = AlignmentAnnotationUtils
3181    .extractSSSourceFromAnnotationDescription(aa);
3182  132 if (ssSource != null && ssSource.equals(selectedSSSource))
3183    {
3184  40 return true;
3185    }
3186    }
3187    }
3188  141 return false;
3189    }
3190   
3191    // Method to get the key for a given provider value
 
3192  0 toggle public static String getSecondaryStructureProviderKey(
3193    String providerValue)
3194    {
3195  0 for (Map.Entry<String, String> entry : Constants.STRUCTURE_PROVIDERS
3196    .entrySet())
3197    {
3198  0 if (entry.getValue().equals(providerValue))
3199    {
3200  0 return entry.getKey(); // Return the key (abbreviation) for the matching
3201    // provider value
3202    }
3203    }
3204  0 return null; // Return null if no match is found
3205    }
3206   
 
3207  0 toggle public static String reduceLabelLength(String label)
3208    {
3209    // Split the input by " | "
3210  0 StringBuilder reducedLabel = new StringBuilder();
3211  0 for (String part:label.split(" \\| ")) {
3212  0 String nextLabel=part;
3213  0 for (Entry<String,String> provider:Constants.STRUCTURE_PROVIDERS.entrySet()) {
3214  0 if (part.equals(provider.getValue())) {
3215  0 nextLabel = provider.getKey();
3216    }
3217    }
3218  0 if (reducedLabel.length()>0)
3219    {
3220  0 reducedLabel.append(" | ");
3221    }
3222  0 reducedLabel.append(nextLabel);
3223    }
3224  0 return reducedLabel.toString();
3225    }
3226   
 
3227  542 toggle public static HashMap<String, Color> assignColorsForSecondaryStructureProviders(
3228    List<String> labels)
3229    {
3230    // Loop through each unique label and add it to the map with a colour.
3231    // for (String label : uniqueLabels)
3232   
3233  542 HashMap<String, Color> secondaryStructureProviderColorMap = new HashMap<String, Color>();
3234  542 for (String label : labels)
3235    {
3236    // Generate or retrieve a colour for the label.
3237  506 String name = label.toUpperCase(Locale.ROOT).trim();
3238  506 secondaryStructureProviderColorMap.put(name,
3239    ColorUtils.getColourFromNameAndScheme(name, "NONE"));
3240    }
3241   
3242  542 return secondaryStructureProviderColorMap;
3243    }
3244   
 
3245  6 toggle public static int computeMaxShifts(SequenceI[] seqs, ShiftList inserts)
3246    {
3247   
3248  6 int oldwidth = 0;
3249  6 int p = 0, lastP = 0, lastIns = 0;
3250  6 for (SequenceI seq : seqs)
3251    {
3252  140 char[] sqs = seq.getSequence();
3253    // System.out.println(seq.getSequenceAsString());
3254  140 if (oldwidth < sqs.length)
3255    {
3256  13 oldwidth = sqs.length;
3257    }
3258  140 p = 0;
3259  140 lastP = 0;
3260  140 lastIns = 0;
3261  140 int modelOffset = 0; // shift between index and insertions encountered
3262  140 do
3263    {
3264  53877 if (sqs[p] >= 'a' && sqs[p] <= 'z')
3265    {
3266  2015 if (lastIns == 0)
3267    {
3268  589 lastP = (modelOffset + p) * 2;
3269    }
3270  2015 lastIns++;
3271    }
3272    else
3273    {
3274  51862 if (lastIns > 0)
3275    {
3276  586 inserts.extendShift(lastP, lastIns);
3277  586 modelOffset -= lastIns;
3278    }
3279  51862 lastIns = 0;
3280    }
3281  53877 } while (++p < sqs.length);
3282  140 if (lastIns > 0)
3283    {
3284  3 inserts.extendShift(lastP, lastIns);
3285    }
3286    }
3287  6 return oldwidth;
3288    }
3289   
3290    /**
3291    * quick and dirty lowercase - to - insert expansion
3292    *
3293    * @param seqs
3294    * - these will be direcly modified
3295    * @return message indicating how many gaps were inserted.
3296    */
 
3297  4 toggle public static String a3mToMSA(SequenceI[] seqs)
3298    {
3299  4 String response = "";
3300  4 ShiftList inserts = new ShiftList();
3301  4 int oldwidth = computeMaxShifts(seqs, inserts);
3302   
3303  4 int newwidth = insertShifts(seqs,inserts);
3304  4 if (oldwidth != newwidth)
3305    {
3306  4 response = "Added " + (newwidth - oldwidth) + " inserts.";
3307    }
3308  4 return response;
3309    }
3310   
 
3311  5 toggle public static int insertShifts(SequenceI[] seqs, ShiftList inserts)
3312    {
3313  5 int newwidth=0;
3314  5 for (SequenceI seq : seqs)
3315    {
3316  80 boolean changed=false;
3317  80 int posShift = 0, totalShift = 0;
3318  80 for (int[] shift : inserts.getShifts())
3319    {
3320    // locate this shift in the totalShift'ed sequence
3321  5912 posShift = (shift[0] / 2);
3322  5912 int len = shift[1];
3323   
3324    // account for any insertions in this sequence
3325  5912 int p = posShift + totalShift;
3326  7021 while (len > 0 && p < seq.getLength()
3327    && (seq.getCharAt(p) >= 'a' && seq.getCharAt(p) <= 'z'))
3328    {
3329  1109 p++;
3330  1109 len--;
3331    }
3332    // and mark any remaining after the insert
3333  5912 if (len > 0)
3334    {
3335  5670 changed=true;
3336  5670 seq.doInsert(p, len, '-');
3337    }
3338  5912 totalShift += shift[1]; // update shifted positions in this sequence
3339    }
3340  80 if (changed)
3341    {
3342  78 seq.sequenceChanged();
3343    }
3344  80 newwidth = (seq.getLength() > newwidth) ? seq.getLength() : newwidth;
3345    }
3346  5 return newwidth;
3347    }
3348    }