Clover icon

Coverage Report

  1. Project Clover database Mon Nov 18 2024 09:38:20 GMT
  2. Package jalview.analysis

File AlignmentUtils.java

 

Coverage histogram

../../img/srcFileCovDistChart8.png
13% of files have more coverage

Code metrics

492
981
64
2
3,402
2,092
377
0.38
15.33
32
5.89

Classes

Class Line # Actions
AlignmentUtils 86 975 371
0.785292278.5%
AlignmentUtils.DnaVariant 102 6 6
0.00%
 

Contributing tests

This file is covered by 188 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.analysis;
22   
23    import java.awt.Color;
24    import java.util.ArrayList;
25    import java.util.Arrays;
26    import java.util.Collection;
27    import java.util.Collections;
28    import java.util.HashMap;
29    import java.util.HashSet;
30    import java.util.Iterator;
31    import java.util.LinkedHashMap;
32    import java.util.List;
33    import java.util.Locale;
34    import java.util.Map;
35    import java.util.Map.Entry;
36    import java.util.NoSuchElementException;
37    import java.util.Set;
38    import java.util.SortedMap;
39    import java.util.TreeMap;
40    import java.util.Vector;
41    import java.util.stream.Collectors;
42   
43    import org.jcolorbrewer.ColorBrewer;
44   
45    import jalview.api.AlignCalcWorkerI;
46    import jalview.bin.Console;
47    import jalview.commands.RemoveGapColCommand;
48    import jalview.datamodel.AlignedCodon;
49    import jalview.datamodel.AlignedCodonFrame;
50    import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping;
51    import jalview.datamodel.Alignment;
52    import jalview.datamodel.AlignmentAnnotation;
53    import jalview.datamodel.AlignmentI;
54    import jalview.datamodel.Annotation;
55    import jalview.datamodel.ContactMatrixI;
56    import jalview.datamodel.DBRefEntry;
57    import jalview.datamodel.GeneLociI;
58    import jalview.datamodel.IncompleteCodonException;
59    import jalview.datamodel.Mapping;
60    import jalview.datamodel.PDBEntry;
61    import jalview.datamodel.SeqCigar;
62    import jalview.datamodel.Sequence;
63    import jalview.datamodel.SequenceFeature;
64    import jalview.datamodel.SequenceGroup;
65    import jalview.datamodel.SequenceI;
66    import jalview.datamodel.features.SequenceFeatures;
67    import jalview.gui.AlignmentPanel;
68    import jalview.io.gff.SequenceOntologyI;
69    import jalview.schemes.ResidueProperties;
70    import jalview.util.Comparison;
71    import jalview.util.Constants;
72    import jalview.util.DBRefUtils;
73    import jalview.util.IntRangeComparator;
74    import jalview.util.MapList;
75    import jalview.util.MappingUtils;
76    import jalview.util.MessageManager;
77    import jalview.workers.SecondaryStructureConsensusThread;
78   
79    /**
80    * grab bag of useful alignment manipulation operations Expect these to be
81    * refactored elsewhere at some point.
82    *
83    * @author jimp
84    *
85    */
 
86    public class AlignmentUtils
87    {
88    private static final int CODON_LENGTH = 3;
89   
90    private static final String SEQUENCE_VARIANT = "sequence_variant:";
91   
92    /*
93    * the 'id' attribute is provided for variant features fetched from
94    * Ensembl using its REST service with JSON format
95    */
96    public static final String VARIANT_ID = "id";
97   
98    /**
99    * A data model to hold the 'normal' base value at a position, and an optional
100    * sequence variant feature
101    */
 
102    static final class DnaVariant
103    {
104    final String base;
105   
106    SequenceFeature variant;
107   
 
108  0 toggle DnaVariant(String nuc)
109    {
110  0 base = nuc;
111  0 variant = null;
112    }
113   
 
114  0 toggle DnaVariant(String nuc, SequenceFeature var)
115    {
116  0 base = nuc;
117  0 variant = var;
118    }
119   
 
120  0 toggle public String getSource()
121    {
122  0 return variant == null ? null : variant.getFeatureGroup();
123    }
124   
125    /**
126    * toString for aid in the debugger only
127    */
 
128  0 toggle @Override
129    public String toString()
130    {
131  0 return base + ":" + (variant == null ? "" : variant.getDescription());
132    }
133    }
134   
135    /**
136    * given an existing alignment, create a new alignment including all, or up to
137    * flankSize additional symbols from each sequence's dataset sequence
138    *
139    * @param core
140    * @param flankSize
141    * @return AlignmentI
142    */
 
143  27 toggle public static AlignmentI expandContext(AlignmentI core, int flankSize)
144    {
145  27 List<SequenceI> sq = new ArrayList<>();
146  27 int maxoffset = 0;
147  27 for (SequenceI s : core.getSequences())
148    {
149  131 SequenceI newSeq = s.deriveSequence();
150  131 final int newSeqStart = newSeq.getStart() - 1;
151  131 if (newSeqStart > maxoffset
152    && newSeq.getDatasetSequence().getStart() < s.getStart())
153    {
154  131 maxoffset = newSeqStart;
155    }
156  131 sq.add(newSeq);
157    }
158  27 if (flankSize > -1)
159    {
160  25 maxoffset = Math.min(maxoffset, flankSize);
161    }
162   
163    /*
164    * now add offset left and right to create an expanded alignment
165    */
166  27 for (SequenceI s : sq)
167    {
168  131 SequenceI ds = s;
169  262 while (ds.getDatasetSequence() != null)
170    {
171  131 ds = ds.getDatasetSequence();
172    }
173  131 int s_end = s.findPosition(s.getStart() + s.getLength());
174    // find available flanking residues for sequence
175  131 int ustream_ds = s.getStart() - ds.getStart();
176  131 int dstream_ds = ds.getEnd() - s_end;
177   
178    // build new flanked sequence
179   
180    // compute gap padding to start of flanking sequence
181  131 int offset = maxoffset - ustream_ds;
182   
183    // padding is gapChar x ( maxoffset - min(ustream_ds, flank)
184  131 if (flankSize >= 0)
185    {
186  125 if (flankSize < ustream_ds)
187    {
188    // take up to flankSize residues
189  40 offset = maxoffset - flankSize;
190  40 ustream_ds = flankSize;
191    }
192  125 if (flankSize <= dstream_ds)
193    {
194  116 dstream_ds = flankSize - 1;
195    }
196    }
197    // TODO use Character.toLowerCase to avoid creating String objects?
198  131 char[] upstream = new String(ds
199    .getSequence(s.getStart() - 1 - ustream_ds, s.getStart() - 1))
200    .toLowerCase(Locale.ROOT).toCharArray();
201  131 char[] downstream = new String(
202    ds.getSequence(s_end - 1, s_end + dstream_ds))
203    .toLowerCase(Locale.ROOT).toCharArray();
204  131 char[] coreseq = s.getSequence();
205  131 char[] nseq = new char[offset + upstream.length + downstream.length
206    + coreseq.length];
207  131 char c = core.getGapCharacter();
208   
209  131 int p = 0;
210  461 for (; p < offset; p++)
211    {
212  330 nseq[p] = c;
213    }
214   
215  131 System.arraycopy(upstream, 0, nseq, p, upstream.length);
216  131 System.arraycopy(coreseq, 0, nseq, p + upstream.length,
217    coreseq.length);
218  131 System.arraycopy(downstream, 0, nseq,
219    p + coreseq.length + upstream.length, downstream.length);
220  131 s.setSequence(new String(nseq));
221  131 s.setStart(s.getStart() - ustream_ds);
222  131 s.setEnd(s_end + downstream.length);
223    }
224  27 AlignmentI newAl = new jalview.datamodel.Alignment(
225    sq.toArray(new SequenceI[0]));
226  27 for (SequenceI s : sq)
227    {
228  131 if (s.getAnnotation() != null)
229    {
230  1 for (AlignmentAnnotation aa : s.getAnnotation())
231    {
232  1 aa.adjustForAlignment(); // JAL-1712 fix
233  1 newAl.addAnnotation(aa);
234    }
235    }
236    }
237  27 newAl.setDataset(core.getDataset());
238  27 return newAl;
239    }
240   
241    /**
242    * Returns the index (zero-based position) of a sequence in an alignment, or
243    * -1 if not found.
244    *
245    * @param al
246    * @param seq
247    * @return
248    */
 
249  56066 toggle public static int getSequenceIndex(AlignmentI al, SequenceI seq)
250    {
251  56066 int result = -1;
252  56066 int pos = 0;
253  56066 for (SequenceI alSeq : al.getSequences())
254    {
255  126110737 if (alSeq == seq)
256    {
257  56066 result = pos;
258  56066 break;
259    }
260  126054671 pos++;
261    }
262  56066 return result;
263    }
264   
265    /**
266    * Returns a map of lists of sequences in the alignment, keyed by sequence
267    * name. For use in mapping between different alignment views of the same
268    * sequences.
269    *
270    * @see jalview.datamodel.AlignmentI#getSequencesByName()
271    */
 
272  1 toggle public static Map<String, List<SequenceI>> getSequencesByName(
273    AlignmentI al)
274    {
275  1 Map<String, List<SequenceI>> theMap = new LinkedHashMap<>();
276  1 for (SequenceI seq : al.getSequences())
277    {
278  3 String name = seq.getName();
279  3 if (name != null)
280    {
281  3 List<SequenceI> seqs = theMap.get(name);
282  3 if (seqs == null)
283    {
284  2 seqs = new ArrayList<>();
285  2 theMap.put(name, seqs);
286    }
287  3 seqs.add(seq);
288    }
289    }
290  1 return theMap;
291    }
292   
293    /**
294    * Build mapping of protein to cDNA alignment. Mappings are made between
295    * sequences where the cDNA translates to the protein sequence. Any new
296    * mappings are added to the protein alignment. Returns true if any mappings
297    * either already exist or were added, else false.
298    *
299    * @param proteinAlignment
300    * @param cdnaAlignment
301    * @return
302    */
 
303  4 toggle public static boolean mapProteinAlignmentToCdna(
304    final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment)
305    {
306  4 if (proteinAlignment == null || cdnaAlignment == null)
307    {
308  0 return false;
309    }
310   
311  4 Set<SequenceI> mappedDna = new HashSet<>();
312  4 Set<SequenceI> mappedProtein = new HashSet<>();
313   
314    /*
315    * First pass - map sequences where cross-references exist. This include
316    * 1-to-many mappings to support, for example, variant cDNA.
317    */
318  4 boolean mappingPerformed = mapProteinToCdna(proteinAlignment,
319    cdnaAlignment, mappedDna, mappedProtein, true);
320   
321    /*
322    * Second pass - map sequences where no cross-references exist. This only
323    * does 1-to-1 mappings and assumes corresponding sequences are in the same
324    * order in the alignments.
325    */
326  4 mappingPerformed |= mapProteinToCdna(proteinAlignment, cdnaAlignment,
327    mappedDna, mappedProtein, false);
328  4 return mappingPerformed;
329    }
330   
331    /**
332    * Make mappings between compatible sequences (where the cDNA translation
333    * matches the protein).
334    *
335    * @param proteinAlignment
336    * @param cdnaAlignment
337    * @param mappedDna
338    * a set of mapped DNA sequences (to add to)
339    * @param mappedProtein
340    * a set of mapped Protein sequences (to add to)
341    * @param xrefsOnly
342    * if true, only map sequences where xrefs exist
343    * @return
344    */
 
345  8 toggle protected static boolean mapProteinToCdna(
346    final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment,
347    Set<SequenceI> mappedDna, Set<SequenceI> mappedProtein,
348    boolean xrefsOnly)
349    {
350  8 boolean mappingExistsOrAdded = false;
351  8 List<SequenceI> thisSeqs = proteinAlignment.getSequences();
352  8 for (SequenceI aaSeq : thisSeqs)
353    {
354  22 boolean proteinMapped = false;
355  22 AlignedCodonFrame acf = new AlignedCodonFrame();
356   
357  22 for (SequenceI cdnaSeq : cdnaAlignment.getSequences())
358    {
359    /*
360    * Always try to map if sequences have xref to each other; this supports
361    * variant cDNA or alternative splicing for a protein sequence.
362    *
363    * If no xrefs, try to map progressively, assuming that alignments have
364    * mappable sequences in corresponding order. These are not
365    * many-to-many, as that would risk mixing species with similar cDNA
366    * sequences.
367    */
368  86 if (xrefsOnly && !AlignmentUtils.haveCrossRef(aaSeq, cdnaSeq))
369    {
370  39 continue;
371    }
372   
373    /*
374    * Don't map non-xrefd sequences more than once each. This heuristic
375    * allows us to pair up similar sequences in ordered alignments.
376    */
377  47 if (!xrefsOnly && (mappedProtein.contains(aaSeq)
378    || mappedDna.contains(cdnaSeq)))
379    {
380  29 continue;
381    }
382  18 if (mappingExists(proteinAlignment.getCodonFrames(),
383    aaSeq.getDatasetSequence(), cdnaSeq.getDatasetSequence()))
384    {
385  0 mappingExistsOrAdded = true;
386    }
387    else
388    {
389  18 MapList map = mapCdnaToProtein(aaSeq, cdnaSeq);
390  18 if (map != null)
391    {
392  12 acf.addMap(cdnaSeq, aaSeq, map);
393  12 mappingExistsOrAdded = true;
394  12 proteinMapped = true;
395  12 mappedDna.add(cdnaSeq);
396  12 mappedProtein.add(aaSeq);
397    }
398    }
399    }
400  22 if (proteinMapped)
401    {
402  11 proteinAlignment.addCodonFrame(acf);
403    }
404    }
405  8 return mappingExistsOrAdded;
406    }
407   
408    /**
409    * Answers true if the mappings include one between the given (dataset)
410    * sequences.
411    */
 
412  30 toggle protected static boolean mappingExists(List<AlignedCodonFrame> mappings,
413    SequenceI aaSeq, SequenceI cdnaSeq)
414    {
415  30 if (mappings != null)
416    {
417  30 for (AlignedCodonFrame acf : mappings)
418    {
419  20 if (cdnaSeq == acf.getDnaForAaSeq(aaSeq))
420    {
421  0 return true;
422    }
423    }
424    }
425  30 return false;
426    }
427   
428    /**
429    * Builds a mapping (if possible) of a cDNA to a protein sequence.
430    * <ul>
431    * <li>first checks if the cdna translates exactly to the protein
432    * sequence</li>
433    * <li>else checks for translation after removing a STOP codon</li>
434    * <li>else checks for translation after removing a START codon</li>
435    * <li>if that fails, inspect CDS features on the cDNA sequence</li>
436    * </ul>
437    * Returns null if no mapping is determined.
438    *
439    * @param proteinSeq
440    * the aligned protein sequence
441    * @param cdnaSeq
442    * the aligned cdna sequence
443    * @return
444    */
 
445  30 toggle public static MapList mapCdnaToProtein(SequenceI proteinSeq,
446    SequenceI cdnaSeq)
447    {
448    /*
449    * Here we handle either dataset sequence set (desktop) or absent (applet).
450    * Use only the char[] form of the sequence to avoid creating possibly large
451    * String objects.
452    */
453  30 final SequenceI proteinDataset = proteinSeq.getDatasetSequence();
454  30 char[] aaSeqChars = proteinDataset != null
455    ? proteinDataset.getSequence()
456    : proteinSeq.getSequence();
457  30 final SequenceI cdnaDataset = cdnaSeq.getDatasetSequence();
458  30 char[] cdnaSeqChars = cdnaDataset != null ? cdnaDataset.getSequence()
459    : cdnaSeq.getSequence();
460  30 if (aaSeqChars == null || cdnaSeqChars == null)
461    {
462  0 return null;
463    }
464   
465    /*
466    * cdnaStart/End, proteinStartEnd are base 1 (for dataset sequence mapping)
467    */
468  30 final int mappedLength = CODON_LENGTH * aaSeqChars.length;
469  30 int cdnaLength = cdnaSeqChars.length;
470  30 int cdnaStart = cdnaSeq.getStart();
471  30 int cdnaEnd = cdnaSeq.getEnd();
472  30 final int proteinStart = proteinSeq.getStart();
473  30 final int proteinEnd = proteinSeq.getEnd();
474   
475    /*
476    * If lengths don't match, try ignoring stop codon (if present)
477    */
478  30 if (cdnaLength != mappedLength && cdnaLength > 2)
479    {
480  14 String lastCodon = String.valueOf(cdnaSeqChars,
481    cdnaLength - CODON_LENGTH, CODON_LENGTH)
482    .toUpperCase(Locale.ROOT);
483  14 for (String stop : ResidueProperties.STOP_CODONS)
484    {
485  41 if (lastCodon.equals(stop))
486    {
487  2 cdnaEnd -= CODON_LENGTH;
488  2 cdnaLength -= CODON_LENGTH;
489  2 break;
490    }
491    }
492    }
493   
494    /*
495    * If lengths still don't match, try ignoring start codon.
496    */
497  30 int startOffset = 0;
498  30 if (cdnaLength != mappedLength && cdnaLength > 2
499    && String.valueOf(cdnaSeqChars, 0, CODON_LENGTH)
500    .toUpperCase(Locale.ROOT)
501    .equals(ResidueProperties.START))
502    {
503  5 startOffset += CODON_LENGTH;
504  5 cdnaStart += CODON_LENGTH;
505  5 cdnaLength -= CODON_LENGTH;
506    }
507   
508  30 if (translatesAs(cdnaSeqChars, startOffset, aaSeqChars))
509    {
510    /*
511    * protein is translation of dna (+/- start/stop codons)
512    */
513  15 MapList map = new MapList(new int[] { cdnaStart, cdnaEnd },
514    new int[]
515    { proteinStart, proteinEnd }, CODON_LENGTH, 1);
516  15 return map;
517    }
518   
519    /*
520    * translation failed - try mapping CDS annotated regions of dna
521    */
522  15 return mapCdsToProtein(cdnaSeq, proteinSeq);
523    }
524   
525    /**
526    * Test whether the given cdna sequence, starting at the given offset,
527    * translates to the given amino acid sequence, using the standard translation
528    * table. Designed to fail fast i.e. as soon as a mismatch position is found.
529    *
530    * @param cdnaSeqChars
531    * @param cdnaStart
532    * @param aaSeqChars
533    * @return
534    */
 
535  50 toggle protected static boolean translatesAs(char[] cdnaSeqChars, int cdnaStart,
536    char[] aaSeqChars)
537    {
538  50 if (cdnaSeqChars == null || aaSeqChars == null)
539    {
540  3 return false;
541    }
542   
543  47 int aaPos = 0;
544  47 int dnaPos = cdnaStart;
545  160 for (; dnaPos < cdnaSeqChars.length - 2
546    && aaPos < aaSeqChars.length; dnaPos += CODON_LENGTH, aaPos++)
547    {
548  129 String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
549  129 final String translated = ResidueProperties.codonTranslate(codon);
550   
551    /*
552    * allow * in protein to match untranslatable in dna
553    */
554  129 final char aaRes = aaSeqChars[aaPos];
555  129 if ((translated == null || ResidueProperties.STOP.equals(translated))
556    && aaRes == '*')
557    {
558  4 continue;
559    }
560  125 if (translated == null || !(aaRes == translated.charAt(0)))
561    {
562    // debug
563    // jalview.bin.Console.outPrintln(("Mismatch at " + i + "/" + aaResidue
564    // + ": "
565    // + codon + "(" + translated + ") != " + aaRes));
566  16 return false;
567    }
568    }
569   
570    /*
571    * check we matched all of the protein sequence
572    */
573  31 if (aaPos != aaSeqChars.length)
574    {
575  2 return false;
576    }
577   
578    /*
579    * check we matched all of the dna except
580    * for optional trailing STOP codon
581    */
582  29 if (dnaPos == cdnaSeqChars.length)
583    {
584  17 return true;
585    }
586  12 if (dnaPos == cdnaSeqChars.length - CODON_LENGTH)
587    {
588  11 String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
589  11 if (ResidueProperties.STOP
590    .equals(ResidueProperties.codonTranslate(codon)))
591    {
592  9 return true;
593    }
594    }
595  3 return false;
596    }
597   
598    /**
599    * Align sequence 'seq' to match the alignment of a mapped sequence. Note this
600    * currently assumes that we are aligning cDNA to match protein.
601    *
602    * @param seq
603    * the sequence to be realigned
604    * @param al
605    * the alignment whose sequence alignment is to be 'copied'
606    * @param gap
607    * character string represent a gap in the realigned sequence
608    * @param preserveUnmappedGaps
609    * @param preserveMappedGaps
610    * @return true if the sequence was realigned, false if it could not be
611    */
 
612  0 toggle public static boolean alignSequenceAs(SequenceI seq, AlignmentI al,
613    String gap, boolean preserveMappedGaps,
614    boolean preserveUnmappedGaps)
615    {
616    /*
617    * Get any mappings from the source alignment to the target (dataset)
618    * sequence.
619    */
620    // TODO there may be one AlignedCodonFrame per dataset sequence, or one with
621    // all mappings. Would it help to constrain this?
622  0 List<AlignedCodonFrame> mappings = al.getCodonFrame(seq);
623  0 if (mappings == null || mappings.isEmpty())
624    {
625  0 return false;
626    }
627   
628    /*
629    * Locate the aligned source sequence whose dataset sequence is mapped. We
630    * just take the first match here (as we can't align like more than one
631    * sequence).
632    */
633  0 SequenceI alignFrom = null;
634  0 AlignedCodonFrame mapping = null;
635  0 for (AlignedCodonFrame mp : mappings)
636    {
637  0 alignFrom = mp.findAlignedSequence(seq, al);
638  0 if (alignFrom != null)
639    {
640  0 mapping = mp;
641  0 break;
642    }
643    }
644   
645  0 if (alignFrom == null)
646    {
647  0 return false;
648    }
649  0 alignSequenceAs(seq, alignFrom, mapping, gap, al.getGapCharacter(),
650    preserveMappedGaps, preserveUnmappedGaps);
651  0 return true;
652    }
653   
654    /**
655    * Align sequence 'alignTo' the same way as 'alignFrom', using the mapping to
656    * match residues and codons. Flags control whether existing gaps in unmapped
657    * (intron) and mapped (exon) regions are preserved or not. Gaps between
658    * intron and exon are only retained if both flags are set.
659    *
660    * @param alignTo
661    * @param alignFrom
662    * @param mapping
663    * @param myGap
664    * @param sourceGap
665    * @param preserveUnmappedGaps
666    * @param preserveMappedGaps
667    */
 
668  15 toggle public static void alignSequenceAs(SequenceI alignTo, SequenceI alignFrom,
669    AlignedCodonFrame mapping, String myGap, char sourceGap,
670    boolean preserveMappedGaps, boolean preserveUnmappedGaps)
671    {
672    // TODO generalise to work for Protein-Protein, dna-dna, dna-protein
673   
674    // aligned and dataset sequence positions, all base zero
675  15 int thisSeqPos = 0;
676  15 int sourceDsPos = 0;
677   
678  15 int basesWritten = 0;
679  15 char myGapChar = myGap.charAt(0);
680  15 int ratio = myGap.length();
681   
682  15 int fromOffset = alignFrom.getStart() - 1;
683  15 int toOffset = alignTo.getStart() - 1;
684  15 int sourceGapMappedLength = 0;
685  15 boolean inExon = false;
686  15 final int toLength = alignTo.getLength();
687  15 final int fromLength = alignFrom.getLength();
688  15 StringBuilder thisAligned = new StringBuilder(2 * toLength);
689   
690    /*
691    * Traverse the 'model' aligned sequence
692    */
693  109 for (int i = 0; i < fromLength; i++)
694    {
695  94 char sourceChar = alignFrom.getCharAt(i);
696  94 if (sourceChar == sourceGap)
697    {
698  44 sourceGapMappedLength += ratio;
699  44 continue;
700    }
701   
702    /*
703    * Found a non-gap character. Locate its mapped region if any.
704    */
705  50 sourceDsPos++;
706    // Note mapping positions are base 1, our sequence positions base 0
707  50 int[] mappedPos = mapping.getMappedRegion(alignTo, alignFrom,
708    sourceDsPos + fromOffset);
709  50 if (mappedPos == null)
710    {
711    /*
712    * unmapped position; treat like a gap
713    */
714  14 sourceGapMappedLength += ratio;
715    // jalview.bin.Console.errPrintln("Can't align: no codon mapping to
716    // residue "
717    // + sourceDsPos + "(" + sourceChar + ")");
718    // return;
719  14 continue;
720    }
721   
722  36 int mappedCodonStart = mappedPos[0]; // position (1...) of codon start
723  36 int mappedCodonEnd = mappedPos[mappedPos.length - 1]; // codon end pos
724  36 StringBuilder trailingCopiedGap = new StringBuilder();
725   
726    /*
727    * Copy dna sequence up to and including this codon. Optionally, include
728    * gaps before the codon starts (in introns) and/or after the codon starts
729    * (in exons).
730    *
731    * Note this only works for 'linear' splicing, not reverse or interleaved.
732    * But then 'align dna as protein' doesn't make much sense otherwise.
733    */
734  36 int intronLength = 0;
735  278 while (basesWritten + toOffset < mappedCodonEnd
736    && thisSeqPos < toLength)
737    {
738  242 final char c = alignTo.getCharAt(thisSeqPos++);
739  242 if (c != myGapChar)
740    {
741  142 basesWritten++;
742  142 int sourcePosition = basesWritten + toOffset;
743  142 if (sourcePosition < mappedCodonStart)
744    {
745    /*
746    * Found an unmapped (intron) base. First add in any preceding gaps
747    * (if wanted).
748    */
749  48 if (preserveUnmappedGaps && trailingCopiedGap.length() > 0)
750    {
751  17 thisAligned.append(trailingCopiedGap.toString());
752  17 intronLength += trailingCopiedGap.length();
753  17 trailingCopiedGap = new StringBuilder();
754    }
755  48 intronLength++;
756  48 inExon = false;
757    }
758    else
759    {
760  94 final boolean startOfCodon = sourcePosition == mappedCodonStart;
761  94 int gapsToAdd = calculateGapsToInsert(preserveMappedGaps,
762    preserveUnmappedGaps, sourceGapMappedLength, inExon,
763    trailingCopiedGap.length(), intronLength, startOfCodon);
764  179 for (int k = 0; k < gapsToAdd; k++)
765    {
766  85 thisAligned.append(myGapChar);
767    }
768  94 sourceGapMappedLength = 0;
769  94 inExon = true;
770    }
771  142 thisAligned.append(c);
772  142 trailingCopiedGap = new StringBuilder();
773    }
774    else
775    {
776  100 if (inExon && preserveMappedGaps)
777    {
778  32 trailingCopiedGap.append(myGapChar);
779    }
780  68 else if (!inExon && preserveUnmappedGaps)
781    {
782  27 trailingCopiedGap.append(myGapChar);
783    }
784    }
785    }
786    }
787   
788    /*
789    * At end of model aligned sequence. Copy any remaining target sequence, optionally
790    * including (intron) gaps.
791    */
792  57 while (thisSeqPos < toLength)
793    {
794  42 final char c = alignTo.getCharAt(thisSeqPos++);
795  42 if (c != myGapChar || preserveUnmappedGaps)
796    {
797  34 thisAligned.append(c);
798    }
799  42 sourceGapMappedLength--;
800    }
801   
802    /*
803    * finally add gaps to pad for any trailing source gaps or
804    * unmapped characters
805    */
806  15 if (preserveUnmappedGaps)
807    {
808  20 while (sourceGapMappedLength > 0)
809    {
810  12 thisAligned.append(myGapChar);
811  12 sourceGapMappedLength--;
812    }
813    }
814   
815    /*
816    * All done aligning, set the aligned sequence.
817    */
818  15 alignTo.setSequence(new String(thisAligned));
819    }
820   
821    /**
822    * Helper method to work out how many gaps to insert when realigning.
823    *
824    * @param preserveMappedGaps
825    * @param preserveUnmappedGaps
826    * @param sourceGapMappedLength
827    * @param inExon
828    * @param trailingCopiedGap
829    * @param intronLength
830    * @param startOfCodon
831    * @return
832    */
 
833  94 toggle protected static int calculateGapsToInsert(boolean preserveMappedGaps,
834    boolean preserveUnmappedGaps, int sourceGapMappedLength,
835    boolean inExon, int trailingGapLength, int intronLength,
836    final boolean startOfCodon)
837    {
838  94 int gapsToAdd = 0;
839  94 if (startOfCodon)
840    {
841    /*
842    * Reached start of codon. Ignore trailing gaps in intron unless we are
843    * preserving gaps in both exon and intron. Ignore them anyway if the
844    * protein alignment introduces a gap at least as large as the intronic
845    * region.
846    */
847  36 if (inExon && !preserveMappedGaps)
848    {
849  4 trailingGapLength = 0;
850    }
851  36 if (!inExon && !(preserveMappedGaps && preserveUnmappedGaps))
852    {
853  15 trailingGapLength = 0;
854    }
855  36 if (inExon)
856    {
857  14 gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
858    }
859    else
860    {
861  22 if (intronLength + trailingGapLength <= sourceGapMappedLength)
862    {
863  16 gapsToAdd = sourceGapMappedLength - intronLength;
864    }
865    else
866    {
867  6 gapsToAdd = Math.min(
868    intronLength + trailingGapLength - sourceGapMappedLength,
869    trailingGapLength);
870    }
871    }
872    }
873    else
874    {
875    /*
876    * second or third base of codon; check for any gaps in dna
877    */
878  58 if (!preserveMappedGaps)
879    {
880  32 trailingGapLength = 0;
881    }
882  58 gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
883    }
884  94 return gapsToAdd;
885    }
886   
887    /**
888    * Realigns the given protein to match the alignment of the dna, using codon
889    * mappings to translate aligned codon positions to protein residues.
890    *
891    * @param protein
892    * the alignment whose sequences are realigned by this method
893    * @param dna
894    * the dna alignment whose alignment we are 'copying'
895    * @return the number of sequences that were realigned
896    */
 
897  3 toggle public static int alignProteinAsDna(AlignmentI protein, AlignmentI dna)
898    {
899  3 if (protein.isNucleotide() || !dna.isNucleotide())
900    {
901  0 jalview.bin.Console
902    .errPrintln("Wrong alignment type in alignProteinAsDna");
903  0 return 0;
904    }
905  3 List<SequenceI> unmappedProtein = new ArrayList<>();
906  3 Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = buildCodonColumnsMap(
907    protein, dna, unmappedProtein);
908  3 return alignProteinAs(protein, alignedCodons, unmappedProtein);
909    }
910   
911    /**
912    * Realigns the given dna to match the alignment of the protein, using codon
913    * mappings to translate aligned peptide positions to codons.
914    *
915    * Always produces a padded CDS alignment.
916    *
917    * @param dna
918    * the alignment whose sequences are realigned by this method
919    * @param protein
920    * the protein alignment whose alignment we are 'copying'
921    * @return the number of sequences that were realigned
922    */
 
923  4 toggle public static int alignCdsAsProtein(AlignmentI dna, AlignmentI protein)
924    {
925  4 if (protein.isNucleotide() || !dna.isNucleotide())
926    {
927  0 jalview.bin.Console
928    .errPrintln("Wrong alignment type in alignProteinAsDna");
929  0 return 0;
930    }
931    // todo: implement this
932  4 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
933  4 int alignedCount = 0;
934  4 int width = 0; // alignment width for padding CDS
935  4 for (SequenceI dnaSeq : dna.getSequences())
936    {
937  5 if (alignCdsSequenceAsProtein(dnaSeq, protein, mappings,
938    dna.getGapCharacter()))
939    {
940  5 alignedCount++;
941    }
942  5 width = Math.max(dnaSeq.getLength(), width);
943    }
944  4 int oldwidth;
945  4 int diff;
946  4 for (SequenceI dnaSeq : dna.getSequences())
947    {
948  5 oldwidth = dnaSeq.getLength();
949  5 diff = width - oldwidth;
950  5 if (diff > 0)
951    {
952  1 dnaSeq.insertCharAt(oldwidth, diff, dna.getGapCharacter());
953    }
954    }
955  4 return alignedCount;
956    }
957   
958    /**
959    * Helper method to align (if possible) the dna sequence to match the
960    * alignment of a mapped protein sequence. This is currently limited to
961    * handling coding sequence only.
962    *
963    * @param cdsSeq
964    * @param protein
965    * @param mappings
966    * @param gapChar
967    * @return
968    */
 
969  5 toggle static boolean alignCdsSequenceAsProtein(SequenceI cdsSeq,
970    AlignmentI protein, List<AlignedCodonFrame> mappings,
971    char gapChar)
972    {
973  5 SequenceI cdsDss = cdsSeq.getDatasetSequence();
974  5 if (cdsDss == null)
975    {
976  0 System.err
977    .println("alignCdsSequenceAsProtein needs aligned sequence!");
978  0 return false;
979    }
980   
981  5 List<AlignedCodonFrame> dnaMappings = MappingUtils
982    .findMappingsForSequence(cdsSeq, mappings);
983  5 for (AlignedCodonFrame mapping : dnaMappings)
984    {
985  5 SequenceI peptide = mapping.findAlignedSequence(cdsSeq, protein);
986  5 if (peptide != null)
987    {
988  5 final int peptideLength = peptide.getLength();
989  5 Mapping map = mapping.getMappingBetween(cdsSeq, peptide);
990  5 if (map != null)
991    {
992  5 MapList mapList = map.getMap();
993  5 if (map.getTo() == peptide.getDatasetSequence())
994    {
995  5 mapList = mapList.getInverse();
996    }
997  5 final int cdsLength = cdsDss.getLength();
998  5 int mappedFromLength = MappingUtils
999    .getLength(mapList.getFromRanges());
1000  5 int mappedToLength = MappingUtils
1001    .getLength(mapList.getToRanges());
1002  5 boolean addStopCodon = (cdsLength == mappedFromLength
1003    * CODON_LENGTH + CODON_LENGTH)
1004    || (peptide.getDatasetSequence()
1005    .getLength() == mappedFromLength - 1);
1006  5 if (cdsLength != mappedToLength && !addStopCodon)
1007    {
1008  0 jalview.bin.Console.errPrintln(String.format(
1009    "Can't align cds as protein (length mismatch %d/%d): %s",
1010    cdsLength, mappedToLength, cdsSeq.getName()));
1011    }
1012   
1013    /*
1014    * pre-fill the aligned cds sequence with gaps
1015    */
1016  5 char[] alignedCds = new char[peptideLength * CODON_LENGTH
1017  5 + (addStopCodon ? CODON_LENGTH : 0)];
1018  5 Arrays.fill(alignedCds, gapChar);
1019   
1020    /*
1021    * walk over the aligned peptide sequence and insert mapped
1022    * codons for residues in the aligned cds sequence
1023    */
1024  5 int copiedBases = 0;
1025  5 int cdsStart = cdsDss.getStart();
1026  5 int proteinPos = peptide.getStart() - 1;
1027  5 int cdsCol = 0;
1028   
1029  33 for (int col = 0; col < peptideLength; col++)
1030    {
1031  28 char residue = peptide.getCharAt(col);
1032   
1033  28 if (Comparison.isGap(residue))
1034    {
1035  13 cdsCol += CODON_LENGTH;
1036    }
1037    else
1038    {
1039  15 proteinPos++;
1040  15 int[] codon = mapList.locateInTo(proteinPos, proteinPos);
1041  15 if (codon == null)
1042    {
1043    // e.g. incomplete start codon, X in peptide
1044  0 cdsCol += CODON_LENGTH;
1045    }
1046    else
1047    {
1048  60 for (int j = codon[0]; j <= codon[1]; j++)
1049    {
1050  45 char mappedBase = cdsDss.getCharAt(j - cdsStart);
1051  45 alignedCds[cdsCol++] = mappedBase;
1052  45 copiedBases++;
1053    }
1054    }
1055    }
1056    }
1057   
1058    /*
1059    * append stop codon if not mapped from protein,
1060    * closing it up to the end of the mapped sequence
1061    */
1062  5 if (copiedBases == cdsLength - CODON_LENGTH)
1063    {
1064  0 for (int i = alignedCds.length - 1; i >= 0; i--)
1065    {
1066  0 if (!Comparison.isGap(alignedCds[i]))
1067    {
1068  0 cdsCol = i + 1; // gap just after end of sequence
1069  0 break;
1070    }
1071    }
1072  0 for (int i = cdsLength - CODON_LENGTH; i < cdsLength; i++)
1073    {
1074  0 alignedCds[cdsCol++] = cdsDss.getCharAt(i);
1075    }
1076    }
1077  5 cdsSeq.setSequence(new String(alignedCds));
1078  5 return true;
1079    }
1080    }
1081    }
1082  0 return false;
1083    }
1084   
1085    /**
1086    * Builds a map whose key is an aligned codon position (3 alignment column
1087    * numbers base 0), and whose value is a map from protein sequence to each
1088    * protein's peptide residue for that codon. The map generates an ordering of
1089    * the codons, and allows us to read off the peptides at each position in
1090    * order to assemble 'aligned' protein sequences.
1091    *
1092    * @param protein
1093    * the protein alignment
1094    * @param dna
1095    * the coding dna alignment
1096    * @param unmappedProtein
1097    * any unmapped proteins are added to this list
1098    * @return
1099    */
 
1100  3 toggle protected static Map<AlignedCodon, Map<SequenceI, AlignedCodon>> buildCodonColumnsMap(
1101    AlignmentI protein, AlignmentI dna,
1102    List<SequenceI> unmappedProtein)
1103    {
1104    /*
1105    * maintain a list of any proteins with no mappings - these will be
1106    * rendered 'as is' in the protein alignment as we can't align them
1107    */
1108  3 unmappedProtein.addAll(protein.getSequences());
1109   
1110  3 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
1111   
1112    /*
1113    * Map will hold, for each aligned codon position e.g. [3, 5, 6], a map of
1114    * {dnaSequence, {proteinSequence, codonProduct}} at that position. The
1115    * comparator keeps the codon positions ordered.
1116    */
1117  3 Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = new TreeMap<>(
1118    new CodonComparator());
1119   
1120  3 for (SequenceI dnaSeq : dna.getSequences())
1121    {
1122  8 for (AlignedCodonFrame mapping : mappings)
1123    {
1124  10 SequenceI prot = mapping.findAlignedSequence(dnaSeq, protein);
1125  10 if (prot != null)
1126    {
1127  10 Mapping seqMap = mapping.getMappingForSequence(dnaSeq);
1128  10 addCodonPositions(dnaSeq, prot, protein.getGapCharacter(), seqMap,
1129    alignedCodons);
1130  10 unmappedProtein.remove(prot);
1131    }
1132    }
1133    }
1134   
1135    /*
1136    * Finally add any unmapped peptide start residues (e.g. for incomplete
1137    * codons) as if at the codon position before the second residue
1138    */
1139    // TODO resolve JAL-2022 so this fudge can be removed
1140  3 int mappedSequenceCount = protein.getHeight() - unmappedProtein.size();
1141  3 addUnmappedPeptideStarts(alignedCodons, mappedSequenceCount);
1142   
1143  3 return alignedCodons;
1144    }
1145   
1146    /**
1147    * Scans for any protein mapped from position 2 (meaning unmapped start
1148    * position e.g. an incomplete codon), and synthesizes a 'codon' for it at the
1149    * preceding position in the alignment
1150    *
1151    * @param alignedCodons
1152    * the codon-to-peptide map
1153    * @param mappedSequenceCount
1154    * the number of distinct sequences in the map
1155    */
 
1156  3 toggle protected static void addUnmappedPeptideStarts(
1157    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1158    int mappedSequenceCount)
1159    {
1160    // TODO delete this ugly hack once JAL-2022 is resolved
1161    // i.e. we can model startPhase > 0 (incomplete start codon)
1162   
1163  3 List<SequenceI> sequencesChecked = new ArrayList<>();
1164  3 AlignedCodon lastCodon = null;
1165  3 Map<SequenceI, AlignedCodon> toAdd = new HashMap<>();
1166   
1167  3 for (Entry<AlignedCodon, Map<SequenceI, AlignedCodon>> entry : alignedCodons
1168    .entrySet())
1169    {
1170  21 for (Entry<SequenceI, AlignedCodon> sequenceCodon : entry.getValue()
1171    .entrySet())
1172    {
1173  25 SequenceI seq = sequenceCodon.getKey();
1174  25 if (sequencesChecked.contains(seq))
1175    {
1176  17 continue;
1177    }
1178  8 sequencesChecked.add(seq);
1179  8 AlignedCodon codon = sequenceCodon.getValue();
1180  8 if (codon.peptideCol > 1)
1181    {
1182  0 jalview.bin.Console.errPrintln(
1183    "Problem mapping protein with >1 unmapped start positions: "
1184    + seq.getName());
1185    }
1186  8 else if (codon.peptideCol == 1)
1187    {
1188    /*
1189    * first position (peptideCol == 0) was unmapped - add it
1190    */
1191  2 if (lastCodon != null)
1192    {
1193  1 AlignedCodon firstPeptide = new AlignedCodon(lastCodon.pos1,
1194    lastCodon.pos2, lastCodon.pos3,
1195    String.valueOf(seq.getCharAt(0)), 0);
1196  1 toAdd.put(seq, firstPeptide);
1197    }
1198    else
1199    {
1200    /*
1201    * unmapped residue at start of alignment (no prior column) -
1202    * 'insert' at nominal codon [0, 0, 0]
1203    */
1204  1 AlignedCodon firstPeptide = new AlignedCodon(0, 0, 0,
1205    String.valueOf(seq.getCharAt(0)), 0);
1206  1 toAdd.put(seq, firstPeptide);
1207    }
1208    }
1209  8 if (sequencesChecked.size() == mappedSequenceCount)
1210    {
1211    // no need to check past first mapped position in all sequences
1212  3 break;
1213    }
1214    }
1215  21 lastCodon = entry.getKey();
1216    }
1217   
1218    /*
1219    * add any new codons safely after iterating over the map
1220    */
1221  3 for (Entry<SequenceI, AlignedCodon> startCodon : toAdd.entrySet())
1222    {
1223  2 addCodonToMap(alignedCodons, startCodon.getValue(),
1224    startCodon.getKey());
1225    }
1226    }
1227   
1228    /**
1229    * Update the aligned protein sequences to match the codon alignments given in
1230    * the map.
1231    *
1232    * @param protein
1233    * @param alignedCodons
1234    * an ordered map of codon positions (columns), with sequence/peptide
1235    * values present in each column
1236    * @param unmappedProtein
1237    * @return
1238    */
 
1239  3 toggle protected static int alignProteinAs(AlignmentI protein,
1240    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1241    List<SequenceI> unmappedProtein)
1242    {
1243    /*
1244    * prefill peptide sequences with gaps
1245    */
1246  3 int alignedWidth = alignedCodons.size();
1247  3 char[] gaps = new char[alignedWidth];
1248  3 Arrays.fill(gaps, protein.getGapCharacter());
1249  3 Map<SequenceI, char[]> peptides = new HashMap<>();
1250  3 for (SequenceI seq : protein.getSequences())
1251    {
1252  9 if (!unmappedProtein.contains(seq))
1253    {
1254  8 peptides.put(seq, Arrays.copyOf(gaps, gaps.length));
1255    }
1256    }
1257   
1258    /*
1259    * Traverse the codons left to right (as defined by CodonComparator)
1260    * and insert peptides in each column where the sequence is mapped.
1261    * This gives a peptide 'alignment' where residues are aligned if their
1262    * corresponding codons occupy the same columns in the cdna alignment.
1263    */
1264  3 int column = 0;
1265  3 for (AlignedCodon codon : alignedCodons.keySet())
1266    {
1267  22 final Map<SequenceI, AlignedCodon> columnResidues = alignedCodons
1268    .get(codon);
1269  22 for (Entry<SequenceI, AlignedCodon> entry : columnResidues.entrySet())
1270    {
1271  28 char residue = entry.getValue().product.charAt(0);
1272  28 peptides.get(entry.getKey())[column] = residue;
1273    }
1274  22 column++;
1275    }
1276   
1277    /*
1278    * and finally set the constructed sequences
1279    */
1280  3 for (Entry<SequenceI, char[]> entry : peptides.entrySet())
1281    {
1282  8 entry.getKey().setSequence(new String(entry.getValue()));
1283    }
1284   
1285  3 return 0;
1286    }
1287   
1288    /**
1289    * Populate the map of aligned codons by traversing the given sequence
1290    * mapping, locating the aligned positions of mapped codons, and adding those
1291    * positions and their translation products to the map.
1292    *
1293    * @param dna
1294    * the aligned sequence we are mapping from
1295    * @param protein
1296    * the sequence to be aligned to the codons
1297    * @param gapChar
1298    * the gap character in the dna sequence
1299    * @param seqMap
1300    * a mapping to a sequence translation
1301    * @param alignedCodons
1302    * the map we are building up
1303    */
 
1304  10 toggle static void addCodonPositions(SequenceI dna, SequenceI protein,
1305    char gapChar, Mapping seqMap,
1306    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons)
1307    {
1308  10 Iterator<AlignedCodon> codons = seqMap.getCodonIterator(dna, gapChar);
1309   
1310    /*
1311    * add codon positions, and their peptide translations, to the alignment
1312    * map, while remembering the first codon mapped
1313    */
1314  44 while (codons.hasNext())
1315    {
1316  34 try
1317    {
1318  34 AlignedCodon codon = codons.next();
1319  34 addCodonToMap(alignedCodons, codon, protein);
1320    } catch (IncompleteCodonException e)
1321    {
1322    // possible incomplete trailing codon - ignore
1323    } catch (NoSuchElementException e)
1324    {
1325    // possibly peptide lacking STOP
1326    }
1327    }
1328    }
1329   
1330    /**
1331    * Helper method to add a codon-to-peptide entry to the aligned codons map
1332    *
1333    * @param alignedCodons
1334    * @param codon
1335    * @param protein
1336    */
 
1337  36 toggle protected static void addCodonToMap(
1338    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1339    AlignedCodon codon, SequenceI protein)
1340    {
1341  36 Map<SequenceI, AlignedCodon> seqProduct = alignedCodons.get(codon);
1342  36 if (seqProduct == null)
1343    {
1344  22 seqProduct = new HashMap<>();
1345  22 alignedCodons.put(codon, seqProduct);
1346    }
1347  36 seqProduct.put(protein, codon);
1348    }
1349   
1350    /**
1351    * Returns true if a cDNA/Protein mapping either exists, or could be made,
1352    * between at least one pair of sequences in the two alignments. Currently,
1353    * the logic is:
1354    * <ul>
1355    * <li>One alignment must be nucleotide, and the other protein</li>
1356    * <li>At least one pair of sequences must be already mapped, or mappable</li>
1357    * <li>Mappable means the nucleotide translation matches the protein
1358    * sequence</li>
1359    * <li>The translation may ignore start and stop codons if present in the
1360    * nucleotide</li>
1361    * </ul>
1362    *
1363    * @param al1
1364    * @param al2
1365    * @return
1366    */
 
1367  25 toggle public static boolean isMappable(AlignmentI al1, AlignmentI al2)
1368    {
1369  25 if (al1 == null || al2 == null || al1 == al2)
1370    {
1371  5 return false;
1372    }
1373   
1374    /*
1375    * Require one nucleotide and one protein
1376    */
1377  20 if (al1.isNucleotide() == al2.isNucleotide())
1378    {
1379    // or if protein - check if alternate coding
1380  10 if (al1.isNucleotide())
1381    {
1382  5 return false;
1383    }
1384  5 return check3diPeptideMapping(al1,al2);
1385    }
1386  10 AlignmentI dna = al1.isNucleotide() ? al1 : al2;
1387  10 AlignmentI protein = dna == al1 ? al2 : al1;
1388  10 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
1389  10 for (SequenceI dnaSeq : dna.getSequences())
1390    {
1391  11 for (SequenceI proteinSeq : protein.getSequences())
1392    {
1393  11 if (isMappable(dnaSeq, proteinSeq, mappings))
1394    {
1395  2 return true;
1396    }
1397    }
1398    }
1399  8 return false;
1400    }
 
1401  5 toggle public static boolean check3diPeptideMapping(AlignmentI al1, AlignmentI al2)
1402    {
1403  5 if (al1.getHeight()!=al2.getHeight())
1404  1 { return false;
1405    }
1406  4 int mappable=0;
1407  4 for (SequenceI al1seq: al1.getSequences())
1408    {
1409  4 boolean foundMappable = false;
1410  4 for (SequenceI al2seq:al2.getSequences())
1411    {
1412  4 if (canBuild3diMapping(al1seq,al2seq))
1413    {
1414  2 foundMappable = true;
1415  2 break;
1416    }
1417    }
1418  4 if (foundMappable)
1419    {
1420  2 mappable++;
1421    }
1422    }
1423  4 if (mappable>0 && mappable >= (al1.getHeight()*9/10))
1424    {
1425    // TODO allow optional if mappable > a few
1426  2 return true;
1427    }
1428  2 return false;
1429    }
1430   
1431    /**
1432    * exact name, start-end, and identical length non-gap sequences
1433    * @param al1seq
1434    * @param al2seq
1435    * @return
1436    */
 
1437  16 toggle public static boolean canBuild3diMapping(SequenceI al1seq,
1438    SequenceI al2seq)
1439    {
1440  16 if (!al1seq.getDisplayId(true)
1441    .equalsIgnoreCase(al2seq.getDisplayId(true))) {
1442  8 return false;
1443    }
1444  8 String s1 = AlignSeq
1445    .extractGaps(jalview.util.Comparison.GapChars,
1446    al1seq.getSequenceAsString());
1447  8 String s2 = AlignSeq.extractGaps(
1448    jalview.util.Comparison.GapChars,
1449    al2seq.getSequenceAsString());
1450  8 return s1.length()==s2.length();
1451    }
1452   
 
1453  3 toggle public static boolean map3diPeptideToProteinAligment(
1454    AlignmentI proteinAlignment, AlignmentI tdiAlignment)
1455    {
1456  3 if (proteinAlignment==null || tdiAlignment==null)
1457    {
1458  0 return false;
1459    }
1460  3 Set<SequenceI> mappedDna = new HashSet<>();
1461  3 Set<SequenceI> mappedProtein = new HashSet<>();
1462   
1463    /*
1464    * First pass - map sequences where cross-references exist. This include
1465    * 1-to-many mappings to support, for example, variant cDNA.
1466    */
1467  3 boolean mappingPerformed = mapProteinToTdiAlignment(proteinAlignment,
1468    tdiAlignment, mappedDna, mappedProtein);
1469  3 return mappingPerformed;
1470   
1471   
1472    }
1473   
1474   
1475    /**
1476    * Make mappings between compatible sequences (ids are identical, length of seqs are identical).
1477    *
1478    * @param proteinAlignment
1479    * @param tdiAlignment
1480    * @param mappedTdiSeq
1481    * a set of mapped DNA sequences (to add to)
1482    * @param mappedProtein
1483    * a set of mapped Protein sequences (to add to)
1484    * @param xrefsOnly
1485    * if true, only map sequences where xrefs exist
1486    * @return
1487    */
 
1488  3 toggle protected static boolean mapProteinToTdiAlignment(
1489    final AlignmentI proteinAlignment, final AlignmentI tdiAlignment,
1490    Set<SequenceI> mappedTdiSeq, Set<SequenceI> mappedProtein)
1491    {
1492  3 boolean mappingExistsOrAdded = false;
1493  3 List<SequenceI> thisSeqs = proteinAlignment.getSequences();
1494  3 for (SequenceI aaSeq : thisSeqs)
1495    {
1496  6 aaSeq = aaSeq.getDatasetSequence()!=null ? aaSeq.getDatasetSequence():null;
1497  6 boolean proteinMapped = false;
1498  6 AlignedCodonFrame acf = new AlignedCodonFrame();
1499   
1500  6 for (SequenceI cdnaSeq : tdiAlignment.getSequences())
1501    {
1502    // resolve dataset sequences
1503  12 cdnaSeq = cdnaSeq.getDatasetSequence()!=null ? cdnaSeq.getDatasetSequence():null;
1504   
1505  12 if (mappingExists(proteinAlignment.getCodonFrames(),
1506    aaSeq, cdnaSeq))
1507    {
1508  0 mappingExistsOrAdded = true;
1509    }
1510    else
1511    {
1512   
1513  12 if (canBuild3diMapping(aaSeq, cdnaSeq))
1514    {
1515  6 MapList map = new MapList(new int[] { aaSeq.getStart(),aaSeq.getEnd()},new int[] { cdnaSeq.getStart(),cdnaSeq.getEnd()},1,1);
1516  6 acf.addMap(cdnaSeq, aaSeq, map);
1517  6 mappingExistsOrAdded = true;
1518  6 proteinMapped = true;
1519  6 mappedTdiSeq.add(cdnaSeq);
1520  6 mappedProtein.add(aaSeq);
1521    }
1522    }
1523    }
1524  6 if (proteinMapped)
1525    {
1526  6 proteinAlignment.addCodonFrame(acf);
1527    }
1528    }
1529  3 return mappingExistsOrAdded;
1530    }
1531   
1532   
1533    /**
1534    * Returns true if the dna sequence is mapped, or could be mapped, to the
1535    * protein sequence.
1536    *
1537    * @param dnaSeq
1538    * @param proteinSeq
1539    * @param mappings
1540    * @return
1541    */
 
1542  11 toggle protected static boolean isMappable(SequenceI dnaSeq,
1543    SequenceI proteinSeq, List<AlignedCodonFrame> mappings)
1544    {
1545  11 if (dnaSeq == null || proteinSeq == null)
1546    {
1547  0 return false;
1548    }
1549   
1550  11 SequenceI dnaDs = dnaSeq.getDatasetSequence() == null ? dnaSeq
1551    : dnaSeq.getDatasetSequence();
1552  11 SequenceI proteinDs = proteinSeq.getDatasetSequence() == null
1553    ? proteinSeq
1554    : proteinSeq.getDatasetSequence();
1555   
1556  11 for (AlignedCodonFrame mapping : mappings)
1557    {
1558  0 if (proteinDs == mapping.getAaForDnaSeq(dnaDs))
1559    {
1560    /*
1561    * already mapped
1562    */
1563  0 return true;
1564    }
1565    }
1566   
1567    /*
1568    * Just try to make a mapping (it is not yet stored), test whether
1569    * successful.
1570    */
1571  11 return mapCdnaToProtein(proteinDs, dnaDs) != null;
1572    }
1573   
1574    /**
1575    * Finds any reference annotations associated with the sequences in
1576    * sequenceScope, that are not already added to the alignment, and adds them
1577    * to the 'candidates' map. Also populates a lookup table of annotation
1578    * labels, keyed by calcId, for use in constructing tooltips or the like.
1579    *
1580    * @param sequenceScope
1581    * the sequences to scan for reference annotations
1582    * @param labelForCalcId
1583    * (optional) map to populate with label for calcId
1584    * @param candidates
1585    * map to populate with annotations for sequence
1586    * @param al
1587    * the alignment to check for presence of annotations
1588    */
 
1589  51 toggle public static void findAddableReferenceAnnotations(
1590    List<SequenceI> sequenceScope, Map<String, String> labelForCalcId,
1591    final Map<SequenceI, List<AlignmentAnnotation>> candidates,
1592    AlignmentI al)
1593    {
1594  51 if (sequenceScope == null)
1595    {
1596  1 return;
1597    }
1598   
1599    /*
1600    * For each sequence in scope, make a list of any annotations on the
1601    * underlying dataset sequence which are not already on the alignment.
1602    *
1603    * Add to a map of { alignmentSequence, <List of annotations to add> }
1604    */
1605  50 for (SequenceI seq : sequenceScope)
1606    {
1607  47 SequenceI dataset = seq.getDatasetSequence();
1608  47 if (dataset == null)
1609    {
1610  0 continue;
1611    }
1612  47 AlignmentAnnotation[] datasetAnnotations = dataset.getAnnotation();
1613  47 if (datasetAnnotations == null)
1614    {
1615  33 continue;
1616    }
1617  14 final List<AlignmentAnnotation> result = new ArrayList<>();
1618  14 for (AlignmentAnnotation dsann : datasetAnnotations)
1619    {
1620    /*
1621    * Find matching annotations on the alignment. If none is found, then
1622    * add this annotation to the list of 'addable' annotations for this
1623    * sequence.
1624    */
1625  26 final Iterable<AlignmentAnnotation> matchedAlignmentAnnotations = al
1626    .findAnnotations(seq, dsann.getCalcId(), dsann.label);
1627  26 boolean found = false;
1628  26 if (matchedAlignmentAnnotations != null)
1629    {
1630  23 for (AlignmentAnnotation matched : matchedAlignmentAnnotations)
1631    {
1632  5 if (dsann.description.equals(matched.description))
1633    {
1634  4 found = true;
1635  4 break;
1636    }
1637    }
1638    }
1639  26 if (!found)
1640    {
1641  22 result.add(dsann);
1642  22 if (labelForCalcId != null)
1643    {
1644  10 labelForCalcId.put(dsann.getCalcId(), dsann.label);
1645    }
1646    }
1647    }
1648    /*
1649    * Save any addable annotations for this sequence
1650    */
1651  14 if (!result.isEmpty())
1652    {
1653  12 candidates.put(seq, result);
1654    }
1655    }
1656    }
1657   
1658    /**
1659    * Adds annotations to the top of the alignment annotations, in the same order
1660    * as their related sequences. If you already have an annotation and want to
1661    * add it to a sequence in an alignment use {@code addReferenceAnnotationTo}
1662    *
1663    * @param annotations
1664    * the annotations to add
1665    * @param alignment
1666    * the alignment to add them to
1667    * @param selectionGroup
1668    * current selection group - may be null, if provided then any added
1669    * annotation will be trimmed to just those columns in the selection
1670    * group
1671    */
 
1672  7 toggle public static void addReferenceAnnotations(
1673    Map<SequenceI, List<AlignmentAnnotation>> annotations,
1674    final AlignmentI alignment, final SequenceGroup selectionGroup)
1675    {
1676  7 for (SequenceI seq : annotations.keySet())
1677    {
1678  8 for (AlignmentAnnotation ann : annotations.get(seq))
1679    {
1680  16 addReferenceAnnotationTo(alignment, seq, ann, selectionGroup);
1681    }
1682    }
1683    }
1684   
 
1685  0 toggle public static boolean isSSAnnotationPresent(
1686    Map<SequenceI, List<AlignmentAnnotation>> annotations)
1687    {
1688   
1689  0 for (SequenceI seq : annotations.keySet())
1690    {
1691  0 if (isSecondaryStructurePresent(
1692    annotations.get(seq).toArray(new AlignmentAnnotation[0])))
1693    {
1694  0 return true;
1695    }
1696    }
1697  0 return false;
1698    }
1699   
1700    /**
1701    * Make a copy of a reference annotation {@code ann} and add it to an
1702    * alignment sequence {@code seq} in {@code alignment}, optionally limited to
1703    * the extent of {@code selectionGroup}
1704    *
1705    * @param alignment
1706    * @param seq
1707    * @param ann
1708    * @param selectionGroup
1709    * current selection group - may be null, if provided then any added
1710    * annotation will be trimmed to just those columns in the selection
1711    * group
1712    * @return annotation added to {@code seq and {@code alignment}
1713    */
 
1714  19 toggle public static AlignmentAnnotation addReferenceAnnotationTo(
1715    final AlignmentI alignment, final SequenceI seq,
1716    final AlignmentAnnotation ann, final SequenceGroup selectionGroup)
1717    {
1718  19 AlignmentAnnotation copyAnn = new AlignmentAnnotation(ann);
1719  19 int startRes = 0;
1720  19 int endRes = ann.annotations.length;
1721  19 if (selectionGroup != null)
1722    {
1723  2 startRes = -1 + Math.min(seq.getEnd(), Math.max(seq.getStart(),
1724    seq.findPosition(selectionGroup.getStartRes())));
1725  2 endRes = -1 + Math.min(seq.getEnd(),
1726    seq.findPosition(selectionGroup.getEndRes()));
1727   
1728    }
1729  19 copyAnn.restrict(startRes, endRes + 0);
1730   
1731    /*
1732    * Add to the sequence (sets copyAnn.datasetSequence), unless the
1733    * original annotation is already on the sequence.
1734    */
1735  19 if (!seq.hasAnnotation(ann))
1736    {
1737  19 ContactMatrixI cm = seq.getDatasetSequence().getContactMatrixFor(ann);
1738  19 if (cm != null)
1739    {
1740  7 seq.addContactListFor(copyAnn, cm);
1741    }
1742  19 seq.addAlignmentAnnotation(copyAnn);
1743    }
1744    // adjust for gaps
1745  19 copyAnn.adjustForAlignment();
1746    // add to the alignment and set visible
1747  19 alignment.addAnnotation(copyAnn);
1748  19 copyAnn.visible = true;
1749   
1750  19 return copyAnn;
1751    }
1752   
1753    /**
1754    * Set visibility of alignment annotations of specified types (labels), for
1755    * specified sequences. This supports controls like "Show all secondary
1756    * structure", "Hide all Temp factor", etc.
1757    *
1758    * @al the alignment to scan for annotations
1759    * @param types
1760    * the types (labels) of annotations to be updated
1761    * @param forSequences
1762    * if not null, only annotations linked to one of these sequences are
1763    * in scope for update; if null, acts on all sequence annotations
1764    * @param anyType
1765    * if this flag is true, 'types' is ignored (label not checked)
1766    * @param doShow
1767    * if true, set visibility on, else set off
1768    */
 
1769  5 toggle public static void showOrHideSequenceAnnotations(AlignmentI al,
1770    Collection<String> types, List<SequenceI> forSequences,
1771    boolean anyType, boolean doShow)
1772    {
1773  5 AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
1774  5 if (anns != null)
1775    {
1776  5 for (AlignmentAnnotation aa : anns)
1777    {
1778  30 if (anyType || types.contains(aa.label))
1779    {
1780  21 if ((aa.sequenceRef != null) && (forSequences == null
1781    || forSequences.contains(aa.sequenceRef)))
1782    {
1783  11 aa.visible = doShow;
1784    }
1785    }
1786    }
1787    }
1788    }
1789   
1790    /**
1791    * Shows or hides auto calculated annotations for a sequence group.
1792    *
1793    * @param al
1794    * The alignment object with the annotations.
1795    * @param type
1796    * The type of annotation to show or hide.
1797    * @param selectedGroup
1798    * The sequence group for which the annotations should be shown or
1799    * hidden.
1800    * @param anyType
1801    * If true, all types of annotations will be shown/hidden.
1802    * @param doShow
1803    * If true, the annotations will be shown; if false, annotations will
1804    * be hidden.
1805    */
 
1806  0 toggle public static void showOrHideAutoCalculatedAnnotationsForGroup(
1807    AlignmentI al, String type, SequenceGroup selectedGroup,
1808    boolean anyType, boolean doShow)
1809    {
1810    // Get all alignment annotations
1811  0 AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
1812   
1813  0 if (anns != null)
1814    {
1815  0 for (AlignmentAnnotation aa : anns)
1816    {
1817    // Check if anyType is true or if the annotation's label contains the
1818    // specified type (currently for secondary structure consensus)
1819  0 if ((anyType && aa.label
1820    .startsWith(Constants.SECONDARY_STRUCTURE_CONSENSUS_LABEL))
1821    || aa.label.startsWith(type))
1822    {
1823    // If the annotation's group reference is not null and matches the
1824    // selected group, update its visibility.
1825  0 if (aa.groupRef != null && selectedGroup == aa.groupRef)
1826    {
1827  0 aa.visible = doShow;
1828    }
1829    }
1830    }
1831    }
1832    }
1833   
 
1834  0 toggle public static AlignmentAnnotation getFirstSequenceAnnotationOfType(
1835    AlignmentI al, int graphType)
1836    {
1837  0 AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
1838  0 if (anns != null)
1839    {
1840  0 for (AlignmentAnnotation aa : anns)
1841    {
1842  0 if (aa.sequenceRef != null && aa.graph == graphType)
1843  0 return aa;
1844    }
1845    }
1846  0 return null;
1847    }
1848   
1849    /**
1850    * Returns true if either sequence has a cross-reference to the other
1851    *
1852    * @param seq1
1853    * @param seq2
1854    * @return
1855    */
 
1856  52 toggle public static boolean haveCrossRef(SequenceI seq1, SequenceI seq2)
1857    {
1858    // Note: moved here from class CrossRef as the latter class has dependencies
1859    // not availability to the applet's classpath
1860  52 return hasCrossRef(seq1, seq2) || hasCrossRef(seq2, seq1);
1861    }
1862   
1863    /**
1864    * Returns true if seq1 has a cross-reference to seq2. Currently this assumes
1865    * that sequence name is structured as Source|AccessionId.
1866    *
1867    * @param seq1
1868    * @param seq2
1869    * @return
1870    */
 
1871  108 toggle public static boolean hasCrossRef(SequenceI seq1, SequenceI seq2)
1872    {
1873  108 if (seq1 == null || seq2 == null)
1874    {
1875  8 return false;
1876    }
1877  100 String name = seq2.getName();
1878  100 final List<DBRefEntry> xrefs = seq1.getDBRefs();
1879  100 if (xrefs != null)
1880    {
1881  34 for (int ix = 0, nx = xrefs.size(); ix < nx; ix++)
1882    {
1883  24 DBRefEntry xref = xrefs.get(ix);
1884  24 String xrefName = xref.getSource() + "|" + xref.getAccessionId();
1885    // case-insensitive test, consistent with DBRefEntry.equalRef()
1886  24 if (xrefName.equalsIgnoreCase(name))
1887    {
1888  12 return true;
1889    }
1890    }
1891    }
1892  88 return false;
1893    }
1894   
1895    /**
1896    * Constructs an alignment consisting of the mapped (CDS) regions in the given
1897    * nucleotide sequences, and updates mappings to match. The CDS sequences are
1898    * added to the original alignment's dataset, which is shared by the new
1899    * alignment. Mappings from nucleotide to CDS, and from CDS to protein, are
1900    * added to the alignment dataset.
1901    *
1902    * @param dna
1903    * aligned nucleotide (dna or cds) sequences
1904    * @param dataset
1905    * the alignment dataset the sequences belong to
1906    * @param products
1907    * (optional) to restrict results to CDS that map to specified
1908    * protein products
1909    * @return an alignment whose sequences are the cds-only parts of the dna
1910    * sequences (or null if no mappings are found)
1911    */
 
1912  4 toggle public static AlignmentI makeCdsAlignment(SequenceI[] dna,
1913    AlignmentI dataset, SequenceI[] products)
1914    {
1915  4 if (dataset == null || dataset.getDataset() != null)
1916    {
1917  0 throw new IllegalArgumentException(
1918    "IMPLEMENTATION ERROR: dataset.getDataset() must be null!");
1919    }
1920  4 List<SequenceI> foundSeqs = new ArrayList<>();
1921  4 List<SequenceI> cdsSeqs = new ArrayList<>();
1922  4 List<AlignedCodonFrame> mappings = dataset.getCodonFrames();
1923  4 HashSet<SequenceI> productSeqs = null;
1924  4 if (products != null)
1925    {
1926  1 productSeqs = new HashSet<>();
1927  1 for (SequenceI seq : products)
1928    {
1929  2 productSeqs.add(seq.getDatasetSequence() == null ? seq
1930    : seq.getDatasetSequence());
1931    }
1932    }
1933   
1934    /*
1935    * Construct CDS sequences from mappings on the alignment dataset.
1936    * The logic is:
1937    * - find the protein product(s) mapped to from each dna sequence
1938    * - if the mapping covers the whole dna sequence (give or take start/stop
1939    * codon), take the dna as the CDS sequence
1940    * - else search dataset mappings for a suitable dna sequence, i.e. one
1941    * whose whole sequence is mapped to the protein
1942    * - if no sequence found, construct one from the dna sequence and mapping
1943    * (and add it to dataset so it is found if this is repeated)
1944    */
1945  4 for (SequenceI dnaSeq : dna)
1946    {
1947  8 SequenceI dnaDss = dnaSeq.getDatasetSequence() == null ? dnaSeq
1948    : dnaSeq.getDatasetSequence();
1949   
1950  8 List<AlignedCodonFrame> seqMappings = MappingUtils
1951    .findMappingsForSequence(dnaSeq, mappings);
1952  8 for (AlignedCodonFrame mapping : seqMappings)
1953    {
1954  9 List<Mapping> mappingsFromSequence = mapping
1955    .getMappingsFromSequence(dnaSeq);
1956   
1957  9 for (Mapping aMapping : mappingsFromSequence)
1958    {
1959  11 MapList mapList = aMapping.getMap();
1960  11 if (mapList.getFromRatio() == 1)
1961    {
1962    /*
1963    * not a dna-to-protein mapping (likely dna-to-cds)
1964    */
1965  0 continue;
1966    }
1967   
1968    /*
1969    * skip if mapping is not to one of the target set of proteins
1970    */
1971  11 SequenceI proteinProduct = aMapping.getTo();
1972  11 if (productSeqs != null && !productSeqs.contains(proteinProduct))
1973    {
1974  2 continue;
1975    }
1976   
1977    /*
1978    * try to locate the CDS from the dataset mappings;
1979    * guard against duplicate results (for the case that protein has
1980    * dbrefs to both dna and cds sequences)
1981    */
1982  9 SequenceI cdsSeq = findCdsForProtein(mappings, dnaSeq,
1983    seqMappings, aMapping);
1984  9 if (cdsSeq != null)
1985    {
1986  0 if (!foundSeqs.contains(cdsSeq))
1987    {
1988  0 foundSeqs.add(cdsSeq);
1989  0 SequenceI derivedSequence = cdsSeq.deriveSequence();
1990  0 cdsSeqs.add(derivedSequence);
1991  0 if (!dataset.getSequences().contains(cdsSeq))
1992    {
1993  0 dataset.addSequence(cdsSeq);
1994    }
1995    }
1996  0 continue;
1997    }
1998   
1999    /*
2000    * didn't find mapped CDS sequence - construct it and add
2001    * its dataset sequence to the dataset
2002    */
2003  9 cdsSeq = makeCdsSequence(dnaSeq.getDatasetSequence(), aMapping,
2004    dataset).deriveSequence();
2005    // cdsSeq has a name constructed as CDS|<dbref>
2006    // <dbref> will be either the accession for the coding sequence,
2007    // marked in the /via/ dbref to the protein product accession
2008    // or it will be the original nucleotide accession.
2009  9 SequenceI cdsSeqDss = cdsSeq.getDatasetSequence();
2010   
2011  9 cdsSeqs.add(cdsSeq);
2012   
2013    /*
2014    * build the mapping from CDS to protein
2015    */
2016  9 List<int[]> cdsRange = Collections
2017    .singletonList(new int[]
2018    { cdsSeq.getStart(),
2019    cdsSeq.getLength() + cdsSeq.getStart() - 1 });
2020  9 MapList cdsToProteinMap = new MapList(cdsRange,
2021    mapList.getToRanges(), mapList.getFromRatio(),
2022    mapList.getToRatio());
2023   
2024  9 if (!dataset.getSequences().contains(cdsSeqDss))
2025    {
2026    /*
2027    * if this sequence is a newly created one, add it to the dataset
2028    * and made a CDS to protein mapping (if sequence already exists,
2029    * CDS-to-protein mapping _is_ the transcript-to-protein mapping)
2030    */
2031  9 dataset.addSequence(cdsSeqDss);
2032  9 AlignedCodonFrame cdsToProteinMapping = new AlignedCodonFrame();
2033  9 cdsToProteinMapping.addMap(cdsSeqDss, proteinProduct,
2034    cdsToProteinMap);
2035   
2036    /*
2037    * guard against duplicating the mapping if repeating this action
2038    */
2039  9 if (!mappings.contains(cdsToProteinMapping))
2040    {
2041  9 mappings.add(cdsToProteinMapping);
2042    }
2043    }
2044   
2045  9 propagateDBRefsToCDS(cdsSeqDss, dnaSeq.getDatasetSequence(),
2046    proteinProduct, aMapping);
2047    /*
2048    * add another mapping from original 'from' range to CDS
2049    */
2050  9 AlignedCodonFrame dnaToCdsMapping = new AlignedCodonFrame();
2051  9 final MapList dnaToCdsMap = new MapList(mapList.getFromRanges(),
2052    cdsRange, 1, 1);
2053  9 dnaToCdsMapping.addMap(dnaSeq.getDatasetSequence(), cdsSeqDss,
2054    dnaToCdsMap);
2055  9 if (!mappings.contains(dnaToCdsMapping))
2056    {
2057  9 mappings.add(dnaToCdsMapping);
2058    }
2059   
2060    /*
2061    * transfer dna chromosomal loci (if known) to the CDS
2062    * sequence (via the mapping)
2063    */
2064  9 final MapList cdsToDnaMap = dnaToCdsMap.getInverse();
2065  9 transferGeneLoci(dnaSeq, cdsToDnaMap, cdsSeq);
2066   
2067    /*
2068    * add DBRef with mapping from protein to CDS
2069    * (this enables Get Cross-References from protein alignment)
2070    * This is tricky because we can't have two DBRefs with the
2071    * same source and accession, so need a different accession for
2072    * the CDS from the dna sequence
2073    */
2074   
2075    // specific use case:
2076    // Genomic contig ENSCHR:1, contains coding regions for ENSG01,
2077    // ENSG02, ENSG03, with transcripts and products similarly named.
2078    // cannot add distinct dbrefs mapping location on ENSCHR:1 to ENSG01
2079   
2080    // JBPNote: ?? can't actually create an example that demonstrates we
2081    // need to
2082    // synthesize an xref.
2083   
2084  9 List<DBRefEntry> primrefs = dnaDss.getPrimaryDBRefs();
2085  11 for (int ip = 0, np = primrefs.size(); ip < np; ip++)
2086    {
2087  2 DBRefEntry primRef = primrefs.get(ip);
2088    /*
2089    * create a cross-reference from CDS to the source sequence's
2090    * primary reference and vice versa
2091    */
2092  2 String source = primRef.getSource();
2093  2 String version = primRef.getVersion();
2094  2 DBRefEntry cdsCrossRef = new DBRefEntry(source,
2095    source + ":" + version, primRef.getAccessionId());
2096  2 cdsCrossRef
2097    .setMap(new Mapping(dnaDss, new MapList(cdsToDnaMap)));
2098  2 cdsSeqDss.addDBRef(cdsCrossRef);
2099   
2100  2 dnaSeq.addDBRef(new DBRefEntry(source, version,
2101    cdsSeq.getName(), new Mapping(cdsSeqDss, dnaToCdsMap)));
2102    // problem here is that the cross-reference is synthesized -
2103    // cdsSeq.getName() may be like 'CDS|dnaaccession' or
2104    // 'CDS|emblcdsacc'
2105    // assuming cds version same as dna ?!?
2106   
2107  2 DBRefEntry proteinToCdsRef = new DBRefEntry(source, version,
2108    cdsSeq.getName());
2109    //
2110  2 proteinToCdsRef.setMap(
2111    new Mapping(cdsSeqDss, cdsToProteinMap.getInverse()));
2112  2 proteinProduct.addDBRef(proteinToCdsRef);
2113    }
2114    /*
2115    * transfer any features on dna that overlap the CDS
2116    */
2117  9 transferFeatures(dnaSeq, cdsSeq, dnaToCdsMap, null,
2118    SequenceOntologyI.CDS);
2119    }
2120    }
2121    }
2122   
2123  4 AlignmentI cds = new Alignment(
2124    cdsSeqs.toArray(new SequenceI[cdsSeqs.size()]));
2125  4 cds.setDataset(dataset);
2126   
2127  4 return cds;
2128    }
2129   
2130    /**
2131    * Tries to transfer gene loci (dbref to chromosome positions) from fromSeq to
2132    * toSeq, mediated by the given mapping between the sequences
2133    *
2134    * @param fromSeq
2135    * @param targetToFrom
2136    * Map
2137    * @param targetSeq
2138    */
 
2139  12 toggle protected static void transferGeneLoci(SequenceI fromSeq,
2140    MapList targetToFrom, SequenceI targetSeq)
2141    {
2142  12 if (targetSeq.getGeneLoci() != null)
2143    {
2144    // already have - don't override
2145  1 return;
2146    }
2147  11 GeneLociI fromLoci = fromSeq.getGeneLoci();
2148  11 if (fromLoci == null)
2149    {
2150  10 return;
2151    }
2152   
2153  1 MapList newMap = targetToFrom.traverse(fromLoci.getMapping());
2154   
2155  1 if (newMap != null)
2156    {
2157  1 targetSeq.setGeneLoci(fromLoci.getSpeciesId(),
2158    fromLoci.getAssemblyId(), fromLoci.getChromosomeId(), newMap);
2159    }
2160    }
2161   
2162    /**
2163    * A helper method that finds a CDS sequence in the alignment dataset that is
2164    * mapped to the given protein sequence, and either is, or has a mapping from,
2165    * the given dna sequence.
2166    *
2167    * @param mappings
2168    * set of all mappings on the dataset
2169    * @param dnaSeq
2170    * a dna (or cds) sequence we are searching from
2171    * @param seqMappings
2172    * the set of mappings involving dnaSeq
2173    * @param aMapping
2174    * a transcript-to-peptide mapping
2175    * @return
2176    */
 
2177  16 toggle static SequenceI findCdsForProtein(List<AlignedCodonFrame> mappings,
2178    SequenceI dnaSeq, List<AlignedCodonFrame> seqMappings,
2179    Mapping aMapping)
2180    {
2181    /*
2182    * TODO a better dna-cds-protein mapping data representation to allow easy
2183    * navigation; until then this clunky looping around lists of mappings
2184    */
2185  16 SequenceI seqDss = dnaSeq.getDatasetSequence() == null ? dnaSeq
2186    : dnaSeq.getDatasetSequence();
2187  16 SequenceI proteinProduct = aMapping.getTo();
2188   
2189    /*
2190    * is this mapping from the whole dna sequence (i.e. CDS)?
2191    * allowing for possible stop codon on dna but not peptide
2192    */
2193  16 int mappedFromLength = MappingUtils
2194    .getLength(aMapping.getMap().getFromRanges());
2195  16 int dnaLength = seqDss.getLength();
2196  16 if (mappedFromLength == dnaLength
2197    || mappedFromLength == dnaLength - CODON_LENGTH)
2198    {
2199    /*
2200    * if sequence has CDS features, this is a transcript with no UTR
2201    * - do not take this as the CDS sequence! (JAL-2789)
2202    */
2203  4 if (seqDss.getFeatures().getFeaturesByOntology(SequenceOntologyI.CDS)
2204    .isEmpty())
2205    {
2206  1 return seqDss;
2207    }
2208    }
2209   
2210    /*
2211    * looks like we found the dna-to-protein mapping; search for the
2212    * corresponding cds-to-protein mapping
2213    */
2214  15 List<AlignedCodonFrame> mappingsToPeptide = MappingUtils
2215    .findMappingsForSequence(proteinProduct, mappings);
2216  15 for (AlignedCodonFrame acf : mappingsToPeptide)
2217    {
2218  19 for (SequenceToSequenceMapping map : acf.getMappings())
2219    {
2220  23 Mapping mapping = map.getMapping();
2221  23 if (mapping != aMapping
2222    && mapping.getMap().getFromRatio() == CODON_LENGTH
2223    && proteinProduct == mapping.getTo()
2224    && seqDss != map.getFromSeq())
2225    {
2226  4 mappedFromLength = MappingUtils
2227    .getLength(mapping.getMap().getFromRanges());
2228  4 if (mappedFromLength == map.getFromSeq().getLength())
2229    {
2230    /*
2231    * found a 3:1 mapping to the protein product which covers
2232    * the whole dna sequence i.e. is from CDS; finally check the CDS
2233    * is mapped from the given dna start sequence
2234    */
2235  4 SequenceI cdsSeq = map.getFromSeq();
2236    // todo this test is weak if seqMappings contains multiple mappings;
2237    // we get away with it if transcript:cds relationship is 1:1
2238  4 List<AlignedCodonFrame> dnaToCdsMaps = MappingUtils
2239    .findMappingsForSequence(cdsSeq, seqMappings);
2240  4 if (!dnaToCdsMaps.isEmpty())
2241    {
2242  2 return cdsSeq;
2243    }
2244    }
2245    }
2246    }
2247    }
2248  13 return null;
2249    }
2250   
2251    /**
2252    * Helper method that makes a CDS sequence as defined by the mappings from the
2253    * given sequence i.e. extracts the 'mapped from' ranges (which may be on
2254    * forward or reverse strand).
2255    *
2256    * @param seq
2257    * @param mapping
2258    * @param dataset
2259    * - existing dataset. We check for sequences that look like the CDS
2260    * we are about to construct, if one exists already, then we will
2261    * just return that one.
2262    * @return CDS sequence (as a dataset sequence)
2263    */
 
2264  9 toggle static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping,
2265    AlignmentI dataset)
2266    {
2267    /*
2268    * construct CDS sequence name as "CDS|" with 'from id' held in the mapping
2269    * if set (e.g. EMBL protein_id), else sequence name appended
2270    */
2271  9 String mapFromId = mapping.getMappedFromId();
2272  9 final String seqId = "CDS|"
2273  9 + (mapFromId != null ? mapFromId : seq.getName());
2274   
2275  9 SequenceI newSeq = null;
2276   
2277    /*
2278    * construct CDS sequence by splicing mapped from ranges
2279    */
2280  9 char[] seqChars = seq.getSequence();
2281  9 List<int[]> fromRanges = mapping.getMap().getFromRanges();
2282  9 int cdsWidth = MappingUtils.getLength(fromRanges);
2283  9 char[] newSeqChars = new char[cdsWidth];
2284   
2285  9 int newPos = 0;
2286  9 for (int[] range : fromRanges)
2287    {
2288  21 if (range[0] <= range[1])
2289    {
2290    // forward strand mapping - just copy the range
2291  21 int length = range[1] - range[0] + 1;
2292  21 System.arraycopy(seqChars, range[0] - 1, newSeqChars, newPos,
2293    length);
2294  21 newPos += length;
2295    }
2296    else
2297    {
2298    // reverse strand mapping - copy and complement one by one
2299  0 for (int i = range[0]; i >= range[1]; i--)
2300    {
2301  0 newSeqChars[newPos++] = Dna.getComplement(seqChars[i - 1]);
2302    }
2303    }
2304   
2305  21 newSeq = new Sequence(seqId, newSeqChars, 1, newPos);
2306    }
2307   
2308  9 if (dataset != null)
2309    {
2310  9 SequenceI[] matches = dataset.findSequenceMatch(newSeq.getName());
2311  9 if (matches != null)
2312    {
2313  9 boolean matched = false;
2314  9 for (SequenceI mtch : matches)
2315    {
2316  3 if (mtch.getStart() != newSeq.getStart())
2317    {
2318  0 continue;
2319    }
2320  3 if (mtch.getEnd() != newSeq.getEnd())
2321    {
2322  0 continue;
2323    }
2324  3 if (!Arrays.equals(mtch.getSequence(), newSeq.getSequence()))
2325    {
2326  3 continue;
2327    }
2328  0 if (!matched)
2329    {
2330  0 matched = true;
2331  0 newSeq = mtch;
2332    }
2333    else
2334    {
2335  0 Console.error(
2336    "JAL-2154 regression: warning - found (and ignored) a duplicate CDS sequence:"
2337    + mtch.toString());
2338    }
2339    }
2340    }
2341    }
2342    // newSeq.setDescription(mapFromId);
2343   
2344  9 return newSeq;
2345    }
2346   
2347    /**
2348    * Adds any DBRefEntrys to cdsSeq from contig that have a Mapping congruent to
2349    * the given mapping.
2350    *
2351    * @param cdsSeq
2352    * @param contig
2353    * @param proteinProduct
2354    * @param mapping
2355    * @return list of DBRefEntrys added
2356    */
 
2357  9 toggle protected static List<DBRefEntry> propagateDBRefsToCDS(SequenceI cdsSeq,
2358    SequenceI contig, SequenceI proteinProduct, Mapping mapping)
2359    {
2360   
2361    // gather direct refs from contig congruent with mapping
2362  9 List<DBRefEntry> direct = new ArrayList<>();
2363  9 HashSet<String> directSources = new HashSet<>();
2364   
2365  9 List<DBRefEntry> refs = contig.getDBRefs();
2366  9 if (refs != null)
2367    {
2368  6 for (int ib = 0, nb = refs.size(); ib < nb; ib++)
2369    {
2370  4 DBRefEntry dbr = refs.get(ib);
2371  4 MapList map;
2372  ? if (dbr.hasMap() && (map = dbr.getMap().getMap()).isTripletMap())
2373    {
2374    // check if map is the CDS mapping
2375  2 if (mapping.getMap().equals(map))
2376    {
2377  2 direct.add(dbr);
2378  2 directSources.add(dbr.getSource());
2379    }
2380    }
2381    }
2382    }
2383  9 List<DBRefEntry> onSource = DBRefUtils.selectRefs(
2384    proteinProduct.getDBRefs(),
2385    directSources.toArray(new String[0]));
2386  9 List<DBRefEntry> propagated = new ArrayList<>();
2387   
2388    // and generate appropriate mappings
2389  11 for (int ic = 0, nc = direct.size(); ic < nc; ic++)
2390    {
2391  2 DBRefEntry cdsref = direct.get(ic);
2392  2 Mapping m = cdsref.getMap();
2393    // clone maplist and mapping
2394  2 MapList cdsposmap = new MapList(
2395    Arrays.asList(new int[][]
2396    { new int[] { cdsSeq.getStart(), cdsSeq.getEnd() } }),
2397    m.getMap().getToRanges(), 3, 1);
2398  2 Mapping cdsmap = new Mapping(m.getTo(), m.getMap());
2399   
2400    // create dbref
2401  2 DBRefEntry newref = new DBRefEntry(cdsref.getSource(),
2402    cdsref.getVersion(), cdsref.getAccessionId(),
2403    new Mapping(cdsmap.getTo(), cdsposmap));
2404   
2405    // and see if we can map to the protein product for this mapping.
2406    // onSource is the filtered set of accessions on protein that we are
2407    // tranferring, so we assume accession is the same.
2408  2 if (cdsmap.getTo() == null && onSource != null)
2409    {
2410  2 List<DBRefEntry> sourceRefs = DBRefUtils.searchRefs(onSource,
2411    cdsref.getAccessionId());
2412  2 if (sourceRefs != null)
2413    {
2414  2 for (DBRefEntry srcref : sourceRefs)
2415    {
2416  2 if (srcref.getSource().equalsIgnoreCase(cdsref.getSource()))
2417    {
2418    // we have found a complementary dbref on the protein product, so
2419    // update mapping's getTo
2420  2 newref.getMap().setTo(proteinProduct);
2421    }
2422    }
2423    }
2424    }
2425  2 cdsSeq.addDBRef(newref);
2426  2 propagated.add(newref);
2427    }
2428  9 return propagated;
2429    }
2430   
2431    /**
2432    * Transfers co-located features on 'fromSeq' to 'toSeq', adjusting the
2433    * feature start/end ranges, optionally omitting specified feature types.
2434    * Returns the number of features copied.
2435    *
2436    * @param fromSeq
2437    * @param toSeq
2438    * @param mapping
2439    * the mapping from 'fromSeq' to 'toSeq'
2440    * @param select
2441    * if not null, only features of this type are copied (including
2442    * subtypes in the Sequence Ontology)
2443    * @param omitting
2444    */
 
2445  12 toggle protected static int transferFeatures(SequenceI fromSeq, SequenceI toSeq,
2446    MapList mapping, String select, String... omitting)
2447    {
2448  12 SequenceI copyTo = toSeq;
2449  21 while (copyTo.getDatasetSequence() != null)
2450    {
2451  9 copyTo = copyTo.getDatasetSequence();
2452    }
2453  12 if (fromSeq == copyTo || fromSeq.getDatasetSequence() == copyTo)
2454    {
2455  0 return 0; // shared dataset sequence
2456    }
2457   
2458    /*
2459    * get features, optionally restricted by an ontology term
2460    */
2461  12 List<SequenceFeature> sfs = select == null
2462    ? fromSeq.getFeatures().getPositionalFeatures()
2463    : fromSeq.getFeatures().getFeaturesByOntology(select);
2464   
2465  12 int count = 0;
2466  12 for (SequenceFeature sf : sfs)
2467    {
2468  14 String type = sf.getType();
2469  14 boolean omit = false;
2470  14 for (String toOmit : omitting)
2471    {
2472  7 if (type.equals(toOmit))
2473    {
2474  2 omit = true;
2475    }
2476    }
2477  14 if (omit)
2478    {
2479  2 continue;
2480    }
2481   
2482    /*
2483    * locate the mapped range - null if either start or end is
2484    * not mapped (no partial overlaps are calculated)
2485    */
2486  12 int start = sf.getBegin();
2487  12 int end = sf.getEnd();
2488  12 int[] mappedTo = mapping.locateInTo(start, end);
2489    /*
2490    * if whole exon range doesn't map, try interpreting it
2491    * as 5' or 3' exon overlapping the CDS range
2492    */
2493  12 if (mappedTo == null)
2494    {
2495  3 mappedTo = mapping.locateInTo(end, end);
2496  3 if (mappedTo != null)
2497    {
2498    /*
2499    * end of exon is in CDS range - 5' overlap
2500    * to a range from the start of the peptide
2501    */
2502  0 mappedTo[0] = 1;
2503    }
2504    }
2505  12 if (mappedTo == null)
2506    {
2507  3 mappedTo = mapping.locateInTo(start, start);
2508  3 if (mappedTo != null)
2509    {
2510    /*
2511    * start of exon is in CDS range - 3' overlap
2512    * to a range up to the end of the peptide
2513    */
2514  0 mappedTo[1] = toSeq.getLength();
2515    }
2516    }
2517  12 if (mappedTo != null)
2518    {
2519  9 int newBegin = Math.min(mappedTo[0], mappedTo[1]);
2520  9 int newEnd = Math.max(mappedTo[0], mappedTo[1]);
2521  9 SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
2522    sf.getFeatureGroup(), sf.getScore());
2523  9 copyTo.addSequenceFeature(copy);
2524  9 count++;
2525    }
2526    }
2527  12 return count;
2528    }
2529   
2530    /**
2531    * Returns a mapping from dna to protein by inspecting sequence features of
2532    * type "CDS" on the dna. A mapping is constructed if the total CDS feature
2533    * length is 3 times the peptide length (optionally after dropping a trailing
2534    * stop codon). This method does not check whether the CDS nucleotide sequence
2535    * translates to the peptide sequence.
2536    *
2537    * @param dnaSeq
2538    * @param proteinSeq
2539    * @return
2540    */
 
2541  21 toggle public static MapList mapCdsToProtein(SequenceI dnaSeq,
2542    SequenceI proteinSeq)
2543    {
2544  21 List<int[]> ranges = findCdsPositions(dnaSeq);
2545  21 int mappedDnaLength = MappingUtils.getLength(ranges);
2546   
2547    /*
2548    * if not a whole number of codons, truncate mapping
2549    */
2550  21 int codonRemainder = mappedDnaLength % CODON_LENGTH;
2551  21 if (codonRemainder > 0)
2552    {
2553  2 mappedDnaLength -= codonRemainder;
2554  2 MappingUtils.removeEndPositions(codonRemainder, ranges);
2555    }
2556   
2557  21 int proteinLength = proteinSeq.getLength();
2558  21 int proteinStart = proteinSeq.getStart();
2559  21 int proteinEnd = proteinSeq.getEnd();
2560   
2561    /*
2562    * incomplete start codon may mean X at start of peptide
2563    * we ignore both for mapping purposes
2564    */
2565  21 if (proteinSeq.getCharAt(0) == 'X')
2566    {
2567    // todo JAL-2022 support startPhase > 0
2568  1 proteinStart++;
2569  1 proteinLength--;
2570    }
2571  21 List<int[]> proteinRange = new ArrayList<>();
2572   
2573    /*
2574    * dna length should map to protein (or protein plus stop codon)
2575    */
2576  21 int codesForResidues = mappedDnaLength / CODON_LENGTH;
2577  21 if (codesForResidues == (proteinLength + 1))
2578    {
2579    // assuming extra codon is for STOP and not in peptide
2580    // todo: check trailing codon is indeed a STOP codon
2581  2 codesForResidues--;
2582  2 mappedDnaLength -= CODON_LENGTH;
2583  2 MappingUtils.removeEndPositions(CODON_LENGTH, ranges);
2584    }
2585   
2586  21 if (codesForResidues == proteinLength)
2587    {
2588  4 proteinRange.add(new int[] { proteinStart, proteinEnd });
2589  4 return new MapList(ranges, proteinRange, CODON_LENGTH, 1);
2590    }
2591  17 return null;
2592    }
2593   
2594    /**
2595    * Returns a list of CDS ranges found (as sequence positions base 1), i.e. of
2596    * [start, end] positions of sequence features of type "CDS" (or a sub-type of
2597    * CDS in the Sequence Ontology). The ranges are sorted into ascending start
2598    * position order, so this method is only valid for linear CDS in the same
2599    * sense as the protein product.
2600    *
2601    * @param dnaSeq
2602    * @return
2603    */
 
2604  23 toggle protected static List<int[]> findCdsPositions(SequenceI dnaSeq)
2605    {
2606  23 List<int[]> result = new ArrayList<>();
2607   
2608  23 List<SequenceFeature> sfs = dnaSeq.getFeatures()
2609    .getFeaturesByOntology(SequenceOntologyI.CDS);
2610  23 if (sfs.isEmpty())
2611    {
2612  15 return result;
2613    }
2614  8 SequenceFeatures.sortFeatures(sfs, true);
2615   
2616  8 for (SequenceFeature sf : sfs)
2617    {
2618  16 int phase = 0;
2619  16 try
2620    {
2621  16 String s = sf.getPhase();
2622  16 if (s != null)
2623    {
2624  2 phase = Integer.parseInt(s);
2625    }
2626    } catch (NumberFormatException e)
2627    {
2628    // leave as zero
2629    }
2630    /*
2631    * phase > 0 on first codon means 5' incomplete - skip to the start
2632    * of the next codon; example ENST00000496384
2633    */
2634  16 int begin = sf.getBegin();
2635  16 int end = sf.getEnd();
2636  16 if (result.isEmpty() && phase > 0)
2637    {
2638  2 begin += phase;
2639  2 if (begin > end)
2640    {
2641    // shouldn't happen!
2642  0 System.err
2643    .println("Error: start phase extends beyond start CDS in "
2644    + dnaSeq.getName());
2645    }
2646    }
2647  16 result.add(new int[] { begin, end });
2648    }
2649   
2650    /*
2651    * Finally sort ranges by start position. This avoids a dependency on
2652    * keeping features in order on the sequence (if they are in order anyway,
2653    * the sort will have almost no work to do). The implicit assumption is CDS
2654    * ranges are assembled in order. Other cases should not use this method,
2655    * but instead construct an explicit mapping for CDS (e.g. EMBL parsing).
2656    */
2657  8 Collections.sort(result, IntRangeComparator.ASCENDING);
2658  8 return result;
2659    }
2660   
2661    /**
2662    * Makes an alignment with a copy of the given sequences, adding in any
2663    * non-redundant sequences which are mapped to by the cross-referenced
2664    * sequences.
2665    *
2666    * @param seqs
2667    * @param xrefs
2668    * @param dataset
2669    * the alignment dataset shared by the new copy
2670    * @return
2671    */
 
2672  0 toggle public static AlignmentI makeCopyAlignment(SequenceI[] seqs,
2673    SequenceI[] xrefs, AlignmentI dataset)
2674    {
2675  0 AlignmentI copy = new Alignment(new Alignment(seqs));
2676  0 copy.setDataset(dataset);
2677  0 boolean isProtein = !copy.isNucleotide();
2678  0 SequenceIdMatcher matcher = new SequenceIdMatcher(seqs);
2679  0 if (xrefs != null)
2680    {
2681    // BH 2019.01.25 recoded to remove iterators
2682   
2683  0 for (int ix = 0, nx = xrefs.length; ix < nx; ix++)
2684    {
2685  0 SequenceI xref = xrefs[ix];
2686  0 List<DBRefEntry> dbrefs = xref.getDBRefs();
2687  0 if (dbrefs != null)
2688    {
2689  0 for (int ir = 0, nir = dbrefs.size(); ir < nir; ir++)
2690    {
2691  0 DBRefEntry dbref = dbrefs.get(ir);
2692  0 Mapping map = dbref.getMap();
2693  0 SequenceI mto;
2694  0 if (map == null || (mto = map.getTo()) == null
2695    || mto.isProtein() != isProtein)
2696    {
2697  0 continue;
2698    }
2699  0 SequenceI mappedTo = mto;
2700  0 SequenceI match = matcher.findIdMatch(mappedTo);
2701  0 if (match == null)
2702    {
2703  0 matcher.add(mappedTo);
2704  0 copy.addSequence(mappedTo);
2705    }
2706    }
2707    }
2708    }
2709    }
2710  0 return copy;
2711    }
2712   
2713    /**
2714    * Try to align sequences in 'unaligned' to match the alignment of their
2715    * mapped regions in 'aligned'. For example, could use this to align CDS
2716    * sequences which are mapped to their parent cDNA sequences.
2717    *
2718    * This method handles 1:1 mappings (dna-to-dna or protein-to-protein). For
2719    * dna-to-protein or protein-to-dna use alternative methods.
2720    *
2721    * @param unaligned
2722    * sequences to be aligned
2723    * @param aligned
2724    * holds aligned sequences and their mappings
2725    * @return
2726    */
 
2727  4 toggle public static int alignAs(AlignmentI unaligned, AlignmentI aligned)
2728    {
2729    /*
2730    * easy case - aligning a copy of aligned sequences
2731    */
2732  4 if (alignAsSameSequences(unaligned, aligned))
2733    {
2734  0 return unaligned.getHeight();
2735    }
2736   
2737    /*
2738    * fancy case - aligning via mappings between sequences
2739    */
2740  4 List<SequenceI> unmapped = new ArrayList<>();
2741  4 Map<Integer, Map<SequenceI, Character>> columnMap = buildMappedColumnsMap(
2742    unaligned, aligned, unmapped);
2743  4 int width = columnMap.size();
2744  4 char gap = unaligned.getGapCharacter();
2745  4 int realignedCount = 0;
2746    // TODO: verify this loop scales sensibly for very wide/high alignments
2747   
2748  4 for (SequenceI seq : unaligned.getSequences())
2749    {
2750  8 if (!unmapped.contains(seq))
2751    {
2752  8 char[] newSeq = new char[width];
2753  8 Arrays.fill(newSeq, gap); // JBPComment - doubt this is faster than the
2754    // Integer iteration below
2755  8 int newCol = 0;
2756  8 int lastCol = 0;
2757   
2758    /*
2759    * traverse the map to find columns populated
2760    * by our sequence
2761    */
2762  8 for (Integer column : columnMap.keySet())
2763    {
2764  84 Character c = columnMap.get(column).get(seq);
2765  84 if (c != null)
2766    {
2767    /*
2768    * sequence has a character at this position
2769    *
2770    */
2771  52 newSeq[newCol] = c;
2772  52 lastCol = newCol;
2773    }
2774  84 newCol++;
2775    }
2776   
2777    /*
2778    * trim trailing gaps
2779    */
2780  8 if (lastCol < width)
2781    {
2782  8 char[] tmp = new char[lastCol + 1];
2783  8 System.arraycopy(newSeq, 0, tmp, 0, lastCol + 1);
2784  8 newSeq = tmp;
2785    }
2786    // TODO: optimise SequenceI to avoid char[]->String->char[]
2787  8 seq.setSequence(String.valueOf(newSeq));
2788  8 realignedCount++;
2789    }
2790    }
2791  4 return realignedCount;
2792    }
2793   
2794    /**
2795    * If unaligned and aligned sequences share the same dataset sequences, then
2796    * simply copies the aligned sequences to the unaligned sequences and returns
2797    * true; else returns false
2798    *
2799    * @param unaligned
2800    * - sequences to be aligned based on aligned
2801    * @param aligned
2802    * - 'guide' alignment containing sequences derived from same dataset
2803    * as unaligned
2804    * @return
2805    */
 
2806  8 toggle static boolean alignAsSameSequences(AlignmentI unaligned,
2807    AlignmentI aligned)
2808    {
2809  8 if (aligned.getDataset() == null || unaligned.getDataset() == null)
2810    {
2811  0 return false; // should only pass alignments with datasets here
2812    }
2813   
2814    // map from dataset sequence to alignment sequence(s)
2815  8 Map<SequenceI, List<SequenceI>> alignedDatasets = new HashMap<>();
2816  8 for (SequenceI seq : aligned.getSequences())
2817    {
2818  19 SequenceI ds = seq.getDatasetSequence();
2819  19 if (alignedDatasets.get(ds) == null)
2820    {
2821  18 alignedDatasets.put(ds, new ArrayList<SequenceI>());
2822    }
2823  19 alignedDatasets.get(ds).add(seq);
2824    }
2825   
2826    /*
2827    * first pass - check whether all sequences to be aligned share a
2828    * dataset sequence with an aligned sequence; also note the leftmost
2829    * ungapped column from which to copy
2830    */
2831  8 int leftmost = Integer.MAX_VALUE;
2832  8 for (SequenceI seq : unaligned.getSequences())
2833    {
2834  14 final SequenceI ds = seq.getDatasetSequence();
2835  14 if (!alignedDatasets.containsKey(ds))
2836    {
2837  5 return false;
2838    }
2839  9 SequenceI alignedSeq = alignedDatasets.get(ds).get(0);
2840  9 int startCol = alignedSeq.findIndex(seq.getStart()); // 1..
2841  9 leftmost = Math.min(leftmost, startCol);
2842    }
2843   
2844    /*
2845    * second pass - copy aligned sequences;
2846    * heuristic rule: pair off sequences in order for the case where
2847    * more than one shares the same dataset sequence
2848    */
2849  3 final char gapCharacter = aligned.getGapCharacter();
2850  3 for (SequenceI seq : unaligned.getSequences())
2851    {
2852  7 List<SequenceI> alignedSequences = alignedDatasets
2853    .get(seq.getDatasetSequence());
2854  7 if (alignedSequences.isEmpty())
2855    {
2856    /*
2857    * defensive check - shouldn't happen! (JAL-3536)
2858    */
2859  0 continue;
2860    }
2861  7 SequenceI alignedSeq = alignedSequences.get(0);
2862   
2863    /*
2864    * gap fill for leading (5') UTR if any
2865    */
2866    // TODO this copies intron columns - wrong!
2867  7 int startCol = alignedSeq.findIndex(seq.getStart()); // 1..
2868  7 int endCol = alignedSeq.findIndex(seq.getEnd());
2869  7 char[] seqchars = new char[endCol - leftmost + 1];
2870  7 Arrays.fill(seqchars, gapCharacter);
2871  7 char[] toCopy = alignedSeq.getSequence(startCol - 1, endCol);
2872  7 System.arraycopy(toCopy, 0, seqchars, startCol - leftmost,
2873    toCopy.length);
2874  7 seq.setSequence(String.valueOf(seqchars));
2875  7 if (alignedSequences.size() > 0)
2876    {
2877    // pop off aligned sequences (except the last one)
2878  7 alignedSequences.remove(0);
2879    }
2880    }
2881   
2882    /*
2883    * finally remove gapped columns (e.g. introns)
2884    */
2885  3 new RemoveGapColCommand("", unaligned.getSequencesArray(), 0,
2886    unaligned.getWidth() - 1, unaligned);
2887   
2888  3 return true;
2889    }
2890   
2891    /**
2892    * Returns a map whose key is alignment column number (base 1), and whose
2893    * values are a map of sequence characters in that column.
2894    *
2895    * @param unaligned
2896    * @param aligned
2897    * @param unmapped
2898    * @return
2899    */
 
2900  4 toggle static SortedMap<Integer, Map<SequenceI, Character>> buildMappedColumnsMap(
2901    AlignmentI unaligned, AlignmentI aligned,
2902    List<SequenceI> unmapped)
2903    {
2904    /*
2905    * Map will hold, for each aligned column position, a map of
2906    * {unalignedSequence, characterPerSequence} at that position.
2907    * TreeMap keeps the entries in ascending column order.
2908    */
2909  4 SortedMap<Integer, Map<SequenceI, Character>> map = new TreeMap<>();
2910   
2911    /*
2912    * record any sequences that have no mapping so can't be realigned
2913    */
2914  4 unmapped.addAll(unaligned.getSequences());
2915   
2916  4 List<AlignedCodonFrame> mappings = aligned.getCodonFrames();
2917   
2918  4 for (SequenceI seq : unaligned.getSequences())
2919    {
2920  8 for (AlignedCodonFrame mapping : mappings)
2921    {
2922  12 SequenceI fromSeq = mapping.findAlignedSequence(seq, aligned);
2923  12 if (fromSeq != null)
2924    {
2925  8 Mapping seqMap = mapping.getMappingBetween(fromSeq, seq);
2926  8 if (addMappedPositions(seq, fromSeq, seqMap, map))
2927    {
2928  8 unmapped.remove(seq);
2929    }
2930    }
2931    }
2932    }
2933  4 return map;
2934    }
2935   
2936    /**
2937    * Helper method that adds to a map the mapped column positions of a sequence.
2938    * <br>
2939    * For example if aaTT-Tg-gAAA is mapped to TTTAAA then the map should record
2940    * that columns 3,4,6,10,11,12 map to characters T,T,T,A,A,A of the mapped to
2941    * sequence.
2942    *
2943    * @param seq
2944    * the sequence whose column positions we are recording
2945    * @param fromSeq
2946    * a sequence that is mapped to the first sequence
2947    * @param seqMap
2948    * the mapping from 'fromSeq' to 'seq'
2949    * @param map
2950    * a map to add the column positions (in fromSeq) of the mapped
2951    * positions of seq
2952    * @return
2953    */
 
2954  10 toggle static boolean addMappedPositions(SequenceI seq, SequenceI fromSeq,
2955    Mapping seqMap, Map<Integer, Map<SequenceI, Character>> map)
2956    {
2957  10 if (seqMap == null)
2958    {
2959  0 return false;
2960    }
2961   
2962    /*
2963    * invert mapping if it is from unaligned to aligned sequence
2964    */
2965  10 if (seqMap.getTo() == fromSeq.getDatasetSequence())
2966    {
2967  2 seqMap = new Mapping(seq.getDatasetSequence(),
2968    seqMap.getMap().getInverse());
2969    }
2970   
2971  10 int toStart = seq.getStart();
2972   
2973    /*
2974    * traverse [start, end, start, end...] ranges in fromSeq
2975    */
2976  10 for (int[] fromRange : seqMap.getMap().getFromRanges())
2977    {
2978  26 for (int i = 0; i < fromRange.length - 1; i += 2)
2979    {
2980  13 boolean forward = fromRange[i + 1] >= fromRange[i];
2981   
2982    /*
2983    * find the range mapped to (sequence positions base 1)
2984    */
2985  13 int[] range = seqMap.locateMappedRange(fromRange[i],
2986    fromRange[i + 1]);
2987  13 if (range == null)
2988    {
2989  0 jalview.bin.Console.errPrintln("Error in mapping " + seqMap
2990    + " from " + fromSeq.getName());
2991  0 return false;
2992    }
2993  13 int fromCol = fromSeq.findIndex(fromRange[i]);
2994  13 int mappedCharPos = range[0];
2995   
2996    /*
2997    * walk over the 'from' aligned sequence in forward or reverse
2998    * direction; when a non-gap is found, record the column position
2999    * of the next character of the mapped-to sequence; stop when all
3000    * the characters of the range have been counted
3001    */
3002  96 while (mappedCharPos <= range[1] && fromCol <= fromSeq.getLength()
3003    && fromCol >= 0)
3004    {
3005  83 if (!Comparison.isGap(fromSeq.getCharAt(fromCol - 1)))
3006    {
3007    /*
3008    * mapped from sequence has a character in this column
3009    * record the column position for the mapped to character
3010    */
3011  64 Map<SequenceI, Character> seqsMap = map.get(fromCol);
3012  64 if (seqsMap == null)
3013    {
3014  54 seqsMap = new HashMap<>();
3015  54 map.put(fromCol, seqsMap);
3016    }
3017  64 seqsMap.put(seq, seq.getCharAt(mappedCharPos - toStart));
3018  64 mappedCharPos++;
3019    }
3020  83 fromCol += (forward ? 1 : -1);
3021    }
3022    }
3023    }
3024  10 return true;
3025    }
3026   
3027    // strictly temporary hack until proper criteria for aligning protein to cds
3028    // are in place; this is so Ensembl -> fetch xrefs Uniprot aligns the Uniprot
 
3029  0 toggle public static boolean looksLikeEnsembl(AlignmentI alignment)
3030    {
3031  0 for (SequenceI seq : alignment.getSequences())
3032    {
3033  0 String name = seq.getName();
3034  0 if (!name.startsWith("ENSG") && !name.startsWith("ENST"))
3035    {
3036  0 return false;
3037    }
3038    }
3039  0 return true;
3040    }
3041   
 
3042  5 toggle public static boolean isSecondaryStructurePresent(
3043    AlignmentAnnotation[] annotations)
3044    {
3045  5 boolean ssPresent = false;
3046   
3047  5 for (AlignmentAnnotation aa : annotations)
3048    {
3049  5 if (ssPresent)
3050    {
3051  0 break;
3052    }
3053   
3054  5 if (Constants.SECONDARY_STRUCTURE_LABELS.containsKey(aa.label))
3055    {
3056  3 ssPresent = true;
3057  3 break;
3058    }
3059    }
3060   
3061  5 return ssPresent;
3062   
3063    }
3064   
 
3065  0 toggle public static Color getSecondaryStructureAnnotationColour(char symbol)
3066    {
3067   
3068  0 if (symbol == Constants.COIL)
3069    {
3070  0 return Color.gray;
3071    }
3072  0 if (symbol == Constants.SHEET)
3073    {
3074  0 return Color.green;
3075    }
3076  0 if (symbol == Constants.HELIX)
3077    {
3078  0 return Color.red;
3079    }
3080   
3081  0 return Color.white;
3082    }
3083   
 
3084  19832 toggle public static char findSSAnnotationForGivenSeqposition(
3085    AlignmentAnnotation aa, int seqPosition)
3086    {
3087  19832 char ss = '*';
3088   
3089  19832 if (aa != null)
3090    {
3091  19832 if (aa.getAnnotationForPosition(seqPosition) != null)
3092    {
3093  6340 Annotation a = aa.getAnnotationForPosition(seqPosition);
3094  6340 ss = a.secondaryStructure;
3095   
3096    // There is no representation for coil and it can be either ' ' or null.
3097  6340 if (ss == ' ' || ss == '-')
3098    {
3099  0 ss = Constants.COIL;
3100    }
3101    }
3102    else
3103    {
3104  13492 ss = Constants.COIL;
3105    }
3106    }
3107   
3108  19832 return ss;
3109    }
3110   
 
3111  617 toggle public static List<String> extractSSSourceInAlignmentAnnotation(
3112    AlignmentAnnotation[] annotations)
3113    {
3114   
3115  617 List<String> ssSources = new ArrayList<>();
3116  617 Set<String> addedSources = new HashSet<>(); // to keep track of added
3117    // sources
3118   
3119  617 if (annotations == null)
3120    {
3121  0 return ssSources;
3122    }
3123   
3124  617 for (AlignmentAnnotation aa : annotations)
3125    {
3126   
3127  2106 String ssSource = extractSSSourceFromAnnotationDescription(aa);
3128   
3129  2106 if (ssSource != null && !addedSources.contains(ssSource))
3130    {
3131  26 ssSources.add(ssSource);
3132  26 addedSources.add(ssSource);
3133    }
3134   
3135    }
3136  617 Collections.sort(ssSources);
3137   
3138  617 return ssSources;
3139   
3140    }
3141   
 
3142  14605 toggle public static String extractSSSourceFromAnnotationDescription(
3143    AlignmentAnnotation aa)
3144    {
3145   
3146  14605 for (String label : Constants.SECONDARY_STRUCTURE_LABELS.keySet())
3147    {
3148   
3149  16639 if (label.equals(aa.label))
3150    {
3151   
3152  12571 if (aa.getProperty(Constants.SS_PROVIDER_PROPERTY) != null)
3153    {
3154   
3155  0 return aa.getProperty(Constants.SS_PROVIDER_PROPERTY);
3156   
3157    }
3158   
3159    // For JPred
3160  12571 if (Constants.SS_ANNOTATION_FROM_JPRED_LABEL.equals(aa.label))
3161    {
3162   
3163  0 return (Constants.SECONDARY_STRUCTURE_LABELS.get(aa.label));
3164   
3165    }
3166   
3167    // For input with secondary structure
3168  12571 if (Constants.SS_ANNOTATION_LABEL.equals(aa.label)
3169    && aa.description != null
3170    && Constants.SS_ANNOTATION_LABEL.equals(aa.description))
3171    {
3172   
3173  0 return (Constants.SECONDARY_STRUCTURE_LABELS.get(aa.label));
3174   
3175    }
3176   
3177    // For other sources
3178  12571 if (aa.sequenceRef == null)
3179    {
3180  4 return null;
3181    }
3182  12567 else if (aa.sequenceRef.getDatasetSequence() == null)
3183    {
3184  0 return null;
3185    }
3186  12567 Vector<PDBEntry> pdbEntries = aa.sequenceRef.getDatasetSequence()
3187    .getAllPDBEntries();
3188   
3189    // TODO: this is an incredibly fragile mechanism
3190  12567 for (PDBEntry entry : pdbEntries)
3191    {
3192   
3193  12567 String entryProvider = entry.getProvider();
3194  12567 if (entryProvider == null)
3195    {
3196    // No provider - so this is either an old Jalview project, or not
3197    // retrieved from recognised source
3198  12567 entryProvider = "PDB";
3199    }
3200   
3201    // Should (re)use a standard mechanism for extracting the PDB ID as it
3202    // is written 1QWXTUV:CHAIN
3203    // Trim the string from first occurrence of colon
3204  12567 String entryID = entry.getId();
3205  12567 int index = entryID.indexOf(':');
3206   
3207    // Check if colon exists
3208  12567 if (index != -1)
3209    {
3210   
3211    // Trim the string from first occurrence of colon
3212  0 entryID = entryID.substring(0, index);
3213   
3214    }
3215   
3216    // TODO: shouldn't need to extract from description what the
3217    // originating ID is for this annotation!
3218  12567 if (entryProvider == "PDB" && aa.description.toLowerCase()
3219    .contains("secondary structure for "
3220    + entryID.toLowerCase()))
3221    {
3222   
3223  12567 return entryProvider;
3224   
3225    }
3226   
3227  0 else if (entryProvider != "PDB" && aa.description.toLowerCase()
3228    .contains(entryID.toLowerCase()))
3229    {
3230   
3231  0 return entryProvider;
3232   
3233    }
3234   
3235    }
3236    }
3237    }
3238   
3239  2034 return null;
3240   
3241    }
3242   
3243    // to do set priority for labels
 
3244  287156 toggle public static List<AlignmentAnnotation> getAlignmentAnnotationForSource(
3245    SequenceI seq, String ssSource)
3246    {
3247   
3248  287156 List<AlignmentAnnotation> ssAnnots = new ArrayList<AlignmentAnnotation>();
3249  287156 for (String ssLabel : Constants.SECONDARY_STRUCTURE_LABELS.keySet())
3250    {
3251   
3252  574312 AlignmentAnnotation[] aa = seq.getAnnotation(ssLabel);
3253  574312 if (aa != null)
3254    {
3255   
3256  24998 if (Constants.SS_ALL_PROVIDERS.equals(ssSource))
3257    {
3258  12499 ssAnnots.addAll(Arrays.asList(aa));
3259  12499 continue;
3260    }
3261   
3262  12499 for (AlignmentAnnotation annot : aa)
3263    {
3264   
3265  12499 String ssSourceForAnnot = extractSSSourceFromAnnotationDescription(
3266    annot);
3267  12499 if (ssSourceForAnnot != null && ssSource.equals(ssSourceForAnnot))
3268    {
3269  12499 ssAnnots.add(annot);
3270    }
3271    }
3272    }
3273    }
3274  287156 if (ssAnnots.size() > 0)
3275    {
3276  24998 return ssAnnots;
3277    }
3278   
3279  262158 return null;
3280   
3281    }
3282   
 
3283  0 toggle public static Map<SequenceI, ArrayList<AlignmentAnnotation>> getSequenceAssociatedAlignmentAnnotations(
3284    AlignmentAnnotation[] alignAnnotList, String selectedSSSource)
3285    {
3286   
3287  0 Map<SequenceI, ArrayList<AlignmentAnnotation>> ssAlignmentAnnotationForSequences = new HashMap<SequenceI, ArrayList<AlignmentAnnotation>>();
3288  0 if (alignAnnotList == null || alignAnnotList.length == 0)
3289    {
3290  0 return ssAlignmentAnnotationForSequences;
3291    }
3292   
3293  0 for (AlignmentAnnotation aa : alignAnnotList)
3294    {
3295  0 if (aa.sequenceRef == null)
3296    {
3297  0 continue;
3298    }
3299   
3300  0 if (isSecondaryStructureFrom(selectedSSSource, aa))
3301    {
3302  0 ssAlignmentAnnotationForSequences
3303    .computeIfAbsent(aa.sequenceRef.getDatasetSequence(),
3304    k -> new ArrayList<>())
3305    .add(aa);
3306    }
3307    }
3308   
3309  0 return ssAlignmentAnnotationForSequences;
3310   
3311    }
3312   
3313    /**
3314    *
3315    * @param selectedSSSource
3316    * @param aa
3317    * @return true if aa is from a provider or all providers as specified by
3318    * selectedSSSource
3319    */
 
3320  0 toggle public static boolean isSecondaryStructureFrom(String selectedSSSource,
3321    AlignmentAnnotation aa)
3322    {
3323   
3324  0 for (String label : Constants.SECONDARY_STRUCTURE_LABELS.keySet())
3325    {
3326   
3327  0 if (label.equals(aa.label))
3328    {
3329   
3330  0 if (selectedSSSource.equals(Constants.SS_ALL_PROVIDERS))
3331    {
3332  0 return true;
3333    }
3334  0 String ssSource = AlignmentUtils
3335    .extractSSSourceFromAnnotationDescription(aa);
3336  0 if (ssSource != null && ssSource.equals(selectedSSSource))
3337    {
3338  0 return true;
3339    }
3340    }
3341    }
3342  0 return false;
3343    }
3344   
3345    // Method to get the key for a given provider value
 
3346  0 toggle public static String getSecondaryStructureProviderKey(String providerValue) {
3347  0 for (Map.Entry<String, String> entry : Constants.STRUCTURE_PROVIDERS.entrySet()) {
3348  0 if (entry.getValue().equals(providerValue)) {
3349  0 return entry.getKey(); // Return the key (abbreviation) for the matching provider value
3350    }
3351    }
3352  0 return null; // Return null if no match is found
3353    }
3354   
 
3355  0 toggle public static String reduceLabelLength(String label) {
3356    // Split the input by " | "
3357  0 String[] parts = label.split(" \\| ");
3358   
3359    // Map the full names to their abbreviations
3360  0 String reducedLabel = Arrays.stream(parts)
3361    .map(fullName -> Constants.STRUCTURE_PROVIDERS.entrySet().stream()
3362    .filter(entry -> entry.getValue().equals(fullName))
3363    .map(Map.Entry::getKey)
3364    .findFirst()
3365    .orElse(fullName)) // Use fullName if no abbreviation is found
3366    .collect(Collectors.joining(" | "));
3367   
3368  0 return reducedLabel; // Return the reduced label if abbreviations were applied
3369    }
3370   
 
3371  0 toggle public static Color getSecondaryStructureProviderColor(String label) {
3372   
3373    //return Constants.STRUCTURE_PROVIDERS_COLOR.getOrDefault(label, Color.BLACK);
3374  0 Color c = Constants.STRUCTURE_PROVIDERS_COLOR.get(label.trim());
3375  0 if(c==null)
3376  0 c = Color.BLACK;
3377  0 return c;
3378    }
3379   
3380   
 
3381  0 toggle public static void assignSecondaryStructureProviderColor(Map<String, Color> secondaryStructureProviderColorMap,
3382    List<String> labels) {
3383   
3384    // Use a Set to track unique labels
3385  0 Set<String> uniqueLabels = new HashSet<>(labels);
3386   
3387  0 Color[] palette = ColorBrewer.Paired.getColorPalette(uniqueLabels.size());
3388   
3389   
3390  0 List<Color> colorList = new ArrayList<>();
3391  0 Collections.addAll(colorList, palette);
3392  0 Collections.shuffle(colorList);
3393  0 int i = 0;
3394   
3395    // Loop through each unique label and add it to the map with a color.
3396  0 for (String label : uniqueLabels) {
3397    // Generate or retrieve a color for the label.
3398  0 secondaryStructureProviderColorMap.put(label.toUpperCase().trim(), colorList.get(i));
3399  0 i++;
3400    }
3401    }
3402    }