Clover icon

Coverage Report

  1. Project Clover database Thu Aug 13 2020 12:04:21 BST
  2. Package jalview.analysis

File AlignmentUtils.java

 

Coverage histogram

../../img/srcFileCovDistChart9.png
12% of files have more coverage

Code metrics

388
801
44
2
2,777
1,660
290
0.36
18.2
22
6.59

Classes

Class Line # Actions
AlignmentUtils 70 795 284
0.8408531584.1%
AlignmentUtils.DnaVariant 86 6 6
0.00%
 

Contributing tests

This file is covered by 85 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.analysis;
22   
23    import jalview.commands.RemoveGapColCommand;
24    import jalview.datamodel.AlignedCodon;
25    import jalview.datamodel.AlignedCodonFrame;
26    import jalview.datamodel.AlignedCodonFrame.SequenceToSequenceMapping;
27    import jalview.datamodel.Alignment;
28    import jalview.datamodel.AlignmentAnnotation;
29    import jalview.datamodel.AlignmentI;
30    import jalview.datamodel.DBRefEntry;
31    import jalview.datamodel.GeneLociI;
32    import jalview.datamodel.IncompleteCodonException;
33    import jalview.datamodel.Mapping;
34    import jalview.datamodel.Sequence;
35    import jalview.datamodel.SequenceFeature;
36    import jalview.datamodel.SequenceGroup;
37    import jalview.datamodel.SequenceI;
38    import jalview.datamodel.features.SequenceFeatures;
39    import jalview.io.gff.SequenceOntologyI;
40    import jalview.schemes.ResidueProperties;
41    import jalview.util.Comparison;
42    import jalview.util.DBRefUtils;
43    import jalview.util.IntRangeComparator;
44    import jalview.util.MapList;
45    import jalview.util.MappingUtils;
46   
47    import java.util.ArrayList;
48    import java.util.Arrays;
49    import java.util.Collection;
50    import java.util.Collections;
51    import java.util.HashMap;
52    import java.util.HashSet;
53    import java.util.Iterator;
54    import java.util.LinkedHashMap;
55    import java.util.List;
56    import java.util.Map;
57    import java.util.Map.Entry;
58    import java.util.NoSuchElementException;
59    import java.util.Set;
60    import java.util.SortedMap;
61    import java.util.TreeMap;
62   
63    /**
64    * grab bag of useful alignment manipulation operations Expect these to be
65    * refactored elsewhere at some point.
66    *
67    * @author jimp
68    *
69    */
 
70    public class AlignmentUtils
71    {
72    private static final int CODON_LENGTH = 3;
73   
74    private static final String SEQUENCE_VARIANT = "sequence_variant:";
75   
76    /*
77    * the 'id' attribute is provided for variant features fetched from
78    * Ensembl using its REST service with JSON format
79    */
80    public static final String VARIANT_ID = "id";
81   
82    /**
83    * A data model to hold the 'normal' base value at a position, and an optional
84    * sequence variant feature
85    */
 
86    static final class DnaVariant
87    {
88    final String base;
89   
90    SequenceFeature variant;
91   
 
92  0 toggle DnaVariant(String nuc)
93    {
94  0 base = nuc;
95  0 variant = null;
96    }
97   
 
98  0 toggle DnaVariant(String nuc, SequenceFeature var)
99    {
100  0 base = nuc;
101  0 variant = var;
102    }
103   
 
104  0 toggle public String getSource()
105    {
106  0 return variant == null ? null : variant.getFeatureGroup();
107    }
108   
109    /**
110    * toString for aid in the debugger only
111    */
 
112  0 toggle @Override
113    public String toString()
114    {
115  0 return base + ":" + (variant == null ? "" : variant.getDescription());
116    }
117    }
118   
119    /**
120    * given an existing alignment, create a new alignment including all, or up to
121    * flankSize additional symbols from each sequence's dataset sequence
122    *
123    * @param core
124    * @param flankSize
125    * @return AlignmentI
126    */
 
127  27 toggle public static AlignmentI expandContext(AlignmentI core, int flankSize)
128    {
129  27 List<SequenceI> sq = new ArrayList<>();
130  27 int maxoffset = 0;
131  27 for (SequenceI s : core.getSequences())
132    {
133  131 SequenceI newSeq = s.deriveSequence();
134  131 final int newSeqStart = newSeq.getStart() - 1;
135  131 if (newSeqStart > maxoffset
136    && newSeq.getDatasetSequence().getStart() < s.getStart())
137    {
138  131 maxoffset = newSeqStart;
139    }
140  131 sq.add(newSeq);
141    }
142  27 if (flankSize > -1)
143    {
144  25 maxoffset = Math.min(maxoffset, flankSize);
145    }
146   
147    /*
148    * now add offset left and right to create an expanded alignment
149    */
150  27 for (SequenceI s : sq)
151    {
152  131 SequenceI ds = s;
153  262 while (ds.getDatasetSequence() != null)
154    {
155  131 ds = ds.getDatasetSequence();
156    }
157  131 int s_end = s.findPosition(s.getStart() + s.getLength());
158    // find available flanking residues for sequence
159  131 int ustream_ds = s.getStart() - ds.getStart();
160  131 int dstream_ds = ds.getEnd() - s_end;
161   
162    // build new flanked sequence
163   
164    // compute gap padding to start of flanking sequence
165  131 int offset = maxoffset - ustream_ds;
166   
167    // padding is gapChar x ( maxoffset - min(ustream_ds, flank)
168  131 if (flankSize >= 0)
169    {
170  125 if (flankSize < ustream_ds)
171    {
172    // take up to flankSize residues
173  40 offset = maxoffset - flankSize;
174  40 ustream_ds = flankSize;
175    }
176  125 if (flankSize <= dstream_ds)
177    {
178  116 dstream_ds = flankSize - 1;
179    }
180    }
181    // TODO use Character.toLowerCase to avoid creating String objects?
182  131 char[] upstream = new String(ds
183    .getSequence(s.getStart() - 1 - ustream_ds, s.getStart() - 1))
184    .toLowerCase().toCharArray();
185  131 char[] downstream = new String(
186    ds.getSequence(s_end - 1, s_end + dstream_ds)).toLowerCase()
187    .toCharArray();
188  131 char[] coreseq = s.getSequence();
189  131 char[] nseq = new char[offset + upstream.length + downstream.length
190    + coreseq.length];
191  131 char c = core.getGapCharacter();
192   
193  131 int p = 0;
194  461 for (; p < offset; p++)
195    {
196  330 nseq[p] = c;
197    }
198   
199  131 System.arraycopy(upstream, 0, nseq, p, upstream.length);
200  131 System.arraycopy(coreseq, 0, nseq, p + upstream.length,
201    coreseq.length);
202  131 System.arraycopy(downstream, 0, nseq,
203    p + coreseq.length + upstream.length, downstream.length);
204  131 s.setSequence(new String(nseq));
205  131 s.setStart(s.getStart() - ustream_ds);
206  131 s.setEnd(s_end + downstream.length);
207    }
208  27 AlignmentI newAl = new jalview.datamodel.Alignment(
209    sq.toArray(new SequenceI[0]));
210  27 for (SequenceI s : sq)
211    {
212  131 if (s.getAnnotation() != null)
213    {
214  1 for (AlignmentAnnotation aa : s.getAnnotation())
215    {
216  1 aa.adjustForAlignment(); // JAL-1712 fix
217  1 newAl.addAnnotation(aa);
218    }
219    }
220    }
221  27 newAl.setDataset(core.getDataset());
222  27 return newAl;
223    }
224   
225    /**
226    * Returns the index (zero-based position) of a sequence in an alignment, or
227    * -1 if not found.
228    *
229    * @param al
230    * @param seq
231    * @return
232    */
 
233  57464 toggle public static int getSequenceIndex(AlignmentI al, SequenceI seq)
234    {
235  57464 int result = -1;
236  57464 int pos = 0;
237  57464 for (SequenceI alSeq : al.getSequences())
238    {
239  126202503 if (alSeq == seq)
240    {
241  57462 result = pos;
242  57462 break;
243    }
244  126145041 pos++;
245    }
246  57463 return result;
247    }
248   
249    /**
250    * Returns a map of lists of sequences in the alignment, keyed by sequence
251    * name. For use in mapping between different alignment views of the same
252    * sequences.
253    *
254    * @see jalview.datamodel.AlignmentI#getSequencesByName()
255    */
 
256  1 toggle public static Map<String, List<SequenceI>> getSequencesByName(
257    AlignmentI al)
258    {
259  1 Map<String, List<SequenceI>> theMap = new LinkedHashMap<>();
260  1 for (SequenceI seq : al.getSequences())
261    {
262  3 String name = seq.getName();
263  3 if (name != null)
264    {
265  3 List<SequenceI> seqs = theMap.get(name);
266  3 if (seqs == null)
267    {
268  2 seqs = new ArrayList<>();
269  2 theMap.put(name, seqs);
270    }
271  3 seqs.add(seq);
272    }
273    }
274  1 return theMap;
275    }
276   
277    /**
278    * Build mapping of protein to cDNA alignment. Mappings are made between
279    * sequences where the cDNA translates to the protein sequence. Any new
280    * mappings are added to the protein alignment. Returns true if any mappings
281    * either already exist or were added, else false.
282    *
283    * @param proteinAlignment
284    * @param cdnaAlignment
285    * @return
286    */
 
287  4 toggle public static boolean mapProteinAlignmentToCdna(
288    final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment)
289    {
290  4 if (proteinAlignment == null || cdnaAlignment == null)
291    {
292  0 return false;
293    }
294   
295  4 Set<SequenceI> mappedDna = new HashSet<>();
296  4 Set<SequenceI> mappedProtein = new HashSet<>();
297   
298    /*
299    * First pass - map sequences where cross-references exist. This include
300    * 1-to-many mappings to support, for example, variant cDNA.
301    */
302  4 boolean mappingPerformed = mapProteinToCdna(proteinAlignment,
303    cdnaAlignment, mappedDna, mappedProtein, true);
304   
305    /*
306    * Second pass - map sequences where no cross-references exist. This only
307    * does 1-to-1 mappings and assumes corresponding sequences are in the same
308    * order in the alignments.
309    */
310  4 mappingPerformed |= mapProteinToCdna(proteinAlignment, cdnaAlignment,
311    mappedDna, mappedProtein, false);
312  4 return mappingPerformed;
313    }
314   
315    /**
316    * Make mappings between compatible sequences (where the cDNA translation
317    * matches the protein).
318    *
319    * @param proteinAlignment
320    * @param cdnaAlignment
321    * @param mappedDna
322    * a set of mapped DNA sequences (to add to)
323    * @param mappedProtein
324    * a set of mapped Protein sequences (to add to)
325    * @param xrefsOnly
326    * if true, only map sequences where xrefs exist
327    * @return
328    */
 
329  8 toggle protected static boolean mapProteinToCdna(
330    final AlignmentI proteinAlignment, final AlignmentI cdnaAlignment,
331    Set<SequenceI> mappedDna, Set<SequenceI> mappedProtein,
332    boolean xrefsOnly)
333    {
334  8 boolean mappingExistsOrAdded = false;
335  8 List<SequenceI> thisSeqs = proteinAlignment.getSequences();
336  8 for (SequenceI aaSeq : thisSeqs)
337    {
338  22 boolean proteinMapped = false;
339  22 AlignedCodonFrame acf = new AlignedCodonFrame();
340   
341  22 for (SequenceI cdnaSeq : cdnaAlignment.getSequences())
342    {
343    /*
344    * Always try to map if sequences have xref to each other; this supports
345    * variant cDNA or alternative splicing for a protein sequence.
346    *
347    * If no xrefs, try to map progressively, assuming that alignments have
348    * mappable sequences in corresponding order. These are not
349    * many-to-many, as that would risk mixing species with similar cDNA
350    * sequences.
351    */
352  86 if (xrefsOnly && !AlignmentUtils.haveCrossRef(aaSeq, cdnaSeq))
353    {
354  39 continue;
355    }
356   
357    /*
358    * Don't map non-xrefd sequences more than once each. This heuristic
359    * allows us to pair up similar sequences in ordered alignments.
360    */
361  47 if (!xrefsOnly && (mappedProtein.contains(aaSeq)
362    || mappedDna.contains(cdnaSeq)))
363    {
364  29 continue;
365    }
366  18 if (mappingExists(proteinAlignment.getCodonFrames(),
367    aaSeq.getDatasetSequence(), cdnaSeq.getDatasetSequence()))
368    {
369  0 mappingExistsOrAdded = true;
370    }
371    else
372    {
373  18 MapList map = mapCdnaToProtein(aaSeq, cdnaSeq);
374  18 if (map != null)
375    {
376  12 acf.addMap(cdnaSeq, aaSeq, map);
377  12 mappingExistsOrAdded = true;
378  12 proteinMapped = true;
379  12 mappedDna.add(cdnaSeq);
380  12 mappedProtein.add(aaSeq);
381    }
382    }
383    }
384  22 if (proteinMapped)
385    {
386  11 proteinAlignment.addCodonFrame(acf);
387    }
388    }
389  8 return mappingExistsOrAdded;
390    }
391   
392    /**
393    * Answers true if the mappings include one between the given (dataset)
394    * sequences.
395    */
 
396  18 toggle protected static boolean mappingExists(List<AlignedCodonFrame> mappings,
397    SequenceI aaSeq, SequenceI cdnaSeq)
398    {
399  18 if (mappings != null)
400    {
401  18 for (AlignedCodonFrame acf : mappings)
402    {
403  14 if (cdnaSeq == acf.getDnaForAaSeq(aaSeq))
404    {
405  0 return true;
406    }
407    }
408    }
409  18 return false;
410    }
411   
412    /**
413    * Builds a mapping (if possible) of a cDNA to a protein sequence.
414    * <ul>
415    * <li>first checks if the cdna translates exactly to the protein
416    * sequence</li>
417    * <li>else checks for translation after removing a STOP codon</li>
418    * <li>else checks for translation after removing a START codon</li>
419    * <li>if that fails, inspect CDS features on the cDNA sequence</li>
420    * </ul>
421    * Returns null if no mapping is determined.
422    *
423    * @param proteinSeq
424    * the aligned protein sequence
425    * @param cdnaSeq
426    * the aligned cdna sequence
427    * @return
428    */
 
429  21 toggle public static MapList mapCdnaToProtein(SequenceI proteinSeq,
430    SequenceI cdnaSeq)
431    {
432    /*
433    * Here we handle either dataset sequence set (desktop) or absent (applet).
434    * Use only the char[] form of the sequence to avoid creating possibly large
435    * String objects.
436    */
437  21 final SequenceI proteinDataset = proteinSeq.getDatasetSequence();
438  21 char[] aaSeqChars = proteinDataset != null
439    ? proteinDataset.getSequence()
440    : proteinSeq.getSequence();
441  21 final SequenceI cdnaDataset = cdnaSeq.getDatasetSequence();
442  21 char[] cdnaSeqChars = cdnaDataset != null ? cdnaDataset.getSequence()
443    : cdnaSeq.getSequence();
444  21 if (aaSeqChars == null || cdnaSeqChars == null)
445    {
446  0 return null;
447    }
448   
449    /*
450    * cdnaStart/End, proteinStartEnd are base 1 (for dataset sequence mapping)
451    */
452  21 final int mappedLength = CODON_LENGTH * aaSeqChars.length;
453  21 int cdnaLength = cdnaSeqChars.length;
454  21 int cdnaStart = cdnaSeq.getStart();
455  21 int cdnaEnd = cdnaSeq.getEnd();
456  21 final int proteinStart = proteinSeq.getStart();
457  21 final int proteinEnd = proteinSeq.getEnd();
458   
459    /*
460    * If lengths don't match, try ignoring stop codon (if present)
461    */
462  21 if (cdnaLength != mappedLength && cdnaLength > 2)
463    {
464  6 String lastCodon = String.valueOf(cdnaSeqChars,
465    cdnaLength - CODON_LENGTH, CODON_LENGTH).toUpperCase();
466  6 for (String stop : ResidueProperties.STOP_CODONS)
467    {
468  17 if (lastCodon.equals(stop))
469    {
470  2 cdnaEnd -= CODON_LENGTH;
471  2 cdnaLength -= CODON_LENGTH;
472  2 break;
473    }
474    }
475    }
476   
477    /*
478    * If lengths still don't match, try ignoring start codon.
479    */
480  21 int startOffset = 0;
481  21 if (cdnaLength != mappedLength && cdnaLength > 2
482    && String.valueOf(cdnaSeqChars, 0, CODON_LENGTH).toUpperCase()
483    .equals(ResidueProperties.START))
484    {
485  5 startOffset += CODON_LENGTH;
486  5 cdnaStart += CODON_LENGTH;
487  5 cdnaLength -= CODON_LENGTH;
488    }
489   
490  21 if (translatesAs(cdnaSeqChars, startOffset, aaSeqChars))
491    {
492    /*
493    * protein is translation of dna (+/- start/stop codons)
494    */
495  15 MapList map = new MapList(new int[] { cdnaStart, cdnaEnd },
496    new int[]
497    { proteinStart, proteinEnd }, CODON_LENGTH, 1);
498  15 return map;
499    }
500   
501    /*
502    * translation failed - try mapping CDS annotated regions of dna
503    */
504  6 return mapCdsToProtein(cdnaSeq, proteinSeq);
505    }
506   
507    /**
508    * Test whether the given cdna sequence, starting at the given offset,
509    * translates to the given amino acid sequence, using the standard translation
510    * table. Designed to fail fast i.e. as soon as a mismatch position is found.
511    *
512    * @param cdnaSeqChars
513    * @param cdnaStart
514    * @param aaSeqChars
515    * @return
516    */
 
517  41 toggle protected static boolean translatesAs(char[] cdnaSeqChars, int cdnaStart,
518    char[] aaSeqChars)
519    {
520  41 if (cdnaSeqChars == null || aaSeqChars == null)
521    {
522  3 return false;
523    }
524   
525  38 int aaPos = 0;
526  38 int dnaPos = cdnaStart;
527  151 for (; dnaPos < cdnaSeqChars.length - 2
528    && aaPos < aaSeqChars.length; dnaPos += CODON_LENGTH, aaPos++)
529    {
530  120 String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
531  120 final String translated = ResidueProperties.codonTranslate(codon);
532   
533    /*
534    * allow * in protein to match untranslatable in dna
535    */
536  120 final char aaRes = aaSeqChars[aaPos];
537  120 if ((translated == null || ResidueProperties.STOP.equals(translated))
538    && aaRes == '*')
539    {
540  4 continue;
541    }
542  116 if (translated == null || !(aaRes == translated.charAt(0)))
543    {
544    // debug
545    // System.out.println(("Mismatch at " + i + "/" + aaResidue + ": "
546    // + codon + "(" + translated + ") != " + aaRes));
547  7 return false;
548    }
549    }
550   
551    /*
552    * check we matched all of the protein sequence
553    */
554  31 if (aaPos != aaSeqChars.length)
555    {
556  2 return false;
557    }
558   
559    /*
560    * check we matched all of the dna except
561    * for optional trailing STOP codon
562    */
563  29 if (dnaPos == cdnaSeqChars.length)
564    {
565  17 return true;
566    }
567  12 if (dnaPos == cdnaSeqChars.length - CODON_LENGTH)
568    {
569  11 String codon = String.valueOf(cdnaSeqChars, dnaPos, CODON_LENGTH);
570  11 if (ResidueProperties.STOP
571    .equals(ResidueProperties.codonTranslate(codon)))
572    {
573  9 return true;
574    }
575    }
576  3 return false;
577    }
578   
579    /**
580    * Align sequence 'seq' to match the alignment of a mapped sequence. Note this
581    * currently assumes that we are aligning cDNA to match protein.
582    *
583    * @param seq
584    * the sequence to be realigned
585    * @param al
586    * the alignment whose sequence alignment is to be 'copied'
587    * @param gap
588    * character string represent a gap in the realigned sequence
589    * @param preserveUnmappedGaps
590    * @param preserveMappedGaps
591    * @return true if the sequence was realigned, false if it could not be
592    */
 
593  8 toggle public static boolean alignSequenceAs(SequenceI seq, AlignmentI al,
594    String gap, boolean preserveMappedGaps,
595    boolean preserveUnmappedGaps)
596    {
597    /*
598    * Get any mappings from the source alignment to the target (dataset)
599    * sequence.
600    */
601    // TODO there may be one AlignedCodonFrame per dataset sequence, or one with
602    // all mappings. Would it help to constrain this?
603  8 List<AlignedCodonFrame> mappings = al.getCodonFrame(seq);
604  8 if (mappings == null || mappings.isEmpty())
605    {
606  0 return false;
607    }
608   
609    /*
610    * Locate the aligned source sequence whose dataset sequence is mapped. We
611    * just take the first match here (as we can't align like more than one
612    * sequence).
613    */
614  8 SequenceI alignFrom = null;
615  8 AlignedCodonFrame mapping = null;
616  8 for (AlignedCodonFrame mp : mappings)
617    {
618  8 alignFrom = mp.findAlignedSequence(seq, al);
619  8 if (alignFrom != null)
620    {
621  4 mapping = mp;
622  4 break;
623    }
624    }
625   
626  8 if (alignFrom == null)
627    {
628  4 return false;
629    }
630  4 alignSequenceAs(seq, alignFrom, mapping, gap, al.getGapCharacter(),
631    preserveMappedGaps, preserveUnmappedGaps);
632  4 return true;
633    }
634   
635    /**
636    * Align sequence 'alignTo' the same way as 'alignFrom', using the mapping to
637    * match residues and codons. Flags control whether existing gaps in unmapped
638    * (intron) and mapped (exon) regions are preserved or not. Gaps between
639    * intron and exon are only retained if both flags are set.
640    *
641    * @param alignTo
642    * @param alignFrom
643    * @param mapping
644    * @param myGap
645    * @param sourceGap
646    * @param preserveUnmappedGaps
647    * @param preserveMappedGaps
648    */
 
649  19 toggle public static void alignSequenceAs(SequenceI alignTo, SequenceI alignFrom,
650    AlignedCodonFrame mapping, String myGap, char sourceGap,
651    boolean preserveMappedGaps, boolean preserveUnmappedGaps)
652    {
653    // TODO generalise to work for Protein-Protein, dna-dna, dna-protein
654   
655    // aligned and dataset sequence positions, all base zero
656  19 int thisSeqPos = 0;
657  19 int sourceDsPos = 0;
658   
659  19 int basesWritten = 0;
660  19 char myGapChar = myGap.charAt(0);
661  19 int ratio = myGap.length();
662   
663  19 int fromOffset = alignFrom.getStart() - 1;
664  19 int toOffset = alignTo.getStart() - 1;
665  19 int sourceGapMappedLength = 0;
666  19 boolean inExon = false;
667  19 final int toLength = alignTo.getLength();
668  19 final int fromLength = alignFrom.getLength();
669  19 StringBuilder thisAligned = new StringBuilder(2 * toLength);
670   
671    /*
672    * Traverse the 'model' aligned sequence
673    */
674  205 for (int i = 0; i < fromLength; i++)
675    {
676  186 char sourceChar = alignFrom.getCharAt(i);
677  186 if (sourceChar == sourceGap)
678    {
679  44 sourceGapMappedLength += ratio;
680  44 continue;
681    }
682   
683    /*
684    * Found a non-gap character. Locate its mapped region if any.
685    */
686  142 sourceDsPos++;
687    // Note mapping positions are base 1, our sequence positions base 0
688  142 int[] mappedPos = mapping.getMappedRegion(alignTo, alignFrom,
689    sourceDsPos + fromOffset);
690  142 if (mappedPos == null)
691    {
692    /*
693    * unmapped position; treat like a gap
694    */
695  94 sourceGapMappedLength += ratio;
696    // System.err.println("Can't align: no codon mapping to residue "
697    // + sourceDsPos + "(" + sourceChar + ")");
698    // return;
699  94 continue;
700    }
701   
702  48 int mappedCodonStart = mappedPos[0]; // position (1...) of codon start
703  48 int mappedCodonEnd = mappedPos[mappedPos.length - 1]; // codon end pos
704  48 StringBuilder trailingCopiedGap = new StringBuilder();
705   
706    /*
707    * Copy dna sequence up to and including this codon. Optionally, include
708    * gaps before the codon starts (in introns) and/or after the codon starts
709    * (in exons).
710    *
711    * Note this only works for 'linear' splicing, not reverse or interleaved.
712    * But then 'align dna as protein' doesn't make much sense otherwise.
713    */
714  48 int intronLength = 0;
715  294 while (basesWritten + toOffset < mappedCodonEnd
716    && thisSeqPos < toLength)
717    {
718  246 final char c = alignTo.getCharAt(thisSeqPos++);
719  246 if (c != myGapChar)
720    {
721  146 basesWritten++;
722  146 int sourcePosition = basesWritten + toOffset;
723  146 if (sourcePosition < mappedCodonStart)
724    {
725    /*
726    * Found an unmapped (intron) base. First add in any preceding gaps
727    * (if wanted).
728    */
729  48 if (preserveUnmappedGaps && trailingCopiedGap.length() > 0)
730    {
731  17 thisAligned.append(trailingCopiedGap.toString());
732  17 intronLength += trailingCopiedGap.length();
733  17 trailingCopiedGap = new StringBuilder();
734    }
735  48 intronLength++;
736  48 inExon = false;
737    }
738    else
739    {
740  98 final boolean startOfCodon = sourcePosition == mappedCodonStart;
741  98 int gapsToAdd = calculateGapsToInsert(preserveMappedGaps,
742    preserveUnmappedGaps, sourceGapMappedLength, inExon,
743    trailingCopiedGap.length(), intronLength, startOfCodon);
744  215 for (int k = 0; k < gapsToAdd; k++)
745    {
746  117 thisAligned.append(myGapChar);
747    }
748  98 sourceGapMappedLength = 0;
749  98 inExon = true;
750    }
751  146 thisAligned.append(c);
752  146 trailingCopiedGap = new StringBuilder();
753    }
754    else
755    {
756  100 if (inExon && preserveMappedGaps)
757    {
758  32 trailingCopiedGap.append(myGapChar);
759    }
760  68 else if (!inExon && preserveUnmappedGaps)
761    {
762  27 trailingCopiedGap.append(myGapChar);
763    }
764    }
765    }
766    }
767   
768    /*
769    * At end of model aligned sequence. Copy any remaining target sequence, optionally
770    * including (intron) gaps.
771    */
772  129 while (thisSeqPos < toLength)
773    {
774  110 final char c = alignTo.getCharAt(thisSeqPos++);
775  110 if (c != myGapChar || preserveUnmappedGaps)
776    {
777  102 thisAligned.append(c);
778    }
779  110 sourceGapMappedLength--;
780    }
781   
782    /*
783    * finally add gaps to pad for any trailing source gaps or
784    * unmapped characters
785    */
786  19 if (preserveUnmappedGaps)
787    {
788  24 while (sourceGapMappedLength > 0)
789    {
790  12 thisAligned.append(myGapChar);
791  12 sourceGapMappedLength--;
792    }
793    }
794   
795    /*
796    * All done aligning, set the aligned sequence.
797    */
798  19 alignTo.setSequence(new String(thisAligned));
799    }
800   
801    /**
802    * Helper method to work out how many gaps to insert when realigning.
803    *
804    * @param preserveMappedGaps
805    * @param preserveUnmappedGaps
806    * @param sourceGapMappedLength
807    * @param inExon
808    * @param trailingCopiedGap
809    * @param intronLength
810    * @param startOfCodon
811    * @return
812    */
 
813  98 toggle protected static int calculateGapsToInsert(boolean preserveMappedGaps,
814    boolean preserveUnmappedGaps, int sourceGapMappedLength,
815    boolean inExon, int trailingGapLength, int intronLength,
816    final boolean startOfCodon)
817    {
818  98 int gapsToAdd = 0;
819  98 if (startOfCodon)
820    {
821    /*
822    * Reached start of codon. Ignore trailing gaps in intron unless we are
823    * preserving gaps in both exon and intron. Ignore them anyway if the
824    * protein alignment introduces a gap at least as large as the intronic
825    * region.
826    */
827  40 if (inExon && !preserveMappedGaps)
828    {
829  4 trailingGapLength = 0;
830    }
831  40 if (!inExon && !(preserveMappedGaps && preserveUnmappedGaps))
832    {
833  19 trailingGapLength = 0;
834    }
835  40 if (inExon)
836    {
837  14 gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
838    }
839    else
840    {
841  26 if (intronLength + trailingGapLength <= sourceGapMappedLength)
842    {
843  20 gapsToAdd = sourceGapMappedLength - intronLength;
844    }
845    else
846    {
847  6 gapsToAdd = Math.min(
848    intronLength + trailingGapLength - sourceGapMappedLength,
849    trailingGapLength);
850    }
851    }
852    }
853    else
854    {
855    /*
856    * second or third base of codon; check for any gaps in dna
857    */
858  58 if (!preserveMappedGaps)
859    {
860  32 trailingGapLength = 0;
861    }
862  58 gapsToAdd = Math.max(sourceGapMappedLength, trailingGapLength);
863    }
864  98 return gapsToAdd;
865    }
866   
867    /**
868    * Realigns the given protein to match the alignment of the dna, using codon
869    * mappings to translate aligned codon positions to protein residues.
870    *
871    * @param protein
872    * the alignment whose sequences are realigned by this method
873    * @param dna
874    * the dna alignment whose alignment we are 'copying'
875    * @return the number of sequences that were realigned
876    */
 
877  5 toggle public static int alignProteinAsDna(AlignmentI protein, AlignmentI dna)
878    {
879  5 if (protein.isNucleotide() || !dna.isNucleotide())
880    {
881  0 System.err.println("Wrong alignment type in alignProteinAsDna");
882  0 return 0;
883    }
884  5 List<SequenceI> unmappedProtein = new ArrayList<>();
885  5 Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = buildCodonColumnsMap(
886    protein, dna, unmappedProtein);
887  5 return alignProteinAs(protein, alignedCodons, unmappedProtein);
888    }
889   
890    /**
891    * Realigns the given dna to match the alignment of the protein, using codon
892    * mappings to translate aligned peptide positions to codons.
893    *
894    * Always produces a padded CDS alignment.
895    *
896    * @param dna
897    * the alignment whose sequences are realigned by this method
898    * @param protein
899    * the protein alignment whose alignment we are 'copying'
900    * @return the number of sequences that were realigned
901    */
 
902  4 toggle public static int alignCdsAsProtein(AlignmentI dna, AlignmentI protein)
903    {
904  4 if (protein.isNucleotide() || !dna.isNucleotide())
905    {
906  0 System.err.println("Wrong alignment type in alignProteinAsDna");
907  0 return 0;
908    }
909    // todo: implement this
910  4 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
911  4 int alignedCount = 0;
912  4 int width = 0; // alignment width for padding CDS
913  4 for (SequenceI dnaSeq : dna.getSequences())
914    {
915  5 if (alignCdsSequenceAsProtein(dnaSeq, protein, mappings,
916    dna.getGapCharacter()))
917    {
918  5 alignedCount++;
919    }
920  5 width = Math.max(dnaSeq.getLength(), width);
921    }
922  4 int oldwidth;
923  4 int diff;
924  4 for (SequenceI dnaSeq : dna.getSequences())
925    {
926  5 oldwidth = dnaSeq.getLength();
927  5 diff = width - oldwidth;
928  5 if (diff > 0)
929    {
930  1 dnaSeq.insertCharAt(oldwidth, diff, dna.getGapCharacter());
931    }
932    }
933  4 return alignedCount;
934    }
935   
936    /**
937    * Helper method to align (if possible) the dna sequence to match the
938    * alignment of a mapped protein sequence. This is currently limited to
939    * handling coding sequence only.
940    *
941    * @param cdsSeq
942    * @param protein
943    * @param mappings
944    * @param gapChar
945    * @return
946    */
 
947  5 toggle static boolean alignCdsSequenceAsProtein(SequenceI cdsSeq,
948    AlignmentI protein, List<AlignedCodonFrame> mappings,
949    char gapChar)
950    {
951  5 SequenceI cdsDss = cdsSeq.getDatasetSequence();
952  5 if (cdsDss == null)
953    {
954  0 System.err
955    .println("alignCdsSequenceAsProtein needs aligned sequence!");
956  0 return false;
957    }
958   
959  5 List<AlignedCodonFrame> dnaMappings = MappingUtils
960    .findMappingsForSequence(cdsSeq, mappings);
961  5 for (AlignedCodonFrame mapping : dnaMappings)
962    {
963  5 SequenceI peptide = mapping.findAlignedSequence(cdsSeq, protein);
964  5 if (peptide != null)
965    {
966  5 final int peptideLength = peptide.getLength();
967  5 Mapping map = mapping.getMappingBetween(cdsSeq, peptide);
968  5 if (map != null)
969    {
970  5 MapList mapList = map.getMap();
971  5 if (map.getTo() == peptide.getDatasetSequence())
972    {
973  5 mapList = mapList.getInverse();
974    }
975  5 final int cdsLength = cdsDss.getLength();
976  5 int mappedFromLength = MappingUtils.getLength(mapList
977    .getFromRanges());
978  5 int mappedToLength = MappingUtils
979    .getLength(mapList.getToRanges());
980  5 boolean addStopCodon = (cdsLength == mappedFromLength
981    * CODON_LENGTH + CODON_LENGTH)
982    || (peptide.getDatasetSequence()
983    .getLength() == mappedFromLength - 1);
984  5 if (cdsLength != mappedToLength && !addStopCodon)
985    {
986  0 System.err.println(String.format(
987    "Can't align cds as protein (length mismatch %d/%d): %s",
988    cdsLength, mappedToLength, cdsSeq.getName()));
989    }
990   
991    /*
992    * pre-fill the aligned cds sequence with gaps
993    */
994  5 char[] alignedCds = new char[peptideLength * CODON_LENGTH
995  5 + (addStopCodon ? CODON_LENGTH : 0)];
996  5 Arrays.fill(alignedCds, gapChar);
997   
998    /*
999    * walk over the aligned peptide sequence and insert mapped
1000    * codons for residues in the aligned cds sequence
1001    */
1002  5 int copiedBases = 0;
1003  5 int cdsStart = cdsDss.getStart();
1004  5 int proteinPos = peptide.getStart() - 1;
1005  5 int cdsCol = 0;
1006   
1007  33 for (int col = 0; col < peptideLength; col++)
1008    {
1009  28 char residue = peptide.getCharAt(col);
1010   
1011  28 if (Comparison.isGap(residue))
1012    {
1013  13 cdsCol += CODON_LENGTH;
1014    }
1015    else
1016    {
1017  15 proteinPos++;
1018  15 int[] codon = mapList.locateInTo(proteinPos, proteinPos);
1019  15 if (codon == null)
1020    {
1021    // e.g. incomplete start codon, X in peptide
1022  0 cdsCol += CODON_LENGTH;
1023    }
1024    else
1025    {
1026  60 for (int j = codon[0]; j <= codon[1]; j++)
1027    {
1028  45 char mappedBase = cdsDss.getCharAt(j - cdsStart);
1029  45 alignedCds[cdsCol++] = mappedBase;
1030  45 copiedBases++;
1031    }
1032    }
1033    }
1034    }
1035   
1036    /*
1037    * append stop codon if not mapped from protein,
1038    * closing it up to the end of the mapped sequence
1039    */
1040  5 if (copiedBases == cdsLength - CODON_LENGTH)
1041    {
1042  0 for (int i = alignedCds.length - 1; i >= 0; i--)
1043    {
1044  0 if (!Comparison.isGap(alignedCds[i]))
1045    {
1046  0 cdsCol = i + 1; // gap just after end of sequence
1047  0 break;
1048    }
1049    }
1050  0 for (int i = cdsLength - CODON_LENGTH; i < cdsLength; i++)
1051    {
1052  0 alignedCds[cdsCol++] = cdsDss.getCharAt(i);
1053    }
1054    }
1055  5 cdsSeq.setSequence(new String(alignedCds));
1056  5 return true;
1057    }
1058    }
1059    }
1060  0 return false;
1061    }
1062   
1063    /**
1064    * Builds a map whose key is an aligned codon position (3 alignment column
1065    * numbers base 0), and whose value is a map from protein sequence to each
1066    * protein's peptide residue for that codon. The map generates an ordering of
1067    * the codons, and allows us to read off the peptides at each position in
1068    * order to assemble 'aligned' protein sequences.
1069    *
1070    * @param protein
1071    * the protein alignment
1072    * @param dna
1073    * the coding dna alignment
1074    * @param unmappedProtein
1075    * any unmapped proteins are added to this list
1076    * @return
1077    */
 
1078  5 toggle protected static Map<AlignedCodon, Map<SequenceI, AlignedCodon>> buildCodonColumnsMap(
1079    AlignmentI protein, AlignmentI dna,
1080    List<SequenceI> unmappedProtein)
1081    {
1082    /*
1083    * maintain a list of any proteins with no mappings - these will be
1084    * rendered 'as is' in the protein alignment as we can't align them
1085    */
1086  5 unmappedProtein.addAll(protein.getSequences());
1087   
1088  5 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
1089   
1090    /*
1091    * Map will hold, for each aligned codon position e.g. [3, 5, 6], a map of
1092    * {dnaSequence, {proteinSequence, codonProduct}} at that position. The
1093    * comparator keeps the codon positions ordered.
1094    */
1095  5 Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons = new TreeMap<>(
1096    new CodonComparator());
1097   
1098  5 for (SequenceI dnaSeq : dna.getSequences())
1099    {
1100  30 for (AlignedCodonFrame mapping : mappings)
1101    {
1102  274 SequenceI prot = mapping.findAlignedSequence(dnaSeq, protein);
1103  274 if (prot != null)
1104    {
1105  32 Mapping seqMap = mapping.getMappingForSequence(dnaSeq);
1106  32 addCodonPositions(dnaSeq, prot, protein.getGapCharacter(), seqMap,
1107    alignedCodons);
1108  32 unmappedProtein.remove(prot);
1109    }
1110    }
1111    }
1112   
1113    /*
1114    * Finally add any unmapped peptide start residues (e.g. for incomplete
1115    * codons) as if at the codon position before the second residue
1116    */
1117    // TODO resolve JAL-2022 so this fudge can be removed
1118  5 int mappedSequenceCount = protein.getHeight() - unmappedProtein.size();
1119  5 addUnmappedPeptideStarts(alignedCodons, mappedSequenceCount);
1120   
1121  5 return alignedCodons;
1122    }
1123   
1124    /**
1125    * Scans for any protein mapped from position 2 (meaning unmapped start
1126    * position e.g. an incomplete codon), and synthesizes a 'codon' for it at the
1127    * preceding position in the alignment
1128    *
1129    * @param alignedCodons
1130    * the codon-to-peptide map
1131    * @param mappedSequenceCount
1132    * the number of distinct sequences in the map
1133    */
 
1134  5 toggle protected static void addUnmappedPeptideStarts(
1135    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1136    int mappedSequenceCount)
1137    {
1138    // TODO delete this ugly hack once JAL-2022 is resolved
1139    // i.e. we can model startPhase > 0 (incomplete start codon)
1140   
1141  5 List<SequenceI> sequencesChecked = new ArrayList<>();
1142  5 AlignedCodon lastCodon = null;
1143  5 Map<SequenceI, AlignedCodon> toAdd = new HashMap<>();
1144   
1145  5 for (Entry<AlignedCodon, Map<SequenceI, AlignedCodon>> entry : alignedCodons
1146    .entrySet())
1147    {
1148  1913 for (Entry<SequenceI, AlignedCodon> sequenceCodon : entry.getValue()
1149    .entrySet())
1150    {
1151  10661 SequenceI seq = sequenceCodon.getKey();
1152  10661 if (sequencesChecked.contains(seq))
1153    {
1154  10631 continue;
1155    }
1156  30 sequencesChecked.add(seq);
1157  30 AlignedCodon codon = sequenceCodon.getValue();
1158  30 if (codon.peptideCol > 1)
1159    {
1160  0 System.err.println(
1161    "Problem mapping protein with >1 unmapped start positions: "
1162    + seq.getName());
1163    }
1164  30 else if (codon.peptideCol == 1)
1165    {
1166    /*
1167    * first position (peptideCol == 0) was unmapped - add it
1168    */
1169  6 if (lastCodon != null)
1170    {
1171  5 AlignedCodon firstPeptide = new AlignedCodon(lastCodon.pos1,
1172    lastCodon.pos2, lastCodon.pos3,
1173    String.valueOf(seq.getCharAt(0)), 0);
1174  5 toAdd.put(seq, firstPeptide);
1175    }
1176    else
1177    {
1178    /*
1179    * unmapped residue at start of alignment (no prior column) -
1180    * 'insert' at nominal codon [0, 0, 0]
1181    */
1182  1 AlignedCodon firstPeptide = new AlignedCodon(0, 0, 0,
1183    String.valueOf(seq.getCharAt(0)), 0);
1184  1 toAdd.put(seq, firstPeptide);
1185    }
1186    }
1187  30 if (sequencesChecked.size() == mappedSequenceCount)
1188    {
1189    // no need to check past first mapped position in all sequences
1190  5 break;
1191    }
1192    }
1193  1913 lastCodon = entry.getKey();
1194    }
1195   
1196    /*
1197    * add any new codons safely after iterating over the map
1198    */
1199  5 for (Entry<SequenceI, AlignedCodon> startCodon : toAdd.entrySet())
1200    {
1201  6 addCodonToMap(alignedCodons, startCodon.getValue(),
1202    startCodon.getKey());
1203    }
1204    }
1205   
1206    /**
1207    * Update the aligned protein sequences to match the codon alignments given in
1208    * the map.
1209    *
1210    * @param protein
1211    * @param alignedCodons
1212    * an ordered map of codon positions (columns), with sequence/peptide
1213    * values present in each column
1214    * @param unmappedProtein
1215    * @return
1216    */
 
1217  5 toggle protected static int alignProteinAs(AlignmentI protein,
1218    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1219    List<SequenceI> unmappedProtein)
1220    {
1221    /*
1222    * prefill peptide sequences with gaps
1223    */
1224  5 int alignedWidth = alignedCodons.size();
1225  5 char[] gaps = new char[alignedWidth];
1226  5 Arrays.fill(gaps, protein.getGapCharacter());
1227  5 Map<SequenceI, char[]> peptides = new HashMap<>();
1228  5 for (SequenceI seq : protein.getSequences())
1229    {
1230  31 if (!unmappedProtein.contains(seq))
1231    {
1232  30 peptides.put(seq, Arrays.copyOf(gaps, gaps.length));
1233    }
1234    }
1235   
1236    /*
1237    * Traverse the codons left to right (as defined by CodonComparator)
1238    * and insert peptides in each column where the sequence is mapped.
1239    * This gives a peptide 'alignment' where residues are aligned if their
1240    * corresponding codons occupy the same columns in the cdna alignment.
1241    */
1242  5 int column = 0;
1243  5 for (AlignedCodon codon : alignedCodons.keySet())
1244    {
1245  1914 final Map<SequenceI, AlignedCodon> columnResidues = alignedCodons
1246    .get(codon);
1247  1914 for (Entry<SequenceI, AlignedCodon> entry : columnResidues.entrySet())
1248    {
1249  10682 char residue = entry.getValue().product.charAt(0);
1250  10682 peptides.get(entry.getKey())[column] = residue;
1251    }
1252  1914 column++;
1253    }
1254   
1255    /*
1256    * and finally set the constructed sequences
1257    */
1258  5 for (Entry<SequenceI, char[]> entry : peptides.entrySet())
1259    {
1260  30 entry.getKey().setSequence(new String(entry.getValue()));
1261    }
1262   
1263  5 return 0;
1264    }
1265   
1266    /**
1267    * Populate the map of aligned codons by traversing the given sequence
1268    * mapping, locating the aligned positions of mapped codons, and adding those
1269    * positions and their translation products to the map.
1270    *
1271    * @param dna
1272    * the aligned sequence we are mapping from
1273    * @param protein
1274    * the sequence to be aligned to the codons
1275    * @param gapChar
1276    * the gap character in the dna sequence
1277    * @param seqMap
1278    * a mapping to a sequence translation
1279    * @param alignedCodons
1280    * the map we are building up
1281    */
 
1282  32 toggle static void addCodonPositions(SequenceI dna, SequenceI protein,
1283    char gapChar, Mapping seqMap,
1284    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons)
1285    {
1286  32 Iterator<AlignedCodon> codons = seqMap.getCodonIterator(dna, gapChar);
1287   
1288    /*
1289    * add codon positions, and their peptide translations, to the alignment
1290    * map, while remembering the first codon mapped
1291    */
1292  10716 while (codons.hasNext())
1293    {
1294  10684 try
1295    {
1296  10684 AlignedCodon codon = codons.next();
1297  10684 addCodonToMap(alignedCodons, codon, protein);
1298    } catch (IncompleteCodonException e)
1299    {
1300    // possible incomplete trailing codon - ignore
1301    } catch (NoSuchElementException e)
1302    {
1303    // possibly peptide lacking STOP
1304    }
1305    }
1306    }
1307   
1308    /**
1309    * Helper method to add a codon-to-peptide entry to the aligned codons map
1310    *
1311    * @param alignedCodons
1312    * @param codon
1313    * @param protein
1314    */
 
1315  10690 toggle protected static void addCodonToMap(
1316    Map<AlignedCodon, Map<SequenceI, AlignedCodon>> alignedCodons,
1317    AlignedCodon codon, SequenceI protein)
1318    {
1319  10690 Map<SequenceI, AlignedCodon> seqProduct = alignedCodons.get(codon);
1320  10690 if (seqProduct == null)
1321    {
1322  1914 seqProduct = new HashMap<>();
1323  1914 alignedCodons.put(codon, seqProduct);
1324    }
1325  10690 seqProduct.put(protein, codon);
1326    }
1327   
1328    /**
1329    * Returns true if a cDNA/Protein mapping either exists, or could be made,
1330    * between at least one pair of sequences in the two alignments. Currently,
1331    * the logic is:
1332    * <ul>
1333    * <li>One alignment must be nucleotide, and the other protein</li>
1334    * <li>At least one pair of sequences must be already mapped, or mappable</li>
1335    * <li>Mappable means the nucleotide translation matches the protein
1336    * sequence</li>
1337    * <li>The translation may ignore start and stop codons if present in the
1338    * nucleotide</li>
1339    * </ul>
1340    *
1341    * @param al1
1342    * @param al2
1343    * @return
1344    */
 
1345  8 toggle public static boolean isMappable(AlignmentI al1, AlignmentI al2)
1346    {
1347  8 if (al1 == null || al2 == null)
1348    {
1349  3 return false;
1350    }
1351   
1352    /*
1353    * Require one nucleotide and one protein
1354    */
1355  5 if (al1.isNucleotide() == al2.isNucleotide())
1356    {
1357  3 return false;
1358    }
1359  2 AlignmentI dna = al1.isNucleotide() ? al1 : al2;
1360  2 AlignmentI protein = dna == al1 ? al2 : al1;
1361  2 List<AlignedCodonFrame> mappings = protein.getCodonFrames();
1362  2 for (SequenceI dnaSeq : dna.getSequences())
1363    {
1364  2 for (SequenceI proteinSeq : protein.getSequences())
1365    {
1366  2 if (isMappable(dnaSeq, proteinSeq, mappings))
1367    {
1368  2 return true;
1369    }
1370    }
1371    }
1372  0 return false;
1373    }
1374   
1375    /**
1376    * Returns true if the dna sequence is mapped, or could be mapped, to the
1377    * protein sequence.
1378    *
1379    * @param dnaSeq
1380    * @param proteinSeq
1381    * @param mappings
1382    * @return
1383    */
 
1384  2 toggle protected static boolean isMappable(SequenceI dnaSeq,
1385    SequenceI proteinSeq, List<AlignedCodonFrame> mappings)
1386    {
1387  2 if (dnaSeq == null || proteinSeq == null)
1388    {
1389  0 return false;
1390    }
1391   
1392  2 SequenceI dnaDs = dnaSeq.getDatasetSequence() == null ? dnaSeq
1393    : dnaSeq.getDatasetSequence();
1394  2 SequenceI proteinDs = proteinSeq.getDatasetSequence() == null
1395    ? proteinSeq
1396    : proteinSeq.getDatasetSequence();
1397   
1398  2 for (AlignedCodonFrame mapping : mappings)
1399    {
1400  0 if (proteinDs == mapping.getAaForDnaSeq(dnaDs))
1401    {
1402    /*
1403    * already mapped
1404    */
1405  0 return true;
1406    }
1407    }
1408   
1409    /*
1410    * Just try to make a mapping (it is not yet stored), test whether
1411    * successful.
1412    */
1413  2 return mapCdnaToProtein(proteinDs, dnaDs) != null;
1414    }
1415   
1416    /**
1417    * Finds any reference annotations associated with the sequences in
1418    * sequenceScope, that are not already added to the alignment, and adds them
1419    * to the 'candidates' map. Also populates a lookup table of annotation
1420    * labels, keyed by calcId, for use in constructing tooltips or the like.
1421    *
1422    * @param sequenceScope
1423    * the sequences to scan for reference annotations
1424    * @param labelForCalcId
1425    * (optional) map to populate with label for calcId
1426    * @param candidates
1427    * map to populate with annotations for sequence
1428    * @param al
1429    * the alignment to check for presence of annotations
1430    */
 
1431  46 toggle public static void findAddableReferenceAnnotations(
1432    List<SequenceI> sequenceScope, Map<String, String> labelForCalcId,
1433    final Map<SequenceI, List<AlignmentAnnotation>> candidates,
1434    AlignmentI al)
1435    {
1436  46 if (sequenceScope == null)
1437    {
1438  1 return;
1439    }
1440   
1441    /*
1442    * For each sequence in scope, make a list of any annotations on the
1443    * underlying dataset sequence which are not already on the alignment.
1444    *
1445    * Add to a map of { alignmentSequence, <List of annotations to add> }
1446    */
1447  45 for (SequenceI seq : sequenceScope)
1448    {
1449  41 SequenceI dataset = seq.getDatasetSequence();
1450  41 if (dataset == null)
1451    {
1452  0 continue;
1453    }
1454  41 AlignmentAnnotation[] datasetAnnotations = dataset.getAnnotation();
1455  41 if (datasetAnnotations == null)
1456    {
1457  35 continue;
1458    }
1459  6 final List<AlignmentAnnotation> result = new ArrayList<>();
1460  6 for (AlignmentAnnotation dsann : datasetAnnotations)
1461    {
1462    /*
1463    * Find matching annotations on the alignment. If none is found, then
1464    * add this annotation to the list of 'addable' annotations for this
1465    * sequence.
1466    */
1467  9 final Iterable<AlignmentAnnotation> matchedAlignmentAnnotations = al
1468    .findAnnotations(seq, dsann.getCalcId(), dsann.label);
1469  9 if (!matchedAlignmentAnnotations.iterator().hasNext())
1470    {
1471  6 result.add(dsann);
1472  6 if (labelForCalcId != null)
1473    {
1474  6 labelForCalcId.put(dsann.getCalcId(), dsann.label);
1475    }
1476    }
1477    }
1478    /*
1479    * Save any addable annotations for this sequence
1480    */
1481  6 if (!result.isEmpty())
1482    {
1483  4 candidates.put(seq, result);
1484    }
1485    }
1486    }
1487   
1488    /**
1489    * Adds annotations to the top of the alignment annotations, in the same order
1490    * as their related sequences.
1491    *
1492    * @param annotations
1493    * the annotations to add
1494    * @param alignment
1495    * the alignment to add them to
1496    * @param selectionGroup
1497    * current selection group (or null if none)
1498    */
 
1499  0 toggle public static void addReferenceAnnotations(
1500    Map<SequenceI, List<AlignmentAnnotation>> annotations,
1501    final AlignmentI alignment, final SequenceGroup selectionGroup)
1502    {
1503  0 for (SequenceI seq : annotations.keySet())
1504    {
1505  0 for (AlignmentAnnotation ann : annotations.get(seq))
1506    {
1507  0 AlignmentAnnotation copyAnn = new AlignmentAnnotation(ann);
1508  0 int startRes = 0;
1509  0 int endRes = ann.annotations.length;
1510  0 if (selectionGroup != null)
1511    {
1512  0 startRes = selectionGroup.getStartRes();
1513  0 endRes = selectionGroup.getEndRes();
1514    }
1515  0 copyAnn.restrict(startRes, endRes);
1516   
1517    /*
1518    * Add to the sequence (sets copyAnn.datasetSequence), unless the
1519    * original annotation is already on the sequence.
1520    */
1521  0 if (!seq.hasAnnotation(ann))
1522    {
1523  0 seq.addAlignmentAnnotation(copyAnn);
1524    }
1525    // adjust for gaps
1526  0 copyAnn.adjustForAlignment();
1527    // add to the alignment and set visible
1528  0 alignment.addAnnotation(copyAnn);
1529  0 copyAnn.visible = true;
1530    }
1531    }
1532    }
1533   
1534    /**
1535    * Set visibility of alignment annotations of specified types (labels), for
1536    * specified sequences. This supports controls like "Show all secondary
1537    * structure", "Hide all Temp factor", etc.
1538    *
1539    * @al the alignment to scan for annotations
1540    * @param types
1541    * the types (labels) of annotations to be updated
1542    * @param forSequences
1543    * if not null, only annotations linked to one of these sequences are
1544    * in scope for update; if null, acts on all sequence annotations
1545    * @param anyType
1546    * if this flag is true, 'types' is ignored (label not checked)
1547    * @param doShow
1548    * if true, set visibility on, else set off
1549    */
 
1550  5 toggle public static void showOrHideSequenceAnnotations(AlignmentI al,
1551    Collection<String> types, List<SequenceI> forSequences,
1552    boolean anyType, boolean doShow)
1553    {
1554  5 AlignmentAnnotation[] anns = al.getAlignmentAnnotation();
1555  5 if (anns != null)
1556    {
1557  5 for (AlignmentAnnotation aa : anns)
1558    {
1559  30 if (anyType || types.contains(aa.label))
1560    {
1561  21 if ((aa.sequenceRef != null) && (forSequences == null
1562    || forSequences.contains(aa.sequenceRef)))
1563    {
1564  11 aa.visible = doShow;
1565    }
1566    }
1567    }
1568    }
1569    }
1570   
1571    /**
1572    * Returns true if either sequence has a cross-reference to the other
1573    *
1574    * @param seq1
1575    * @param seq2
1576    * @return
1577    */
 
1578  52 toggle public static boolean haveCrossRef(SequenceI seq1, SequenceI seq2)
1579    {
1580    // Note: moved here from class CrossRef as the latter class has dependencies
1581    // not availability to the applet's classpath
1582  52 return hasCrossRef(seq1, seq2) || hasCrossRef(seq2, seq1);
1583    }
1584   
1585    /**
1586    * Returns true if seq1 has a cross-reference to seq2. Currently this assumes
1587    * that sequence name is structured as Source|AccessionId.
1588    *
1589    * @param seq1
1590    * @param seq2
1591    * @return
1592    */
 
1593  108 toggle public static boolean hasCrossRef(SequenceI seq1, SequenceI seq2)
1594    {
1595  108 if (seq1 == null || seq2 == null)
1596    {
1597  8 return false;
1598    }
1599  100 String name = seq2.getName();
1600  100 final List<DBRefEntry> xrefs = seq1.getDBRefs();
1601  100 if (xrefs != null)
1602    {
1603  34 for (int ix = 0, nx = xrefs.size(); ix < nx; ix++)
1604    {
1605  24 DBRefEntry xref = xrefs.get(ix);
1606  24 String xrefName = xref.getSource() + "|" + xref.getAccessionId();
1607    // case-insensitive test, consistent with DBRefEntry.equalRef()
1608  24 if (xrefName.equalsIgnoreCase(name))
1609    {
1610  12 return true;
1611    }
1612    }
1613    }
1614  88 return false;
1615    }
1616   
1617    /**
1618    * Constructs an alignment consisting of the mapped (CDS) regions in the given
1619    * nucleotide sequences, and updates mappings to match. The CDS sequences are
1620    * added to the original alignment's dataset, which is shared by the new
1621    * alignment. Mappings from nucleotide to CDS, and from CDS to protein, are
1622    * added to the alignment dataset.
1623    *
1624    * @param dna
1625    * aligned nucleotide (dna or cds) sequences
1626    * @param dataset
1627    * the alignment dataset the sequences belong to
1628    * @param products
1629    * (optional) to restrict results to CDS that map to specified
1630    * protein products
1631    * @return an alignment whose sequences are the cds-only parts of the dna
1632    * sequences (or null if no mappings are found)
1633    */
 
1634  6 toggle public static AlignmentI makeCdsAlignment(SequenceI[] dna,
1635    AlignmentI dataset, SequenceI[] products)
1636    {
1637  6 if (dataset == null || dataset.getDataset() != null)
1638    {
1639  0 throw new IllegalArgumentException(
1640    "IMPLEMENTATION ERROR: dataset.getDataset() must be null!");
1641    }
1642  6 List<SequenceI> foundSeqs = new ArrayList<>();
1643  6 List<SequenceI> cdsSeqs = new ArrayList<>();
1644  6 List<AlignedCodonFrame> mappings = dataset.getCodonFrames();
1645  6 HashSet<SequenceI> productSeqs = null;
1646  6 if (products != null)
1647    {
1648  3 productSeqs = new HashSet<>();
1649  3 for (SequenceI seq : products)
1650    {
1651  24 productSeqs.add(seq.getDatasetSequence() == null ? seq : seq
1652    .getDatasetSequence());
1653    }
1654    }
1655   
1656    /*
1657    * Construct CDS sequences from mappings on the alignment dataset.
1658    * The logic is:
1659    * - find the protein product(s) mapped to from each dna sequence
1660    * - if the mapping covers the whole dna sequence (give or take start/stop
1661    * codon), take the dna as the CDS sequence
1662    * - else search dataset mappings for a suitable dna sequence, i.e. one
1663    * whose whole sequence is mapped to the protein
1664    * - if no sequence found, construct one from the dna sequence and mapping
1665    * (and add it to dataset so it is found if this is repeated)
1666    */
1667  6 for (SequenceI dnaSeq : dna)
1668    {
1669  52 SequenceI dnaDss = dnaSeq.getDatasetSequence() == null ? dnaSeq
1670    : dnaSeq.getDatasetSequence();
1671   
1672  52 List<AlignedCodonFrame> seqMappings = MappingUtils
1673    .findMappingsForSequence(dnaSeq, mappings);
1674  52 for (AlignedCodonFrame mapping : seqMappings)
1675    {
1676  42 List<Mapping> mappingsFromSequence = mapping
1677    .getMappingsFromSequence(dnaSeq);
1678   
1679  42 for (Mapping aMapping : mappingsFromSequence)
1680    {
1681  44 MapList mapList = aMapping.getMap();
1682  44 if (mapList.getFromRatio() == 1)
1683    {
1684    /*
1685    * not a dna-to-protein mapping (likely dna-to-cds)
1686    */
1687  11 continue;
1688    }
1689   
1690    /*
1691    * skip if mapping is not to one of the target set of proteins
1692    */
1693  33 SequenceI proteinProduct = aMapping.getTo();
1694  33 if (productSeqs != null && !productSeqs.contains(proteinProduct))
1695    {
1696  2 continue;
1697    }
1698   
1699    /*
1700    * try to locate the CDS from the dataset mappings;
1701    * guard against duplicate results (for the case that protein has
1702    * dbrefs to both dna and cds sequences)
1703    */
1704  31 SequenceI cdsSeq = findCdsForProtein(mappings, dnaSeq,
1705    seqMappings, aMapping);
1706  31 if (cdsSeq != null)
1707    {
1708  0 if (!foundSeqs.contains(cdsSeq))
1709    {
1710  0 foundSeqs.add(cdsSeq);
1711  0 SequenceI derivedSequence = cdsSeq.deriveSequence();
1712  0 cdsSeqs.add(derivedSequence);
1713  0 if (!dataset.getSequences().contains(cdsSeq))
1714    {
1715  0 dataset.addSequence(cdsSeq);
1716    }
1717    }
1718  0 continue;
1719    }
1720   
1721    /*
1722    * didn't find mapped CDS sequence - construct it and add
1723    * its dataset sequence to the dataset
1724    */
1725  31 cdsSeq = makeCdsSequence(dnaSeq.getDatasetSequence(), aMapping,
1726    dataset).deriveSequence();
1727    // cdsSeq has a name constructed as CDS|<dbref>
1728    // <dbref> will be either the accession for the coding sequence,
1729    // marked in the /via/ dbref to the protein product accession
1730    // or it will be the original nucleotide accession.
1731  31 SequenceI cdsSeqDss = cdsSeq.getDatasetSequence();
1732   
1733  31 cdsSeqs.add(cdsSeq);
1734   
1735    /*
1736    * build the mapping from CDS to protein
1737    */
1738  31 List<int[]> cdsRange = Collections
1739    .singletonList(new int[]
1740    { cdsSeq.getStart(),
1741    cdsSeq.getLength() + cdsSeq.getStart() - 1 });
1742  31 MapList cdsToProteinMap = new MapList(cdsRange,
1743    mapList.getToRanges(), mapList.getFromRatio(),
1744    mapList.getToRatio());
1745   
1746  31 if (!dataset.getSequences().contains(cdsSeqDss))
1747    {
1748    /*
1749    * if this sequence is a newly created one, add it to the dataset
1750    * and made a CDS to protein mapping (if sequence already exists,
1751    * CDS-to-protein mapping _is_ the transcript-to-protein mapping)
1752    */
1753  9 dataset.addSequence(cdsSeqDss);
1754  9 AlignedCodonFrame cdsToProteinMapping = new AlignedCodonFrame();
1755  9 cdsToProteinMapping.addMap(cdsSeqDss, proteinProduct,
1756    cdsToProteinMap);
1757   
1758    /*
1759    * guard against duplicating the mapping if repeating this action
1760    */
1761  9 if (!mappings.contains(cdsToProteinMapping))
1762    {
1763  9 mappings.add(cdsToProteinMapping);
1764    }
1765    }
1766   
1767  31 propagateDBRefsToCDS(cdsSeqDss, dnaSeq.getDatasetSequence(),
1768    proteinProduct, aMapping);
1769    /*
1770    * add another mapping from original 'from' range to CDS
1771    */
1772  31 AlignedCodonFrame dnaToCdsMapping = new AlignedCodonFrame();
1773  31 final MapList dnaToCdsMap = new MapList(mapList.getFromRanges(),
1774    cdsRange, 1, 1);
1775  31 dnaToCdsMapping.addMap(dnaSeq.getDatasetSequence(), cdsSeqDss,
1776    dnaToCdsMap);
1777  31 if (!mappings.contains(dnaToCdsMapping))
1778    {
1779  20 mappings.add(dnaToCdsMapping);
1780    }
1781   
1782    /*
1783    * transfer dna chromosomal loci (if known) to the CDS
1784    * sequence (via the mapping)
1785    */
1786  31 final MapList cdsToDnaMap = dnaToCdsMap.getInverse();
1787  31 transferGeneLoci(dnaSeq, cdsToDnaMap, cdsSeq);
1788   
1789    /*
1790    * add DBRef with mapping from protein to CDS
1791    * (this enables Get Cross-References from protein alignment)
1792    * This is tricky because we can't have two DBRefs with the
1793    * same source and accession, so need a different accession for
1794    * the CDS from the dna sequence
1795    */
1796   
1797    // specific use case:
1798    // Genomic contig ENSCHR:1, contains coding regions for ENSG01,
1799    // ENSG02, ENSG03, with transcripts and products similarly named.
1800    // cannot add distinct dbrefs mapping location on ENSCHR:1 to ENSG01
1801   
1802    // JBPNote: ?? can't actually create an example that demonstrates we
1803    // need to
1804    // synthesize an xref.
1805   
1806  31 List<DBRefEntry> primrefs = dnaDss.getPrimaryDBRefs();
1807  55 for (int ip = 0, np = primrefs.size(); ip < np; ip++)
1808    {
1809  24 DBRefEntry primRef = primrefs.get(ip);
1810    /*
1811    * create a cross-reference from CDS to the source sequence's
1812    * primary reference and vice versa
1813    */
1814  24 String source = primRef.getSource();
1815  24 String version = primRef.getVersion();
1816  24 DBRefEntry cdsCrossRef = new DBRefEntry(source, source + ":"
1817    + version, primRef.getAccessionId());
1818  24 cdsCrossRef.setMap(new Mapping(dnaDss, new MapList(cdsToDnaMap)));
1819  24 cdsSeqDss.addDBRef(cdsCrossRef);
1820   
1821  24 dnaSeq.addDBRef(new DBRefEntry(source, version, cdsSeq
1822    .getName(), new Mapping(cdsSeqDss, dnaToCdsMap)));
1823    // problem here is that the cross-reference is synthesized -
1824    // cdsSeq.getName() may be like 'CDS|dnaaccession' or
1825    // 'CDS|emblcdsacc'
1826    // assuming cds version same as dna ?!?
1827   
1828  24 DBRefEntry proteinToCdsRef = new DBRefEntry(source, version,
1829    cdsSeq.getName());
1830    //
1831  24 proteinToCdsRef.setMap(new Mapping(cdsSeqDss, cdsToProteinMap
1832    .getInverse()));
1833  24 proteinProduct.addDBRef(proteinToCdsRef);
1834    }
1835    /*
1836    * transfer any features on dna that overlap the CDS
1837    */
1838  31 transferFeatures(dnaSeq, cdsSeq, dnaToCdsMap, null,
1839    SequenceOntologyI.CDS);
1840    }
1841    }
1842    }
1843   
1844  6 AlignmentI cds = new Alignment(cdsSeqs.toArray(new SequenceI[cdsSeqs
1845    .size()]));
1846  6 cds.setDataset(dataset);
1847   
1848  6 return cds;
1849    }
1850   
1851    /**
1852    * Tries to transfer gene loci (dbref to chromosome positions) from fromSeq to
1853    * toSeq, mediated by the given mapping between the sequences
1854    *
1855    * @param fromSeq
1856    * @param targetToFrom
1857    * Map
1858    * @param targetSeq
1859    */
 
1860  34 toggle protected static void transferGeneLoci(SequenceI fromSeq,
1861    MapList targetToFrom, SequenceI targetSeq)
1862    {
1863  34 if (targetSeq.getGeneLoci() != null)
1864    {
1865    // already have - don't override
1866  23 return;
1867    }
1868  11 GeneLociI fromLoci = fromSeq.getGeneLoci();
1869  11 if (fromLoci == null)
1870    {
1871  10 return;
1872    }
1873   
1874  1 MapList newMap = targetToFrom.traverse(fromLoci.getMapping());
1875   
1876  1 if (newMap != null)
1877    {
1878  1 targetSeq.setGeneLoci(fromLoci.getSpeciesId(),
1879    fromLoci.getAssemblyId(), fromLoci.getChromosomeId(), newMap);
1880    }
1881    }
1882   
1883    /**
1884    * A helper method that finds a CDS sequence in the alignment dataset that is
1885    * mapped to the given protein sequence, and either is, or has a mapping from,
1886    * the given dna sequence.
1887    *
1888    * @param mappings
1889    * set of all mappings on the dataset
1890    * @param dnaSeq
1891    * a dna (or cds) sequence we are searching from
1892    * @param seqMappings
1893    * the set of mappings involving dnaSeq
1894    * @param aMapping
1895    * a transcript-to-peptide mapping
1896    * @return
1897    */
 
1898  31 toggle static SequenceI findCdsForProtein(List<AlignedCodonFrame> mappings,
1899    SequenceI dnaSeq, List<AlignedCodonFrame> seqMappings,
1900    Mapping aMapping)
1901    {
1902    /*
1903    * TODO a better dna-cds-protein mapping data representation to allow easy
1904    * navigation; until then this clunky looping around lists of mappings
1905    */
1906  31 SequenceI seqDss = dnaSeq.getDatasetSequence() == null ? dnaSeq
1907    : dnaSeq.getDatasetSequence();
1908  31 SequenceI proteinProduct = aMapping.getTo();
1909   
1910    /*
1911    * is this mapping from the whole dna sequence (i.e. CDS)?
1912    * allowing for possible stop codon on dna but not peptide
1913    */
1914  31 int mappedFromLength = MappingUtils
1915    .getLength(aMapping.getMap().getFromRanges());
1916  31 int dnaLength = seqDss.getLength();
1917  31 if (mappedFromLength == dnaLength
1918    || mappedFromLength == dnaLength - CODON_LENGTH)
1919    {
1920    /*
1921    * if sequence has CDS features, this is a transcript with no UTR
1922    * - do not take this as the CDS sequence! (JAL-2789)
1923    */
1924  0 if (seqDss.getFeatures().getFeaturesByOntology(SequenceOntologyI.CDS)
1925    .isEmpty())
1926    {
1927  0 return seqDss;
1928    }
1929    }
1930   
1931    /*
1932    * looks like we found the dna-to-protein mapping; search for the
1933    * corresponding cds-to-protein mapping
1934    */
1935  31 List<AlignedCodonFrame> mappingsToPeptide = MappingUtils
1936    .findMappingsForSequence(proteinProduct, mappings);
1937  31 for (AlignedCodonFrame acf : mappingsToPeptide)
1938    {
1939  31 for (SequenceToSequenceMapping map : acf.getMappings())
1940    {
1941  253 Mapping mapping = map.getMapping();
1942  253 if (mapping != aMapping
1943    && mapping.getMap().getFromRatio() == CODON_LENGTH
1944    && proteinProduct == mapping.getTo()
1945    && seqDss != map.getFromSeq())
1946    {
1947  0 mappedFromLength = MappingUtils
1948    .getLength(mapping.getMap().getFromRanges());
1949  0 if (mappedFromLength == map.getFromSeq().getLength())
1950    {
1951    /*
1952    * found a 3:1 mapping to the protein product which covers
1953    * the whole dna sequence i.e. is from CDS; finally check the CDS
1954    * is mapped from the given dna start sequence
1955    */
1956  0 SequenceI cdsSeq = map.getFromSeq();
1957    // todo this test is weak if seqMappings contains multiple mappings;
1958    // we get away with it if transcript:cds relationship is 1:1
1959  0 List<AlignedCodonFrame> dnaToCdsMaps = MappingUtils
1960    .findMappingsForSequence(cdsSeq, seqMappings);
1961  0 if (!dnaToCdsMaps.isEmpty())
1962    {
1963  0 return cdsSeq;
1964    }
1965    }
1966    }
1967    }
1968    }
1969  31 return null;
1970    }
1971   
1972    /**
1973    * Helper method that makes a CDS sequence as defined by the mappings from the
1974    * given sequence i.e. extracts the 'mapped from' ranges (which may be on
1975    * forward or reverse strand).
1976    *
1977    * @param seq
1978    * @param mapping
1979    * @param dataset
1980    * - existing dataset. We check for sequences that look like the CDS
1981    * we are about to construct, if one exists already, then we will
1982    * just return that one.
1983    * @return CDS sequence (as a dataset sequence)
1984    */
 
1985  31 toggle static SequenceI makeCdsSequence(SequenceI seq, Mapping mapping,
1986    AlignmentI dataset)
1987    {
1988    /*
1989    * construct CDS sequence name as "CDS|" with 'from id' held in the mapping
1990    * if set (e.g. EMBL protein_id), else sequence name appended
1991    */
1992  31 String mapFromId = mapping.getMappedFromId();
1993  31 final String seqId = "CDS|"
1994  31 + (mapFromId != null ? mapFromId : seq.getName());
1995   
1996  31 SequenceI newSeq = null;
1997   
1998  31 final MapList maplist = mapping.getMap();
1999  31 if (maplist.isContiguous() && maplist.isFromForwardStrand())
2000    {
2001    /*
2002    * just a subsequence, keep same dataset sequence
2003    */
2004  22 int start = maplist.getFromLowest();
2005  22 int end = maplist.getFromHighest();
2006  22 newSeq = seq.getSubSequence(start - 1, end);
2007  22 newSeq.setName(seqId);
2008    }
2009    else
2010    {
2011    /*
2012    * construct by splicing mapped from ranges
2013    */
2014  9 char[] seqChars = seq.getSequence();
2015  9 List<int[]> fromRanges = maplist.getFromRanges();
2016  9 int cdsWidth = MappingUtils.getLength(fromRanges);
2017  9 char[] newSeqChars = new char[cdsWidth];
2018   
2019  9 int newPos = 0;
2020  9 for (int[] range : fromRanges)
2021    {
2022  21 if (range[0] <= range[1])
2023    {
2024    // forward strand mapping - just copy the range
2025  21 int length = range[1] - range[0] + 1;
2026  21 System.arraycopy(seqChars, range[0] - 1, newSeqChars, newPos,
2027    length);
2028  21 newPos += length;
2029    }
2030    else
2031    {
2032    // reverse strand mapping - copy and complement one by one
2033  0 for (int i = range[0]; i >= range[1]; i--)
2034    {
2035  0 newSeqChars[newPos++] = Dna.getComplement(seqChars[i - 1]);
2036    }
2037    }
2038    }
2039   
2040  9 newSeq = new Sequence(seqId, newSeqChars, 1, newPos);
2041    }
2042   
2043  31 if (dataset != null)
2044    {
2045  31 SequenceI[] matches = dataset.findSequenceMatch(newSeq.getName());
2046  31 if (matches != null)
2047    {
2048  31 boolean matched = false;
2049  31 for (SequenceI mtch : matches)
2050    {
2051  3 if (mtch.getStart() != newSeq.getStart())
2052    {
2053  0 continue;
2054    }
2055  3 if (mtch.getEnd() != newSeq.getEnd())
2056    {
2057  0 continue;
2058    }
2059  3 if (!Arrays.equals(mtch.getSequence(), newSeq.getSequence()))
2060    {
2061  3 continue;
2062    }
2063  0 if (!matched)
2064    {
2065  0 matched = true;
2066  0 newSeq = mtch;
2067    }
2068    else
2069    {
2070  0 System.err.println(
2071    "JAL-2154 regression: warning - found (and ignnored a duplicate CDS sequence):"
2072    + mtch.toString());
2073    }
2074    }
2075    }
2076    }
2077    // newSeq.setDescription(mapFromId);
2078   
2079  31 return newSeq;
2080    }
2081   
2082    /**
2083    * Adds any DBRefEntrys to cdsSeq from contig that have a Mapping congruent to
2084    * the given mapping.
2085    *
2086    * @param cdsSeq
2087    * @param contig
2088    * @param proteinProduct
2089    * @param mapping
2090    * @return list of DBRefEntrys added
2091    */
 
2092  31 toggle protected static List<DBRefEntry> propagateDBRefsToCDS(SequenceI cdsSeq,
2093    SequenceI contig, SequenceI proteinProduct, Mapping mapping)
2094    {
2095   
2096    // gather direct refs from contig congruent with mapping
2097  31 List<DBRefEntry> direct = new ArrayList<>();
2098  31 HashSet<String> directSources = new HashSet<>();
2099   
2100  31 List<DBRefEntry> refs = contig.getDBRefs();
2101  31 if (refs != null)
2102    {
2103  600 for (int ib = 0, nb = refs.size(); ib < nb; ib++)
2104    {
2105  576 DBRefEntry dbr = refs.get(ib);
2106  576 MapList map;
2107  ? if (dbr.hasMap() && (map = dbr.getMap().getMap()).isTripletMap())
2108    {
2109    // check if map is the CDS mapping
2110  46 if (mapping.getMap().equals(map))
2111    {
2112  46 direct.add(dbr);
2113  46 directSources.add(dbr.getSource());
2114    }
2115    }
2116    }
2117    }
2118  31 List<DBRefEntry> onSource = DBRefUtils.selectRefs(
2119    proteinProduct.getDBRefs(),
2120    directSources.toArray(new String[0]));
2121  31 List<DBRefEntry> propagated = new ArrayList<>();
2122   
2123    // and generate appropriate mappings
2124  77 for (int ic = 0, nc = direct.size(); ic < nc; ic++)
2125    {
2126  46 DBRefEntry cdsref = direct.get(ic);
2127  46 Mapping m = cdsref.getMap();
2128    // clone maplist and mapping
2129  46 MapList cdsposmap = new MapList(
2130    Arrays.asList(new int[][]
2131    { new int[] { cdsSeq.getStart(), cdsSeq.getEnd() } }),
2132    m.getMap().getToRanges(), 3, 1);
2133  46 Mapping cdsmap = new Mapping(m.getTo(), m.getMap());
2134   
2135    // create dbref
2136  46 DBRefEntry newref = new DBRefEntry(cdsref.getSource(),
2137    cdsref.getVersion(), cdsref.getAccessionId(),
2138    new Mapping(cdsmap.getTo(), cdsposmap));
2139   
2140    // and see if we can map to the protein product for this mapping.
2141    // onSource is the filtered set of accessions on protein that we are
2142    // tranferring, so we assume accession is the same.
2143  46 if (cdsmap.getTo() == null && onSource != null)
2144    {
2145  2 List<DBRefEntry> sourceRefs = DBRefUtils.searchRefs(onSource,
2146    cdsref.getAccessionId());
2147  2 if (sourceRefs != null)
2148    {
2149  2 for (DBRefEntry srcref : sourceRefs)
2150    {
2151  2 if (srcref.getSource().equalsIgnoreCase(cdsref.getSource()))
2152    {
2153    // we have found a complementary dbref on the protein product, so
2154    // update mapping's getTo
2155  2 newref.getMap().setTo(proteinProduct);
2156    }
2157    }
2158    }
2159    }
2160  46 cdsSeq.addDBRef(newref);
2161  46 propagated.add(newref);
2162    }
2163  31 return propagated;
2164    }
2165   
2166    /**
2167    * Transfers co-located features on 'fromSeq' to 'toSeq', adjusting the
2168    * feature start/end ranges, optionally omitting specified feature types.
2169    * Returns the number of features copied.
2170    *
2171    * @param fromSeq
2172    * @param toSeq
2173    * @param mapping
2174    * the mapping from 'fromSeq' to 'toSeq'
2175    * @param select
2176    * if not null, only features of this type are copied (including
2177    * subtypes in the Sequence Ontology)
2178    * @param omitting
2179    */
 
2180  34 toggle protected static int transferFeatures(SequenceI fromSeq, SequenceI toSeq,
2181    MapList mapping, String select, String... omitting)
2182    {
2183  34 SequenceI copyTo = toSeq;
2184  65 while (copyTo.getDatasetSequence() != null)
2185    {
2186  31 copyTo = copyTo.getDatasetSequence();
2187    }
2188  34 if (fromSeq == copyTo || fromSeq.getDatasetSequence() == copyTo)
2189    {
2190  22 return 0; // shared dataset sequence
2191    }
2192   
2193    /*
2194    * get features, optionally restricted by an ontology term
2195    */
2196  12 List<SequenceFeature> sfs = select == null ? fromSeq.getFeatures()
2197    .getPositionalFeatures() : fromSeq.getFeatures()
2198    .getFeaturesByOntology(select);
2199   
2200  12 int count = 0;
2201  12 for (SequenceFeature sf : sfs)
2202    {
2203  14 String type = sf.getType();
2204  14 boolean omit = false;
2205  14 for (String toOmit : omitting)
2206    {
2207  7 if (type.equals(toOmit))
2208    {
2209  2 omit = true;
2210    }
2211    }
2212  14 if (omit)
2213    {
2214  2 continue;
2215    }
2216   
2217    /*
2218    * locate the mapped range - null if either start or end is
2219    * not mapped (no partial overlaps are calculated)
2220    */
2221  12 int start = sf.getBegin();
2222  12 int end = sf.getEnd();
2223  12 int[] mappedTo = mapping.locateInTo(start, end);
2224    /*
2225    * if whole exon range doesn't map, try interpreting it
2226    * as 5' or 3' exon overlapping the CDS range
2227    */
2228  12 if (mappedTo == null)
2229    {
2230  5 mappedTo = mapping.locateInTo(end, end);
2231  5 if (mappedTo != null)
2232    {
2233    /*
2234    * end of exon is in CDS range - 5' overlap
2235    * to a range from the start of the peptide
2236    */
2237  1 mappedTo[0] = 1;
2238    }
2239    }
2240  12 if (mappedTo == null)
2241    {
2242  4 mappedTo = mapping.locateInTo(start, start);
2243  4 if (mappedTo != null)
2244    {
2245    /*
2246    * start of exon is in CDS range - 3' overlap
2247    * to a range up to the end of the peptide
2248    */
2249  1 mappedTo[1] = toSeq.getLength();
2250    }
2251    }
2252  12 if (mappedTo != null)
2253    {
2254  9 int newBegin = Math.min(mappedTo[0], mappedTo[1]);
2255  9 int newEnd = Math.max(mappedTo[0], mappedTo[1]);
2256  9 SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
2257    sf.getFeatureGroup(), sf.getScore());
2258  9 copyTo.addSequenceFeature(copy);
2259  9 count++;
2260    }
2261    }
2262  12 return count;
2263    }
2264   
2265    /**
2266    * Returns a mapping from dna to protein by inspecting sequence features of
2267    * type "CDS" on the dna. A mapping is constructed if the total CDS feature
2268    * length is 3 times the peptide length (optionally after dropping a trailing
2269    * stop codon). This method does not check whether the CDS nucleotide sequence
2270    * translates to the peptide sequence.
2271    *
2272    * @param dnaSeq
2273    * @param proteinSeq
2274    * @return
2275    */
 
2276  12 toggle public static MapList mapCdsToProtein(SequenceI dnaSeq,
2277    SequenceI proteinSeq)
2278    {
2279  12 List<int[]> ranges = findCdsPositions(dnaSeq);
2280  12 int mappedDnaLength = MappingUtils.getLength(ranges);
2281   
2282    /*
2283    * if not a whole number of codons, truncate mapping
2284    */
2285  12 int codonRemainder = mappedDnaLength % CODON_LENGTH;
2286  12 if (codonRemainder > 0)
2287    {
2288  2 mappedDnaLength -= codonRemainder;
2289  2 MappingUtils.removeEndPositions(codonRemainder, ranges);
2290    }
2291   
2292  12 int proteinLength = proteinSeq.getLength();
2293  12 int proteinStart = proteinSeq.getStart();
2294  12 int proteinEnd = proteinSeq.getEnd();
2295   
2296    /*
2297    * incomplete start codon may mean X at start of peptide
2298    * we ignore both for mapping purposes
2299    */
2300  12 if (proteinSeq.getCharAt(0) == 'X')
2301    {
2302    // todo JAL-2022 support startPhase > 0
2303  1 proteinStart++;
2304  1 proteinLength--;
2305    }
2306  12 List<int[]> proteinRange = new ArrayList<>();
2307   
2308    /*
2309    * dna length should map to protein (or protein plus stop codon)
2310    */
2311  12 int codesForResidues = mappedDnaLength / CODON_LENGTH;
2312  12 if (codesForResidues == (proteinLength + 1))
2313    {
2314    // assuming extra codon is for STOP and not in peptide
2315    // todo: check trailing codon is indeed a STOP codon
2316  2 codesForResidues--;
2317  2 mappedDnaLength -= CODON_LENGTH;
2318  2 MappingUtils.removeEndPositions(CODON_LENGTH, ranges);
2319    }
2320   
2321  12 if (codesForResidues == proteinLength)
2322    {
2323  4 proteinRange.add(new int[] { proteinStart, proteinEnd });
2324  4 return new MapList(ranges, proteinRange, CODON_LENGTH, 1);
2325    }
2326  8 return null;
2327    }
2328   
2329    /**
2330    * Returns a list of CDS ranges found (as sequence positions base 1), i.e. of
2331    * [start, end] positions of sequence features of type "CDS" (or a sub-type of
2332    * CDS in the Sequence Ontology). The ranges are sorted into ascending start
2333    * position order, so this method is only valid for linear CDS in the same
2334    * sense as the protein product.
2335    *
2336    * @param dnaSeq
2337    * @return
2338    */
 
2339  14 toggle protected static List<int[]> findCdsPositions(SequenceI dnaSeq)
2340    {
2341  14 List<int[]> result = new ArrayList<>();
2342   
2343  14 List<SequenceFeature> sfs = dnaSeq.getFeatures().getFeaturesByOntology(
2344    SequenceOntologyI.CDS);
2345  14 if (sfs.isEmpty())
2346    {
2347  6 return result;
2348    }
2349  8 SequenceFeatures.sortFeatures(sfs, true);
2350   
2351  8 for (SequenceFeature sf : sfs)
2352    {
2353  16 int phase = 0;
2354  16 try
2355    {
2356  16 String s = sf.getPhase();
2357  16 if (s != null)
2358    {
2359  2 phase = Integer.parseInt(s);
2360    }
2361    } catch (NumberFormatException e)
2362    {
2363    // leave as zero
2364    }
2365    /*
2366    * phase > 0 on first codon means 5' incomplete - skip to the start
2367    * of the next codon; example ENST00000496384
2368    */
2369  16 int begin = sf.getBegin();
2370  16 int end = sf.getEnd();
2371  16 if (result.isEmpty() && phase > 0)
2372    {
2373  2 begin += phase;
2374  2 if (begin > end)
2375    {
2376    // shouldn't happen!
2377  0 System.err
2378    .println("Error: start phase extends beyond start CDS in "
2379    + dnaSeq.getName());
2380    }
2381    }
2382  16 result.add(new int[] { begin, end });
2383    }
2384   
2385    /*
2386    * Finally sort ranges by start position. This avoids a dependency on
2387    * keeping features in order on the sequence (if they are in order anyway,
2388    * the sort will have almost no work to do). The implicit assumption is CDS
2389    * ranges are assembled in order. Other cases should not use this method,
2390    * but instead construct an explicit mapping for CDS (e.g. EMBL parsing).
2391    */
2392  8 Collections.sort(result, IntRangeComparator.ASCENDING);
2393  8 return result;
2394    }
2395   
2396    /**
2397    * Makes an alignment with a copy of the given sequences, adding in any
2398    * non-redundant sequences which are mapped to by the cross-referenced
2399    * sequences.
2400    *
2401    * @param seqs
2402    * @param xrefs
2403    * @param dataset
2404    * the alignment dataset shared by the new copy
2405    * @return
2406    */
 
2407  0 toggle public static AlignmentI makeCopyAlignment(SequenceI[] seqs,
2408    SequenceI[] xrefs, AlignmentI dataset)
2409    {
2410  0 AlignmentI copy = new Alignment(new Alignment(seqs));
2411  0 copy.setDataset(dataset);
2412  0 boolean isProtein = !copy.isNucleotide();
2413  0 SequenceIdMatcher matcher = new SequenceIdMatcher(seqs);
2414  0 if (xrefs != null)
2415    {
2416    // BH 2019.01.25 recoded to remove iterators
2417   
2418  0 for (int ix = 0, nx = xrefs.length; ix < nx; ix++)
2419    {
2420  0 SequenceI xref = xrefs[ix];
2421  0 List<DBRefEntry> dbrefs = xref.getDBRefs();
2422  0 if (dbrefs != null)
2423    {
2424  0 for (int ir = 0, nir = dbrefs.size(); ir < nir; ir++)
2425    {
2426  0 DBRefEntry dbref = dbrefs.get(ir);
2427  0 Mapping map = dbref.getMap();
2428  0 SequenceI mto;
2429  0 if (map == null || (mto = map.getTo()) == null
2430    || mto.isProtein() != isProtein)
2431    {
2432  0 continue;
2433    }
2434  0 SequenceI mappedTo = mto;
2435  0 SequenceI match = matcher.findIdMatch(mappedTo);
2436  0 if (match == null)
2437    {
2438  0 matcher.add(mappedTo);
2439  0 copy.addSequence(mappedTo);
2440    }
2441    }
2442    }
2443    }
2444    }
2445  0 return copy;
2446    }
2447   
2448    /**
2449    * Try to align sequences in 'unaligned' to match the alignment of their
2450    * mapped regions in 'aligned'. For example, could use this to align CDS
2451    * sequences which are mapped to their parent cDNA sequences.
2452    *
2453    * This method handles 1:1 mappings (dna-to-dna or protein-to-protein). For
2454    * dna-to-protein or protein-to-dna use alternative methods.
2455    *
2456    * @param unaligned
2457    * sequences to be aligned
2458    * @param aligned
2459    * holds aligned sequences and their mappings
2460    * @return
2461    */
 
2462  4 toggle public static int alignAs(AlignmentI unaligned, AlignmentI aligned)
2463    {
2464    /*
2465    * easy case - aligning a copy of aligned sequences
2466    */
2467  4 if (alignAsSameSequences(unaligned, aligned))
2468    {
2469  2 return unaligned.getHeight();
2470    }
2471   
2472    /*
2473    * fancy case - aligning via mappings between sequences
2474    */
2475  2 List<SequenceI> unmapped = new ArrayList<>();
2476  2 Map<Integer, Map<SequenceI, Character>> columnMap = buildMappedColumnsMap(
2477    unaligned, aligned, unmapped);
2478  2 int width = columnMap.size();
2479  2 char gap = unaligned.getGapCharacter();
2480  2 int realignedCount = 0;
2481    // TODO: verify this loop scales sensibly for very wide/high alignments
2482   
2483  2 for (SequenceI seq : unaligned.getSequences())
2484    {
2485  4 if (!unmapped.contains(seq))
2486    {
2487  4 char[] newSeq = new char[width];
2488  4 Arrays.fill(newSeq, gap); // JBPComment - doubt this is faster than the
2489    // Integer iteration below
2490  4 int newCol = 0;
2491  4 int lastCol = 0;
2492   
2493    /*
2494    * traverse the map to find columns populated
2495    * by our sequence
2496    */
2497  4 for (Integer column : columnMap.keySet())
2498    {
2499  60 Character c = columnMap.get(column).get(seq);
2500  60 if (c != null)
2501    {
2502    /*
2503    * sequence has a character at this position
2504    *
2505    */
2506  36 newSeq[newCol] = c;
2507  36 lastCol = newCol;
2508    }
2509  60 newCol++;
2510    }
2511   
2512    /*
2513    * trim trailing gaps
2514    */
2515  4 if (lastCol < width)
2516    {
2517  4 char[] tmp = new char[lastCol + 1];
2518  4 System.arraycopy(newSeq, 0, tmp, 0, lastCol + 1);
2519  4 newSeq = tmp;
2520    }
2521    // TODO: optimise SequenceI to avoid char[]->String->char[]
2522  4 seq.setSequence(String.valueOf(newSeq));
2523  4 realignedCount++;
2524    }
2525    }
2526  2 return realignedCount;
2527    }
2528   
2529    /**
2530    * If unaligned and aligned sequences share the same dataset sequences, then
2531    * simply copies the aligned sequences to the unaligned sequences and returns
2532    * true; else returns false
2533    *
2534    * @param unaligned
2535    * - sequences to be aligned based on aligned
2536    * @param aligned
2537    * - 'guide' alignment containing sequences derived from same
2538    * dataset as unaligned
2539    * @return
2540    */
 
2541  8 toggle static boolean alignAsSameSequences(AlignmentI unaligned,
2542    AlignmentI aligned)
2543    {
2544  8 if (aligned.getDataset() == null || unaligned.getDataset() == null)
2545    {
2546  0 return false; // should only pass alignments with datasets here
2547    }
2548   
2549    // map from dataset sequence to alignment sequence(s)
2550  8 Map<SequenceI, List<SequenceI>> alignedDatasets = new HashMap<>();
2551  8 for (SequenceI seq : aligned.getSequences())
2552    {
2553  59 SequenceI ds = seq.getDatasetSequence();
2554  59 if (alignedDatasets.get(ds) == null)
2555    {
2556  58 alignedDatasets.put(ds, new ArrayList<SequenceI>());
2557    }
2558  59 alignedDatasets.get(ds).add(seq);
2559    }
2560   
2561    /*
2562    * first pass - check whether all sequences to be aligned share a
2563    * dataset sequence with an aligned sequence; also note the leftmost
2564    * ungapped column from which to copy
2565    */
2566  8 int leftmost = Integer.MAX_VALUE;
2567  8 for (SequenceI seq : unaligned.getSequences())
2568    {
2569  34 final SequenceI ds = seq.getDatasetSequence();
2570  34 if (!alignedDatasets.containsKey(ds))
2571    {
2572  3 return false;
2573    }
2574  31 SequenceI alignedSeq = alignedDatasets.get(ds)
2575    .get(0);
2576  31 int startCol = alignedSeq.findIndex(seq.getStart()); // 1..
2577  31 leftmost = Math.min(leftmost, startCol);
2578    }
2579   
2580    /*
2581    * second pass - copy aligned sequences;
2582    * heuristic rule: pair off sequences in order for the case where
2583    * more than one shares the same dataset sequence
2584    */
2585  5 final char gapCharacter = aligned.getGapCharacter();
2586  5 for (SequenceI seq : unaligned.getSequences())
2587    {
2588  29 List<SequenceI> alignedSequences = alignedDatasets
2589    .get(seq.getDatasetSequence());
2590  29 if (alignedSequences.isEmpty())
2591    {
2592    /*
2593    * defensive check - shouldn't happen! (JAL-3536)
2594    */
2595  0 continue;
2596    }
2597  29 SequenceI alignedSeq = alignedSequences.get(0);
2598   
2599    /*
2600    * gap fill for leading (5') UTR if any
2601    */
2602    // TODO this copies intron columns - wrong!
2603  29 int startCol = alignedSeq.findIndex(seq.getStart()); // 1..
2604  29 int endCol = alignedSeq.findIndex(seq.getEnd());
2605  29 char[] seqchars = new char[endCol - leftmost + 1];
2606  29 Arrays.fill(seqchars, gapCharacter);
2607  29 char[] toCopy = alignedSeq.getSequence(startCol - 1, endCol);
2608  29 System.arraycopy(toCopy, 0, seqchars, startCol - leftmost,
2609    toCopy.length);
2610  29 seq.setSequence(String.valueOf(seqchars));
2611  29 if (alignedSequences.size() > 0)
2612    {
2613    // pop off aligned sequences (except the last one)
2614  29 alignedSequences.remove(0);
2615    }
2616    }
2617   
2618    /*
2619    * finally remove gapped columns (e.g. introns)
2620    */
2621  5 new RemoveGapColCommand("", unaligned.getSequencesArray(), 0,
2622    unaligned.getWidth() - 1, unaligned);
2623   
2624  5 return true;
2625    }
2626   
2627    /**
2628    * Returns a map whose key is alignment column number (base 1), and whose
2629    * values are a map of sequence characters in that column.
2630    *
2631    * @param unaligned
2632    * @param aligned
2633    * @param unmapped
2634    * @return
2635    */
 
2636  2 toggle static SortedMap<Integer, Map<SequenceI, Character>> buildMappedColumnsMap(
2637    AlignmentI unaligned, AlignmentI aligned,
2638    List<SequenceI> unmapped)
2639    {
2640    /*
2641    * Map will hold, for each aligned column position, a map of
2642    * {unalignedSequence, characterPerSequence} at that position.
2643    * TreeMap keeps the entries in ascending column order.
2644    */
2645  2 SortedMap<Integer, Map<SequenceI, Character>> map = new TreeMap<>();
2646   
2647    /*
2648    * record any sequences that have no mapping so can't be realigned
2649    */
2650  2 unmapped.addAll(unaligned.getSequences());
2651   
2652  2 List<AlignedCodonFrame> mappings = aligned.getCodonFrames();
2653   
2654  2 for (SequenceI seq : unaligned.getSequences())
2655    {
2656  4 for (AlignedCodonFrame mapping : mappings)
2657    {
2658  4 SequenceI fromSeq = mapping.findAlignedSequence(seq, aligned);
2659  4 if (fromSeq != null)
2660    {
2661  4 Mapping seqMap = mapping.getMappingBetween(fromSeq, seq);
2662  4 if (addMappedPositions(seq, fromSeq, seqMap, map))
2663    {
2664  4 unmapped.remove(seq);
2665    }
2666    }
2667    }
2668    }
2669  2 return map;
2670    }
2671   
2672    /**
2673    * Helper method that adds to a map the mapped column positions of a sequence.
2674    * <br>
2675    * For example if aaTT-Tg-gAAA is mapped to TTTAAA then the map should record
2676    * that columns 3,4,6,10,11,12 map to characters T,T,T,A,A,A of the mapped to
2677    * sequence.
2678    *
2679    * @param seq
2680    * the sequence whose column positions we are recording
2681    * @param fromSeq
2682    * a sequence that is mapped to the first sequence
2683    * @param seqMap
2684    * the mapping from 'fromSeq' to 'seq'
2685    * @param map
2686    * a map to add the column positions (in fromSeq) of the mapped
2687    * positions of seq
2688    * @return
2689    */
 
2690  6 toggle static boolean addMappedPositions(SequenceI seq, SequenceI fromSeq,
2691    Mapping seqMap, Map<Integer, Map<SequenceI, Character>> map)
2692    {
2693  6 if (seqMap == null)
2694    {
2695  0 return false;
2696    }
2697   
2698    /*
2699    * invert mapping if it is from unaligned to aligned sequence
2700    */
2701  6 if (seqMap.getTo() == fromSeq.getDatasetSequence())
2702    {
2703  0 seqMap = new Mapping(seq.getDatasetSequence(),
2704    seqMap.getMap().getInverse());
2705    }
2706   
2707  6 int toStart = seq.getStart();
2708   
2709    /*
2710    * traverse [start, end, start, end...] ranges in fromSeq
2711    */
2712  6 for (int[] fromRange : seqMap.getMap().getFromRanges())
2713    {
2714  18 for (int i = 0; i < fromRange.length - 1; i += 2)
2715    {
2716  9 boolean forward = fromRange[i + 1] >= fromRange[i];
2717   
2718    /*
2719    * find the range mapped to (sequence positions base 1)
2720    */
2721  9 int[] range = seqMap.locateMappedRange(fromRange[i],
2722    fromRange[i + 1]);
2723  9 if (range == null)
2724    {
2725  0 System.err.println("Error in mapping " + seqMap + " from "
2726    + fromSeq.getName());
2727  0 return false;
2728    }
2729  9 int fromCol = fromSeq.findIndex(fromRange[i]);
2730  9 int mappedCharPos = range[0];
2731   
2732    /*
2733    * walk over the 'from' aligned sequence in forward or reverse
2734    * direction; when a non-gap is found, record the column position
2735    * of the next character of the mapped-to sequence; stop when all
2736    * the characters of the range have been counted
2737    */
2738  70 while (mappedCharPos <= range[1] && fromCol <= fromSeq.getLength()
2739    && fromCol >= 0)
2740    {
2741  61 if (!Comparison.isGap(fromSeq.getCharAt(fromCol - 1)))
2742    {
2743    /*
2744    * mapped from sequence has a character in this column
2745    * record the column position for the mapped to character
2746    */
2747  48 Map<SequenceI, Character> seqsMap = map.get(fromCol);
2748  48 if (seqsMap == null)
2749    {
2750  42 seqsMap = new HashMap<>();
2751  42 map.put(fromCol, seqsMap);
2752    }
2753  48 seqsMap.put(seq, seq.getCharAt(mappedCharPos - toStart));
2754  48 mappedCharPos++;
2755    }
2756  61 fromCol += (forward ? 1 : -1);
2757    }
2758    }
2759    }
2760  6 return true;
2761    }
2762   
2763    // strictly temporary hack until proper criteria for aligning protein to cds
2764    // are in place; this is so Ensembl -> fetch xrefs Uniprot aligns the Uniprot
 
2765  4 toggle public static boolean looksLikeEnsembl(AlignmentI alignment)
2766    {
2767  4 for (SequenceI seq : alignment.getSequences())
2768    {
2769  88 String name = seq.getName();
2770  88 if (!name.startsWith("ENSG") && !name.startsWith("ENST"))
2771    {
2772  0 return false;
2773    }
2774    }
2775  4 return true;
2776    }
2777    }