Clover icon

Coverage Report

  1. Project Clover database Thu Aug 13 2020 12:04:21 BST
  2. Package jalview.analysis

File CrossRef.java

 

Coverage histogram

../../img/srcFileCovDistChart5.png
39% of files have more coverage

Code metrics

184
288
16
1
1,110
649
137
0.48
18
16
8.56

Classes

Class Line # Actions
CrossRef 47 288 137
0.4631147746.3%
 

Contributing tests

This file is covered by 101 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.analysis;
22   
23    import jalview.datamodel.AlignedCodonFrame;
24    import jalview.datamodel.Alignment;
25    import jalview.datamodel.AlignmentI;
26    import jalview.datamodel.DBRefEntry;
27    import jalview.datamodel.DBRefSource;
28    import jalview.datamodel.Mapping;
29    import jalview.datamodel.Sequence;
30    import jalview.datamodel.SequenceFeature;
31    import jalview.datamodel.SequenceI;
32    import jalview.util.DBRefUtils;
33    import jalview.util.MapList;
34    import jalview.ws.SequenceFetcherFactory;
35    import jalview.ws.seqfetcher.ASequenceFetcher;
36   
37    import java.util.ArrayList;
38    import java.util.Iterator;
39    import java.util.List;
40   
41    /**
42    * Functions for cross-referencing sequence databases.
43    *
44    * @author JimP
45    *
46    */
 
47    public class CrossRef
48    {
49    /*
50    * the dataset of the alignment for which we are searching for
51    * cross-references; in some cases we may resolve xrefs by
52    * searching in the dataset
53    */
54    private AlignmentI dataset;
55   
56    /*
57    * the sequences for which we are seeking cross-references
58    */
59    private SequenceI[] fromSeqs;
60   
61    /**
62    * matcher built from dataset
63    */
64    SequenceIdMatcher matcher;
65   
66    /**
67    * sequences found by cross-ref searches to fromSeqs
68    */
69    List<SequenceI> rseqs;
70   
71    /**
72    * Constructor
73    *
74    * @param seqs
75    * the sequences for which we are seeking cross-references
76    * @param ds
77    * the containing alignment dataset (may be searched to resolve
78    * cross-references)
79    */
 
80  432 toggle public CrossRef(SequenceI[] seqs, AlignmentI ds)
81    {
82  432 fromSeqs = seqs;
83  432 dataset = ds.getDataset() == null ? ds : ds.getDataset();
84    }
85   
86    /**
87    * Returns a list of distinct database sources for which sequences have either
88    * <ul>
89    * <li>a (dna-to-protein or protein-to-dna) cross-reference</li>
90    * <li>an indirect cross-reference - a (dna-to-protein or protein-to-dna)
91    * reference from another sequence in the dataset which has a cross-reference
92    * to a direct DBRefEntry on the given sequence</li>
93    * </ul>
94    *
95    * @param dna
96    * - when true, cross-references *from* dna returned. When false,
97    * cross-references *from* protein are returned
98    * @return
99    */
 
100  425 toggle public List<String> findXrefSourcesForSequences(boolean dna)
101    {
102  425 List<String> sources = new ArrayList<>();
103  425 for (SequenceI seq : fromSeqs)
104    {
105  4686 if (seq != null)
106    {
107  4686 findXrefSourcesForSequence(seq, dna, sources);
108    }
109    }
110  425 sources.remove(DBRefSource.EMBL); // hack to prevent EMBL xrefs resulting in
111    // redundant datasets
112  425 if (dna)
113    {
114  29 sources.remove(DBRefSource.ENSEMBL); // hack to prevent Ensembl and
115    // EnsemblGenomes xref option shown
116    // from cdna panel
117  29 sources.remove(DBRefSource.ENSEMBLGENOMES);
118    }
119    // redundant datasets
120  425 return sources;
121    }
122   
123    /**
124    * Returns a list of distinct database sources for which a sequence has either
125    * <ul>
126    * <li>a (dna-to-protein or protein-to-dna) cross-reference</li>
127    * <li>an indirect cross-reference - a (dna-to-protein or protein-to-dna)
128    * reference from another sequence in the dataset which has a cross-reference
129    * to a direct DBRefEntry on the given sequence</li>
130    * </ul>
131    *
132    * @param seq
133    * the sequence whose dbrefs we are searching against
134    * @param fromDna
135    * when true, context is DNA - so sources identifying protein
136    * products will be returned.
137    * @param sources
138    * a list of sources to add matches to
139    */
 
140  4686 toggle void findXrefSourcesForSequence(SequenceI seq, boolean fromDna,
141    List<String> sources)
142    {
143    /*
144    * first find seq's xrefs (dna-to-peptide or peptide-to-dna)
145    */
146  4686 List<DBRefEntry> rfs = DBRefUtils.selectDbRefs(!fromDna, seq.getDBRefs());
147  4686 addXrefsToSources(rfs, sources);
148  4686 if (dataset != null)
149    {
150    /*
151    * find sequence's direct (dna-to-dna, peptide-to-peptide) xrefs
152    */
153  4686 List<DBRefEntry> lrfs = DBRefUtils.selectDbRefs(fromDna, seq.getDBRefs());
154  4686 List<SequenceI> foundSeqs = new ArrayList<>();
155   
156    /*
157    * find sequences in the alignment which xref one of these DBRefs
158    * i.e. is xref-ed to a common sequence identifier
159    */
160  4686 searchDatasetXrefs(fromDna, seq, lrfs, foundSeqs, null);
161   
162    /*
163    * add those sequences' (dna-to-peptide or peptide-to-dna) dbref sources
164    */
165  4686 for (SequenceI rs : foundSeqs)
166    {
167  394 List<DBRefEntry> xrs = DBRefUtils.selectDbRefs(!fromDna,
168    rs.getDBRefs());
169  394 addXrefsToSources(xrs, sources);
170    }
171    }
172    }
173   
174    /**
175    * Helper method that adds the source identifiers of some cross-references to
176    * a (non-redundant) list of database sources
177    *
178    * @param xrefs
179    * @param sources
180    */
 
181  5080 toggle void addXrefsToSources(List<DBRefEntry> xrefs, List<String> sources)
182    {
183  5080 if (xrefs != null)
184    {
185  486 for (DBRefEntry ref : xrefs)
186    {
187    /*
188    * avoid duplication e.g. ENSEMBL and Ensembl
189    */
190  1812 String source = DBRefUtils.getCanonicalName(ref.getSource());
191  1812 if (!sources.contains(source))
192    {
193  21 sources.add(source);
194    }
195    }
196    }
197    }
198   
199    /**
200    * Attempts to find cross-references from the sequences provided in the
201    * constructor to the given source database. Cross-references may be found
202    * <ul>
203    * <li>in dbrefs on the sequence which hold a mapping to a sequence
204    * <ul>
205    * <li>provided with a fetched sequence (e.g. ENA translation), or</li>
206    * <li>populated previously after getting cross-references</li>
207    * </ul>
208    * <li>as other sequences in the alignment which share a dbref identifier with
209    * the sequence</li>
210    * <li>by fetching from the remote database</li>
211    * </ul>
212    * The cross-referenced sequences, and mappings to them, are added to the
213    * alignment dataset.
214    *
215    * @param source
216    * @return cross-referenced sequences (as dataset sequences)
217    */
 
218  6 toggle public Alignment findXrefSequences(String source, boolean fromDna)
219    {
220   
221  6 rseqs = new ArrayList<>();
222  6 AlignedCodonFrame cf = new AlignedCodonFrame();
223  6 matcher = new SequenceIdMatcher(dataset.getSequences());
224   
225  6 for (SequenceI seq : fromSeqs)
226    {
227  48 SequenceI dss = seq;
228  93 while (dss.getDatasetSequence() != null)
229    {
230  45 dss = dss.getDatasetSequence();
231    }
232  48 boolean found = false;
233  48 List<DBRefEntry> xrfs = DBRefUtils.selectDbRefs(!fromDna,
234    dss.getDBRefs());
235    // ENST & ENSP comes in to both Protein and nucleotide, so we need to
236    // filter them
237    // out later.
238  48 if ((xrfs == null || xrfs.size() == 0) && dataset != null)
239    {
240    /*
241    * found no suitable dbrefs on sequence - look for sequences in the
242    * alignment which share a dbref with this one
243    */
244  3 List<DBRefEntry> lrfs = DBRefUtils.selectDbRefs(fromDna,
245    seq.getDBRefs());
246   
247    /*
248    * find sequences (except this one!), of complementary type,
249    * which have a dbref to an accession id for this sequence,
250    * and add them to the results
251    */
252  3 found = searchDatasetXrefs(fromDna, dss, lrfs, rseqs, cf);
253    }
254  48 if (xrfs == null && !found)
255    {
256    /*
257    * no dbref to source on this sequence or matched
258    * complementary sequence in the dataset
259    */
260  1 continue;
261    }
262  47 List<DBRefEntry> sourceRefs = DBRefUtils.searchRefsForSource(xrfs,
263    source);
264  47 Iterator<DBRefEntry> refIterator = sourceRefs.iterator();
265    // At this point, if we are retrieving Ensembl, we still don't filter out
266    // ENST when looking for protein crossrefs.
267  93 while (refIterator.hasNext())
268    {
269  46 DBRefEntry xref = refIterator.next();
270  46 found = false;
271    // we're only interested in coding cross-references, not
272    // locus->transcript
273  46 if (xref.hasMap() && xref.getMap().getMap().isTripletMap())
274    {
275  24 SequenceI mappedTo = xref.getMap().getTo();
276  24 if (mappedTo != null)
277    {
278    /*
279    * dbref contains the sequence it maps to; add it to the
280    * results unless we have done so already (could happen if
281    * fetching xrefs for sequences which have xrefs in common)
282    * for example: UNIPROT {P0CE19, P0CE20} -> EMBL {J03321, X06707}
283    */
284  24 found = true;
285    /*
286    * problem: matcher.findIdMatch() is lenient - returns a sequence
287    * with a dbref to the search arg e.g. ENST for ENSP - wrong
288    * but findInDataset() matches ENSP when looking for Uniprot...
289    */
290  24 SequenceI matchInDataset = findInDataset(xref);
291  24 if (matchInDataset != null && xref.getMap().getTo() != null
292    && matchInDataset != xref.getMap().getTo())
293    {
294  0 System.err.println(
295    "Implementation problem (reopen JAL-2154): CrossRef.findInDataset seems to have recovered a different sequence than the one explicitly mapped for xref."
296    + "Found:" + matchInDataset + "\nExpected:"
297    + xref.getMap().getTo() + "\nFor xref:"
298    + xref);
299    }
300    /*matcher.findIdMatch(mappedTo);*/
301  24 if (matchInDataset != null)
302    {
303  22 if (!rseqs.contains(matchInDataset))
304    {
305  0 rseqs.add(matchInDataset);
306    }
307    // even if rseqs contained matchInDataset - check mappings between
308    // these seqs are added
309    // need to try harder to only add unique mappings
310  22 if (xref.getMap().getMap().isTripletMap()
311    && dataset.getMapping(seq, matchInDataset) == null
312    && cf.getMappingBetween(seq, matchInDataset) == null)
313    {
314    // materialise a mapping for highlighting between these
315    // sequences
316  11 if (fromDna)
317    {
318  11 cf.addMap(dss, matchInDataset, xref.getMap().getMap(),
319    xref.getMap().getMappedFromId());
320    }
321    else
322    {
323  0 cf.addMap(matchInDataset, dss,
324    xref.getMap().getMap().getInverse(),
325    xref.getMap().getMappedFromId());
326    }
327    }
328   
329  22 refIterator.remove();
330  22 continue;
331    }
332    // TODO: need to determine if this should be a deriveSequence
333  2 SequenceI rsq = new Sequence(mappedTo);
334  2 rseqs.add(rsq);
335  2 if (xref.getMap().getMap().isTripletMap())
336    {
337    // get sense of map correct for adding to product alignment.
338  2 if (fromDna)
339    {
340    // map is from dna seq to a protein product
341  2 cf.addMap(dss, rsq, xref.getMap().getMap(),
342    xref.getMap().getMappedFromId());
343    }
344    else
345    {
346    // map should be from protein seq to its coding dna
347  0 cf.addMap(rsq, dss, xref.getMap().getMap().getInverse(),
348    xref.getMap().getMappedFromId());
349    }
350    }
351    }
352    }
353   
354  24 if (!found)
355    {
356  22 SequenceI matchedSeq = matcher.findIdMatch(
357    xref.getSource() + "|" + xref.getAccessionId());
358    // if there was a match, check it's at least the right type of
359    // molecule!
360  22 if (matchedSeq != null && matchedSeq.isProtein() == fromDna)
361    {
362  0 if (constructMapping(seq, matchedSeq, xref, cf, fromDna))
363    {
364  0 found = true;
365    }
366    }
367    }
368   
369  24 if (!found)
370    {
371    // do a bit more work - search for sequences with references matching
372    // xrefs on this sequence.
373  22 found = searchDataset(fromDna, dss, xref, rseqs, cf, false, DBRefUtils.SEARCH_MODE_FULL);
374    }
375  24 if (found)
376    {
377  24 refIterator.remove();
378    }
379    }
380   
381    /*
382    * fetch from source database any dbrefs we haven't resolved up to here
383    */
384  47 if (!sourceRefs.isEmpty())
385    {
386  0 retrieveCrossRef(sourceRefs, seq, xrfs, fromDna, cf);
387    }
388    }
389   
390  6 Alignment ral = null;
391  6 if (rseqs.size() > 0)
392    {
393  5 ral = new Alignment(rseqs.toArray(new SequenceI[rseqs.size()]));
394  5 if (!cf.isEmpty())
395    {
396  2 dataset.addCodonFrame(cf);
397    }
398    }
399  6 return ral;
400    }
401   
 
402  0 toggle private void retrieveCrossRef(List<DBRefEntry> sourceRefs, SequenceI seq,
403    List<DBRefEntry> xrfs, boolean fromDna, AlignedCodonFrame cf)
404    {
405  0 ASequenceFetcher sftch = SequenceFetcherFactory.getSequenceFetcher();
406  0 SequenceI[] retrieved = null;
407  0 SequenceI dss = seq.getDatasetSequence() == null ? seq
408    : seq.getDatasetSequence();
409    // first filter in case we are retrieving crossrefs that have already been
410    // retrieved. this happens for cases where a database record doesn't yield
411    // protein products for CDS
412  0 removeAlreadyRetrievedSeqs(sourceRefs, fromDna);
413  0 if (sourceRefs.size() == 0)
414    {
415    // no more work to do! We already had all requested sequence records in
416    // the dataset.
417  0 return;
418    }
419  0 try
420    {
421  0 retrieved = sftch.getSequences(sourceRefs, !fromDna);
422    } catch (Exception e)
423    {
424  0 System.err.println(
425    "Problem whilst retrieving cross references for Sequence : "
426    + seq.getName());
427  0 e.printStackTrace();
428    }
429   
430  0 if (retrieved != null)
431    {
432  0 boolean addedXref = false;
433  0 List<SequenceI> newDsSeqs = new ArrayList<>(),
434    doNotAdd = new ArrayList<>();
435   
436  0 for (SequenceI retrievedSequence : retrieved)
437    {
438    // dataset gets contaminated ccwith non-ds sequences. why ??!
439    // try: Ensembl -> Nuc->Ensembl, Nuc->Uniprot-->Protein->EMBL->
440  0 SequenceI retrievedDss = retrievedSequence
441    .getDatasetSequence() == null ? retrievedSequence
442    : retrievedSequence.getDatasetSequence();
443  0 addedXref |= importCrossRefSeq(cf, newDsSeqs, doNotAdd, dss,
444    retrievedDss);
445    }
446  0 if (!addedXref)
447    {
448    // try again, after looking for matching IDs
449    // shouldn't need to do this unless the dbref mechanism has broken.
450  0 updateDbrefMappings(seq, xrfs, retrieved, cf, fromDna);
451  0 for (SequenceI retrievedSequence : retrieved)
452    {
453    // dataset gets contaminated ccwith non-ds sequences. why ??!
454    // try: Ensembl -> Nuc->Ensembl, Nuc->Uniprot-->Protein->EMBL->
455  0 SequenceI retrievedDss = retrievedSequence
456    .getDatasetSequence() == null ? retrievedSequence
457    : retrievedSequence.getDatasetSequence();
458  0 addedXref |= importCrossRefSeq(cf, newDsSeqs, doNotAdd, dss,
459    retrievedDss);
460    }
461    }
462  0 for (SequenceI newToSeq : newDsSeqs)
463    {
464  0 if (!doNotAdd.contains(newToSeq)
465    && dataset.findIndex(newToSeq) == -1)
466    {
467  0 dataset.addSequence(newToSeq);
468  0 matcher.add(newToSeq);
469    }
470    }
471    }
472    }
473   
474    /**
475    * Search dataset for sequences with a primary reference contained in
476    * sourceRefs.
477    *
478    * @param sourceRefs
479    * - list of references to filter.
480    * @param fromDna
481    * - type of sequence to search for matching primary reference.
482    */
 
483  0 toggle private void removeAlreadyRetrievedSeqs(List<DBRefEntry> sourceRefs,
484    boolean fromDna)
485    {
486  0 List<DBRefEntry> dbrSourceSet = new ArrayList<>(sourceRefs);
487  0 List<SequenceI> dsSeqs = dataset.getSequences();
488  0 for (int ids = 0, nds = dsSeqs.size(); ids < nds; ids++)
489    {
490  0 SequenceI sq = dsSeqs.get(ids);
491  0 boolean dupeFound = false;
492    // !fromDna means we are looking only for nucleotide sequences, not
493    // protein
494  0 if (sq.isProtein() == fromDna)
495    {
496  0 List<DBRefEntry> sqdbrefs = sq.getPrimaryDBRefs();
497  0 for (int idb = 0, ndb = sqdbrefs.size(); idb < ndb; idb++)
498    {
499  0 DBRefEntry dbr = sqdbrefs.get(idb);
500  0 List<DBRefEntry> searchrefs = DBRefUtils.searchRefs(dbrSourceSet, dbr, DBRefUtils.SEARCH_MODE_FULL);
501  0 for (int isr = 0, nsr = searchrefs.size(); isr < nsr; isr++)
502    {
503  0 sourceRefs.remove(searchrefs.get(isr));
504  0 dupeFound = true;
505    }
506    }
507    }
508  0 if (dupeFound)
509    {
510    // rebuild the search array from the filtered sourceRefs list
511  0 dbrSourceSet.clear();
512  0 dbrSourceSet.addAll(sourceRefs);
513    }
514    }
515    }
516   
517    /**
518    * process sequence retrieved via a dbref on source sequence to resolve and
519    * transfer data
520    *
521    * @param cf
522    * @param sourceSequence
523    * @param retrievedSequence
524    * @return true if retrieveSequence was imported
525    */
 
526  0 toggle private boolean importCrossRefSeq(AlignedCodonFrame cf,
527    List<SequenceI> newDsSeqs, List<SequenceI> doNotAdd,
528    SequenceI sourceSequence, SequenceI retrievedSequence)
529    {
530    /**
531    * set when retrievedSequence has been verified as a crossreference for
532    * sourceSequence
533    */
534  0 boolean imported = false;
535  0 List<DBRefEntry> dbr = retrievedSequence.getDBRefs();
536  0 if (dbr != null)
537    {
538  0 for (int ib = 0, nb = dbr.size(); ib < nb; ib++)
539    {
540   
541  0 DBRefEntry dbref = dbr.get(ib);
542  0 SequenceI matched = findInDataset(dbref);
543  0 if (matched == sourceSequence)
544    {
545    // verified retrieved and source sequence cross-reference each other
546  0 imported = true;
547    }
548    // find any entry where we should put in the sequence being
549    // cross-referenced into the map
550  0 Mapping map = dbref.getMap();
551  0 if (map != null)
552    {
553  0 SequenceI ms = map.getTo();
554  0 if (ms != null && map.getMap() != null)
555    {
556  0 if (ms == sourceSequence)
557    {
558    // already called to import once, and most likely this sequence
559    // already imported !
560  0 continue;
561    }
562  0 if (matched == null)
563    {
564    /*
565    * sequence is new to dataset, so save a reference so it can be added.
566    */
567  0 newDsSeqs.add(ms);
568  0 continue;
569    }
570   
571    /*
572    * there was a matching sequence in dataset, so now, check to see if we can update the map.getTo() sequence to the existing one.
573    */
574   
575  0 try
576    {
577    // compare ms with dss and replace with dss in mapping
578    // if map is congruent
579    // TODO findInDataset requires exact sequence match but
580    // 'congruent' test is only for the mapped part
581    // maybe not a problem in practice since only ENA provide a
582    // mapping and it is to the full protein translation of CDS
583    // matcher.findIdMatch(map.getTo());
584    // TODO addendum: if matched is shorter than getTo, this will fail
585    // - when it should really succeed.
586  0 int sf = map.getMap().getToLowest();
587  0 int st = map.getMap().getToHighest();
588  0 SequenceI mappedrg = ms.getSubSequence(sf, st);
589  0 if (mappedrg.getLength() > 0 && ms.getSequenceAsString()
590    .equals(matched.getSequenceAsString()))
591    {
592    /*
593    * sequences were a match,
594    */
595  0 String msg = "Mapping updated from " + ms.getName()
596    + " to retrieved crossreference "
597    + matched.getName();
598  0 System.out.println(msg);
599   
600  0 List<DBRefEntry> toRefs = map.getTo().getDBRefs();
601  0 if (toRefs != null)
602    {
603    /*
604    * transfer database refs
605    */
606  0 for (DBRefEntry ref : toRefs)
607    {
608  0 if (dbref.getSrcAccString()
609    .equals(ref.getSrcAccString()))
610    {
611  0 continue; // avoid overwriting the ref on source sequence
612    }
613  0 matched.addDBRef(ref); // add or update mapping
614    }
615    }
616  0 doNotAdd.add(map.getTo());
617  0 map.setTo(matched);
618   
619    /*
620    * give the reverse reference the inverse mapping
621    * (if it doesn't have one already)
622    */
623  0 setReverseMapping(matched, dbref, cf);
624   
625    /*
626    * copy sequence features as well, avoiding
627    * duplication (e.g. same variation from two
628    * transcripts)
629    */
630  0 List<SequenceFeature> sfs = ms.getFeatures()
631    .getAllFeatures();
632  0 for (SequenceFeature feat : sfs)
633    {
634    /*
635    * make a flyweight feature object which ignores Parent
636    * attribute in equality test; this avoids creating many
637    * otherwise duplicate exon features on genomic sequence
638    */
639  0 SequenceFeature newFeature = new SequenceFeature(feat)
640    {
 
641  0 toggle @Override
642    public boolean equals(Object o)
643    {
644  0 return super.equals(o, true);
645    }
646    };
647  0 matched.addSequenceFeature(newFeature);
648    }
649    }
650  0 cf.addMap(retrievedSequence, map.getTo(), map.getMap());
651    } catch (Exception e)
652    {
653  0 System.err.println(
654    "Exception when consolidating Mapped sequence set...");
655  0 e.printStackTrace(System.err);
656    }
657    }
658    }
659    }
660    }
661  0 if (imported)
662    {
663  0 retrievedSequence.updatePDBIds();
664  0 rseqs.add(retrievedSequence);
665  0 if (dataset.findIndex(retrievedSequence) == -1)
666    {
667  0 dataset.addSequence(retrievedSequence);
668  0 matcher.add(retrievedSequence);
669    }
670    }
671  0 return imported;
672    }
673   
674    /**
675    * Sets the inverse sequence mapping in the corresponding dbref of the mapped
676    * to sequence (if any). This is used after fetching a cross-referenced
677    * sequence, if the fetched sequence has a mapping to the original sequence,
678    * to set the mapping in the original sequence's dbref.
679    *
680    * @param mapFrom
681    * the sequence mapped from
682    * @param dbref
683    * @param mappings
684    */
 
685  0 toggle void setReverseMapping(SequenceI mapFrom, DBRefEntry dbref,
686    AlignedCodonFrame mappings)
687    {
688  0 SequenceI mapTo = dbref.getMap().getTo();
689  0 if (mapTo == null)
690    {
691  0 return;
692    }
693  0 List<DBRefEntry> dbrefs = mapTo.getDBRefs();
694  0 if (dbrefs == null)
695    {
696  0 return;
697    }
698  0 for (DBRefEntry toRef : dbrefs)
699    {
700  0 if (toRef.hasMap() && mapFrom == toRef.getMap().getTo())
701    {
702    /*
703    * found the reverse dbref; update its mapping if null
704    */
705  0 if (toRef.getMap().getMap() == null)
706    {
707  0 MapList inverse = dbref.getMap().getMap().getInverse();
708  0 toRef.getMap().setMap(inverse);
709  0 mappings.addMap(mapTo, mapFrom, inverse);
710    }
711    }
712    }
713    }
714   
715    /**
716    * Returns null or the first sequence in the dataset which is identical to
717    * xref.mapTo, and has a) a primary dbref matching xref, or if none found, the
718    * first one with an ID source|xrefacc
719    *
720    * @param xref
721    * with map and mapped-to sequence
722    * @return
723    */
 
724  24 toggle SequenceI findInDataset(DBRefEntry xref)
725    {
726  24 if (xref == null || !xref.hasMap() || xref.getMap().getTo() == null)
727    {
728  0 return null;
729    }
730  24 SequenceI mapsTo = xref.getMap().getTo();
731  24 String name = xref.getAccessionId();
732  24 String name2 = xref.getSource() + "|" + name;
733  24 SequenceI dss = mapsTo.getDatasetSequence() == null ? mapsTo
734    : mapsTo.getDatasetSequence();
735    // first check ds if ds is directly referenced
736  24 if (dataset.findIndex(dss) > -1)
737    {
738  22 return dss;
739    }
740  2 DBRefEntry template = new DBRefEntry(xref.getSource(), null,
741    xref.getAccessionId());
742    /**
743    * remember the first ID match - in case we don't find a match to template
744    */
745  2 SequenceI firstIdMatch = null;
746  2 for (SequenceI seq : dataset.getSequences())
747    {
748    // first check primary refs.
749  2 List<DBRefEntry> match = DBRefUtils.searchRefs(
750    seq.getPrimaryDBRefs(), template, DBRefUtils.SEARCH_MODE_FULL);
751  2 if (match != null && match.size() == 1 && sameSequence(seq, dss))
752    {
753  0 return seq;
754    }
755    /*
756    * clumsy alternative to using SequenceIdMatcher which currently
757    * returns sequences with a dbref to the matched accession id
758    * which we don't want
759    */
760  2 if (firstIdMatch == null && (name.equals(seq.getName())
761    || seq.getName().startsWith(name2)))
762    {
763  0 if (sameSequence(seq, dss))
764    {
765  0 firstIdMatch = seq;
766    }
767    }
768    }
769  2 return firstIdMatch;
770    }
771   
772    /**
773    * Answers true if seq1 and seq2 contain exactly the same characters (ignoring
774    * case), else false. This method compares the lengths, then each character in
775    * turn, in order to 'fail fast'. For case-sensitive comparison, it would be
776    * possible to use Arrays.equals(seq1.getSequence(), seq2.getSequence()).
777    *
778    * @param seq1
779    * @param seq2
780    * @return
781    */
782    // TODO move to Sequence / SequenceI
 
783  7 toggle static boolean sameSequence(SequenceI seq1, SequenceI seq2)
784    {
785  7 if (seq1 == seq2)
786    {
787  1 return true;
788    }
789  6 if (seq1 == null || seq2 == null)
790    {
791  2 return false;
792    }
793   
794  4 if (seq1.getLength() != seq2.getLength())
795    {
796  2 return false;
797    }
798  2 int length = seq1.getLength();
799  14 for (int i = 0; i < length; i++)
800    {
801  12 int diff = seq1.getCharAt(i) - seq2.getCharAt(i);
802    /*
803    * same char or differ in case only ('a'-'A' == 32)
804    */
805  12 if (diff != 0 && diff != 32 && diff != -32)
806    {
807  0 return false;
808    }
809    }
810  2 return true;
811    }
812   
813    /**
814    * Updates any empty mappings in the cross-references with one to a compatible
815    * retrieved sequence if found, and adds any new mappings to the
816    * AlignedCodonFrame
817    *
818    * @param mapFrom
819    * @param xrefs
820    * @param retrieved
821    * @param acf
822    */
 
823  0 toggle void updateDbrefMappings(SequenceI mapFrom, List<DBRefEntry> xrefs,
824    SequenceI[] retrieved, AlignedCodonFrame acf, boolean fromDna)
825    {
826  0 SequenceIdMatcher idMatcher = new SequenceIdMatcher(retrieved);
827  0 for (DBRefEntry xref : xrefs)
828    {
829  0 if (!xref.hasMap())
830    {
831  0 String targetSeqName = xref.getSource() + "|"
832    + xref.getAccessionId();
833  0 SequenceI[] matches = idMatcher.findAllIdMatches(targetSeqName);
834  0 if (matches == null)
835    {
836  0 return;
837    }
838  0 for (SequenceI seq : matches)
839    {
840  0 constructMapping(mapFrom, seq, xref, acf, fromDna);
841    }
842    }
843    }
844    }
845   
846    /**
847    * Tries to make a mapping between sequences. If successful, adds the mapping
848    * to the dbref and the mappings collection and answers true, otherwise
849    * answers false. The following methods of making are mapping are tried in
850    * turn:
851    * <ul>
852    * <li>if 'mapTo' holds a mapping to 'mapFrom', take the inverse; this is, for
853    * example, the case after fetching EMBL cross-references for a Uniprot
854    * sequence</li>
855    * <li>else check if the dna translates exactly to the protein (give or take
856    * start and stop codons></li>
857    * <li>else try to map based on CDS features on the dna sequence</li>
858    * </ul>
859    *
860    * @param mapFrom
861    * @param mapTo
862    * @param xref
863    * @param mappings
864    * @return
865    */
 
866  0 toggle boolean constructMapping(SequenceI mapFrom, SequenceI mapTo,
867    DBRefEntry xref, AlignedCodonFrame mappings, boolean fromDna)
868    {
869  0 MapList mapping = null;
870  0 SequenceI dsmapFrom = mapFrom.getDatasetSequence() == null ? mapFrom
871    : mapFrom.getDatasetSequence();
872  0 SequenceI dsmapTo = mapTo.getDatasetSequence() == null ? mapTo
873    : mapTo.getDatasetSequence();
874    /*
875    * look for a reverse mapping, if found make its inverse.
876    * Note - we do this on dataset sequences only.
877    */
878  0 if (dsmapTo.getDBRefs() != null)
879    {
880  0 for (DBRefEntry dbref : dsmapTo.getDBRefs())
881    {
882  0 String name = dbref.getSource() + "|" + dbref.getAccessionId();
883  0 if (dbref.hasMap() && dsmapFrom.getName().startsWith(name))
884    {
885    /*
886    * looks like we've found a map from 'mapTo' to 'mapFrom'
887    * - invert it to make the mapping the other way
888    */
889  0 MapList reverse = dbref.getMap().getMap().getInverse();
890  0 xref.setMap(new Mapping(dsmapTo, reverse));
891  0 mappings.addMap(mapFrom, dsmapTo, reverse);
892  0 return true;
893    }
894    }
895    }
896   
897  0 if (fromDna)
898    {
899  0 mapping = AlignmentUtils.mapCdnaToProtein(mapTo, mapFrom);
900    }
901    else
902    {
903  0 mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, mapTo);
904  0 if (mapping != null)
905    {
906  0 mapping = mapping.getInverse();
907    }
908    }
909  0 if (mapping == null)
910    {
911  0 return false;
912    }
913  0 xref.setMap(new Mapping(mapTo, mapping));
914   
915    /*
916    * and add a reverse DbRef with the inverse mapping
917    */
918  0 if (mapFrom.getDatasetSequence() != null && false)
919    // && mapFrom.getDatasetSequence().getSourceDBRef() != null)
920    {
921    // possible need to search primary references... except, why doesn't xref
922    // == getSourceDBRef ??
923    // DBRefEntry dbref = new DBRefEntry(mapFrom.getDatasetSequence()
924    // .getSourceDBRef());
925    // dbref.setMap(new Mapping(mapFrom.getDatasetSequence(), mapping
926    // .getInverse()));
927    // mapTo.addDBRef(dbref);
928    }
929   
930  0 if (fromDna)
931    {
932    // AlignmentUtils.computeProteinFeatures(mapFrom, mapTo, mapping);
933  0 mappings.addMap(mapFrom, mapTo, mapping);
934    }
935    else
936    {
937  0 mappings.addMap(mapTo, mapFrom, mapping.getInverse());
938    }
939   
940  0 return true;
941    }
942   
943    /**
944    * find references to lrfs in the cross-reference set of each sequence in
945    * dataset (that is not equal to sequenceI) Identifies matching DBRefEntry
946    * based on source and accession string only - Map and Version are nulled.
947    *
948    * @param fromDna
949    * - true if context was searching from Dna sequences, false if
950    * context was searching from Protein sequences
951    * @param sequenceI
952    * @param lrfs
953    * @param foundSeqs
954    * @return true if matches were found.
955    */
 
956  4689 toggle private boolean searchDatasetXrefs(boolean fromDna, SequenceI sequenceI,
957    List<DBRefEntry> lrfs, List<SequenceI> foundSeqs,
958    AlignedCodonFrame cf)
959    {
960  4689 boolean found = false;
961  4689 if (lrfs == null)
962    {
963  2968 return false;
964    }
965  3797 for (int i = 0, n = lrfs.size(); i < n; i++)
966    {
967    // DBRefEntry xref = new DBRefEntry(lrfs.get(i));
968    // // add in wildcards
969    // xref.setVersion(null);
970    // xref.setMap(null);
971  2076 found |= searchDataset(fromDna, sequenceI, lrfs.get(i), foundSeqs, cf,
972    false, DBRefUtils.SEARCH_MODE_NO_MAP_NO_VERSION);
973    }
974  1721 return found;
975    }
976   
977    /**
978    * Searches dataset for DBRefEntrys matching the given one (xrf) and adds the
979    * associated sequence to rseqs
980    *
981    * @param fromDna
982    * true if context was searching for refs *from* dna sequence, false
983    * if context was searching for refs *from* protein sequence
984    * @param fromSeq
985    * a sequence to ignore (start point of search)
986    * @param xrf
987    * a cross-reference to try to match
988    * @param foundSeqs
989    * result list to add to
990    * @param mappings
991    * a set of sequence mappings to add to
992    * @param direct
993    * - indicates the type of relationship between returned sequences,
994    * xrf, and sequenceI that is required.
995    * <ul>
996    * <li>direct implies xrf is a primary reference for sequenceI AND
997    * the sequences to be located (eg a uniprot ID for a protein
998    * sequence, and a uniprot ref on a transcript sequence).</li>
999    * <li>indirect means xrf is a cross reference with respect to
1000    * sequenceI or all the returned sequences (eg a genomic reference
1001    * associated with a locus and one or more transcripts)</li>
1002    * </ul>
1003    * @param mode SEARCH_MODE_FULL for all; SEARCH_MODE_NO_MAP_NO_VERSION optional
1004    * @return true if relationship found and sequence added.
1005    */
 
1006  2101 toggle boolean searchDataset(boolean fromDna, SequenceI fromSeq, DBRefEntry xrf,
1007    List<SequenceI> foundSeqs, AlignedCodonFrame mappings,
1008    boolean direct, int mode)
1009    {
1010  2101 boolean found = false;
1011  2101 if (dataset == null)
1012    {
1013  0 return false;
1014    }
1015  2101 if (dataset.getSequences() == null)
1016    {
1017  0 System.err.println("Empty dataset sequence set - NO VECTOR");
1018  0 return false;
1019    }
1020  2101 List<SequenceI> ds = dataset.getSequences();
1021  2101 synchronized (ds)
1022    {
1023  2101 for (SequenceI nxt : ds)
1024    {
1025  46848 if (nxt != null)
1026    {
1027  46848 if (nxt.getDatasetSequence() != null)
1028    {
1029  0 System.err.println(
1030    "Implementation warning: CrossRef initialised with a dataset alignment with non-dataset sequences in it! ("
1031    + nxt.getDisplayId(true) + " has ds reference "
1032    + nxt.getDatasetSequence().getDisplayId(true)
1033    + ")");
1034    }
1035  46848 if (nxt == fromSeq || nxt == fromSeq.getDatasetSequence())
1036    {
1037  2098 continue;
1038    }
1039    /*
1040    * only look at same molecule type if 'direct', or
1041    * complementary type if !direct
1042    */
1043    {
1044  44750 boolean isDna = !nxt.isProtein();
1045  44750 if (direct ? (isDna != fromDna) : (isDna == fromDna))
1046    {
1047    // skip this sequence because it is wrong molecule type
1048  38916 continue;
1049    }
1050    }
1051   
1052    // look for direct or indirect references in common
1053  5834 List<DBRefEntry> poss = nxt.getDBRefs();
1054  5834 List<DBRefEntry> cands = null;
1055   
1056    // todo: indirect specifies we select either direct references to nxt
1057    // that match xrf which is indirect to sequenceI, or indirect
1058    // references to nxt that match xrf which is direct to sequenceI
1059  5834 cands = DBRefUtils.searchRefs(poss, xrf, mode);
1060    // else
1061    // {
1062    // poss = DBRefUtils.selectDbRefs(nxt.isProtein()!fromDna, poss);
1063    // cands = DBRefUtils.searchRefs(poss, xrf);
1064    // }
1065  5834 if (!cands.isEmpty())
1066    {
1067  1442 if (foundSeqs.contains(nxt))
1068    {
1069  1022 continue;
1070    }
1071  420 found = true;
1072  420 foundSeqs.add(nxt);
1073  420 if (mappings != null && !direct)
1074    {
1075    /*
1076    * if the matched sequence has mapped dbrefs to
1077    * protein product / cdna, add equivalent mappings to
1078    * our source sequence
1079    */
1080  26 for (DBRefEntry candidate : cands)
1081    {
1082  26 Mapping mapping = candidate.getMap();
1083  26 if (mapping != null)
1084    {
1085  1 MapList map = mapping.getMap();
1086  1 if (mapping.getTo() != null
1087    && map.getFromRatio() != map.getToRatio())
1088    {
1089    /*
1090    * add a mapping, as from dna to peptide sequence
1091    */
1092  1 if (map.getFromRatio() == 3)
1093    {
1094  1 mappings.addMap(nxt, fromSeq, map);
1095    }
1096    else
1097    {
1098  0 mappings.addMap(nxt, fromSeq, map.getInverse());
1099    }
1100    }
1101    }
1102    }
1103    }
1104    }
1105    }
1106    }
1107    }
1108  2101 return found;
1109    }
1110    }