Clover icon

jalviewX

  1. Project Clover database Wed Oct 31 2018 15:13:58 GMT
  2. Package jalview.analysis

File CrossRef.java

 

Coverage histogram

../../img/srcFileCovDistChart5.png
40% of files have more coverage

Code metrics

176
285
16
1
1,101
646
133
0.47
17.81
16
8.31

Classes

Class Line # Actions
CrossRef 47 285 133 275
0.423480142.3%
 

Contributing tests

This file is covered by 83 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.analysis;
22   
23    import jalview.datamodel.AlignedCodonFrame;
24    import jalview.datamodel.Alignment;
25    import jalview.datamodel.AlignmentI;
26    import jalview.datamodel.DBRefEntry;
27    import jalview.datamodel.DBRefSource;
28    import jalview.datamodel.Mapping;
29    import jalview.datamodel.Sequence;
30    import jalview.datamodel.SequenceFeature;
31    import jalview.datamodel.SequenceI;
32    import jalview.util.DBRefUtils;
33    import jalview.util.MapList;
34    import jalview.ws.SequenceFetcherFactory;
35    import jalview.ws.seqfetcher.ASequenceFetcher;
36   
37    import java.util.ArrayList;
38    import java.util.Iterator;
39    import java.util.List;
40   
41    /**
42    * Functions for cross-referencing sequence databases.
43    *
44    * @author JimP
45    *
46    */
 
47    public class CrossRef
48    {
49    /*
50    * the dataset of the alignment for which we are searching for
51    * cross-references; in some cases we may resolve xrefs by
52    * searching in the dataset
53    */
54    private AlignmentI dataset;
55   
56    /*
57    * the sequences for which we are seeking cross-references
58    */
59    private SequenceI[] fromSeqs;
60   
61    /**
62    * matcher built from dataset
63    */
64    SequenceIdMatcher matcher;
65   
66    /**
67    * sequences found by cross-ref searches to fromSeqs
68    */
69    List<SequenceI> rseqs;
70   
71    /**
72    * Constructor
73    *
74    * @param seqs
75    * the sequences for which we are seeking cross-references
76    * @param ds
77    * the containing alignment dataset (may be searched to resolve
78    * cross-references)
79    */
 
80  387 toggle public CrossRef(SequenceI[] seqs, AlignmentI ds)
81    {
82  387 fromSeqs = seqs;
83  387 dataset = ds.getDataset() == null ? ds : ds.getDataset();
84    }
85   
86    /**
87    * Returns a list of distinct database sources for which sequences have either
88    * <ul>
89    * <li>a (dna-to-protein or protein-to-dna) cross-reference</li>
90    * <li>an indirect cross-reference - a (dna-to-protein or protein-to-dna)
91    * reference from another sequence in the dataset which has a cross-reference
92    * to a direct DBRefEntry on the given sequence</li>
93    * </ul>
94    *
95    * @param dna
96    * - when true, cross-references *from* dna returned. When false,
97    * cross-references *from* protein are returned
98    * @return
99    */
 
100  379 toggle public List<String> findXrefSourcesForSequences(boolean dna)
101    {
102  379 List<String> sources = new ArrayList<>();
103  379 for (SequenceI seq : fromSeqs)
104    {
105  4232 if (seq != null)
106    {
107  4232 findXrefSourcesForSequence(seq, dna, sources);
108    }
109    }
110  379 sources.remove(DBRefSource.EMBL); // hack to prevent EMBL xrefs resulting in
111    // redundant datasets
112  379 if (dna)
113    {
114  16 sources.remove(DBRefSource.ENSEMBL); // hack to prevent Ensembl and
115    // EnsemblGenomes xref option shown
116    // from cdna panel
117  16 sources.remove(DBRefSource.ENSEMBLGENOMES);
118    }
119    // redundant datasets
120  379 return sources;
121    }
122   
123    /**
124    * Returns a list of distinct database sources for which a sequence has either
125    * <ul>
126    * <li>a (dna-to-protein or protein-to-dna) cross-reference</li>
127    * <li>an indirect cross-reference - a (dna-to-protein or protein-to-dna)
128    * reference from another sequence in the dataset which has a cross-reference
129    * to a direct DBRefEntry on the given sequence</li>
130    * </ul>
131    *
132    * @param seq
133    * the sequence whose dbrefs we are searching against
134    * @param fromDna
135    * when true, context is DNA - so sources identifying protein
136    * products will be returned.
137    * @param sources
138    * a list of sources to add matches to
139    */
 
140  4232 toggle void findXrefSourcesForSequence(SequenceI seq, boolean fromDna,
141    List<String> sources)
142    {
143    /*
144    * first find seq's xrefs (dna-to-peptide or peptide-to-dna)
145    */
146  4232 DBRefEntry[] rfs = DBRefUtils.selectDbRefs(!fromDna, seq.getDBRefs());
147  4232 addXrefsToSources(rfs, sources);
148  4232 if (dataset != null)
149    {
150    /*
151    * find sequence's direct (dna-to-dna, peptide-to-peptide) xrefs
152    */
153  4232 DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(fromDna, seq.getDBRefs());
154  4232 List<SequenceI> foundSeqs = new ArrayList<>();
155   
156    /*
157    * find sequences in the alignment which xref one of these DBRefs
158    * i.e. is xref-ed to a common sequence identifier
159    */
160  4232 searchDatasetXrefs(fromDna, seq, lrfs, foundSeqs, null);
161   
162    /*
163    * add those sequences' (dna-to-peptide or peptide-to-dna) dbref sources
164    */
165  4232 for (SequenceI rs : foundSeqs)
166    {
167  0 DBRefEntry[] xrs = DBRefUtils.selectDbRefs(!fromDna,
168    rs.getDBRefs());
169  0 addXrefsToSources(xrs, sources);
170    }
171    }
172    }
173   
174    /**
175    * Helper method that adds the source identifiers of some cross-references to
176    * a (non-redundant) list of database sources
177    *
178    * @param xrefs
179    * @param sources
180    */
 
181  4232 toggle void addXrefsToSources(DBRefEntry[] xrefs, List<String> sources)
182    {
183  4232 if (xrefs != null)
184    {
185  3 for (DBRefEntry ref : xrefs)
186    {
187    /*
188    * avoid duplication e.g. ENSEMBL and Ensembl
189    */
190  9 String source = DBRefUtils.getCanonicalName(ref.getSource());
191  9 if (!sources.contains(source))
192    {
193  8 sources.add(source);
194    }
195    }
196    }
197    }
198   
199    /**
200    * Attempts to find cross-references from the sequences provided in the
201    * constructor to the given source database. Cross-references may be found
202    * <ul>
203    * <li>in dbrefs on the sequence which hold a mapping to a sequence
204    * <ul>
205    * <li>provided with a fetched sequence (e.g. ENA translation), or</li>
206    * <li>populated previously after getting cross-references</li>
207    * </ul>
208    * <li>as other sequences in the alignment which share a dbref identifier with
209    * the sequence</li>
210    * <li>by fetching from the remote database</li>
211    * </ul>
212    * The cross-referenced sequences, and mappings to them, are added to the
213    * alignment dataset.
214    *
215    * @param source
216    * @return cross-referenced sequences (as dataset sequences)
217    */
 
218  7 toggle public Alignment findXrefSequences(String source, boolean fromDna)
219    {
220   
221  7 rseqs = new ArrayList<>();
222  7 AlignedCodonFrame cf = new AlignedCodonFrame();
223  7 matcher = new SequenceIdMatcher(dataset.getSequences());
224   
225  7 for (SequenceI seq : fromSeqs)
226    {
227  10 SequenceI dss = seq;
228  11 while (dss.getDatasetSequence() != null)
229    {
230  1 dss = dss.getDatasetSequence();
231    }
232  10 boolean found = false;
233  10 DBRefEntry[] xrfs = DBRefUtils.selectDbRefs(!fromDna,
234    dss.getDBRefs());
235    // ENST & ENSP comes in to both Protein and nucleotide, so we need to
236    // filter them
237    // out later.
238  10 if ((xrfs == null || xrfs.length == 0) && dataset != null)
239    {
240    /*
241    * found no suitable dbrefs on sequence - look for sequences in the
242    * alignment which share a dbref with this one
243    */
244  3 DBRefEntry[] lrfs = DBRefUtils.selectDbRefs(fromDna,
245    seq.getDBRefs());
246   
247    /*
248    * find sequences (except this one!), of complementary type,
249    * which have a dbref to an accession id for this sequence,
250    * and add them to the results
251    */
252  3 found = searchDatasetXrefs(fromDna, dss, lrfs, rseqs, cf);
253    }
254  10 if (xrfs == null && !found)
255    {
256    /*
257    * no dbref to source on this sequence or matched
258    * complementary sequence in the dataset
259    */
260  1 continue;
261    }
262  9 List<DBRefEntry> sourceRefs = DBRefUtils.searchRefsForSource(xrfs,
263    source);
264  9 Iterator<DBRefEntry> refIterator = sourceRefs.iterator();
265    // At this point, if we are retrieving Ensembl, we still don't filter out
266    // ENST when looking for protein crossrefs.
267  24 while (refIterator.hasNext())
268    {
269  15 DBRefEntry xref = refIterator.next();
270  15 found = false;
271    // we're only interested in coding cross-references, not
272    // locus->transcript
273  2 if (xref.hasMap() && xref.getMap().getMap().isTripletMap())
274    {
275  2 SequenceI mappedTo = xref.getMap().getTo();
276  2 if (mappedTo != null)
277    {
278    /*
279    * dbref contains the sequence it maps to; add it to the
280    * results unless we have done so already (could happen if
281    * fetching xrefs for sequences which have xrefs in common)
282    * for example: UNIPROT {P0CE19, P0CE20} -> EMBL {J03321, X06707}
283    */
284  2 found = true;
285    /*
286    * problem: matcher.findIdMatch() is lenient - returns a sequence
287    * with a dbref to the search arg e.g. ENST for ENSP - wrong
288    * but findInDataset() matches ENSP when looking for Uniprot...
289    */
290  2 SequenceI matchInDataset = findInDataset(xref);
291  2 if (matchInDataset != null && xref.getMap().getTo() != null
292    && matchInDataset != xref.getMap().getTo())
293    {
294  0 System.err.println(
295    "Implementation problem (reopen JAL-2154): CrossRef.findInDataset seems to have recovered a different sequence than the one explicitly mapped for xref."
296    + "Found:" + matchInDataset + "\nExpected:"
297    + xref.getMap().getTo() + "\nFor xref:"
298    + xref);
299    }
300    /*matcher.findIdMatch(mappedTo);*/
301  2 if (matchInDataset != null)
302    {
303  0 if (!rseqs.contains(matchInDataset))
304    {
305  0 rseqs.add(matchInDataset);
306    }
307    // even if rseqs contained matchInDataset - check mappings between
308    // these seqs are added
309    // need to try harder to only add unique mappings
310  0 if (xref.getMap().getMap().isTripletMap()
311    && dataset.getMapping(seq, matchInDataset) == null
312    && cf.getMappingBetween(seq, matchInDataset) == null)
313    {
314    // materialise a mapping for highlighting between these
315    // sequences
316  0 if (fromDna)
317    {
318  0 cf.addMap(dss, matchInDataset, xref.getMap().getMap(),
319    xref.getMap().getMappedFromId());
320    }
321    else
322    {
323  0 cf.addMap(matchInDataset, dss,
324    xref.getMap().getMap().getInverse(),
325    xref.getMap().getMappedFromId());
326    }
327    }
328   
329  0 refIterator.remove();
330  0 continue;
331    }
332    // TODO: need to determine if this should be a deriveSequence
333  2 SequenceI rsq = new Sequence(mappedTo);
334  2 rseqs.add(rsq);
335  2 if (xref.getMap().getMap().isTripletMap())
336    {
337    // get sense of map correct for adding to product alignment.
338  2 if (fromDna)
339    {
340    // map is from dna seq to a protein product
341  2 cf.addMap(dss, rsq, xref.getMap().getMap(),
342    xref.getMap().getMappedFromId());
343    }
344    else
345    {
346    // map should be from protein seq to its coding dna
347  0 cf.addMap(rsq, dss, xref.getMap().getMap().getInverse(),
348    xref.getMap().getMappedFromId());
349    }
350    }
351    }
352    }
353   
354  2 if (!found)
355    {
356  0 SequenceI matchedSeq = matcher.findIdMatch(
357    xref.getSource() + "|" + xref.getAccessionId());
358    // if there was a match, check it's at least the right type of
359    // molecule!
360  0 if (matchedSeq != null && matchedSeq.isProtein() == fromDna)
361    {
362  0 if (constructMapping(seq, matchedSeq, xref, cf, fromDna))
363    {
364  0 found = true;
365    }
366    }
367    }
368   
369  4 if (!found)
370    {
371    // do a bit more work - search for sequences with references matching
372    // xrefs on this sequence.
373  0 found = searchDataset(fromDna, dss, xref, rseqs, cf, false);
374    }
375  4 if (found)
376    {
377  4 refIterator.remove();
378    }
379    }
380   
381    /*
382    * fetch from source database any dbrefs we haven't resolved up to here
383    */
384  3 if (!sourceRefs.isEmpty())
385    {
386  0 retrieveCrossRef(sourceRefs, seq, xrfs, fromDna, cf);
387    }
388    }
389   
390  7 Alignment ral = null;
391  7 if (rseqs.size() > 0)
392    {
393  4 ral = new Alignment(rseqs.toArray(new SequenceI[rseqs.size()]));
394  4 if (!cf.isEmpty())
395    {
396  2 dataset.addCodonFrame(cf);
397    }
398    }
399  7 return ral;
400    }
401   
 
402  0 toggle private void retrieveCrossRef(List<DBRefEntry> sourceRefs, SequenceI seq,
403    DBRefEntry[] xrfs, boolean fromDna, AlignedCodonFrame cf)
404    {
405  0 ASequenceFetcher sftch = SequenceFetcherFactory.getSequenceFetcher();
406  0 SequenceI[] retrieved = null;
407  0 SequenceI dss = seq.getDatasetSequence() == null ? seq
408    : seq.getDatasetSequence();
409    // first filter in case we are retrieving crossrefs that have already been
410    // retrieved. this happens for cases where a database record doesn't yield
411    // protein products for CDS
412  0 removeAlreadyRetrievedSeqs(sourceRefs, fromDna);
413  0 if (sourceRefs.size() == 0)
414    {
415    // no more work to do! We already had all requested sequence records in
416    // the dataset.
417  0 return;
418    }
419  0 try
420    {
421  0 retrieved = sftch.getSequences(sourceRefs, !fromDna);
422    } catch (Exception e)
423    {
424  0 System.err.println(
425    "Problem whilst retrieving cross references for Sequence : "
426    + seq.getName());
427  0 e.printStackTrace();
428    }
429   
430  0 if (retrieved != null)
431    {
432  0 boolean addedXref = false;
433  0 List<SequenceI> newDsSeqs = new ArrayList<>(),
434    doNotAdd = new ArrayList<>();
435   
436  0 for (SequenceI retrievedSequence : retrieved)
437    {
438    // dataset gets contaminated ccwith non-ds sequences. why ??!
439    // try: Ensembl -> Nuc->Ensembl, Nuc->Uniprot-->Protein->EMBL->
440  0 SequenceI retrievedDss = retrievedSequence
441    .getDatasetSequence() == null ? retrievedSequence
442    : retrievedSequence.getDatasetSequence();
443  0 addedXref |= importCrossRefSeq(cf, newDsSeqs, doNotAdd, dss,
444    retrievedDss);
445    }
446  0 if (!addedXref)
447    {
448    // try again, after looking for matching IDs
449    // shouldn't need to do this unless the dbref mechanism has broken.
450  0 updateDbrefMappings(seq, xrfs, retrieved, cf, fromDna);
451  0 for (SequenceI retrievedSequence : retrieved)
452    {
453    // dataset gets contaminated ccwith non-ds sequences. why ??!
454    // try: Ensembl -> Nuc->Ensembl, Nuc->Uniprot-->Protein->EMBL->
455  0 SequenceI retrievedDss = retrievedSequence
456    .getDatasetSequence() == null ? retrievedSequence
457    : retrievedSequence.getDatasetSequence();
458  0 addedXref |= importCrossRefSeq(cf, newDsSeqs, doNotAdd, dss,
459    retrievedDss);
460    }
461    }
462  0 for (SequenceI newToSeq : newDsSeqs)
463    {
464  0 if (!doNotAdd.contains(newToSeq)
465    && dataset.findIndex(newToSeq) == -1)
466    {
467  0 dataset.addSequence(newToSeq);
468  0 matcher.add(newToSeq);
469    }
470    }
471    }
472    }
473   
474    /**
475    * Search dataset for sequences with a primary reference contained in
476    * sourceRefs.
477    *
478    * @param sourceRefs
479    * - list of references to filter.
480    * @param fromDna
481    * - type of sequence to search for matching primary reference.
482    */
 
483  0 toggle private void removeAlreadyRetrievedSeqs(List<DBRefEntry> sourceRefs,
484    boolean fromDna)
485    {
486  0 DBRefEntry[] dbrSourceSet = sourceRefs.toArray(new DBRefEntry[0]);
487  0 for (SequenceI sq : dataset.getSequences())
488    {
489  0 boolean dupeFound = false;
490    // !fromDna means we are looking only for nucleotide sequences, not
491    // protein
492  0 if (sq.isProtein() == fromDna)
493    {
494  0 for (DBRefEntry dbr : sq.getPrimaryDBRefs())
495    {
496  0 for (DBRefEntry found : DBRefUtils.searchRefs(dbrSourceSet, dbr))
497    {
498  0 sourceRefs.remove(found);
499  0 dupeFound = true;
500    }
501    }
502    }
503  0 if (dupeFound)
504    {
505    // rebuild the search array from the filtered sourceRefs list
506  0 dbrSourceSet = sourceRefs.toArray(new DBRefEntry[0]);
507    }
508    }
509    }
510   
511    /**
512    * process sequence retrieved via a dbref on source sequence to resolve and
513    * transfer data
514    *
515    * @param cf
516    * @param sourceSequence
517    * @param retrievedSequence
518    * @return true if retrieveSequence was imported
519    */
 
520  0 toggle private boolean importCrossRefSeq(AlignedCodonFrame cf,
521    List<SequenceI> newDsSeqs, List<SequenceI> doNotAdd,
522    SequenceI sourceSequence, SequenceI retrievedSequence)
523    {
524    /**
525    * set when retrievedSequence has been verified as a crossreference for
526    * sourceSequence
527    */
528  0 boolean imported = false;
529  0 DBRefEntry[] dbr = retrievedSequence.getDBRefs();
530  0 if (dbr != null)
531    {
532  0 for (DBRefEntry dbref : dbr)
533    {
534  0 SequenceI matched = findInDataset(dbref);
535  0 if (matched == sourceSequence)
536    {
537    // verified retrieved and source sequence cross-reference each other
538  0 imported = true;
539    }
540    // find any entry where we should put in the sequence being
541    // cross-referenced into the map
542  0 Mapping map = dbref.getMap();
543  0 if (map != null)
544    {
545  0 if (map.getTo() != null && map.getMap() != null)
546    {
547  0 if (map.getTo() == sourceSequence)
548    {
549    // already called to import once, and most likely this sequence
550    // already imported !
551  0 continue;
552    }
553  0 if (matched == null)
554    {
555    /*
556    * sequence is new to dataset, so save a reference so it can be added.
557    */
558  0 newDsSeqs.add(map.getTo());
559  0 continue;
560    }
561   
562    /*
563    * there was a matching sequence in dataset, so now, check to see if we can update the map.getTo() sequence to the existing one.
564    */
565   
566  0 try
567    {
568    // compare ms with dss and replace with dss in mapping
569    // if map is congruent
570  0 SequenceI ms = map.getTo();
571    // TODO findInDataset requires exact sequence match but
572    // 'congruent' test is only for the mapped part
573    // maybe not a problem in practice since only ENA provide a
574    // mapping and it is to the full protein translation of CDS
575    // matcher.findIdMatch(map.getTo());
576    // TODO addendum: if matched is shorter than getTo, this will fail
577    // - when it should really succeed.
578  0 int sf = map.getMap().getToLowest();
579  0 int st = map.getMap().getToHighest();
580  0 SequenceI mappedrg = ms.getSubSequence(sf, st);
581  0 if (mappedrg.getLength() > 0 && ms.getSequenceAsString()
582    .equals(matched.getSequenceAsString()))
583    {
584    /*
585    * sequences were a match,
586    */
587  0 String msg = "Mapping updated from " + ms.getName()
588    + " to retrieved crossreference "
589    + matched.getName();
590  0 System.out.println(msg);
591   
592  0 DBRefEntry[] toRefs = map.getTo().getDBRefs();
593  0 if (toRefs != null)
594    {
595    /*
596    * transfer database refs
597    */
598  0 for (DBRefEntry ref : toRefs)
599    {
600  0 if (dbref.getSrcAccString()
601    .equals(ref.getSrcAccString()))
602    {
603  0 continue; // avoid overwriting the ref on source sequence
604    }
605  0 matched.addDBRef(ref); // add or update mapping
606    }
607    }
608  0 doNotAdd.add(map.getTo());
609  0 map.setTo(matched);
610   
611    /*
612    * give the reverse reference the inverse mapping
613    * (if it doesn't have one already)
614    */
615  0 setReverseMapping(matched, dbref, cf);
616   
617    /*
618    * copy sequence features as well, avoiding
619    * duplication (e.g. same variation from two
620    * transcripts)
621    */
622  0 List<SequenceFeature> sfs = ms.getFeatures()
623    .getAllFeatures();
624  0 for (SequenceFeature feat : sfs)
625    {
626    /*
627    * make a flyweight feature object which ignores Parent
628    * attribute in equality test; this avoids creating many
629    * otherwise duplicate exon features on genomic sequence
630    */
631  0 SequenceFeature newFeature = new SequenceFeature(feat)
632    {
 
633  0 toggle @Override
634    public boolean equals(Object o)
635    {
636  0 return super.equals(o, true);
637    }
638    };
639  0 matched.addSequenceFeature(newFeature);
640    }
641    }
642  0 cf.addMap(retrievedSequence, map.getTo(), map.getMap());
643    } catch (Exception e)
644    {
645  0 System.err.println(
646    "Exception when consolidating Mapped sequence set...");
647  0 e.printStackTrace(System.err);
648    }
649    }
650    }
651    }
652    }
653  0 if (imported)
654    {
655  0 retrievedSequence.updatePDBIds();
656  0 rseqs.add(retrievedSequence);
657  0 if (dataset.findIndex(retrievedSequence) == -1)
658    {
659  0 dataset.addSequence(retrievedSequence);
660  0 matcher.add(retrievedSequence);
661    }
662    }
663  0 return imported;
664    }
665   
666    /**
667    * Sets the inverse sequence mapping in the corresponding dbref of the mapped
668    * to sequence (if any). This is used after fetching a cross-referenced
669    * sequence, if the fetched sequence has a mapping to the original sequence,
670    * to set the mapping in the original sequence's dbref.
671    *
672    * @param mapFrom
673    * the sequence mapped from
674    * @param dbref
675    * @param mappings
676    */
 
677  0 toggle void setReverseMapping(SequenceI mapFrom, DBRefEntry dbref,
678    AlignedCodonFrame mappings)
679    {
680  0 SequenceI mapTo = dbref.getMap().getTo();
681  0 if (mapTo == null)
682    {
683  0 return;
684    }
685  0 DBRefEntry[] dbrefs = mapTo.getDBRefs();
686  0 if (dbrefs == null)
687    {
688  0 return;
689    }
690  0 for (DBRefEntry toRef : dbrefs)
691    {
692  0 if (toRef.hasMap() && mapFrom == toRef.getMap().getTo())
693    {
694    /*
695    * found the reverse dbref; update its mapping if null
696    */
697  0 if (toRef.getMap().getMap() == null)
698    {
699  0 MapList inverse = dbref.getMap().getMap().getInverse();
700  0 toRef.getMap().setMap(inverse);
701  0 mappings.addMap(mapTo, mapFrom, inverse);
702    }
703    }
704    }
705    }
706   
707    /**
708    * Returns null or the first sequence in the dataset which is identical to
709    * xref.mapTo, and has a) a primary dbref matching xref, or if none found, the
710    * first one with an ID source|xrefacc
711    *
712    * @param xref
713    * with map and mapped-to sequence
714    * @return
715    */
 
716  26 toggle SequenceI findInDataset(DBRefEntry xref)
717    {
718  10 if (xref == null || !xref.hasMap() || xref.getMap().getTo() == null)
719    {
720  0 return null;
721    }
722  10 SequenceI mapsTo = xref.getMap().getTo();
723  10 String name = xref.getAccessionId();
724  10 String name2 = xref.getSource() + "|" + name;
725  2 SequenceI dss = mapsTo.getDatasetSequence() == null ? mapsTo
726    : mapsTo.getDatasetSequence();
727    // first check ds if ds is directly referenced
728  10 if (dataset.findIndex(dss) > -1)
729    {
730  0 return dss;
731    }
732  10 DBRefEntry template = new DBRefEntry(xref.getSource(), null,
733    xref.getAccessionId());
734    /**
735    * remember the first ID match - in case we don't find a match to template
736    */
737  10 SequenceI firstIdMatch = null;
738  10 for (SequenceI seq : dataset.getSequences())
739    {
740    // first check primary refs.
741  32 List<DBRefEntry> match = DBRefUtils.searchRefs(
742    seq.getPrimaryDBRefs().toArray(new DBRefEntry[0]), template);
743  32 if (match != null && match.size() == 1 && sameSequence(seq, dss))
744    {
745  0 return seq;
746    }
747    /*
748    * clumsy alternative to using SequenceIdMatcher which currently
749    * returns sequences with a dbref to the matched accession id
750    * which we don't want
751    */
752  26 if (firstIdMatch == null && (name.equals(seq.getName())
753    || seq.getName().startsWith(name2)))
754    {
755  0 if (sameSequence(seq, dss))
756    {
757  0 firstIdMatch = seq;
758    }
759    }
760    }
761  10 return firstIdMatch;
762    }
763   
764    /**
765    * Answers true if seq1 and seq2 contain exactly the same characters (ignoring
766    * case), else false. This method compares the lengths, then each character in
767    * turn, in order to 'fail fast'. For case-sensitive comparison, it would be
768    * possible to use Arrays.equals(seq1.getSequence(), seq2.getSequence()).
769    *
770    * @param seq1
771    * @param seq2
772    * @return
773    */
774    // TODO move to Sequence / SequenceI
 
775  13 toggle static boolean sameSequence(SequenceI seq1, SequenceI seq2)
776    {
777  13 if (seq1 == seq2)
778    {
779  1 return true;
780    }
781  12 if (seq1 == null || seq2 == null)
782    {
783  2 return false;
784    }
785   
786  10 if (seq1.getLength() != seq2.getLength())
787    {
788  2 return false;
789    }
790  8 int length = seq1.getLength();
791  44 for (int i = 0; i < length; i++)
792    {
793  36 int diff = seq1.getCharAt(i) - seq2.getCharAt(i);
794    /*
795    * same char or differ in case only ('a'-'A' == 32)
796    */
797  36 if (diff != 0 && diff != 32 && diff != -32)
798    {
799  0 return false;
800    }
801    }
802  8 return true;
803    }
804   
805    /**
806    * Updates any empty mappings in the cross-references with one to a compatible
807    * retrieved sequence if found, and adds any new mappings to the
808    * AlignedCodonFrame
809    *
810    * @param mapFrom
811    * @param xrefs
812    * @param retrieved
813    * @param acf
814    */
 
815  0 toggle void updateDbrefMappings(SequenceI mapFrom, DBRefEntry[] xrefs,
816    SequenceI[] retrieved, AlignedCodonFrame acf, boolean fromDna)
817    {
818  0 SequenceIdMatcher idMatcher = new SequenceIdMatcher(retrieved);
819  0 for (DBRefEntry xref : xrefs)
820    {
821  0 if (!xref.hasMap())
822    {
823  0 String targetSeqName = xref.getSource() + "|"
824    + xref.getAccessionId();
825  0 SequenceI[] matches = idMatcher.findAllIdMatches(targetSeqName);
826  0 if (matches == null)
827    {
828  0 return;
829    }
830  0 for (SequenceI seq : matches)
831    {
832  0 constructMapping(mapFrom, seq, xref, acf, fromDna);
833    }
834    }
835    }
836    }
837   
838    /**
839    * Tries to make a mapping between sequences. If successful, adds the mapping
840    * to the dbref and the mappings collection and answers true, otherwise
841    * answers false. The following methods of making are mapping are tried in
842    * turn:
843    * <ul>
844    * <li>if 'mapTo' holds a mapping to 'mapFrom', take the inverse; this is, for
845    * example, the case after fetching EMBL cross-references for a Uniprot
846    * sequence</li>
847    * <li>else check if the dna translates exactly to the protein (give or take
848    * start and stop codons></li>
849    * <li>else try to map based on CDS features on the dna sequence</li>
850    * </ul>
851    *
852    * @param mapFrom
853    * @param mapTo
854    * @param xref
855    * @param mappings
856    * @return
857    */
 
858  0 toggle boolean constructMapping(SequenceI mapFrom, SequenceI mapTo,
859    DBRefEntry xref, AlignedCodonFrame mappings, boolean fromDna)
860    {
861  0 MapList mapping = null;
862  0 SequenceI dsmapFrom = mapFrom.getDatasetSequence() == null ? mapFrom
863    : mapFrom.getDatasetSequence();
864  0 SequenceI dsmapTo = mapTo.getDatasetSequence() == null ? mapTo
865    : mapTo.getDatasetSequence();
866    /*
867    * look for a reverse mapping, if found make its inverse.
868    * Note - we do this on dataset sequences only.
869    */
870  0 if (dsmapTo.getDBRefs() != null)
871    {
872  0 for (DBRefEntry dbref : dsmapTo.getDBRefs())
873    {
874  0 String name = dbref.getSource() + "|" + dbref.getAccessionId();
875  0 if (dbref.hasMap() && dsmapFrom.getName().startsWith(name))
876    {
877    /*
878    * looks like we've found a map from 'mapTo' to 'mapFrom'
879    * - invert it to make the mapping the other way
880    */
881  0 MapList reverse = dbref.getMap().getMap().getInverse();
882  0 xref.setMap(new Mapping(dsmapTo, reverse));
883  0 mappings.addMap(mapFrom, dsmapTo, reverse);
884  0 return true;
885    }
886    }
887    }
888   
889  0 if (fromDna)
890    {
891  0 mapping = AlignmentUtils.mapCdnaToProtein(mapTo, mapFrom);
892    }
893    else
894    {
895  0 mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, mapTo);
896  0 if (mapping != null)
897    {
898  0 mapping = mapping.getInverse();
899    }
900    }
901  0 if (mapping == null)
902    {
903  0 return false;
904    }
905  0 xref.setMap(new Mapping(mapTo, mapping));
906   
907    /*
908    * and add a reverse DbRef with the inverse mapping
909    */
910  0 if (mapFrom.getDatasetSequence() != null && false)
911    // && mapFrom.getDatasetSequence().getSourceDBRef() != null)
912    {
913    // possible need to search primary references... except, why doesn't xref
914    // == getSourceDBRef ??
915    // DBRefEntry dbref = new DBRefEntry(mapFrom.getDatasetSequence()
916    // .getSourceDBRef());
917    // dbref.setMap(new Mapping(mapFrom.getDatasetSequence(), mapping
918    // .getInverse()));
919    // mapTo.addDBRef(dbref);
920    }
921   
922  0 if (fromDna)
923    {
924  0 AlignmentUtils.computeProteinFeatures(mapFrom, mapTo, mapping);
925  0 mappings.addMap(mapFrom, mapTo, mapping);
926    }
927    else
928    {
929  0 mappings.addMap(mapTo, mapFrom, mapping.getInverse());
930    }
931   
932  0 return true;
933    }
934   
935    /**
936    * find references to lrfs in the cross-reference set of each sequence in
937    * dataset (that is not equal to sequenceI) Identifies matching DBRefEntry
938    * based on source and accession string only - Map and Version are nulled.
939    *
940    * @param fromDna
941    * - true if context was searching from Dna sequences, false if
942    * context was searching from Protein sequences
943    * @param sequenceI
944    * @param lrfs
945    * @param foundSeqs
946    * @return true if matches were found.
947    */
 
948  4235 toggle private boolean searchDatasetXrefs(boolean fromDna, SequenceI sequenceI,
949    DBRefEntry[] lrfs, List<SequenceI> foundSeqs,
950    AlignedCodonFrame cf)
951    {
952  4235 boolean found = false;
953  4235 if (lrfs == null)
954    {
955  1735 return false;
956    }
957  5001 for (int i = 0; i < lrfs.length; i++)
958    {
959  2501 DBRefEntry xref = new DBRefEntry(lrfs[i]);
960    // add in wildcards
961  2501 xref.setVersion(null);
962  2501 xref.setMap(null);
963  2501 found |= searchDataset(fromDna, sequenceI, xref, foundSeqs, cf,
964    false);
965    }
966  2500 return found;
967    }
968   
969    /**
970    * Searches dataset for DBRefEntrys matching the given one (xrf) and adds the
971    * associated sequence to rseqs
972    *
973    * @param fromDna
974    * true if context was searching for refs *from* dna sequence, false
975    * if context was searching for refs *from* protein sequence
976    * @param fromSeq
977    * a sequence to ignore (start point of search)
978    * @param xrf
979    * a cross-reference to try to match
980    * @param foundSeqs
981    * result list to add to
982    * @param mappings
983    * a set of sequence mappings to add to
984    * @param direct
985    * - indicates the type of relationship between returned sequences,
986    * xrf, and sequenceI that is required.
987    * <ul>
988    * <li>direct implies xrf is a primary reference for sequenceI AND
989    * the sequences to be located (eg a uniprot ID for a protein
990    * sequence, and a uniprot ref on a transcript sequence).</li>
991    * <li>indirect means xrf is a cross reference with respect to
992    * sequenceI or all the returned sequences (eg a genomic reference
993    * associated with a locus and one or more transcripts)</li>
994    * </ul>
995    * @return true if relationship found and sequence added.
996    */
 
997  2515 toggle boolean searchDataset(boolean fromDna, SequenceI fromSeq, DBRefEntry xrf,
998    List<SequenceI> foundSeqs, AlignedCodonFrame mappings,
999    boolean direct)
1000    {
1001  2515 boolean found = false;
1002  2515 if (dataset == null)
1003    {
1004  0 return false;
1005    }
1006  2515 if (dataset.getSequences() == null)
1007    {
1008  0 System.err.println("Empty dataset sequence set - NO VECTOR");
1009  0 return false;
1010    }
1011  2515 List<SequenceI> ds = dataset.getSequences();
1012  2515 synchronized (ds)
1013    {
1014  2515 for (SequenceI nxt : ds)
1015    {
1016  45449 if (nxt != null)
1017    {
1018  45449 if (nxt.getDatasetSequence() != null)
1019    {
1020  0 System.err.println(
1021    "Implementation warning: CrossRef initialised with a dataset alignment with non-dataset sequences in it! ("
1022    + nxt.getDisplayId(true) + " has ds reference "
1023    + nxt.getDatasetSequence().getDisplayId(true)
1024    + ")");
1025    }
1026  45449 if (nxt == fromSeq || nxt == fromSeq.getDatasetSequence())
1027    {
1028  2512 continue;
1029    }
1030    /*
1031    * only look at same molecule type if 'direct', or
1032    * complementary type if !direct
1033    */
1034    {
1035  42937 boolean isDna = !nxt.isProtein();
1036  42937 if (direct ? (isDna != fromDna) : (isDna == fromDna))
1037    {
1038    // skip this sequence because it is wrong molecule type
1039  42930 continue;
1040    }
1041    }
1042   
1043    // look for direct or indirect references in common
1044  7 DBRefEntry[] poss = nxt.getDBRefs();
1045  7 List<DBRefEntry> cands = null;
1046   
1047    // todo: indirect specifies we select either direct references to nxt
1048    // that match xrf which is indirect to sequenceI, or indirect
1049    // references to nxt that match xrf which is direct to sequenceI
1050  7 cands = DBRefUtils.searchRefs(poss, xrf);
1051    // else
1052    // {
1053    // poss = DBRefUtils.selectDbRefs(nxt.isProtein()!fromDna, poss);
1054    // cands = DBRefUtils.searchRefs(poss, xrf);
1055    // }
1056  4 if (!cands.isEmpty())
1057    {
1058  4 if (foundSeqs.contains(nxt))
1059    {
1060  0 continue;
1061    }
1062  4 found = true;
1063  4 foundSeqs.add(nxt);
1064  4 if (mappings != null && !direct)
1065    {
1066    /*
1067    * if the matched sequence has mapped dbrefs to
1068    * protein product / cdna, add equivalent mappings to
1069    * our source sequence
1070    */
1071  4 for (DBRefEntry candidate : cands)
1072    {
1073  4 Mapping mapping = candidate.getMap();
1074  4 if (mapping != null)
1075    {
1076  1 MapList map = mapping.getMap();
1077  1 if (mapping.getTo() != null
1078    && map.getFromRatio() != map.getToRatio())
1079    {
1080    /*
1081    * add a mapping, as from dna to peptide sequence
1082    */
1083  1 if (map.getFromRatio() == 3)
1084    {
1085  1 mappings.addMap(nxt, fromSeq, map);
1086    }
1087    else
1088    {
1089  0 mappings.addMap(nxt, fromSeq, map.getInverse());
1090    }
1091    }
1092    }
1093    }
1094    }
1095    }
1096    }
1097    }
1098    }
1099  2515 return found;
1100    }
1101    }