Clover icon

Coverage Report

  1. Project Clover database Mon Nov 11 2024 20:42:03 GMT
  2. Package jalview.analysis

File CrossRef.java

 

Coverage histogram

../../img/srcFileCovDistChart5.png
42% of files have more coverage

Code metrics

184
288
16
1
1,126
654
137
0.48
18
16
8.56

Classes

Class Line # Actions
CrossRef 47 288 137
0.4631147746.3%
 

Contributing tests

This file is covered by 187 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.analysis;
22   
23    import jalview.datamodel.AlignedCodonFrame;
24    import jalview.datamodel.Alignment;
25    import jalview.datamodel.AlignmentI;
26    import jalview.datamodel.DBRefEntry;
27    import jalview.datamodel.DBRefSource;
28    import jalview.datamodel.Mapping;
29    import jalview.datamodel.Sequence;
30    import jalview.datamodel.SequenceFeature;
31    import jalview.datamodel.SequenceI;
32    import jalview.util.DBRefUtils;
33    import jalview.util.MapList;
34    import jalview.ws.SequenceFetcherFactory;
35    import jalview.ws.seqfetcher.ASequenceFetcher;
36   
37    import java.util.ArrayList;
38    import java.util.Iterator;
39    import java.util.List;
40   
41    /**
42    * Functions for cross-referencing sequence databases.
43    *
44    * @author JimP
45    *
46    */
 
47    public class CrossRef
48    {
49    /*
50    * the dataset of the alignment for which we are searching for
51    * cross-references; in some cases we may resolve xrefs by
52    * searching in the dataset
53    */
54    private AlignmentI dataset;
55   
56    /*
57    * the sequences for which we are seeking cross-references
58    */
59    private SequenceI[] fromSeqs;
60   
61    /**
62    * matcher built from dataset
63    */
64    SequenceIdMatcher matcher;
65   
66    /**
67    * sequences found by cross-ref searches to fromSeqs
68    */
69    List<SequenceI> rseqs;
70   
71    /**
72    * Constructor
73    *
74    * @param seqs
75    * the sequences for which we are seeking cross-references
76    * @param ds
77    * the containing alignment dataset (may be searched to resolve
78    * cross-references)
79    */
 
80  683 toggle public CrossRef(SequenceI[] seqs, AlignmentI ds)
81    {
82  683 fromSeqs = seqs;
83  683 dataset = ds.getDataset() == null ? ds : ds.getDataset();
84    }
85   
86    /**
87    * Returns a list of distinct database sources for which sequences have either
88    * <ul>
89    * <li>a (dna-to-protein or protein-to-dna) cross-reference</li>
90    * <li>an indirect cross-reference - a (dna-to-protein or protein-to-dna)
91    * reference from another sequence in the dataset which has a cross-reference
92    * to a direct DBRefEntry on the given sequence</li>
93    * </ul>
94    *
95    * @param dna
96    * - when true, cross-references *from* dna returned. When false,
97    * cross-references *from* protein are returned
98    * @return
99    */
 
100  676 toggle public List<String> findXrefSourcesForSequences(boolean dna)
101    {
102  676 List<String> sources = new ArrayList<>();
103  676 for (SequenceI seq : fromSeqs)
104    {
105  6699 if (seq != null)
106    {
107  6699 findXrefSourcesForSequence(seq, dna, sources);
108    }
109    }
110  676 sources.remove(DBRefSource.EMBL); // hack to prevent EMBL xrefs resulting in
111    // redundant datasets
112  676 if (dna)
113    {
114  91 sources.remove(DBRefSource.ENSEMBL); // hack to prevent Ensembl and
115    // EnsemblGenomes xref option shown
116    // from cdna panel
117  91 sources.remove(DBRefSource.ENSEMBLGENOMES);
118    }
119    // redundant datasets
120  676 return sources;
121    }
122   
123    /**
124    * Returns a list of distinct database sources for which a sequence has either
125    * <ul>
126    * <li>a (dna-to-protein or protein-to-dna) cross-reference</li>
127    * <li>an indirect cross-reference - a (dna-to-protein or protein-to-dna)
128    * reference from another sequence in the dataset which has a cross-reference
129    * to a direct DBRefEntry on the given sequence</li>
130    * </ul>
131    *
132    * @param seq
133    * the sequence whose dbrefs we are searching against
134    * @param fromDna
135    * when true, context is DNA - so sources identifying protein
136    * products will be returned.
137    * @param sources
138    * a list of sources to add matches to
139    */
 
140  6699 toggle void findXrefSourcesForSequence(SequenceI seq, boolean fromDna,
141    List<String> sources)
142    {
143    /*
144    * first find seq's xrefs (dna-to-peptide or peptide-to-dna)
145    */
146  6699 List<DBRefEntry> rfs = DBRefUtils.selectDbRefs(!fromDna,
147    seq.getDBRefs());
148  6699 addXrefsToSources(rfs, sources);
149  6699 if (dataset != null)
150    {
151    /*
152    * find sequence's direct (dna-to-dna, peptide-to-peptide) xrefs
153    */
154  6699 List<DBRefEntry> lrfs = DBRefUtils.selectDbRefs(fromDna,
155    seq.getDBRefs());
156  6699 List<SequenceI> foundSeqs = new ArrayList<>();
157   
158    /*
159    * find sequences in the alignment which xref one of these DBRefs
160    * i.e. is xref-ed to a common sequence identifier
161    */
162  6699 searchDatasetXrefs(fromDna, seq, lrfs, foundSeqs, null);
163   
164    /*
165    * add those sequences' (dna-to-peptide or peptide-to-dna) dbref sources
166    */
167  6699 for (SequenceI rs : foundSeqs)
168    {
169  272 List<DBRefEntry> xrs = DBRefUtils.selectDbRefs(!fromDna,
170    rs.getDBRefs());
171  272 addXrefsToSources(xrs, sources);
172    }
173    }
174    }
175   
176    /**
177    * Helper method that adds the source identifiers of some cross-references to
178    * a (non-redundant) list of database sources
179    *
180    * @param xrefs
181    * @param sources
182    */
 
183  6971 toggle void addXrefsToSources(List<DBRefEntry> xrefs, List<String> sources)
184    {
185  6971 if (xrefs != null)
186    {
187  364 for (DBRefEntry ref : xrefs)
188    {
189    /*
190    * avoid duplication e.g. ENSEMBL and Ensembl
191    */
192  1358 String source = DBRefUtils.getCanonicalName(ref.getSource());
193  1358 if (!sources.contains(source))
194    {
195  21 sources.add(source);
196    }
197    }
198    }
199    }
200   
201    /**
202    * Attempts to find cross-references from the sequences provided in the
203    * constructor to the given source database. Cross-references may be found
204    * <ul>
205    * <li>in dbrefs on the sequence which hold a mapping to a sequence
206    * <ul>
207    * <li>provided with a fetched sequence (e.g. ENA translation), or</li>
208    * <li>populated previously after getting cross-references</li>
209    * </ul>
210    * <li>as other sequences in the alignment which share a dbref identifier with
211    * the sequence</li>
212    * <li>by fetching from the remote database</li>
213    * </ul>
214    * The cross-referenced sequences, and mappings to them, are added to the
215    * alignment dataset.
216    *
217    * @param source
218    * @return cross-referenced sequences (as dataset sequences)
219    */
 
220  6 toggle public Alignment findXrefSequences(String source, boolean fromDna)
221    {
222   
223  6 rseqs = new ArrayList<>();
224  6 AlignedCodonFrame cf = new AlignedCodonFrame();
225  6 matcher = new SequenceIdMatcher(dataset.getSequences());
226   
227  6 for (SequenceI seq : fromSeqs)
228    {
229  48 SequenceI dss = seq;
230  93 while (dss.getDatasetSequence() != null)
231    {
232  45 dss = dss.getDatasetSequence();
233    }
234  48 boolean found = false;
235  48 List<DBRefEntry> xrfs = DBRefUtils.selectDbRefs(!fromDna,
236    dss.getDBRefs());
237    // ENST & ENSP comes in to both Protein and nucleotide, so we need to
238    // filter them
239    // out later.
240  48 if ((xrfs == null || xrfs.size() == 0) && dataset != null)
241    {
242    /*
243    * found no suitable dbrefs on sequence - look for sequences in the
244    * alignment which share a dbref with this one
245    */
246  3 List<DBRefEntry> lrfs = DBRefUtils.selectDbRefs(fromDna,
247    seq.getDBRefs());
248   
249    /*
250    * find sequences (except this one!), of complementary type,
251    * which have a dbref to an accession id for this sequence,
252    * and add them to the results
253    */
254  3 found = searchDatasetXrefs(fromDna, dss, lrfs, rseqs, cf);
255    }
256  48 if (xrfs == null && !found)
257    {
258    /*
259    * no dbref to source on this sequence or matched
260    * complementary sequence in the dataset
261    */
262  1 continue;
263    }
264  47 List<DBRefEntry> sourceRefs = DBRefUtils.searchRefsForSource(xrfs,
265    source);
266  47 Iterator<DBRefEntry> refIterator = sourceRefs.iterator();
267    // At this point, if we are retrieving Ensembl, we still don't filter out
268    // ENST when looking for protein crossrefs.
269  93 while (refIterator.hasNext())
270    {
271  46 DBRefEntry xref = refIterator.next();
272  46 found = false;
273    // we're only interested in coding cross-references, not
274    // locus->transcript
275  46 if (xref.hasMap() && xref.getMap().getMap().isTripletMap())
276    {
277  24 SequenceI mappedTo = xref.getMap().getTo();
278  24 if (mappedTo != null)
279    {
280    /*
281    * dbref contains the sequence it maps to; add it to the
282    * results unless we have done so already (could happen if
283    * fetching xrefs for sequences which have xrefs in common)
284    * for example: UNIPROT {P0CE19, P0CE20} -> EMBL {J03321, X06707}
285    */
286  24 found = true;
287    /*
288    * problem: matcher.findIdMatch() is lenient - returns a sequence
289    * with a dbref to the search arg e.g. ENST for ENSP - wrong
290    * but findInDataset() matches ENSP when looking for Uniprot...
291    */
292  24 SequenceI matchInDataset = findInDataset(xref);
293  24 if (matchInDataset != null && xref.getMap().getTo() != null
294    && matchInDataset != xref.getMap().getTo())
295    {
296  0 jalview.bin.Console.errPrintln(
297    "Implementation problem (reopen JAL-2154): CrossRef.findInDataset seems to have recovered a different sequence than the one explicitly mapped for xref."
298    + "Found:" + matchInDataset + "\nExpected:"
299    + xref.getMap().getTo() + "\nFor xref:"
300    + xref);
301    }
302    /*matcher.findIdMatch(mappedTo);*/
303  24 if (matchInDataset != null)
304    {
305  22 if (!rseqs.contains(matchInDataset))
306    {
307  0 rseqs.add(matchInDataset);
308    }
309    // even if rseqs contained matchInDataset - check mappings between
310    // these seqs are added
311    // need to try harder to only add unique mappings
312  22 if (xref.getMap().getMap().isTripletMap()
313    && dataset.getMapping(seq, matchInDataset) == null
314    && cf.getMappingBetween(seq, matchInDataset) == null)
315    {
316    // materialise a mapping for highlighting between these
317    // sequences
318  11 if (fromDna)
319    {
320  11 cf.addMap(dss, matchInDataset, xref.getMap().getMap(),
321    xref.getMap().getMappedFromId());
322    }
323    else
324    {
325  0 cf.addMap(matchInDataset, dss,
326    xref.getMap().getMap().getInverse(),
327    xref.getMap().getMappedFromId());
328    }
329    }
330   
331  22 refIterator.remove();
332  22 continue;
333    }
334    // TODO: need to determine if this should be a deriveSequence
335  2 SequenceI rsq = new Sequence(mappedTo);
336  2 rseqs.add(rsq);
337  2 if (xref.getMap().getMap().isTripletMap())
338    {
339    // get sense of map correct for adding to product alignment.
340  2 if (fromDna)
341    {
342    // map is from dna seq to a protein product
343  2 cf.addMap(dss, rsq, xref.getMap().getMap(),
344    xref.getMap().getMappedFromId());
345    }
346    else
347    {
348    // map should be from protein seq to its coding dna
349  0 cf.addMap(rsq, dss, xref.getMap().getMap().getInverse(),
350    xref.getMap().getMappedFromId());
351    }
352    }
353    }
354    }
355   
356  24 if (!found)
357    {
358  22 SequenceI matchedSeq = matcher.findIdMatch(
359    xref.getSource() + "|" + xref.getAccessionId());
360    // if there was a match, check it's at least the right type of
361    // molecule!
362  22 if (matchedSeq != null && matchedSeq.isProtein() == fromDna)
363    {
364  0 if (constructMapping(seq, matchedSeq, xref, cf, fromDna))
365    {
366  0 found = true;
367    }
368    }
369    }
370   
371  24 if (!found)
372    {
373    // do a bit more work - search for sequences with references matching
374    // xrefs on this sequence.
375  22 found = searchDataset(fromDna, dss, xref, rseqs, cf, false,
376    DBRefUtils.SEARCH_MODE_FULL);
377    }
378  24 if (found)
379    {
380  24 refIterator.remove();
381    }
382    }
383   
384    /*
385    * fetch from source database any dbrefs we haven't resolved up to here
386    */
387  47 if (!sourceRefs.isEmpty())
388    {
389  0 retrieveCrossRef(sourceRefs, seq, xrfs, fromDna, cf);
390    }
391    }
392   
393  6 Alignment ral = null;
394  6 if (rseqs.size() > 0)
395    {
396  5 ral = new Alignment(rseqs.toArray(new SequenceI[rseqs.size()]));
397  5 if (!cf.isEmpty())
398    {
399  2 dataset.addCodonFrame(cf);
400    }
401    }
402  6 return ral;
403    }
404   
 
405  0 toggle private void retrieveCrossRef(List<DBRefEntry> sourceRefs, SequenceI seq,
406    List<DBRefEntry> xrfs, boolean fromDna, AlignedCodonFrame cf)
407    {
408  0 ASequenceFetcher sftch = SequenceFetcherFactory.getSequenceFetcher();
409  0 SequenceI[] retrieved = null;
410  0 SequenceI dss = seq.getDatasetSequence() == null ? seq
411    : seq.getDatasetSequence();
412    // first filter in case we are retrieving crossrefs that have already been
413    // retrieved. this happens for cases where a database record doesn't yield
414    // protein products for CDS
415  0 removeAlreadyRetrievedSeqs(sourceRefs, fromDna);
416  0 if (sourceRefs.size() == 0)
417    {
418    // no more work to do! We already had all requested sequence records in
419    // the dataset.
420  0 return;
421    }
422  0 try
423    {
424  0 retrieved = sftch.getSequences(sourceRefs, !fromDna);
425    } catch (Exception e)
426    {
427  0 jalview.bin.Console.errPrintln(
428    "Problem whilst retrieving cross references for Sequence : "
429    + seq.getName());
430  0 e.printStackTrace();
431    }
432   
433  0 if (retrieved != null)
434    {
435  0 boolean addedXref = false;
436  0 List<SequenceI> newDsSeqs = new ArrayList<>(),
437    doNotAdd = new ArrayList<>();
438   
439  0 for (SequenceI retrievedSequence : retrieved)
440    {
441    // dataset gets contaminated ccwith non-ds sequences. why ??!
442    // try: Ensembl -> Nuc->Ensembl, Nuc->Uniprot-->Protein->EMBL->
443  0 SequenceI retrievedDss = retrievedSequence
444    .getDatasetSequence() == null ? retrievedSequence
445    : retrievedSequence.getDatasetSequence();
446  0 addedXref |= importCrossRefSeq(cf, newDsSeqs, doNotAdd, dss,
447    retrievedDss);
448    }
449    // JBPNote: What assumptions are made for dbref structures on
450    // retrieved sequences ?
451    // addedXref will be true means importCrossRefSeq found
452    // sequences with dbrefs with mappings to sequences congruent with dss
453   
454  0 if (!addedXref)
455    {
456    // try again, after looking for matching IDs
457    // shouldn't need to do this unless the dbref mechanism has broken.
458  0 updateDbrefMappings(seq, xrfs, retrieved, cf, fromDna);
459  0 for (SequenceI retrievedSequence : retrieved)
460    {
461    // dataset gets contaminated ccwith non-ds sequences. why ??!
462    // try: Ensembl -> Nuc->Ensembl, Nuc->Uniprot-->Protein->EMBL->
463  0 SequenceI retrievedDss = retrievedSequence
464    .getDatasetSequence() == null ? retrievedSequence
465    : retrievedSequence.getDatasetSequence();
466  0 addedXref |= importCrossRefSeq(cf, newDsSeqs, doNotAdd, dss,
467    retrievedDss);
468    }
469    }
470  0 for (SequenceI newToSeq : newDsSeqs)
471    {
472  0 if (!doNotAdd.contains(newToSeq)
473    && dataset.findIndex(newToSeq) == -1)
474    {
475  0 dataset.addSequence(newToSeq);
476  0 matcher.add(newToSeq);
477    }
478    }
479    }
480    }
481   
482    /**
483    * Search dataset for sequences with a primary reference contained in
484    * sourceRefs.
485    *
486    * @param sourceRefs
487    * - list of references to filter.
488    * @param fromDna
489    * - type of sequence to search for matching primary reference.
490    */
 
491  0 toggle private void removeAlreadyRetrievedSeqs(List<DBRefEntry> sourceRefs,
492    boolean fromDna)
493    {
494  0 List<DBRefEntry> dbrSourceSet = new ArrayList<>(sourceRefs);
495  0 List<SequenceI> dsSeqs = dataset.getSequences();
496  0 for (int ids = 0, nds = dsSeqs.size(); ids < nds; ids++)
497    {
498  0 SequenceI sq = dsSeqs.get(ids);
499  0 boolean dupeFound = false;
500    // !fromDna means we are looking only for nucleotide sequences, not
501    // protein
502  0 if (sq.isProtein() == fromDna)
503    {
504  0 List<DBRefEntry> sqdbrefs = sq.getPrimaryDBRefs();
505  0 for (int idb = 0, ndb = sqdbrefs.size(); idb < ndb; idb++)
506    {
507  0 DBRefEntry dbr = sqdbrefs.get(idb);
508  0 List<DBRefEntry> searchrefs = DBRefUtils.searchRefs(dbrSourceSet,
509    dbr, DBRefUtils.SEARCH_MODE_FULL);
510  0 for (int isr = 0, nsr = searchrefs.size(); isr < nsr; isr++)
511    {
512  0 sourceRefs.remove(searchrefs.get(isr));
513  0 dupeFound = true;
514    }
515    }
516    }
517  0 if (dupeFound)
518    {
519    // rebuild the search array from the filtered sourceRefs list
520  0 dbrSourceSet.clear();
521  0 dbrSourceSet.addAll(sourceRefs);
522    }
523    }
524    }
525   
526    /**
527    * process sequence retrieved via a dbref on source sequence to resolve and
528    * transfer data JBPNote: as of 2022-02-03 - this assumes retrievedSequence
529    * has dbRefs with Mapping references to a sequence congruent with
530    * sourceSequence
531    *
532    * @param cf
533    * @param sourceSequence
534    * @param retrievedSequence
535    * @return true if retrieveSequence was imported
536    */
 
537  0 toggle private boolean importCrossRefSeq(AlignedCodonFrame cf,
538    List<SequenceI> newDsSeqs, List<SequenceI> doNotAdd,
539    SequenceI sourceSequence, SequenceI retrievedSequence)
540    {
541    /**
542    * set when retrievedSequence has been verified as a crossreference for
543    * sourceSequence
544    */
545  0 boolean imported = false;
546  0 List<DBRefEntry> dbr = retrievedSequence.getDBRefs();
547  0 if (dbr != null)
548    {
549  0 for (int ib = 0, nb = dbr.size(); ib < nb; ib++)
550    {
551   
552  0 DBRefEntry dbref = dbr.get(ib);
553    // matched will return null if the dbref has no map
554  0 SequenceI matched = findInDataset(dbref);
555  0 if (matched == sourceSequence)
556    {
557    // verified retrieved and source sequence cross-reference each other
558  0 imported = true;
559    }
560    // find any entry where we should put in the sequence being
561    // cross-referenced into the map
562  0 Mapping map = dbref.getMap();
563  0 if (map != null)
564    {
565  0 SequenceI ms = map.getTo();
566  0 if (ms != null && map.getMap() != null)
567    {
568  0 if (ms == sourceSequence)
569    {
570    // already called to import once, and most likely this sequence
571    // already imported !
572  0 continue;
573    }
574  0 if (matched == null)
575    {
576    /*
577    * sequence is new to dataset, so save a reference so it can be added.
578    */
579  0 newDsSeqs.add(ms);
580  0 continue;
581    }
582   
583    /*
584    * there was a matching sequence in dataset, so now, check to see if we can update the map.getTo() sequence to the existing one.
585    */
586   
587  0 try
588    {
589    // compare ms with dss and replace with dss in mapping
590    // if map is congruent
591    // TODO findInDataset requires exact sequence match but
592    // 'congruent' test is only for the mapped part
593    // maybe not a problem in practice since only ENA provide a
594    // mapping and it is to the full protein translation of CDS
595    // matcher.findIdMatch(map.getTo());
596    // TODO addendum: if matched is shorter than getTo, this will fail
597    // - when it should really succeed.
598  0 int sf = map.getMap().getToLowest();
599  0 int st = map.getMap().getToHighest();
600  0 SequenceI mappedrg = ms.getSubSequence(sf, st);
601  0 if (mappedrg.getLength() > 0 && ms.getSequenceAsString()
602    .equals(matched.getSequenceAsString()))
603    {
604    /*
605    * sequences were a match,
606    */
607  0 String msg = "Mapping updated from " + ms.getName()
608    + " to retrieved crossreference "
609    + matched.getName();
610  0 jalview.bin.Console.outPrintln(msg);
611   
612  0 List<DBRefEntry> toRefs = map.getTo().getDBRefs();
613  0 if (toRefs != null)
614    {
615    /*
616    * transfer database refs
617    */
618  0 for (DBRefEntry ref : toRefs)
619    {
620  0 if (dbref.getSrcAccString()
621    .equals(ref.getSrcAccString()))
622    {
623  0 continue; // avoid overwriting the ref on source sequence
624    }
625  0 matched.addDBRef(ref); // add or update mapping
626    }
627    }
628  0 doNotAdd.add(map.getTo());
629  0 map.setTo(matched);
630   
631    /*
632    * give the reverse reference the inverse mapping
633    * (if it doesn't have one already)
634    */
635  0 setReverseMapping(matched, dbref, cf);
636   
637    /*
638    * copy sequence features as well, avoiding
639    * duplication (e.g. same variation from two
640    * transcripts)
641    */
642  0 List<SequenceFeature> sfs = ms.getFeatures()
643    .getAllFeatures();
644  0 for (SequenceFeature feat : sfs)
645    {
646    /*
647    * make a flyweight feature object which ignores Parent
648    * attribute in equality test; this avoids creating many
649    * otherwise duplicate exon features on genomic sequence
650    */
651  0 SequenceFeature newFeature = new SequenceFeature(feat)
652    {
 
653  0 toggle @Override
654    public boolean equals(Object o)
655    {
656  0 return super.equals(o, true);
657    }
658    };
659  0 matched.addSequenceFeature(newFeature);
660    }
661    }
662  0 cf.addMap(retrievedSequence, map.getTo(), map.getMap());
663    } catch (Exception e)
664    {
665  0 jalview.bin.Console.errPrintln(
666    "Exception when consolidating Mapped sequence set...");
667  0 e.printStackTrace(System.err);
668    }
669    }
670    }
671    }
672    }
673  0 if (imported)
674    {
675  0 retrievedSequence.updatePDBIds();
676  0 rseqs.add(retrievedSequence);
677  0 if (dataset.findIndex(retrievedSequence) == -1)
678    {
679  0 dataset.addSequence(retrievedSequence);
680  0 matcher.add(retrievedSequence);
681    }
682    }
683  0 return imported;
684    }
685   
686    /**
687    * Sets the inverse sequence mapping in the corresponding dbref of the mapped
688    * to sequence (if any). This is used after fetching a cross-referenced
689    * sequence, if the fetched sequence has a mapping to the original sequence,
690    * to set the mapping in the original sequence's dbref.
691    *
692    * @param mapFrom
693    * the sequence mapped from
694    * @param dbref
695    * @param mappings
696    */
 
697  0 toggle void setReverseMapping(SequenceI mapFrom, DBRefEntry dbref,
698    AlignedCodonFrame mappings)
699    {
700  0 SequenceI mapTo = dbref.getMap().getTo();
701  0 if (mapTo == null)
702    {
703  0 return;
704    }
705  0 List<DBRefEntry> dbrefs = mapTo.getDBRefs();
706  0 if (dbrefs == null)
707    {
708  0 return;
709    }
710  0 for (DBRefEntry toRef : dbrefs)
711    {
712  0 if (toRef.hasMap() && mapFrom == toRef.getMap().getTo())
713    {
714    /*
715    * found the reverse dbref; update its mapping if null
716    */
717  0 if (toRef.getMap().getMap() == null)
718    {
719  0 MapList inverse = dbref.getMap().getMap().getInverse();
720  0 toRef.getMap().setMap(inverse);
721  0 mappings.addMap(mapTo, mapFrom, inverse);
722    }
723    }
724    }
725    }
726   
727    /**
728    * Returns null or the first sequence in the dataset which is identical to
729    * xref.mapTo, and has a) a primary dbref matching xref, or if none found, the
730    * first one with an ID source|xrefacc JBPNote: Could refactor this to
731    * AlignmentI/DatasetI
732    *
733    * @param xref
734    * with map and mapped-to sequence
735    * @return
736    */
 
737  24 toggle SequenceI findInDataset(DBRefEntry xref)
738    {
739  24 if (xref == null || !xref.hasMap() || xref.getMap().getTo() == null)
740    {
741  0 return null;
742    }
743  24 SequenceI mapsTo = xref.getMap().getTo();
744  24 String name = xref.getAccessionId();
745  24 String name2 = xref.getSource() + "|" + name;
746  24 SequenceI dss = mapsTo.getDatasetSequence() == null ? mapsTo
747    : mapsTo.getDatasetSequence();
748    // first check ds if ds is directly referenced
749  24 if (dataset.findIndex(dss) > -1)
750    {
751  22 return dss;
752    }
753  2 DBRefEntry template = new DBRefEntry(xref.getSource(), null,
754    xref.getAccessionId());
755    /**
756    * remember the first ID match - in case we don't find a match to template
757    */
758  2 SequenceI firstIdMatch = null;
759  2 for (SequenceI seq : dataset.getSequences())
760    {
761    // first check primary refs.
762  2 List<DBRefEntry> match = DBRefUtils.searchRefs(seq.getPrimaryDBRefs(),
763    template, DBRefUtils.SEARCH_MODE_FULL);
764  2 if (match != null && match.size() == 1 && sameSequence(seq, dss))
765    {
766  0 return seq;
767    }
768    /*
769    * clumsy alternative to using SequenceIdMatcher which currently
770    * returns sequences with a dbref to the matched accession id
771    * which we don't want
772    */
773  2 if (firstIdMatch == null && (name.equals(seq.getName())
774    || seq.getName().startsWith(name2)))
775    {
776  0 if (sameSequence(seq, dss))
777    {
778  0 firstIdMatch = seq;
779    }
780    }
781    }
782  2 return firstIdMatch;
783    }
784   
785    /**
786    * Answers true if seq1 and seq2 contain exactly the same characters (ignoring
787    * case), else false. This method compares the lengths, then each character in
788    * turn, in order to 'fail fast'. For case-sensitive comparison, it would be
789    * possible to use Arrays.equals(seq1.getSequence(), seq2.getSequence()).
790    *
791    * @param seq1
792    * @param seq2
793    * @return
794    */
795    // TODO move to Sequence / SequenceI
 
796  7 toggle static boolean sameSequence(SequenceI seq1, SequenceI seq2)
797    {
798  7 if (seq1 == seq2)
799    {
800  1 return true;
801    }
802  6 if (seq1 == null || seq2 == null)
803    {
804  2 return false;
805    }
806   
807  4 if (seq1.getLength() != seq2.getLength())
808    {
809  2 return false;
810    }
811  2 int length = seq1.getLength();
812  14 for (int i = 0; i < length; i++)
813    {
814  12 int diff = seq1.getCharAt(i) - seq2.getCharAt(i);
815    /*
816    * same char or differ in case only ('a'-'A' == 32)
817    */
818  12 if (diff != 0 && diff != 32 && diff != -32)
819    {
820  0 return false;
821    }
822    }
823  2 return true;
824    }
825   
826    /**
827    * Updates any empty mappings in the cross-references with one to a compatible
828    * retrieved sequence if found, and adds any new mappings to the
829    * AlignedCodonFrame JBPNote: TODO: this relies on sequence IDs like
830    * UNIPROT|ACCESSION - which do not always happen.
831    *
832    * @param mapFrom
833    * @param xrefs
834    * @param retrieved
835    * @param acf
836    */
 
837  0 toggle void updateDbrefMappings(SequenceI mapFrom, List<DBRefEntry> xrefs,
838    SequenceI[] retrieved, AlignedCodonFrame acf, boolean fromDna)
839    {
840  0 SequenceIdMatcher idMatcher = new SequenceIdMatcher(retrieved);
841  0 for (DBRefEntry xref : xrefs)
842    {
843  0 if (!xref.hasMap())
844    {
845  0 String targetSeqName = xref.getSource() + "|"
846    + xref.getAccessionId();
847  0 SequenceI[] matches = idMatcher.findAllIdMatches(targetSeqName);
848  0 if (matches == null)
849    {
850  0 return;
851    }
852  0 for (SequenceI seq : matches)
853    {
854  0 constructMapping(mapFrom, seq, xref, acf, fromDna);
855    }
856    }
857    }
858    }
859   
860    /**
861    * Tries to make a mapping between sequences. If successful, adds the mapping
862    * to the dbref and the mappings collection and answers true, otherwise
863    * answers false. The following methods of making are mapping are tried in
864    * turn:
865    * <ul>
866    * <li>if 'mapTo' holds a mapping to 'mapFrom', take the inverse; this is, for
867    * example, the case after fetching EMBL cross-references for a Uniprot
868    * sequence</li>
869    * <li>else check if the dna translates exactly to the protein (give or take
870    * start and stop codons></li>
871    * <li>else try to map based on CDS features on the dna sequence</li>
872    * </ul>
873    *
874    * @param mapFrom
875    * @param mapTo
876    * @param xref
877    * @param mappings
878    * @return
879    */
 
880  0 toggle boolean constructMapping(SequenceI mapFrom, SequenceI mapTo,
881    DBRefEntry xref, AlignedCodonFrame mappings, boolean fromDna)
882    {
883  0 MapList mapping = null;
884  0 SequenceI dsmapFrom = mapFrom.getDatasetSequence() == null ? mapFrom
885    : mapFrom.getDatasetSequence();
886  0 SequenceI dsmapTo = mapTo.getDatasetSequence() == null ? mapTo
887    : mapTo.getDatasetSequence();
888    /*
889    * look for a reverse mapping, if found make its inverse.
890    * Note - we do this on dataset sequences only.
891    */
892  0 if (dsmapTo.getDBRefs() != null)
893    {
894  0 for (DBRefEntry dbref : dsmapTo.getDBRefs())
895    {
896  0 String name = dbref.getSource() + "|" + dbref.getAccessionId();
897  0 if (dbref.hasMap() && dsmapFrom.getName().startsWith(name))
898    {
899    /*
900    * looks like we've found a map from 'mapTo' to 'mapFrom'
901    * - invert it to make the mapping the other way
902    */
903  0 MapList reverse = dbref.getMap().getMap().getInverse();
904  0 xref.setMap(new Mapping(dsmapTo, reverse));
905  0 mappings.addMap(mapFrom, dsmapTo, reverse);
906  0 return true;
907    }
908    }
909    }
910   
911  0 if (fromDna)
912    {
913  0 mapping = AlignmentUtils.mapCdnaToProtein(mapTo, mapFrom);
914    }
915    else
916    {
917  0 mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, mapTo);
918  0 if (mapping != null)
919    {
920  0 mapping = mapping.getInverse();
921    }
922    }
923  0 if (mapping == null)
924    {
925  0 return false;
926    }
927  0 xref.setMap(new Mapping(mapTo, mapping));
928   
929    /*
930    * and add a reverse DbRef with the inverse mapping
931    */
932  0 if (mapFrom.getDatasetSequence() != null && false)
933    // && mapFrom.getDatasetSequence().getSourceDBRef() != null)
934    {
935    // possible need to search primary references... except, why doesn't xref
936    // == getSourceDBRef ??
937    // DBRefEntry dbref = new DBRefEntry(mapFrom.getDatasetSequence()
938    // .getSourceDBRef());
939    // dbref.setMap(new Mapping(mapFrom.getDatasetSequence(), mapping
940    // .getInverse()));
941    // mapTo.addDBRef(dbref);
942    }
943   
944  0 if (fromDna)
945    {
946    // AlignmentUtils.computeProteinFeatures(mapFrom, mapTo, mapping);
947  0 mappings.addMap(mapFrom, mapTo, mapping);
948    }
949    else
950    {
951  0 mappings.addMap(mapTo, mapFrom, mapping.getInverse());
952    }
953   
954  0 return true;
955    }
956   
957    /**
958    * find references to lrfs in the cross-reference set of each sequence in
959    * dataset (that is not equal to sequenceI) Identifies matching DBRefEntry
960    * based on source and accession string only - Map and Version are nulled.
961    *
962    * @param fromDna
963    * - true if context was searching from Dna sequences, false if
964    * context was searching from Protein sequences
965    * @param sequenceI
966    * @param lrfs
967    * @param foundSeqs
968    * @return true if matches were found.
969    */
 
970  6702 toggle private boolean searchDatasetXrefs(boolean fromDna, SequenceI sequenceI,
971    List<DBRefEntry> lrfs, List<SequenceI> foundSeqs,
972    AlignedCodonFrame cf)
973    {
974  6702 boolean found = false;
975  6702 if (lrfs == null)
976    {
977  4981 return false;
978    }
979  3659 for (int i = 0, n = lrfs.size(); i < n; i++)
980    {
981    // DBRefEntry xref = new DBRefEntry(lrfs.get(i));
982    // // add in wildcards
983    // xref.setVersion(null);
984    // xref.setMap(null);
985  1938 found |= searchDataset(fromDna, sequenceI, lrfs.get(i), foundSeqs, cf,
986    false, DBRefUtils.SEARCH_MODE_NO_MAP_NO_VERSION);
987    }
988  1721 return found;
989    }
990   
991    /**
992    * Searches dataset for DBRefEntrys matching the given one (xrf) and adds the
993    * associated sequence to rseqs
994    *
995    * @param fromDna
996    * true if context was searching for refs *from* dna sequence, false
997    * if context was searching for refs *from* protein sequence
998    * @param fromSeq
999    * a sequence to ignore (start point of search)
1000    * @param xrf
1001    * a cross-reference to try to match
1002    * @param foundSeqs
1003    * result list to add to
1004    * @param mappings
1005    * a set of sequence mappings to add to
1006    * @param direct
1007    * - indicates the type of relationship between returned sequences,
1008    * xrf, and sequenceI that is required.
1009    * <ul>
1010    * <li>direct implies xrf is a primary reference for sequenceI AND
1011    * the sequences to be located (eg a uniprot ID for a protein
1012    * sequence, and a uniprot ref on a transcript sequence).</li>
1013    * <li>indirect means xrf is a cross reference with respect to
1014    * sequenceI or all the returned sequences (eg a genomic reference
1015    * associated with a locus and one or more transcripts)</li>
1016    * </ul>
1017    * @param mode
1018    * SEARCH_MODE_FULL for all; SEARCH_MODE_NO_MAP_NO_VERSION optional
1019    * @return true if relationship found and sequence added.
1020    */
 
1021  1963 toggle boolean searchDataset(boolean fromDna, SequenceI fromSeq, DBRefEntry xrf,
1022    List<SequenceI> foundSeqs, AlignedCodonFrame mappings,
1023    boolean direct, int mode)
1024    {
1025  1963 boolean found = false;
1026  1963 if (dataset == null)
1027    {
1028  0 return false;
1029    }
1030  1963 if (dataset.getSequences() == null)
1031    {
1032  0 jalview.bin.Console
1033    .errPrintln("Empty dataset sequence set - NO VECTOR");
1034  0 return false;
1035    }
1036  1963 List<SequenceI> ds = dataset.getSequences();
1037  1963 synchronized (ds)
1038    {
1039  1963 for (SequenceI nxt : ds)
1040    {
1041  43625 if (nxt != null)
1042    {
1043  43625 if (nxt.getDatasetSequence() != null)
1044    {
1045  0 jalview.bin.Console.errPrintln(
1046    "Implementation warning: CrossRef initialised with a dataset alignment with non-dataset sequences in it! ("
1047    + nxt.getDisplayId(true) + " has ds reference "
1048    + nxt.getDatasetSequence().getDisplayId(true)
1049    + ")");
1050    }
1051  43625 if (nxt == fromSeq || nxt == fromSeq.getDatasetSequence())
1052    {
1053  1960 continue;
1054    }
1055    /*
1056    * only look at same molecule type if 'direct', or
1057    * complementary type if !direct
1058    */
1059    {
1060  41665 boolean isDna = !nxt.isProtein();
1061  41665 if (direct ? (isDna != fromDna) : (isDna == fromDna))
1062    {
1063    // skip this sequence because it is wrong molecule type
1064  36622 continue;
1065    }
1066    }
1067   
1068    // look for direct or indirect references in common
1069  5043 List<DBRefEntry> poss = nxt.getDBRefs();
1070  5043 List<DBRefEntry> cands = null;
1071   
1072    // todo: indirect specifies we select either direct references to nxt
1073    // that match xrf which is indirect to sequenceI, or indirect
1074    // references to nxt that match xrf which is direct to sequenceI
1075  5043 cands = DBRefUtils.searchRefs(poss, xrf, mode);
1076    // else
1077    // {
1078    // poss = DBRefUtils.selectDbRefs(nxt.isProtein()!fromDna, poss);
1079    // cands = DBRefUtils.searchRefs(poss, xrf);
1080    // }
1081  5043 if (!cands.isEmpty())
1082    {
1083  880 if (foundSeqs.contains(nxt))
1084    {
1085  582 continue;
1086    }
1087  298 found = true;
1088  298 foundSeqs.add(nxt);
1089  298 if (mappings != null && !direct)
1090    {
1091    /*
1092    * if the matched sequence has mapped dbrefs to
1093    * protein product / cdna, add equivalent mappings to
1094    * our source sequence
1095    */
1096  26 for (DBRefEntry candidate : cands)
1097    {
1098  26 Mapping mapping = candidate.getMap();
1099  26 if (mapping != null)
1100    {
1101  1 MapList map = mapping.getMap();
1102  1 if (mapping.getTo() != null
1103    && map.getFromRatio() != map.getToRatio())
1104    {
1105    /*
1106    * add a mapping, as from dna to peptide sequence
1107    */
1108  1 if (map.getFromRatio() == 3)
1109    {
1110  1 mappings.addMap(nxt, fromSeq, map);
1111    }
1112    else
1113    {
1114  0 mappings.addMap(nxt, fromSeq, map.getInverse());
1115    }
1116    }
1117    }
1118    }
1119    }
1120    }
1121    }
1122    }
1123    }
1124  1963 return found;
1125    }
1126    }