Clover icon

Coverage Report

  1. Project Clover database Thu Dec 4 2025 14:43:25 GMT
  2. Package jalview.analysis

File CrossRef.java

 

Coverage histogram

../../img/srcFileCovDistChart5.png
43% of files have more coverage

Code metrics

184
287
16
1
1,124
652
137
0.48
17.94
16
8.56

Classes

Class Line # Actions
CrossRef 46 287 137
0.464065746.4%
 

Contributing tests

This file is covered by 197 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.analysis;
22   
23    import java.util.ArrayList;
24    import java.util.Iterator;
25    import java.util.List;
26   
27    import jalview.datamodel.AlignedCodonFrame;
28    import jalview.datamodel.Alignment;
29    import jalview.datamodel.AlignmentI;
30    import jalview.datamodel.DBRefEntry;
31    import jalview.datamodel.DBRefSource;
32    import jalview.datamodel.Mapping;
33    import jalview.datamodel.Sequence;
34    import jalview.datamodel.SequenceFeature;
35    import jalview.datamodel.SequenceI;
36    import jalview.util.DBRefUtils;
37    import jalview.util.MapList;
38    import jalview.ws.SequenceFetcher;
39   
40    /**
41    * Functions for cross-referencing sequence databases.
42    *
43    * @author JimP
44    *
45    */
 
46    public class CrossRef
47    {
48    /*
49    * the dataset of the alignment for which we are searching for
50    * cross-references; in some cases we may resolve xrefs by
51    * searching in the dataset
52    */
53    private AlignmentI dataset;
54   
55    /*
56    * the sequences for which we are seeking cross-references
57    */
58    private SequenceI[] fromSeqs;
59   
60    /**
61    * matcher built from dataset
62    */
63    SequenceIdMatcher matcher;
64   
65    /**
66    * sequences found by cross-ref searches to fromSeqs
67    */
68    List<SequenceI> rseqs;
69   
70    /**
71    * Constructor
72    *
73    * @param seqs
74    * the sequences for which we are seeking cross-references
75    * @param ds
76    * the containing alignment dataset (may be searched to resolve
77    * cross-references)
78    */
 
79  740 toggle public CrossRef(SequenceI[] seqs, AlignmentI ds)
80    {
81  740 fromSeqs = seqs;
82  740 dataset = ds.getDataset() == null ? ds : ds.getDataset();
83    }
84   
85    /**
86    * Returns a list of distinct database sources for which sequences have either
87    * <ul>
88    * <li>a (dna-to-protein or protein-to-dna) cross-reference</li>
89    * <li>an indirect cross-reference - a (dna-to-protein or protein-to-dna)
90    * reference from another sequence in the dataset which has a cross-reference
91    * to a direct DBRefEntry on the given sequence</li>
92    * </ul>
93    *
94    * @param dna
95    * - when true, cross-references *from* dna returned. When false,
96    * cross-references *from* protein are returned
97    * @return
98    */
 
99  733 toggle public List<String> findXrefSourcesForSequences(boolean dna)
100    {
101  733 List<String> sources = new ArrayList<>();
102  733 for (SequenceI seq : fromSeqs)
103    {
104  7036 if (seq != null)
105    {
106  7036 findXrefSourcesForSequence(seq, dna, sources);
107    }
108    }
109  733 sources.remove(DBRefSource.EMBL); // hack to prevent EMBL xrefs resulting in
110    // redundant datasets
111  733 if (dna)
112    {
113  91 sources.remove(DBRefSource.ENSEMBL); // hack to prevent Ensembl and
114    // EnsemblGenomes xref option shown
115    // from cdna panel
116  91 sources.remove(DBRefSource.ENSEMBLGENOMES);
117    }
118    // redundant datasets
119  733 return sources;
120    }
121   
122    /**
123    * Returns a list of distinct database sources for which a sequence has either
124    * <ul>
125    * <li>a (dna-to-protein or protein-to-dna) cross-reference</li>
126    * <li>an indirect cross-reference - a (dna-to-protein or protein-to-dna)
127    * reference from another sequence in the dataset which has a cross-reference
128    * to a direct DBRefEntry on the given sequence</li>
129    * </ul>
130    *
131    * @param seq
132    * the sequence whose dbrefs we are searching against
133    * @param fromDna
134    * when true, context is DNA - so sources identifying protein
135    * products will be returned.
136    * @param sources
137    * a list of sources to add matches to
138    */
 
139  7036 toggle void findXrefSourcesForSequence(SequenceI seq, boolean fromDna,
140    List<String> sources)
141    {
142    /*
143    * first find seq's xrefs (dna-to-peptide or peptide-to-dna)
144    */
145  7036 List<DBRefEntry> rfs = DBRefUtils.selectDbRefs(!fromDna,
146    seq.getDBRefs());
147  7036 addXrefsToSources(rfs, sources);
148  7036 if (dataset != null)
149    {
150    /*
151    * find sequence's direct (dna-to-dna, peptide-to-peptide) xrefs
152    */
153  7036 List<DBRefEntry> lrfs = DBRefUtils.selectDbRefs(fromDna,
154    seq.getDBRefs());
155  7036 List<SequenceI> foundSeqs = new ArrayList<>();
156   
157    /*
158    * find sequences in the alignment which xref one of these DBRefs
159    * i.e. is xref-ed to a common sequence identifier
160    */
161  7036 searchDatasetXrefs(fromDna, seq, lrfs, foundSeqs, null);
162   
163    /*
164    * add those sequences' (dna-to-peptide or peptide-to-dna) dbref sources
165    */
166  7036 for (SequenceI rs : foundSeqs)
167    {
168  272 List<DBRefEntry> xrs = DBRefUtils.selectDbRefs(!fromDna,
169    rs.getDBRefs());
170  272 addXrefsToSources(xrs, sources);
171    }
172    }
173    }
174   
175    /**
176    * Helper method that adds the source identifiers of some cross-references to
177    * a (non-redundant) list of database sources
178    *
179    * @param xrefs
180    * @param sources
181    */
 
182  7308 toggle void addXrefsToSources(List<DBRefEntry> xrefs, List<String> sources)
183    {
184  7308 if (xrefs != null)
185    {
186  405 for (DBRefEntry ref : xrefs)
187    {
188    /*
189    * avoid duplication e.g. ENSEMBL and Ensembl
190    */
191  1570 String source = DBRefUtils.getCanonicalName(ref.getSource());
192  1570 if (!sources.contains(source))
193    {
194  43 sources.add(source);
195    }
196    }
197    }
198    }
199   
200    /**
201    * Attempts to find cross-references from the sequences provided in the
202    * constructor to the given source database. Cross-references may be found
203    * <ul>
204    * <li>in dbrefs on the sequence which hold a mapping to a sequence
205    * <ul>
206    * <li>provided with a fetched sequence (e.g. ENA translation), or</li>
207    * <li>populated previously after getting cross-references</li>
208    * </ul>
209    * <li>as other sequences in the alignment which share a dbref identifier with
210    * the sequence</li>
211    * <li>by fetching from the remote database</li>
212    * </ul>
213    * The cross-referenced sequences, and mappings to them, are added to the
214    * alignment dataset.
215    *
216    * @param source
217    * @return cross-referenced sequences (as dataset sequences)
218    */
 
219  6 toggle public Alignment findXrefSequences(String source, boolean fromDna)
220    {
221   
222  6 rseqs = new ArrayList<>();
223  6 AlignedCodonFrame cf = new AlignedCodonFrame();
224  6 matcher = new SequenceIdMatcher(dataset.getSequences());
225   
226  6 for (SequenceI seq : fromSeqs)
227    {
228  48 SequenceI dss = seq;
229  93 while (dss.getDatasetSequence() != null)
230    {
231  45 dss = dss.getDatasetSequence();
232    }
233  48 boolean found = false;
234  48 List<DBRefEntry> xrfs = DBRefUtils.selectDbRefs(!fromDna,
235    dss.getDBRefs());
236    // ENST & ENSP comes in to both Protein and nucleotide, so we need to
237    // filter them
238    // out later.
239  48 if ((xrfs == null || xrfs.size() == 0) && dataset != null)
240    {
241    /*
242    * found no suitable dbrefs on sequence - look for sequences in the
243    * alignment which share a dbref with this one
244    */
245  3 List<DBRefEntry> lrfs = DBRefUtils.selectDbRefs(fromDna,
246    seq.getDBRefs());
247   
248    /*
249    * find sequences (except this one!), of complementary type,
250    * which have a dbref to an accession id for this sequence,
251    * and add them to the results
252    */
253  3 found = searchDatasetXrefs(fromDna, dss, lrfs, rseqs, cf);
254    }
255  48 if (xrfs == null && !found)
256    {
257    /*
258    * no dbref to source on this sequence or matched
259    * complementary sequence in the dataset
260    */
261  1 continue;
262    }
263  47 List<DBRefEntry> sourceRefs = DBRefUtils.searchRefsForSource(xrfs,
264    source);
265  47 Iterator<DBRefEntry> refIterator = sourceRefs.iterator();
266    // At this point, if we are retrieving Ensembl, we still don't filter out
267    // ENST when looking for protein crossrefs.
268  93 while (refIterator.hasNext())
269    {
270  46 DBRefEntry xref = refIterator.next();
271  46 found = false;
272    // we're only interested in coding cross-references, not
273    // locus->transcript
274  46 if (xref.hasMap() && xref.getMap().getMap().isTripletMap())
275    {
276  24 SequenceI mappedTo = xref.getMap().getTo();
277  24 if (mappedTo != null)
278    {
279    /*
280    * dbref contains the sequence it maps to; add it to the
281    * results unless we have done so already (could happen if
282    * fetching xrefs for sequences which have xrefs in common)
283    * for example: UNIPROT {P0CE19, P0CE20} -> EMBL {J03321, X06707}
284    */
285  24 found = true;
286    /*
287    * problem: matcher.findIdMatch() is lenient - returns a sequence
288    * with a dbref to the search arg e.g. ENST for ENSP - wrong
289    * but findInDataset() matches ENSP when looking for Uniprot...
290    */
291  24 SequenceI matchInDataset = findInDataset(xref);
292  24 if (matchInDataset != null && xref.getMap().getTo() != null
293    && matchInDataset != xref.getMap().getTo())
294    {
295  0 jalview.bin.Console.errPrintln(
296    "Implementation problem (reopen JAL-2154): CrossRef.findInDataset seems to have recovered a different sequence than the one explicitly mapped for xref."
297    + "Found:" + matchInDataset + "\nExpected:"
298    + xref.getMap().getTo() + "\nFor xref:"
299    + xref);
300    }
301    /*matcher.findIdMatch(mappedTo);*/
302  24 if (matchInDataset != null)
303    {
304  22 if (!rseqs.contains(matchInDataset))
305    {
306  0 rseqs.add(matchInDataset);
307    }
308    // even if rseqs contained matchInDataset - check mappings between
309    // these seqs are added
310    // need to try harder to only add unique mappings
311  22 if (xref.getMap().getMap().isTripletMap()
312    && dataset.getMapping(seq, matchInDataset) == null
313    && cf.getMappingBetween(seq, matchInDataset) == null)
314    {
315    // materialise a mapping for highlighting between these
316    // sequences
317  11 if (fromDna)
318    {
319  11 cf.addMap(dss, matchInDataset, xref.getMap().getMap(),
320    xref.getMap().getMappedFromId());
321    }
322    else
323    {
324  0 cf.addMap(matchInDataset, dss,
325    xref.getMap().getMap().getInverse(),
326    xref.getMap().getMappedFromId());
327    }
328    }
329   
330  22 refIterator.remove();
331  22 continue;
332    }
333    // TODO: need to determine if this should be a deriveSequence
334  2 SequenceI rsq = new Sequence(mappedTo);
335  2 rseqs.add(rsq);
336  2 if (xref.getMap().getMap().isTripletMap())
337    {
338    // get sense of map correct for adding to product alignment.
339  2 if (fromDna)
340    {
341    // map is from dna seq to a protein product
342  2 cf.addMap(dss, rsq, xref.getMap().getMap(),
343    xref.getMap().getMappedFromId());
344    }
345    else
346    {
347    // map should be from protein seq to its coding dna
348  0 cf.addMap(rsq, dss, xref.getMap().getMap().getInverse(),
349    xref.getMap().getMappedFromId());
350    }
351    }
352    }
353    }
354   
355  24 if (!found)
356    {
357  22 SequenceI matchedSeq = matcher.findIdMatch(
358    xref.getSource() + "|" + xref.getAccessionId());
359    // if there was a match, check it's at least the right type of
360    // molecule!
361  22 if (matchedSeq != null && matchedSeq.isProtein() == fromDna)
362    {
363  0 if (constructMapping(seq, matchedSeq, xref, cf, fromDna))
364    {
365  0 found = true;
366    }
367    }
368    }
369   
370  24 if (!found)
371    {
372    // do a bit more work - search for sequences with references matching
373    // xrefs on this sequence.
374  22 found = searchDataset(fromDna, dss, xref, rseqs, cf, false,
375    DBRefUtils.SEARCH_MODE_FULL);
376    }
377  24 if (found)
378    {
379  24 refIterator.remove();
380    }
381    }
382   
383    /*
384    * fetch from source database any dbrefs we haven't resolved up to here
385    */
386  47 if (!sourceRefs.isEmpty())
387    {
388  0 retrieveCrossRef(sourceRefs, seq, xrfs, fromDna, cf);
389    }
390    }
391   
392  6 Alignment ral = null;
393  6 if (rseqs.size() > 0)
394    {
395  5 ral = new Alignment(rseqs.toArray(new SequenceI[rseqs.size()]));
396  5 if (!cf.isEmpty())
397    {
398  2 dataset.addCodonFrame(cf);
399    }
400    }
401  6 return ral;
402    }
403   
 
404  0 toggle private void retrieveCrossRef(List<DBRefEntry> sourceRefs, SequenceI seq,
405    List<DBRefEntry> xrfs, boolean fromDna, AlignedCodonFrame cf)
406    {
407  0 SequenceI[] retrieved = null;
408  0 SequenceI dss = seq.getDatasetSequence() == null ? seq
409    : seq.getDatasetSequence();
410    // first filter in case we are retrieving crossrefs that have already been
411    // retrieved. this happens for cases where a database record doesn't yield
412    // protein products for CDS
413  0 removeAlreadyRetrievedSeqs(sourceRefs, fromDna);
414  0 if (sourceRefs.size() == 0)
415    {
416    // no more work to do! We already had all requested sequence records in
417    // the dataset.
418  0 return;
419    }
420  0 try
421    {
422  0 retrieved = SequenceFetcher.getInstance().getSequences(sourceRefs, !fromDna);
423    } catch (Exception e)
424    {
425  0 jalview.bin.Console.errPrintln(
426    "Problem whilst retrieving cross references for Sequence : "
427    + seq.getName());
428  0 e.printStackTrace();
429    }
430   
431  0 if (retrieved != null)
432    {
433  0 boolean addedXref = false;
434  0 List<SequenceI> newDsSeqs = new ArrayList<>(),
435    doNotAdd = new ArrayList<>();
436   
437  0 for (SequenceI retrievedSequence : retrieved)
438    {
439    // dataset gets contaminated ccwith non-ds sequences. why ??!
440    // try: Ensembl -> Nuc->Ensembl, Nuc->Uniprot-->Protein->EMBL->
441  0 SequenceI retrievedDss = retrievedSequence
442    .getDatasetSequence() == null ? retrievedSequence
443    : retrievedSequence.getDatasetSequence();
444  0 addedXref |= importCrossRefSeq(cf, newDsSeqs, doNotAdd, dss,
445    retrievedDss);
446    }
447    // JBPNote: What assumptions are made for dbref structures on
448    // retrieved sequences ?
449    // addedXref will be true means importCrossRefSeq found
450    // sequences with dbrefs with mappings to sequences congruent with dss
451   
452  0 if (!addedXref)
453    {
454    // try again, after looking for matching IDs
455    // shouldn't need to do this unless the dbref mechanism has broken.
456  0 updateDbrefMappings(seq, xrfs, retrieved, cf, fromDna);
457  0 for (SequenceI retrievedSequence : retrieved)
458    {
459    // dataset gets contaminated ccwith non-ds sequences. why ??!
460    // try: Ensembl -> Nuc->Ensembl, Nuc->Uniprot-->Protein->EMBL->
461  0 SequenceI retrievedDss = retrievedSequence
462    .getDatasetSequence() == null ? retrievedSequence
463    : retrievedSequence.getDatasetSequence();
464  0 addedXref |= importCrossRefSeq(cf, newDsSeqs, doNotAdd, dss,
465    retrievedDss);
466    }
467    }
468  0 for (SequenceI newToSeq : newDsSeqs)
469    {
470  0 if (!doNotAdd.contains(newToSeq)
471    && dataset.findIndex(newToSeq) == -1)
472    {
473  0 dataset.addSequence(newToSeq);
474  0 matcher.add(newToSeq);
475    }
476    }
477    }
478    }
479   
480    /**
481    * Search dataset for sequences with a primary reference contained in
482    * sourceRefs.
483    *
484    * @param sourceRefs
485    * - list of references to filter.
486    * @param fromDna
487    * - type of sequence to search for matching primary reference.
488    */
 
489  0 toggle private void removeAlreadyRetrievedSeqs(List<DBRefEntry> sourceRefs,
490    boolean fromDna)
491    {
492  0 List<DBRefEntry> dbrSourceSet = new ArrayList<>(sourceRefs);
493  0 List<SequenceI> dsSeqs = dataset.getSequences();
494  0 for (int ids = 0, nds = dsSeqs.size(); ids < nds; ids++)
495    {
496  0 SequenceI sq = dsSeqs.get(ids);
497  0 boolean dupeFound = false;
498    // !fromDna means we are looking only for nucleotide sequences, not
499    // protein
500  0 if (sq.isProtein() == fromDna)
501    {
502  0 List<DBRefEntry> sqdbrefs = sq.getPrimaryDBRefs();
503  0 for (int idb = 0, ndb = sqdbrefs.size(); idb < ndb; idb++)
504    {
505  0 DBRefEntry dbr = sqdbrefs.get(idb);
506  0 List<DBRefEntry> searchrefs = DBRefUtils.searchRefs(dbrSourceSet,
507    dbr, DBRefUtils.SEARCH_MODE_FULL);
508  0 for (int isr = 0, nsr = searchrefs.size(); isr < nsr; isr++)
509    {
510  0 sourceRefs.remove(searchrefs.get(isr));
511  0 dupeFound = true;
512    }
513    }
514    }
515  0 if (dupeFound)
516    {
517    // rebuild the search array from the filtered sourceRefs list
518  0 dbrSourceSet.clear();
519  0 dbrSourceSet.addAll(sourceRefs);
520    }
521    }
522    }
523   
524    /**
525    * process sequence retrieved via a dbref on source sequence to resolve and
526    * transfer data JBPNote: as of 2022-02-03 - this assumes retrievedSequence
527    * has dbRefs with Mapping references to a sequence congruent with
528    * sourceSequence
529    *
530    * @param cf
531    * @param sourceSequence
532    * @param retrievedSequence
533    * @return true if retrieveSequence was imported
534    */
 
535  0 toggle private boolean importCrossRefSeq(AlignedCodonFrame cf,
536    List<SequenceI> newDsSeqs, List<SequenceI> doNotAdd,
537    SequenceI sourceSequence, SequenceI retrievedSequence)
538    {
539    /**
540    * set when retrievedSequence has been verified as a crossreference for
541    * sourceSequence
542    */
543  0 boolean imported = false;
544  0 List<DBRefEntry> dbr = retrievedSequence.getDBRefs();
545  0 if (dbr != null)
546    {
547  0 for (int ib = 0, nb = dbr.size(); ib < nb; ib++)
548    {
549   
550  0 DBRefEntry dbref = dbr.get(ib);
551    // matched will return null if the dbref has no map
552  0 SequenceI matched = findInDataset(dbref);
553  0 if (matched == sourceSequence)
554    {
555    // verified retrieved and source sequence cross-reference each other
556  0 imported = true;
557    }
558    // find any entry where we should put in the sequence being
559    // cross-referenced into the map
560  0 Mapping map = dbref.getMap();
561  0 if (map != null)
562    {
563  0 SequenceI ms = map.getTo();
564  0 if (ms != null && map.getMap() != null)
565    {
566  0 if (ms == sourceSequence)
567    {
568    // already called to import once, and most likely this sequence
569    // already imported !
570  0 continue;
571    }
572  0 if (matched == null)
573    {
574    /*
575    * sequence is new to dataset, so save a reference so it can be added.
576    */
577  0 newDsSeqs.add(ms);
578  0 continue;
579    }
580   
581    /*
582    * there was a matching sequence in dataset, so now, check to see if we can update the map.getTo() sequence to the existing one.
583    */
584   
585  0 try
586    {
587    // compare ms with dss and replace with dss in mapping
588    // if map is congruent
589    // TODO findInDataset requires exact sequence match but
590    // 'congruent' test is only for the mapped part
591    // maybe not a problem in practice since only ENA provide a
592    // mapping and it is to the full protein translation of CDS
593    // matcher.findIdMatch(map.getTo());
594    // TODO addendum: if matched is shorter than getTo, this will fail
595    // - when it should really succeed.
596  0 int sf = map.getMap().getToLowest();
597  0 int st = map.getMap().getToHighest();
598  0 SequenceI mappedrg = ms.getSubSequence(sf, st);
599  0 if (mappedrg.getLength() > 0 && ms.getSequenceAsString()
600    .equals(matched.getSequenceAsString()))
601    {
602    /*
603    * sequences were a match,
604    */
605  0 String msg = "Mapping updated from " + ms.getName()
606    + " to retrieved crossreference "
607    + matched.getName();
608  0 jalview.bin.Console.outPrintln(msg);
609   
610  0 List<DBRefEntry> toRefs = map.getTo().getDBRefs();
611  0 if (toRefs != null)
612    {
613    /*
614    * transfer database refs
615    */
616  0 for (DBRefEntry ref : toRefs)
617    {
618  0 if (dbref.getSrcAccString()
619    .equals(ref.getSrcAccString()))
620    {
621  0 continue; // avoid overwriting the ref on source sequence
622    }
623  0 matched.addDBRef(ref); // add or update mapping
624    }
625    }
626  0 doNotAdd.add(map.getTo());
627  0 map.setTo(matched);
628   
629    /*
630    * give the reverse reference the inverse mapping
631    * (if it doesn't have one already)
632    */
633  0 setReverseMapping(matched, dbref, cf);
634   
635    /*
636    * copy sequence features as well, avoiding
637    * duplication (e.g. same variation from two
638    * transcripts)
639    */
640  0 List<SequenceFeature> sfs = ms.getFeatures()
641    .getAllFeatures();
642  0 for (SequenceFeature feat : sfs)
643    {
644    /*
645    * make a flyweight feature object which ignores Parent
646    * attribute in equality test; this avoids creating many
647    * otherwise duplicate exon features on genomic sequence
648    */
649  0 SequenceFeature newFeature = new SequenceFeature(feat)
650    {
 
651  0 toggle @Override
652    public boolean equals(Object o)
653    {
654  0 return super.equals(o, true);
655    }
656    };
657  0 matched.addSequenceFeature(newFeature);
658    }
659    }
660  0 cf.addMap(retrievedSequence, map.getTo(), map.getMap());
661    } catch (Exception e)
662    {
663  0 jalview.bin.Console.errPrintln(
664    "Exception when consolidating Mapped sequence set...");
665  0 e.printStackTrace(System.err);
666    }
667    }
668    }
669    }
670    }
671  0 if (imported)
672    {
673  0 retrievedSequence.updatePDBIds();
674  0 rseqs.add(retrievedSequence);
675  0 if (dataset.findIndex(retrievedSequence) == -1)
676    {
677  0 dataset.addSequence(retrievedSequence);
678  0 matcher.add(retrievedSequence);
679    }
680    }
681  0 return imported;
682    }
683   
684    /**
685    * Sets the inverse sequence mapping in the corresponding dbref of the mapped
686    * to sequence (if any). This is used after fetching a cross-referenced
687    * sequence, if the fetched sequence has a mapping to the original sequence,
688    * to set the mapping in the original sequence's dbref.
689    *
690    * @param mapFrom
691    * the sequence mapped from
692    * @param dbref
693    * @param mappings
694    */
 
695  0 toggle void setReverseMapping(SequenceI mapFrom, DBRefEntry dbref,
696    AlignedCodonFrame mappings)
697    {
698  0 SequenceI mapTo = dbref.getMap().getTo();
699  0 if (mapTo == null)
700    {
701  0 return;
702    }
703  0 List<DBRefEntry> dbrefs = mapTo.getDBRefs();
704  0 if (dbrefs == null)
705    {
706  0 return;
707    }
708  0 for (DBRefEntry toRef : dbrefs)
709    {
710  0 if (toRef.hasMap() && mapFrom == toRef.getMap().getTo())
711    {
712    /*
713    * found the reverse dbref; update its mapping if null
714    */
715  0 if (toRef.getMap().getMap() == null)
716    {
717  0 MapList inverse = dbref.getMap().getMap().getInverse();
718  0 toRef.getMap().setMap(inverse);
719  0 mappings.addMap(mapTo, mapFrom, inverse);
720    }
721    }
722    }
723    }
724   
725    /**
726    * Returns null or the first sequence in the dataset which is identical to
727    * xref.mapTo, and has a) a primary dbref matching xref, or if none found, the
728    * first one with an ID source|xrefacc JBPNote: Could refactor this to
729    * AlignmentI/DatasetI
730    *
731    * @param xref
732    * with map and mapped-to sequence
733    * @return
734    */
 
735  24 toggle SequenceI findInDataset(DBRefEntry xref)
736    {
737  24 if (xref == null || !xref.hasMap() || xref.getMap().getTo() == null)
738    {
739  0 return null;
740    }
741  24 SequenceI mapsTo = xref.getMap().getTo();
742  24 String name = xref.getAccessionId();
743  24 String name2 = xref.getSource() + "|" + name;
744  24 SequenceI dss = mapsTo.getDatasetSequence() == null ? mapsTo
745    : mapsTo.getDatasetSequence();
746    // first check ds if ds is directly referenced
747  24 if (dataset.findIndex(dss) > -1)
748    {
749  22 return dss;
750    }
751  2 DBRefEntry template = new DBRefEntry(xref.getSource(), null,
752    xref.getAccessionId());
753    /**
754    * remember the first ID match - in case we don't find a match to template
755    */
756  2 SequenceI firstIdMatch = null;
757  2 for (SequenceI seq : dataset.getSequences())
758    {
759    // first check primary refs.
760  2 List<DBRefEntry> match = DBRefUtils.searchRefs(seq.getPrimaryDBRefs(),
761    template, DBRefUtils.SEARCH_MODE_FULL);
762  2 if (match != null && match.size() == 1 && sameSequence(seq, dss))
763    {
764  0 return seq;
765    }
766    /*
767    * clumsy alternative to using SequenceIdMatcher which currently
768    * returns sequences with a dbref to the matched accession id
769    * which we don't want
770    */
771  2 if (firstIdMatch == null && (name.equals(seq.getName())
772    || seq.getName().startsWith(name2)))
773    {
774  0 if (sameSequence(seq, dss))
775    {
776  0 firstIdMatch = seq;
777    }
778    }
779    }
780  2 return firstIdMatch;
781    }
782   
783    /**
784    * Answers true if seq1 and seq2 contain exactly the same characters (ignoring
785    * case), else false. This method compares the lengths, then each character in
786    * turn, in order to 'fail fast'. For case-sensitive comparison, it would be
787    * possible to use Arrays.equals(seq1.getSequence(), seq2.getSequence()).
788    *
789    * @param seq1
790    * @param seq2
791    * @return
792    */
793    // TODO move to Sequence / SequenceI
 
794  7 toggle static boolean sameSequence(SequenceI seq1, SequenceI seq2)
795    {
796  7 if (seq1 == seq2)
797    {
798  1 return true;
799    }
800  6 if (seq1 == null || seq2 == null)
801    {
802  2 return false;
803    }
804   
805  4 if (seq1.getLength() != seq2.getLength())
806    {
807  2 return false;
808    }
809  2 int length = seq1.getLength();
810  14 for (int i = 0; i < length; i++)
811    {
812  12 int diff = seq1.getCharAt(i) - seq2.getCharAt(i);
813    /*
814    * same char or differ in case only ('a'-'A' == 32)
815    */
816  12 if (diff != 0 && diff != 32 && diff != -32)
817    {
818  0 return false;
819    }
820    }
821  2 return true;
822    }
823   
824    /**
825    * Updates any empty mappings in the cross-references with one to a compatible
826    * retrieved sequence if found, and adds any new mappings to the
827    * AlignedCodonFrame JBPNote: TODO: this relies on sequence IDs like
828    * UNIPROT|ACCESSION - which do not always happen.
829    *
830    * @param mapFrom
831    * @param xrefs
832    * @param retrieved
833    * @param acf
834    */
 
835  0 toggle void updateDbrefMappings(SequenceI mapFrom, List<DBRefEntry> xrefs,
836    SequenceI[] retrieved, AlignedCodonFrame acf, boolean fromDna)
837    {
838  0 SequenceIdMatcher idMatcher = new SequenceIdMatcher(retrieved);
839  0 for (DBRefEntry xref : xrefs)
840    {
841  0 if (!xref.hasMap())
842    {
843  0 String targetSeqName = xref.getSource() + "|"
844    + xref.getAccessionId();
845  0 SequenceI[] matches = idMatcher.findAllIdMatches(targetSeqName);
846  0 if (matches == null)
847    {
848  0 return;
849    }
850  0 for (SequenceI seq : matches)
851    {
852  0 constructMapping(mapFrom, seq, xref, acf, fromDna);
853    }
854    }
855    }
856    }
857   
858    /**
859    * Tries to make a mapping between sequences. If successful, adds the mapping
860    * to the dbref and the mappings collection and answers true, otherwise
861    * answers false. The following methods of making are mapping are tried in
862    * turn:
863    * <ul>
864    * <li>if 'mapTo' holds a mapping to 'mapFrom', take the inverse; this is, for
865    * example, the case after fetching EMBL cross-references for a Uniprot
866    * sequence</li>
867    * <li>else check if the dna translates exactly to the protein (give or take
868    * start and stop codons></li>
869    * <li>else try to map based on CDS features on the dna sequence</li>
870    * </ul>
871    *
872    * @param mapFrom
873    * @param mapTo
874    * @param xref
875    * @param mappings
876    * @return
877    */
 
878  0 toggle boolean constructMapping(SequenceI mapFrom, SequenceI mapTo,
879    DBRefEntry xref, AlignedCodonFrame mappings, boolean fromDna)
880    {
881  0 MapList mapping = null;
882  0 SequenceI dsmapFrom = mapFrom.getDatasetSequence() == null ? mapFrom
883    : mapFrom.getDatasetSequence();
884  0 SequenceI dsmapTo = mapTo.getDatasetSequence() == null ? mapTo
885    : mapTo.getDatasetSequence();
886    /*
887    * look for a reverse mapping, if found make its inverse.
888    * Note - we do this on dataset sequences only.
889    */
890  0 if (dsmapTo.getDBRefs() != null)
891    {
892  0 for (DBRefEntry dbref : dsmapTo.getDBRefs())
893    {
894  0 String name = dbref.getSource() + "|" + dbref.getAccessionId();
895  0 if (dbref.hasMap() && dsmapFrom.getName().startsWith(name))
896    {
897    /*
898    * looks like we've found a map from 'mapTo' to 'mapFrom'
899    * - invert it to make the mapping the other way
900    */
901  0 MapList reverse = dbref.getMap().getMap().getInverse();
902  0 xref.setMap(new Mapping(dsmapTo, reverse));
903  0 mappings.addMap(mapFrom, dsmapTo, reverse);
904  0 return true;
905    }
906    }
907    }
908   
909  0 if (fromDna)
910    {
911  0 mapping = AlignmentUtils.mapCdnaToProtein(mapTo, mapFrom);
912    }
913    else
914    {
915  0 mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, mapTo);
916  0 if (mapping != null)
917    {
918  0 mapping = mapping.getInverse();
919    }
920    }
921  0 if (mapping == null)
922    {
923  0 return false;
924    }
925  0 xref.setMap(new Mapping(mapTo, mapping));
926   
927    /*
928    * and add a reverse DbRef with the inverse mapping
929    */
930  0 if (mapFrom.getDatasetSequence() != null && false)
931    // && mapFrom.getDatasetSequence().getSourceDBRef() != null)
932    {
933    // possible need to search primary references... except, why doesn't xref
934    // == getSourceDBRef ??
935    // DBRefEntry dbref = new DBRefEntry(mapFrom.getDatasetSequence()
936    // .getSourceDBRef());
937    // dbref.setMap(new Mapping(mapFrom.getDatasetSequence(), mapping
938    // .getInverse()));
939    // mapTo.addDBRef(dbref);
940    }
941   
942  0 if (fromDna)
943    {
944    // AlignmentUtils.computeProteinFeatures(mapFrom, mapTo, mapping);
945  0 mappings.addMap(mapFrom, mapTo, mapping);
946    }
947    else
948    {
949  0 mappings.addMap(mapTo, mapFrom, mapping.getInverse());
950    }
951   
952  0 return true;
953    }
954   
955    /**
956    * find references to lrfs in the cross-reference set of each sequence in
957    * dataset (that is not equal to sequenceI) Identifies matching DBRefEntry
958    * based on source and accession string only - Map and Version are nulled.
959    *
960    * @param fromDna
961    * - true if context was searching from Dna sequences, false if
962    * context was searching from Protein sequences
963    * @param sequenceI
964    * @param lrfs
965    * @param foundSeqs
966    * @return true if matches were found.
967    */
 
968  7039 toggle private boolean searchDatasetXrefs(boolean fromDna, SequenceI sequenceI,
969    List<DBRefEntry> lrfs, List<SequenceI> foundSeqs,
970    AlignedCodonFrame cf)
971    {
972  7039 boolean found = false;
973  7039 if (lrfs == null)
974    {
975  5269 return false;
976    }
977  3779 for (int i = 0, n = lrfs.size(); i < n; i++)
978    {
979    // DBRefEntry xref = new DBRefEntry(lrfs.get(i));
980    // // add in wildcards
981    // xref.setVersion(null);
982    // xref.setMap(null);
983  2009 found |= searchDataset(fromDna, sequenceI, lrfs.get(i), foundSeqs, cf,
984    false, DBRefUtils.SEARCH_MODE_NO_MAP_NO_VERSION);
985    }
986  1770 return found;
987    }
988   
989    /**
990    * Searches dataset for DBRefEntrys matching the given one (xrf) and adds the
991    * associated sequence to rseqs
992    *
993    * @param fromDna
994    * true if context was searching for refs *from* dna sequence, false
995    * if context was searching for refs *from* protein sequence
996    * @param fromSeq
997    * a sequence to ignore (start point of search)
998    * @param xrf
999    * a cross-reference to try to match
1000    * @param foundSeqs
1001    * result list to add to
1002    * @param mappings
1003    * a set of sequence mappings to add to
1004    * @param direct
1005    * - indicates the type of relationship between returned sequences,
1006    * xrf, and sequenceI that is required.
1007    * <ul>
1008    * <li>direct implies xrf is a primary reference for sequenceI AND
1009    * the sequences to be located (eg a uniprot ID for a protein
1010    * sequence, and a uniprot ref on a transcript sequence).</li>
1011    * <li>indirect means xrf is a cross reference with respect to
1012    * sequenceI or all the returned sequences (eg a genomic reference
1013    * associated with a locus and one or more transcripts)</li>
1014    * </ul>
1015    * @param mode
1016    * SEARCH_MODE_FULL for all; SEARCH_MODE_NO_MAP_NO_VERSION optional
1017    * @return true if relationship found and sequence added.
1018    */
 
1019  2034 toggle boolean searchDataset(boolean fromDna, SequenceI fromSeq, DBRefEntry xrf,
1020    List<SequenceI> foundSeqs, AlignedCodonFrame mappings,
1021    boolean direct, int mode)
1022    {
1023  2034 boolean found = false;
1024  2034 if (dataset == null)
1025    {
1026  0 return false;
1027    }
1028  2034 if (dataset.getSequences() == null)
1029    {
1030  0 jalview.bin.Console
1031    .errPrintln("Empty dataset sequence set - NO VECTOR");
1032  0 return false;
1033    }
1034  2034 List<SequenceI> ds = dataset.getSequences();
1035  2034 synchronized (ds)
1036    {
1037  2034 for (SequenceI nxt : ds)
1038    {
1039  287236 if (nxt != null)
1040    {
1041  287236 if (nxt.getDatasetSequence() != null)
1042    {
1043  0 jalview.bin.Console.errPrintln(
1044    "Implementation warning: CrossRef initialised with a dataset alignment with non-dataset sequences in it! ("
1045    + nxt.getDisplayId(true) + " has ds reference "
1046    + nxt.getDatasetSequence().getDisplayId(true)
1047    + ")");
1048    }
1049  287236 if (nxt == fromSeq || nxt == fromSeq.getDatasetSequence())
1050    {
1051  2031 continue;
1052    }
1053    /*
1054    * only look at same molecule type if 'direct', or
1055    * complementary type if !direct
1056    */
1057    {
1058  285205 boolean isDna = !nxt.isProtein();
1059  285205 if (direct ? (isDna != fromDna) : (isDna == fromDna))
1060    {
1061    // skip this sequence because it is wrong molecule type
1062  280162 continue;
1063    }
1064    }
1065   
1066    // look for direct or indirect references in common
1067  5043 List<DBRefEntry> poss = nxt.getDBRefs();
1068  5043 List<DBRefEntry> cands = null;
1069   
1070    // todo: indirect specifies we select either direct references to nxt
1071    // that match xrf which is indirect to sequenceI, or indirect
1072    // references to nxt that match xrf which is direct to sequenceI
1073  5043 cands = DBRefUtils.searchRefs(poss, xrf, mode);
1074    // else
1075    // {
1076    // poss = DBRefUtils.selectDbRefs(nxt.isProtein()!fromDna, poss);
1077    // cands = DBRefUtils.searchRefs(poss, xrf);
1078    // }
1079  5043 if (!cands.isEmpty())
1080    {
1081  880 if (foundSeqs.contains(nxt))
1082    {
1083  582 continue;
1084    }
1085  298 found = true;
1086  298 foundSeqs.add(nxt);
1087  298 if (mappings != null && !direct)
1088    {
1089    /*
1090    * if the matched sequence has mapped dbrefs to
1091    * protein product / cdna, add equivalent mappings to
1092    * our source sequence
1093    */
1094  26 for (DBRefEntry candidate : cands)
1095    {
1096  26 Mapping mapping = candidate.getMap();
1097  26 if (mapping != null)
1098    {
1099  1 MapList map = mapping.getMap();
1100  1 if (mapping.getTo() != null
1101    && map.getFromRatio() != map.getToRatio())
1102    {
1103    /*
1104    * add a mapping, as from dna to peptide sequence
1105    */
1106  1 if (map.getFromRatio() == 3)
1107    {
1108  1 mappings.addMap(nxt, fromSeq, map);
1109    }
1110    else
1111    {
1112  0 mappings.addMap(nxt, fromSeq, map.getInverse());
1113    }
1114    }
1115    }
1116    }
1117    }
1118    }
1119    }
1120    }
1121    }
1122  2034 return found;
1123    }
1124    }