Class |
Line # |
Actions |
|||
---|---|---|---|---|---|
CrossRef | 47 | 288 | 137 |
1 | /* | |
2 | * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) | |
3 | * Copyright (C) $$Year-Rel$$ The Jalview Authors | |
4 | * | |
5 | * This file is part of Jalview. | |
6 | * | |
7 | * Jalview is free software: you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation, either version 3 | |
10 | * of the License, or (at your option) any later version. | |
11 | * | |
12 | * Jalview is distributed in the hope that it will be useful, but | |
13 | * WITHOUT ANY WARRANTY; without even the implied warranty | |
14 | * of MERCHANTABILITY or FITNESS FOR A PARTICULAR | |
15 | * PURPOSE. See the GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with Jalview. If not, see <http://www.gnu.org/licenses/>. | |
19 | * The Jalview Authors are detailed in the 'AUTHORS' file. | |
20 | */ | |
21 | package jalview.analysis; | |
22 | ||
23 | import jalview.datamodel.AlignedCodonFrame; | |
24 | import jalview.datamodel.Alignment; | |
25 | import jalview.datamodel.AlignmentI; | |
26 | import jalview.datamodel.DBRefEntry; | |
27 | import jalview.datamodel.DBRefSource; | |
28 | import jalview.datamodel.Mapping; | |
29 | import jalview.datamodel.Sequence; | |
30 | import jalview.datamodel.SequenceFeature; | |
31 | import jalview.datamodel.SequenceI; | |
32 | import jalview.util.DBRefUtils; | |
33 | import jalview.util.MapList; | |
34 | import jalview.ws.SequenceFetcherFactory; | |
35 | import jalview.ws.seqfetcher.ASequenceFetcher; | |
36 | ||
37 | import java.util.ArrayList; | |
38 | import java.util.Iterator; | |
39 | import java.util.List; | |
40 | ||
41 | /** | |
42 | * Functions for cross-referencing sequence databases. | |
43 | * | |
44 | * @author JimP | |
45 | * | |
46 | */ | |
47 | public class CrossRef | |
48 | { | |
49 | /* | |
50 | * the dataset of the alignment for which we are searching for | |
51 | * cross-references; in some cases we may resolve xrefs by | |
52 | * searching in the dataset | |
53 | */ | |
54 | private AlignmentI dataset; | |
55 | ||
56 | /* | |
57 | * the sequences for which we are seeking cross-references | |
58 | */ | |
59 | private SequenceI[] fromSeqs; | |
60 | ||
61 | /** | |
62 | * matcher built from dataset | |
63 | */ | |
64 | SequenceIdMatcher matcher; | |
65 | ||
66 | /** | |
67 | * sequences found by cross-ref searches to fromSeqs | |
68 | */ | |
69 | List<SequenceI> rseqs; | |
70 | ||
71 | /** | |
72 | * Constructor | |
73 | * | |
74 | * @param seqs | |
75 | * the sequences for which we are seeking cross-references | |
76 | * @param ds | |
77 | * the containing alignment dataset (may be searched to resolve | |
78 | * cross-references) | |
79 | */ | |
80 | 683 | public CrossRef(SequenceI[] seqs, AlignmentI ds) |
81 | { | |
82 | 683 | fromSeqs = seqs; |
83 | 683 | dataset = ds.getDataset() == null ? ds : ds.getDataset(); |
84 | } | |
85 | ||
86 | /** | |
87 | * Returns a list of distinct database sources for which sequences have either | |
88 | * <ul> | |
89 | * <li>a (dna-to-protein or protein-to-dna) cross-reference</li> | |
90 | * <li>an indirect cross-reference - a (dna-to-protein or protein-to-dna) | |
91 | * reference from another sequence in the dataset which has a cross-reference | |
92 | * to a direct DBRefEntry on the given sequence</li> | |
93 | * </ul> | |
94 | * | |
95 | * @param dna | |
96 | * - when true, cross-references *from* dna returned. When false, | |
97 | * cross-references *from* protein are returned | |
98 | * @return | |
99 | */ | |
100 | 676 | public List<String> findXrefSourcesForSequences(boolean dna) |
101 | { | |
102 | 676 | List<String> sources = new ArrayList<>(); |
103 | 676 | for (SequenceI seq : fromSeqs) |
104 | { | |
105 | 6699 | if (seq != null) |
106 | { | |
107 | 6699 | findXrefSourcesForSequence(seq, dna, sources); |
108 | } | |
109 | } | |
110 | 676 | sources.remove(DBRefSource.EMBL); // hack to prevent EMBL xrefs resulting in |
111 | // redundant datasets | |
112 | 676 | if (dna) |
113 | { | |
114 | 91 | sources.remove(DBRefSource.ENSEMBL); // hack to prevent Ensembl and |
115 | // EnsemblGenomes xref option shown | |
116 | // from cdna panel | |
117 | 91 | sources.remove(DBRefSource.ENSEMBLGENOMES); |
118 | } | |
119 | // redundant datasets | |
120 | 676 | return sources; |
121 | } | |
122 | ||
123 | /** | |
124 | * Returns a list of distinct database sources for which a sequence has either | |
125 | * <ul> | |
126 | * <li>a (dna-to-protein or protein-to-dna) cross-reference</li> | |
127 | * <li>an indirect cross-reference - a (dna-to-protein or protein-to-dna) | |
128 | * reference from another sequence in the dataset which has a cross-reference | |
129 | * to a direct DBRefEntry on the given sequence</li> | |
130 | * </ul> | |
131 | * | |
132 | * @param seq | |
133 | * the sequence whose dbrefs we are searching against | |
134 | * @param fromDna | |
135 | * when true, context is DNA - so sources identifying protein | |
136 | * products will be returned. | |
137 | * @param sources | |
138 | * a list of sources to add matches to | |
139 | */ | |
140 | 6699 | void findXrefSourcesForSequence(SequenceI seq, boolean fromDna, |
141 | List<String> sources) | |
142 | { | |
143 | /* | |
144 | * first find seq's xrefs (dna-to-peptide or peptide-to-dna) | |
145 | */ | |
146 | 6699 | List<DBRefEntry> rfs = DBRefUtils.selectDbRefs(!fromDna, |
147 | seq.getDBRefs()); | |
148 | 6699 | addXrefsToSources(rfs, sources); |
149 | 6699 | if (dataset != null) |
150 | { | |
151 | /* | |
152 | * find sequence's direct (dna-to-dna, peptide-to-peptide) xrefs | |
153 | */ | |
154 | 6699 | List<DBRefEntry> lrfs = DBRefUtils.selectDbRefs(fromDna, |
155 | seq.getDBRefs()); | |
156 | 6699 | List<SequenceI> foundSeqs = new ArrayList<>(); |
157 | ||
158 | /* | |
159 | * find sequences in the alignment which xref one of these DBRefs | |
160 | * i.e. is xref-ed to a common sequence identifier | |
161 | */ | |
162 | 6699 | searchDatasetXrefs(fromDna, seq, lrfs, foundSeqs, null); |
163 | ||
164 | /* | |
165 | * add those sequences' (dna-to-peptide or peptide-to-dna) dbref sources | |
166 | */ | |
167 | 6699 | for (SequenceI rs : foundSeqs) |
168 | { | |
169 | 272 | List<DBRefEntry> xrs = DBRefUtils.selectDbRefs(!fromDna, |
170 | rs.getDBRefs()); | |
171 | 272 | addXrefsToSources(xrs, sources); |
172 | } | |
173 | } | |
174 | } | |
175 | ||
176 | /** | |
177 | * Helper method that adds the source identifiers of some cross-references to | |
178 | * a (non-redundant) list of database sources | |
179 | * | |
180 | * @param xrefs | |
181 | * @param sources | |
182 | */ | |
183 | 6971 | void addXrefsToSources(List<DBRefEntry> xrefs, List<String> sources) |
184 | { | |
185 | 6971 | if (xrefs != null) |
186 | { | |
187 | 364 | for (DBRefEntry ref : xrefs) |
188 | { | |
189 | /* | |
190 | * avoid duplication e.g. ENSEMBL and Ensembl | |
191 | */ | |
192 | 1358 | String source = DBRefUtils.getCanonicalName(ref.getSource()); |
193 | 1358 | if (!sources.contains(source)) |
194 | { | |
195 | 21 | sources.add(source); |
196 | } | |
197 | } | |
198 | } | |
199 | } | |
200 | ||
201 | /** | |
202 | * Attempts to find cross-references from the sequences provided in the | |
203 | * constructor to the given source database. Cross-references may be found | |
204 | * <ul> | |
205 | * <li>in dbrefs on the sequence which hold a mapping to a sequence | |
206 | * <ul> | |
207 | * <li>provided with a fetched sequence (e.g. ENA translation), or</li> | |
208 | * <li>populated previously after getting cross-references</li> | |
209 | * </ul> | |
210 | * <li>as other sequences in the alignment which share a dbref identifier with | |
211 | * the sequence</li> | |
212 | * <li>by fetching from the remote database</li> | |
213 | * </ul> | |
214 | * The cross-referenced sequences, and mappings to them, are added to the | |
215 | * alignment dataset. | |
216 | * | |
217 | * @param source | |
218 | * @return cross-referenced sequences (as dataset sequences) | |
219 | */ | |
220 | 6 | public Alignment findXrefSequences(String source, boolean fromDna) |
221 | { | |
222 | ||
223 | 6 | rseqs = new ArrayList<>(); |
224 | 6 | AlignedCodonFrame cf = new AlignedCodonFrame(); |
225 | 6 | matcher = new SequenceIdMatcher(dataset.getSequences()); |
226 | ||
227 | 6 | for (SequenceI seq : fromSeqs) |
228 | { | |
229 | 48 | SequenceI dss = seq; |
230 | 93 | while (dss.getDatasetSequence() != null) |
231 | { | |
232 | 45 | dss = dss.getDatasetSequence(); |
233 | } | |
234 | 48 | boolean found = false; |
235 | 48 | List<DBRefEntry> xrfs = DBRefUtils.selectDbRefs(!fromDna, |
236 | dss.getDBRefs()); | |
237 | // ENST & ENSP comes in to both Protein and nucleotide, so we need to | |
238 | // filter them | |
239 | // out later. | |
240 | 48 | if ((xrfs == null || xrfs.size() == 0) && dataset != null) |
241 | { | |
242 | /* | |
243 | * found no suitable dbrefs on sequence - look for sequences in the | |
244 | * alignment which share a dbref with this one | |
245 | */ | |
246 | 3 | List<DBRefEntry> lrfs = DBRefUtils.selectDbRefs(fromDna, |
247 | seq.getDBRefs()); | |
248 | ||
249 | /* | |
250 | * find sequences (except this one!), of complementary type, | |
251 | * which have a dbref to an accession id for this sequence, | |
252 | * and add them to the results | |
253 | */ | |
254 | 3 | found = searchDatasetXrefs(fromDna, dss, lrfs, rseqs, cf); |
255 | } | |
256 | 48 | if (xrfs == null && !found) |
257 | { | |
258 | /* | |
259 | * no dbref to source on this sequence or matched | |
260 | * complementary sequence in the dataset | |
261 | */ | |
262 | 1 | continue; |
263 | } | |
264 | 47 | List<DBRefEntry> sourceRefs = DBRefUtils.searchRefsForSource(xrfs, |
265 | source); | |
266 | 47 | Iterator<DBRefEntry> refIterator = sourceRefs.iterator(); |
267 | // At this point, if we are retrieving Ensembl, we still don't filter out | |
268 | // ENST when looking for protein crossrefs. | |
269 | 93 | while (refIterator.hasNext()) |
270 | { | |
271 | 46 | DBRefEntry xref = refIterator.next(); |
272 | 46 | found = false; |
273 | // we're only interested in coding cross-references, not | |
274 | // locus->transcript | |
275 | 46 | if (xref.hasMap() && xref.getMap().getMap().isTripletMap()) |
276 | { | |
277 | 24 | SequenceI mappedTo = xref.getMap().getTo(); |
278 | 24 | if (mappedTo != null) |
279 | { | |
280 | /* | |
281 | * dbref contains the sequence it maps to; add it to the | |
282 | * results unless we have done so already (could happen if | |
283 | * fetching xrefs for sequences which have xrefs in common) | |
284 | * for example: UNIPROT {P0CE19, P0CE20} -> EMBL {J03321, X06707} | |
285 | */ | |
286 | 24 | found = true; |
287 | /* | |
288 | * problem: matcher.findIdMatch() is lenient - returns a sequence | |
289 | * with a dbref to the search arg e.g. ENST for ENSP - wrong | |
290 | * but findInDataset() matches ENSP when looking for Uniprot... | |
291 | */ | |
292 | 24 | SequenceI matchInDataset = findInDataset(xref); |
293 | 24 | if (matchInDataset != null && xref.getMap().getTo() != null |
294 | && matchInDataset != xref.getMap().getTo()) | |
295 | { | |
296 | 0 | jalview.bin.Console.errPrintln( |
297 | "Implementation problem (reopen JAL-2154): CrossRef.findInDataset seems to have recovered a different sequence than the one explicitly mapped for xref." | |
298 | + "Found:" + matchInDataset + "\nExpected:" | |
299 | + xref.getMap().getTo() + "\nFor xref:" | |
300 | + xref); | |
301 | } | |
302 | /*matcher.findIdMatch(mappedTo);*/ | |
303 | 24 | if (matchInDataset != null) |
304 | { | |
305 | 22 | if (!rseqs.contains(matchInDataset)) |
306 | { | |
307 | 0 | rseqs.add(matchInDataset); |
308 | } | |
309 | // even if rseqs contained matchInDataset - check mappings between | |
310 | // these seqs are added | |
311 | // need to try harder to only add unique mappings | |
312 | 22 | if (xref.getMap().getMap().isTripletMap() |
313 | && dataset.getMapping(seq, matchInDataset) == null | |
314 | && cf.getMappingBetween(seq, matchInDataset) == null) | |
315 | { | |
316 | // materialise a mapping for highlighting between these | |
317 | // sequences | |
318 | 11 | if (fromDna) |
319 | { | |
320 | 11 | cf.addMap(dss, matchInDataset, xref.getMap().getMap(), |
321 | xref.getMap().getMappedFromId()); | |
322 | } | |
323 | else | |
324 | { | |
325 | 0 | cf.addMap(matchInDataset, dss, |
326 | xref.getMap().getMap().getInverse(), | |
327 | xref.getMap().getMappedFromId()); | |
328 | } | |
329 | } | |
330 | ||
331 | 22 | refIterator.remove(); |
332 | 22 | continue; |
333 | } | |
334 | // TODO: need to determine if this should be a deriveSequence | |
335 | 2 | SequenceI rsq = new Sequence(mappedTo); |
336 | 2 | rseqs.add(rsq); |
337 | 2 | if (xref.getMap().getMap().isTripletMap()) |
338 | { | |
339 | // get sense of map correct for adding to product alignment. | |
340 | 2 | if (fromDna) |
341 | { | |
342 | // map is from dna seq to a protein product | |
343 | 2 | cf.addMap(dss, rsq, xref.getMap().getMap(), |
344 | xref.getMap().getMappedFromId()); | |
345 | } | |
346 | else | |
347 | { | |
348 | // map should be from protein seq to its coding dna | |
349 | 0 | cf.addMap(rsq, dss, xref.getMap().getMap().getInverse(), |
350 | xref.getMap().getMappedFromId()); | |
351 | } | |
352 | } | |
353 | } | |
354 | } | |
355 | ||
356 | 24 | if (!found) |
357 | { | |
358 | 22 | SequenceI matchedSeq = matcher.findIdMatch( |
359 | xref.getSource() + "|" + xref.getAccessionId()); | |
360 | // if there was a match, check it's at least the right type of | |
361 | // molecule! | |
362 | 22 | if (matchedSeq != null && matchedSeq.isProtein() == fromDna) |
363 | { | |
364 | 0 | if (constructMapping(seq, matchedSeq, xref, cf, fromDna)) |
365 | { | |
366 | 0 | found = true; |
367 | } | |
368 | } | |
369 | } | |
370 | ||
371 | 24 | if (!found) |
372 | { | |
373 | // do a bit more work - search for sequences with references matching | |
374 | // xrefs on this sequence. | |
375 | 22 | found = searchDataset(fromDna, dss, xref, rseqs, cf, false, |
376 | DBRefUtils.SEARCH_MODE_FULL); | |
377 | } | |
378 | 24 | if (found) |
379 | { | |
380 | 24 | refIterator.remove(); |
381 | } | |
382 | } | |
383 | ||
384 | /* | |
385 | * fetch from source database any dbrefs we haven't resolved up to here | |
386 | */ | |
387 | 47 | if (!sourceRefs.isEmpty()) |
388 | { | |
389 | 0 | retrieveCrossRef(sourceRefs, seq, xrfs, fromDna, cf); |
390 | } | |
391 | } | |
392 | ||
393 | 6 | Alignment ral = null; |
394 | 6 | if (rseqs.size() > 0) |
395 | { | |
396 | 5 | ral = new Alignment(rseqs.toArray(new SequenceI[rseqs.size()])); |
397 | 5 | if (!cf.isEmpty()) |
398 | { | |
399 | 2 | dataset.addCodonFrame(cf); |
400 | } | |
401 | } | |
402 | 6 | return ral; |
403 | } | |
404 | ||
405 | 0 | private void retrieveCrossRef(List<DBRefEntry> sourceRefs, SequenceI seq, |
406 | List<DBRefEntry> xrfs, boolean fromDna, AlignedCodonFrame cf) | |
407 | { | |
408 | 0 | ASequenceFetcher sftch = SequenceFetcherFactory.getSequenceFetcher(); |
409 | 0 | SequenceI[] retrieved = null; |
410 | 0 | SequenceI dss = seq.getDatasetSequence() == null ? seq |
411 | : seq.getDatasetSequence(); | |
412 | // first filter in case we are retrieving crossrefs that have already been | |
413 | // retrieved. this happens for cases where a database record doesn't yield | |
414 | // protein products for CDS | |
415 | 0 | removeAlreadyRetrievedSeqs(sourceRefs, fromDna); |
416 | 0 | if (sourceRefs.size() == 0) |
417 | { | |
418 | // no more work to do! We already had all requested sequence records in | |
419 | // the dataset. | |
420 | 0 | return; |
421 | } | |
422 | 0 | try |
423 | { | |
424 | 0 | retrieved = sftch.getSequences(sourceRefs, !fromDna); |
425 | } catch (Exception e) | |
426 | { | |
427 | 0 | jalview.bin.Console.errPrintln( |
428 | "Problem whilst retrieving cross references for Sequence : " | |
429 | + seq.getName()); | |
430 | 0 | e.printStackTrace(); |
431 | } | |
432 | ||
433 | 0 | if (retrieved != null) |
434 | { | |
435 | 0 | boolean addedXref = false; |
436 | 0 | List<SequenceI> newDsSeqs = new ArrayList<>(), |
437 | doNotAdd = new ArrayList<>(); | |
438 | ||
439 | 0 | for (SequenceI retrievedSequence : retrieved) |
440 | { | |
441 | // dataset gets contaminated ccwith non-ds sequences. why ??! | |
442 | // try: Ensembl -> Nuc->Ensembl, Nuc->Uniprot-->Protein->EMBL-> | |
443 | 0 | SequenceI retrievedDss = retrievedSequence |
444 | .getDatasetSequence() == null ? retrievedSequence | |
445 | : retrievedSequence.getDatasetSequence(); | |
446 | 0 | addedXref |= importCrossRefSeq(cf, newDsSeqs, doNotAdd, dss, |
447 | retrievedDss); | |
448 | } | |
449 | // JBPNote: What assumptions are made for dbref structures on | |
450 | // retrieved sequences ? | |
451 | // addedXref will be true means importCrossRefSeq found | |
452 | // sequences with dbrefs with mappings to sequences congruent with dss | |
453 | ||
454 | 0 | if (!addedXref) |
455 | { | |
456 | // try again, after looking for matching IDs | |
457 | // shouldn't need to do this unless the dbref mechanism has broken. | |
458 | 0 | updateDbrefMappings(seq, xrfs, retrieved, cf, fromDna); |
459 | 0 | for (SequenceI retrievedSequence : retrieved) |
460 | { | |
461 | // dataset gets contaminated ccwith non-ds sequences. why ??! | |
462 | // try: Ensembl -> Nuc->Ensembl, Nuc->Uniprot-->Protein->EMBL-> | |
463 | 0 | SequenceI retrievedDss = retrievedSequence |
464 | .getDatasetSequence() == null ? retrievedSequence | |
465 | : retrievedSequence.getDatasetSequence(); | |
466 | 0 | addedXref |= importCrossRefSeq(cf, newDsSeqs, doNotAdd, dss, |
467 | retrievedDss); | |
468 | } | |
469 | } | |
470 | 0 | for (SequenceI newToSeq : newDsSeqs) |
471 | { | |
472 | 0 | if (!doNotAdd.contains(newToSeq) |
473 | && dataset.findIndex(newToSeq) == -1) | |
474 | { | |
475 | 0 | dataset.addSequence(newToSeq); |
476 | 0 | matcher.add(newToSeq); |
477 | } | |
478 | } | |
479 | } | |
480 | } | |
481 | ||
482 | /** | |
483 | * Search dataset for sequences with a primary reference contained in | |
484 | * sourceRefs. | |
485 | * | |
486 | * @param sourceRefs | |
487 | * - list of references to filter. | |
488 | * @param fromDna | |
489 | * - type of sequence to search for matching primary reference. | |
490 | */ | |
491 | 0 | private void removeAlreadyRetrievedSeqs(List<DBRefEntry> sourceRefs, |
492 | boolean fromDna) | |
493 | { | |
494 | 0 | List<DBRefEntry> dbrSourceSet = new ArrayList<>(sourceRefs); |
495 | 0 | List<SequenceI> dsSeqs = dataset.getSequences(); |
496 | 0 | for (int ids = 0, nds = dsSeqs.size(); ids < nds; ids++) |
497 | { | |
498 | 0 | SequenceI sq = dsSeqs.get(ids); |
499 | 0 | boolean dupeFound = false; |
500 | // !fromDna means we are looking only for nucleotide sequences, not | |
501 | // protein | |
502 | 0 | if (sq.isProtein() == fromDna) |
503 | { | |
504 | 0 | List<DBRefEntry> sqdbrefs = sq.getPrimaryDBRefs(); |
505 | 0 | for (int idb = 0, ndb = sqdbrefs.size(); idb < ndb; idb++) |
506 | { | |
507 | 0 | DBRefEntry dbr = sqdbrefs.get(idb); |
508 | 0 | List<DBRefEntry> searchrefs = DBRefUtils.searchRefs(dbrSourceSet, |
509 | dbr, DBRefUtils.SEARCH_MODE_FULL); | |
510 | 0 | for (int isr = 0, nsr = searchrefs.size(); isr < nsr; isr++) |
511 | { | |
512 | 0 | sourceRefs.remove(searchrefs.get(isr)); |
513 | 0 | dupeFound = true; |
514 | } | |
515 | } | |
516 | } | |
517 | 0 | if (dupeFound) |
518 | { | |
519 | // rebuild the search array from the filtered sourceRefs list | |
520 | 0 | dbrSourceSet.clear(); |
521 | 0 | dbrSourceSet.addAll(sourceRefs); |
522 | } | |
523 | } | |
524 | } | |
525 | ||
526 | /** | |
527 | * process sequence retrieved via a dbref on source sequence to resolve and | |
528 | * transfer data JBPNote: as of 2022-02-03 - this assumes retrievedSequence | |
529 | * has dbRefs with Mapping references to a sequence congruent with | |
530 | * sourceSequence | |
531 | * | |
532 | * @param cf | |
533 | * @param sourceSequence | |
534 | * @param retrievedSequence | |
535 | * @return true if retrieveSequence was imported | |
536 | */ | |
537 | 0 | private boolean importCrossRefSeq(AlignedCodonFrame cf, |
538 | List<SequenceI> newDsSeqs, List<SequenceI> doNotAdd, | |
539 | SequenceI sourceSequence, SequenceI retrievedSequence) | |
540 | { | |
541 | /** | |
542 | * set when retrievedSequence has been verified as a crossreference for | |
543 | * sourceSequence | |
544 | */ | |
545 | 0 | boolean imported = false; |
546 | 0 | List<DBRefEntry> dbr = retrievedSequence.getDBRefs(); |
547 | 0 | if (dbr != null) |
548 | { | |
549 | 0 | for (int ib = 0, nb = dbr.size(); ib < nb; ib++) |
550 | { | |
551 | ||
552 | 0 | DBRefEntry dbref = dbr.get(ib); |
553 | // matched will return null if the dbref has no map | |
554 | 0 | SequenceI matched = findInDataset(dbref); |
555 | 0 | if (matched == sourceSequence) |
556 | { | |
557 | // verified retrieved and source sequence cross-reference each other | |
558 | 0 | imported = true; |
559 | } | |
560 | // find any entry where we should put in the sequence being | |
561 | // cross-referenced into the map | |
562 | 0 | Mapping map = dbref.getMap(); |
563 | 0 | if (map != null) |
564 | { | |
565 | 0 | SequenceI ms = map.getTo(); |
566 | 0 | if (ms != null && map.getMap() != null) |
567 | { | |
568 | 0 | if (ms == sourceSequence) |
569 | { | |
570 | // already called to import once, and most likely this sequence | |
571 | // already imported ! | |
572 | 0 | continue; |
573 | } | |
574 | 0 | if (matched == null) |
575 | { | |
576 | /* | |
577 | * sequence is new to dataset, so save a reference so it can be added. | |
578 | */ | |
579 | 0 | newDsSeqs.add(ms); |
580 | 0 | continue; |
581 | } | |
582 | ||
583 | /* | |
584 | * there was a matching sequence in dataset, so now, check to see if we can update the map.getTo() sequence to the existing one. | |
585 | */ | |
586 | ||
587 | 0 | try |
588 | { | |
589 | // compare ms with dss and replace with dss in mapping | |
590 | // if map is congruent | |
591 | // TODO findInDataset requires exact sequence match but | |
592 | // 'congruent' test is only for the mapped part | |
593 | // maybe not a problem in practice since only ENA provide a | |
594 | // mapping and it is to the full protein translation of CDS | |
595 | // matcher.findIdMatch(map.getTo()); | |
596 | // TODO addendum: if matched is shorter than getTo, this will fail | |
597 | // - when it should really succeed. | |
598 | 0 | int sf = map.getMap().getToLowest(); |
599 | 0 | int st = map.getMap().getToHighest(); |
600 | 0 | SequenceI mappedrg = ms.getSubSequence(sf, st); |
601 | 0 | if (mappedrg.getLength() > 0 && ms.getSequenceAsString() |
602 | .equals(matched.getSequenceAsString())) | |
603 | { | |
604 | /* | |
605 | * sequences were a match, | |
606 | */ | |
607 | 0 | String msg = "Mapping updated from " + ms.getName() |
608 | + " to retrieved crossreference " | |
609 | + matched.getName(); | |
610 | 0 | jalview.bin.Console.outPrintln(msg); |
611 | ||
612 | 0 | List<DBRefEntry> toRefs = map.getTo().getDBRefs(); |
613 | 0 | if (toRefs != null) |
614 | { | |
615 | /* | |
616 | * transfer database refs | |
617 | */ | |
618 | 0 | for (DBRefEntry ref : toRefs) |
619 | { | |
620 | 0 | if (dbref.getSrcAccString() |
621 | .equals(ref.getSrcAccString())) | |
622 | { | |
623 | 0 | continue; // avoid overwriting the ref on source sequence |
624 | } | |
625 | 0 | matched.addDBRef(ref); // add or update mapping |
626 | } | |
627 | } | |
628 | 0 | doNotAdd.add(map.getTo()); |
629 | 0 | map.setTo(matched); |
630 | ||
631 | /* | |
632 | * give the reverse reference the inverse mapping | |
633 | * (if it doesn't have one already) | |
634 | */ | |
635 | 0 | setReverseMapping(matched, dbref, cf); |
636 | ||
637 | /* | |
638 | * copy sequence features as well, avoiding | |
639 | * duplication (e.g. same variation from two | |
640 | * transcripts) | |
641 | */ | |
642 | 0 | List<SequenceFeature> sfs = ms.getFeatures() |
643 | .getAllFeatures(); | |
644 | 0 | for (SequenceFeature feat : sfs) |
645 | { | |
646 | /* | |
647 | * make a flyweight feature object which ignores Parent | |
648 | * attribute in equality test; this avoids creating many | |
649 | * otherwise duplicate exon features on genomic sequence | |
650 | */ | |
651 | 0 | SequenceFeature newFeature = new SequenceFeature(feat) |
652 | { | |
653 | 0 | @Override |
654 | public boolean equals(Object o) | |
655 | { | |
656 | 0 | return super.equals(o, true); |
657 | } | |
658 | }; | |
659 | 0 | matched.addSequenceFeature(newFeature); |
660 | } | |
661 | } | |
662 | 0 | cf.addMap(retrievedSequence, map.getTo(), map.getMap()); |
663 | } catch (Exception e) | |
664 | { | |
665 | 0 | jalview.bin.Console.errPrintln( |
666 | "Exception when consolidating Mapped sequence set..."); | |
667 | 0 | e.printStackTrace(System.err); |
668 | } | |
669 | } | |
670 | } | |
671 | } | |
672 | } | |
673 | 0 | if (imported) |
674 | { | |
675 | 0 | retrievedSequence.updatePDBIds(); |
676 | 0 | rseqs.add(retrievedSequence); |
677 | 0 | if (dataset.findIndex(retrievedSequence) == -1) |
678 | { | |
679 | 0 | dataset.addSequence(retrievedSequence); |
680 | 0 | matcher.add(retrievedSequence); |
681 | } | |
682 | } | |
683 | 0 | return imported; |
684 | } | |
685 | ||
686 | /** | |
687 | * Sets the inverse sequence mapping in the corresponding dbref of the mapped | |
688 | * to sequence (if any). This is used after fetching a cross-referenced | |
689 | * sequence, if the fetched sequence has a mapping to the original sequence, | |
690 | * to set the mapping in the original sequence's dbref. | |
691 | * | |
692 | * @param mapFrom | |
693 | * the sequence mapped from | |
694 | * @param dbref | |
695 | * @param mappings | |
696 | */ | |
697 | 0 | void setReverseMapping(SequenceI mapFrom, DBRefEntry dbref, |
698 | AlignedCodonFrame mappings) | |
699 | { | |
700 | 0 | SequenceI mapTo = dbref.getMap().getTo(); |
701 | 0 | if (mapTo == null) |
702 | { | |
703 | 0 | return; |
704 | } | |
705 | 0 | List<DBRefEntry> dbrefs = mapTo.getDBRefs(); |
706 | 0 | if (dbrefs == null) |
707 | { | |
708 | 0 | return; |
709 | } | |
710 | 0 | for (DBRefEntry toRef : dbrefs) |
711 | { | |
712 | 0 | if (toRef.hasMap() && mapFrom == toRef.getMap().getTo()) |
713 | { | |
714 | /* | |
715 | * found the reverse dbref; update its mapping if null | |
716 | */ | |
717 | 0 | if (toRef.getMap().getMap() == null) |
718 | { | |
719 | 0 | MapList inverse = dbref.getMap().getMap().getInverse(); |
720 | 0 | toRef.getMap().setMap(inverse); |
721 | 0 | mappings.addMap(mapTo, mapFrom, inverse); |
722 | } | |
723 | } | |
724 | } | |
725 | } | |
726 | ||
727 | /** | |
728 | * Returns null or the first sequence in the dataset which is identical to | |
729 | * xref.mapTo, and has a) a primary dbref matching xref, or if none found, the | |
730 | * first one with an ID source|xrefacc JBPNote: Could refactor this to | |
731 | * AlignmentI/DatasetI | |
732 | * | |
733 | * @param xref | |
734 | * with map and mapped-to sequence | |
735 | * @return | |
736 | */ | |
737 | 24 | SequenceI findInDataset(DBRefEntry xref) |
738 | { | |
739 | 24 | if (xref == null || !xref.hasMap() || xref.getMap().getTo() == null) |
740 | { | |
741 | 0 | return null; |
742 | } | |
743 | 24 | SequenceI mapsTo = xref.getMap().getTo(); |
744 | 24 | String name = xref.getAccessionId(); |
745 | 24 | String name2 = xref.getSource() + "|" + name; |
746 | 24 | SequenceI dss = mapsTo.getDatasetSequence() == null ? mapsTo |
747 | : mapsTo.getDatasetSequence(); | |
748 | // first check ds if ds is directly referenced | |
749 | 24 | if (dataset.findIndex(dss) > -1) |
750 | { | |
751 | 22 | return dss; |
752 | } | |
753 | 2 | DBRefEntry template = new DBRefEntry(xref.getSource(), null, |
754 | xref.getAccessionId()); | |
755 | /** | |
756 | * remember the first ID match - in case we don't find a match to template | |
757 | */ | |
758 | 2 | SequenceI firstIdMatch = null; |
759 | 2 | for (SequenceI seq : dataset.getSequences()) |
760 | { | |
761 | // first check primary refs. | |
762 | 2 | List<DBRefEntry> match = DBRefUtils.searchRefs(seq.getPrimaryDBRefs(), |
763 | template, DBRefUtils.SEARCH_MODE_FULL); | |
764 | 2 | if (match != null && match.size() == 1 && sameSequence(seq, dss)) |
765 | { | |
766 | 0 | return seq; |
767 | } | |
768 | /* | |
769 | * clumsy alternative to using SequenceIdMatcher which currently | |
770 | * returns sequences with a dbref to the matched accession id | |
771 | * which we don't want | |
772 | */ | |
773 | 2 | if (firstIdMatch == null && (name.equals(seq.getName()) |
774 | || seq.getName().startsWith(name2))) | |
775 | { | |
776 | 0 | if (sameSequence(seq, dss)) |
777 | { | |
778 | 0 | firstIdMatch = seq; |
779 | } | |
780 | } | |
781 | } | |
782 | 2 | return firstIdMatch; |
783 | } | |
784 | ||
785 | /** | |
786 | * Answers true if seq1 and seq2 contain exactly the same characters (ignoring | |
787 | * case), else false. This method compares the lengths, then each character in | |
788 | * turn, in order to 'fail fast'. For case-sensitive comparison, it would be | |
789 | * possible to use Arrays.equals(seq1.getSequence(), seq2.getSequence()). | |
790 | * | |
791 | * @param seq1 | |
792 | * @param seq2 | |
793 | * @return | |
794 | */ | |
795 | // TODO move to Sequence / SequenceI | |
796 | 7 | static boolean sameSequence(SequenceI seq1, SequenceI seq2) |
797 | { | |
798 | 7 | if (seq1 == seq2) |
799 | { | |
800 | 1 | return true; |
801 | } | |
802 | 6 | if (seq1 == null || seq2 == null) |
803 | { | |
804 | 2 | return false; |
805 | } | |
806 | ||
807 | 4 | if (seq1.getLength() != seq2.getLength()) |
808 | { | |
809 | 2 | return false; |
810 | } | |
811 | 2 | int length = seq1.getLength(); |
812 | 14 | for (int i = 0; i < length; i++) |
813 | { | |
814 | 12 | int diff = seq1.getCharAt(i) - seq2.getCharAt(i); |
815 | /* | |
816 | * same char or differ in case only ('a'-'A' == 32) | |
817 | */ | |
818 | 12 | if (diff != 0 && diff != 32 && diff != -32) |
819 | { | |
820 | 0 | return false; |
821 | } | |
822 | } | |
823 | 2 | return true; |
824 | } | |
825 | ||
826 | /** | |
827 | * Updates any empty mappings in the cross-references with one to a compatible | |
828 | * retrieved sequence if found, and adds any new mappings to the | |
829 | * AlignedCodonFrame JBPNote: TODO: this relies on sequence IDs like | |
830 | * UNIPROT|ACCESSION - which do not always happen. | |
831 | * | |
832 | * @param mapFrom | |
833 | * @param xrefs | |
834 | * @param retrieved | |
835 | * @param acf | |
836 | */ | |
837 | 0 | void updateDbrefMappings(SequenceI mapFrom, List<DBRefEntry> xrefs, |
838 | SequenceI[] retrieved, AlignedCodonFrame acf, boolean fromDna) | |
839 | { | |
840 | 0 | SequenceIdMatcher idMatcher = new SequenceIdMatcher(retrieved); |
841 | 0 | for (DBRefEntry xref : xrefs) |
842 | { | |
843 | 0 | if (!xref.hasMap()) |
844 | { | |
845 | 0 | String targetSeqName = xref.getSource() + "|" |
846 | + xref.getAccessionId(); | |
847 | 0 | SequenceI[] matches = idMatcher.findAllIdMatches(targetSeqName); |
848 | 0 | if (matches == null) |
849 | { | |
850 | 0 | return; |
851 | } | |
852 | 0 | for (SequenceI seq : matches) |
853 | { | |
854 | 0 | constructMapping(mapFrom, seq, xref, acf, fromDna); |
855 | } | |
856 | } | |
857 | } | |
858 | } | |
859 | ||
860 | /** | |
861 | * Tries to make a mapping between sequences. If successful, adds the mapping | |
862 | * to the dbref and the mappings collection and answers true, otherwise | |
863 | * answers false. The following methods of making are mapping are tried in | |
864 | * turn: | |
865 | * <ul> | |
866 | * <li>if 'mapTo' holds a mapping to 'mapFrom', take the inverse; this is, for | |
867 | * example, the case after fetching EMBL cross-references for a Uniprot | |
868 | * sequence</li> | |
869 | * <li>else check if the dna translates exactly to the protein (give or take | |
870 | * start and stop codons></li> | |
871 | * <li>else try to map based on CDS features on the dna sequence</li> | |
872 | * </ul> | |
873 | * | |
874 | * @param mapFrom | |
875 | * @param mapTo | |
876 | * @param xref | |
877 | * @param mappings | |
878 | * @return | |
879 | */ | |
880 | 0 | boolean constructMapping(SequenceI mapFrom, SequenceI mapTo, |
881 | DBRefEntry xref, AlignedCodonFrame mappings, boolean fromDna) | |
882 | { | |
883 | 0 | MapList mapping = null; |
884 | 0 | SequenceI dsmapFrom = mapFrom.getDatasetSequence() == null ? mapFrom |
885 | : mapFrom.getDatasetSequence(); | |
886 | 0 | SequenceI dsmapTo = mapTo.getDatasetSequence() == null ? mapTo |
887 | : mapTo.getDatasetSequence(); | |
888 | /* | |
889 | * look for a reverse mapping, if found make its inverse. | |
890 | * Note - we do this on dataset sequences only. | |
891 | */ | |
892 | 0 | if (dsmapTo.getDBRefs() != null) |
893 | { | |
894 | 0 | for (DBRefEntry dbref : dsmapTo.getDBRefs()) |
895 | { | |
896 | 0 | String name = dbref.getSource() + "|" + dbref.getAccessionId(); |
897 | 0 | if (dbref.hasMap() && dsmapFrom.getName().startsWith(name)) |
898 | { | |
899 | /* | |
900 | * looks like we've found a map from 'mapTo' to 'mapFrom' | |
901 | * - invert it to make the mapping the other way | |
902 | */ | |
903 | 0 | MapList reverse = dbref.getMap().getMap().getInverse(); |
904 | 0 | xref.setMap(new Mapping(dsmapTo, reverse)); |
905 | 0 | mappings.addMap(mapFrom, dsmapTo, reverse); |
906 | 0 | return true; |
907 | } | |
908 | } | |
909 | } | |
910 | ||
911 | 0 | if (fromDna) |
912 | { | |
913 | 0 | mapping = AlignmentUtils.mapCdnaToProtein(mapTo, mapFrom); |
914 | } | |
915 | else | |
916 | { | |
917 | 0 | mapping = AlignmentUtils.mapCdnaToProtein(mapFrom, mapTo); |
918 | 0 | if (mapping != null) |
919 | { | |
920 | 0 | mapping = mapping.getInverse(); |
921 | } | |
922 | } | |
923 | 0 | if (mapping == null) |
924 | { | |
925 | 0 | return false; |
926 | } | |
927 | 0 | xref.setMap(new Mapping(mapTo, mapping)); |
928 | ||
929 | /* | |
930 | * and add a reverse DbRef with the inverse mapping | |
931 | */ | |
932 | 0 | if (mapFrom.getDatasetSequence() != null && false) |
933 | // && mapFrom.getDatasetSequence().getSourceDBRef() != null) | |
934 | { | |
935 | // possible need to search primary references... except, why doesn't xref | |
936 | // == getSourceDBRef ?? | |
937 | // DBRefEntry dbref = new DBRefEntry(mapFrom.getDatasetSequence() | |
938 | // .getSourceDBRef()); | |
939 | // dbref.setMap(new Mapping(mapFrom.getDatasetSequence(), mapping | |
940 | // .getInverse())); | |
941 | // mapTo.addDBRef(dbref); | |
942 | } | |
943 | ||
944 | 0 | if (fromDna) |
945 | { | |
946 | // AlignmentUtils.computeProteinFeatures(mapFrom, mapTo, mapping); | |
947 | 0 | mappings.addMap(mapFrom, mapTo, mapping); |
948 | } | |
949 | else | |
950 | { | |
951 | 0 | mappings.addMap(mapTo, mapFrom, mapping.getInverse()); |
952 | } | |
953 | ||
954 | 0 | return true; |
955 | } | |
956 | ||
957 | /** | |
958 | * find references to lrfs in the cross-reference set of each sequence in | |
959 | * dataset (that is not equal to sequenceI) Identifies matching DBRefEntry | |
960 | * based on source and accession string only - Map and Version are nulled. | |
961 | * | |
962 | * @param fromDna | |
963 | * - true if context was searching from Dna sequences, false if | |
964 | * context was searching from Protein sequences | |
965 | * @param sequenceI | |
966 | * @param lrfs | |
967 | * @param foundSeqs | |
968 | * @return true if matches were found. | |
969 | */ | |
970 | 6702 | private boolean searchDatasetXrefs(boolean fromDna, SequenceI sequenceI, |
971 | List<DBRefEntry> lrfs, List<SequenceI> foundSeqs, | |
972 | AlignedCodonFrame cf) | |
973 | { | |
974 | 6702 | boolean found = false; |
975 | 6702 | if (lrfs == null) |
976 | { | |
977 | 4981 | return false; |
978 | } | |
979 | 3659 | for (int i = 0, n = lrfs.size(); i < n; i++) |
980 | { | |
981 | // DBRefEntry xref = new DBRefEntry(lrfs.get(i)); | |
982 | // // add in wildcards | |
983 | // xref.setVersion(null); | |
984 | // xref.setMap(null); | |
985 | 1938 | found |= searchDataset(fromDna, sequenceI, lrfs.get(i), foundSeqs, cf, |
986 | false, DBRefUtils.SEARCH_MODE_NO_MAP_NO_VERSION); | |
987 | } | |
988 | 1721 | return found; |
989 | } | |
990 | ||
991 | /** | |
992 | * Searches dataset for DBRefEntrys matching the given one (xrf) and adds the | |
993 | * associated sequence to rseqs | |
994 | * | |
995 | * @param fromDna | |
996 | * true if context was searching for refs *from* dna sequence, false | |
997 | * if context was searching for refs *from* protein sequence | |
998 | * @param fromSeq | |
999 | * a sequence to ignore (start point of search) | |
1000 | * @param xrf | |
1001 | * a cross-reference to try to match | |
1002 | * @param foundSeqs | |
1003 | * result list to add to | |
1004 | * @param mappings | |
1005 | * a set of sequence mappings to add to | |
1006 | * @param direct | |
1007 | * - indicates the type of relationship between returned sequences, | |
1008 | * xrf, and sequenceI that is required. | |
1009 | * <ul> | |
1010 | * <li>direct implies xrf is a primary reference for sequenceI AND | |
1011 | * the sequences to be located (eg a uniprot ID for a protein | |
1012 | * sequence, and a uniprot ref on a transcript sequence).</li> | |
1013 | * <li>indirect means xrf is a cross reference with respect to | |
1014 | * sequenceI or all the returned sequences (eg a genomic reference | |
1015 | * associated with a locus and one or more transcripts)</li> | |
1016 | * </ul> | |
1017 | * @param mode | |
1018 | * SEARCH_MODE_FULL for all; SEARCH_MODE_NO_MAP_NO_VERSION optional | |
1019 | * @return true if relationship found and sequence added. | |
1020 | */ | |
1021 | 1963 | boolean searchDataset(boolean fromDna, SequenceI fromSeq, DBRefEntry xrf, |
1022 | List<SequenceI> foundSeqs, AlignedCodonFrame mappings, | |
1023 | boolean direct, int mode) | |
1024 | { | |
1025 | 1963 | boolean found = false; |
1026 | 1963 | if (dataset == null) |
1027 | { | |
1028 | 0 | return false; |
1029 | } | |
1030 | 1963 | if (dataset.getSequences() == null) |
1031 | { | |
1032 | 0 | jalview.bin.Console |
1033 | .errPrintln("Empty dataset sequence set - NO VECTOR"); | |
1034 | 0 | return false; |
1035 | } | |
1036 | 1963 | List<SequenceI> ds = dataset.getSequences(); |
1037 | 1963 | synchronized (ds) |
1038 | { | |
1039 | 1963 | for (SequenceI nxt : ds) |
1040 | { | |
1041 | 43625 | if (nxt != null) |
1042 | { | |
1043 | 43625 | if (nxt.getDatasetSequence() != null) |
1044 | { | |
1045 | 0 | jalview.bin.Console.errPrintln( |
1046 | "Implementation warning: CrossRef initialised with a dataset alignment with non-dataset sequences in it! (" | |
1047 | + nxt.getDisplayId(true) + " has ds reference " | |
1048 | + nxt.getDatasetSequence().getDisplayId(true) | |
1049 | + ")"); | |
1050 | } | |
1051 | 43625 | if (nxt == fromSeq || nxt == fromSeq.getDatasetSequence()) |
1052 | { | |
1053 | 1960 | continue; |
1054 | } | |
1055 | /* | |
1056 | * only look at same molecule type if 'direct', or | |
1057 | * complementary type if !direct | |
1058 | */ | |
1059 | { | |
1060 | 41665 | boolean isDna = !nxt.isProtein(); |
1061 | 41665 | if (direct ? (isDna != fromDna) : (isDna == fromDna)) |
1062 | { | |
1063 | // skip this sequence because it is wrong molecule type | |
1064 | 36622 | continue; |
1065 | } | |
1066 | } | |
1067 | ||
1068 | // look for direct or indirect references in common | |
1069 | 5043 | List<DBRefEntry> poss = nxt.getDBRefs(); |
1070 | 5043 | List<DBRefEntry> cands = null; |
1071 | ||
1072 | // todo: indirect specifies we select either direct references to nxt | |
1073 | // that match xrf which is indirect to sequenceI, or indirect | |
1074 | // references to nxt that match xrf which is direct to sequenceI | |
1075 | 5043 | cands = DBRefUtils.searchRefs(poss, xrf, mode); |
1076 | // else | |
1077 | // { | |
1078 | // poss = DBRefUtils.selectDbRefs(nxt.isProtein()!fromDna, poss); | |
1079 | // cands = DBRefUtils.searchRefs(poss, xrf); | |
1080 | // } | |
1081 | 5043 | if (!cands.isEmpty()) |
1082 | { | |
1083 | 880 | if (foundSeqs.contains(nxt)) |
1084 | { | |
1085 | 582 | continue; |
1086 | } | |
1087 | 298 | found = true; |
1088 | 298 | foundSeqs.add(nxt); |
1089 | 298 | if (mappings != null && !direct) |
1090 | { | |
1091 | /* | |
1092 | * if the matched sequence has mapped dbrefs to | |
1093 | * protein product / cdna, add equivalent mappings to | |
1094 | * our source sequence | |
1095 | */ | |
1096 | 26 | for (DBRefEntry candidate : cands) |
1097 | { | |
1098 | 26 | Mapping mapping = candidate.getMap(); |
1099 | 26 | if (mapping != null) |
1100 | { | |
1101 | 1 | MapList map = mapping.getMap(); |
1102 | 1 | if (mapping.getTo() != null |
1103 | && map.getFromRatio() != map.getToRatio()) | |
1104 | { | |
1105 | /* | |
1106 | * add a mapping, as from dna to peptide sequence | |
1107 | */ | |
1108 | 1 | if (map.getFromRatio() == 3) |
1109 | { | |
1110 | 1 | mappings.addMap(nxt, fromSeq, map); |
1111 | } | |
1112 | else | |
1113 | { | |
1114 | 0 | mappings.addMap(nxt, fromSeq, map.getInverse()); |
1115 | } | |
1116 | } | |
1117 | } | |
1118 | } | |
1119 | } | |
1120 | } | |
1121 | } | |
1122 | } | |
1123 | } | |
1124 | 1963 | return found; |
1125 | } | |
1126 | } |