Class |
Line # |
Actions |
|||
---|---|---|---|---|---|
EnsemblSeqProxy | 61 | 219 | 85 | ||
EnsemblSeqProxy.EnsemblSeqType | 68 | 2 | 2 |
1 | /* | |
2 | * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) | |
3 | * Copyright (C) $$Year-Rel$$ The Jalview Authors | |
4 | * | |
5 | * This file is part of Jalview. | |
6 | * | |
7 | * Jalview is free software: you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation, either version 3 | |
10 | * of the License, or (at your option) any later version. | |
11 | * | |
12 | * Jalview is distributed in the hope that it will be useful, but | |
13 | * WITHOUT ANY WARRANTY; without even the implied warranty | |
14 | * of MERCHANTABILITY or FITNESS FOR A PARTICULAR | |
15 | * PURPOSE. See the GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with Jalview. If not, see <http://www.gnu.org/licenses/>. | |
19 | * The Jalview Authors are detailed in the 'AUTHORS' file. | |
20 | */ | |
21 | package jalview.ext.ensembl; | |
22 | ||
23 | import java.io.IOException; | |
24 | import java.net.MalformedURLException; | |
25 | import java.net.URL; | |
26 | import java.util.ArrayList; | |
27 | import java.util.Arrays; | |
28 | import java.util.Collections; | |
29 | import java.util.List; | |
30 | import java.util.Map; | |
31 | ||
32 | import org.json.simple.parser.ParseException; | |
33 | ||
34 | import jalview.analysis.AlignmentUtils; | |
35 | import jalview.analysis.Dna; | |
36 | import jalview.bin.Console; | |
37 | import jalview.datamodel.Alignment; | |
38 | import jalview.datamodel.AlignmentI; | |
39 | import jalview.datamodel.DBRefEntry; | |
40 | import jalview.datamodel.DBRefSource; | |
41 | import jalview.datamodel.Mapping; | |
42 | import jalview.datamodel.Sequence; | |
43 | import jalview.datamodel.SequenceFeature; | |
44 | import jalview.datamodel.SequenceI; | |
45 | import jalview.datamodel.features.SequenceFeatures; | |
46 | import jalview.exceptions.JalviewException; | |
47 | import jalview.io.gff.Gff3Helper; | |
48 | import jalview.io.gff.SequenceOntologyFactory; | |
49 | import jalview.io.gff.SequenceOntologyI; | |
50 | import jalview.util.Comparison; | |
51 | import jalview.util.DBRefUtils; | |
52 | import jalview.util.IntRangeComparator; | |
53 | import jalview.util.MapList; | |
54 | ||
55 | /** | |
56 | * Base class for Ensembl sequence fetchers | |
57 | * | |
58 | * @see http://rest.ensembl.org/documentation/info/sequence_id | |
59 | * @author gmcarstairs | |
60 | */ | |
61 | public abstract class EnsemblSeqProxy extends EnsemblRestClient | |
62 | { | |
63 | protected static final String DESCRIPTION = "description"; | |
64 | ||
65 | /* | |
66 | * enum for 'type' parameter to the /sequence REST service | |
67 | */ | |
68 | public enum EnsemblSeqType | |
69 | { | |
70 | /** | |
71 | * type=genomic to fetch full dna including introns | |
72 | */ | |
73 | GENOMIC("genomic"), | |
74 | ||
75 | /** | |
76 | * type=cdna to fetch coding dna including UTRs | |
77 | */ | |
78 | CDNA("cdna"), | |
79 | ||
80 | /** | |
81 | * type=cds to fetch coding dna excluding UTRs | |
82 | */ | |
83 | CDS("cds"), | |
84 | ||
85 | /** | |
86 | * type=protein to fetch peptide product sequence | |
87 | */ | |
88 | PROTEIN("protein"); | |
89 | ||
90 | /* | |
91 | * the value of the 'type' parameter to fetch this version of | |
92 | * an Ensembl sequence | |
93 | */ | |
94 | private String type; | |
95 | ||
96 | 0 | EnsemblSeqType(String t) |
97 | { | |
98 | 0 | type = t; |
99 | } | |
100 | ||
101 | 0 | public String getType() |
102 | { | |
103 | 0 | return type; |
104 | } | |
105 | ||
106 | } | |
107 | ||
108 | /** | |
109 | * Default constructor (to use rest.ensembl.org) | |
110 | */ | |
111 | 49 | public EnsemblSeqProxy() |
112 | { | |
113 | 49 | super(); |
114 | } | |
115 | ||
116 | /** | |
117 | * Constructor given the target domain to fetch data from | |
118 | */ | |
119 | 0 | public EnsemblSeqProxy(String d) |
120 | { | |
121 | 0 | super(d); |
122 | } | |
123 | ||
124 | /** | |
125 | * Makes the sequence queries to Ensembl's REST service and returns an | |
126 | * alignment consisting of the returned sequences. | |
127 | */ | |
128 | 0 | @Override |
129 | public AlignmentI getSequenceRecords(String query) throws Exception | |
130 | { | |
131 | // TODO use a String... query vararg instead? | |
132 | ||
133 | // danger: accession separator used as a regex here, a string elsewhere | |
134 | // in this case it is ok (it is just a space), but (e.g.) '\' would not be | |
135 | 0 | List<String> allIds = Arrays |
136 | .asList(query.split(getAccessionSeparator())); | |
137 | 0 | AlignmentI alignment = null; |
138 | 0 | inProgress = true; |
139 | ||
140 | /* | |
141 | * execute queries, if necessary in batches of the | |
142 | * maximum allowed number of ids | |
143 | */ | |
144 | 0 | int maxQueryCount = getMaximumQueryCount(); |
145 | 0 | for (int v = 0, vSize = allIds.size(); v < vSize; v += maxQueryCount) |
146 | { | |
147 | 0 | int p = Math.min(vSize, v + maxQueryCount); |
148 | 0 | List<String> ids = allIds.subList(v, p); |
149 | 0 | try |
150 | { | |
151 | 0 | alignment = fetchSequences(ids, alignment); |
152 | } catch (Throwable r) | |
153 | { | |
154 | 0 | inProgress = false; |
155 | 0 | String msg = "Aborting ID retrieval after " + v |
156 | + " chunks. Unexpected problem (" + r.getLocalizedMessage() | |
157 | + ")"; | |
158 | 0 | jalview.bin.Console.errPrintln(msg); |
159 | 0 | r.printStackTrace(); |
160 | 0 | break; |
161 | } | |
162 | } | |
163 | ||
164 | 0 | if (alignment == null) |
165 | { | |
166 | 0 | return null; |
167 | } | |
168 | ||
169 | /* | |
170 | * fetch and transfer genomic sequence features, | |
171 | * fetch protein product and add as cross-reference | |
172 | */ | |
173 | 0 | for (int i = 0, n = allIds.size(); i < n; i++) |
174 | { | |
175 | 0 | addFeaturesAndProduct(allIds.get(i), alignment); |
176 | } | |
177 | ||
178 | 0 | List<SequenceI> seqs = alignment.getSequences(); |
179 | 0 | for (int i = 0, n = seqs.size(); i < n; i++) |
180 | { | |
181 | 0 | getCrossReferences(seqs.get(i)); |
182 | } | |
183 | ||
184 | 0 | return alignment; |
185 | } | |
186 | ||
187 | /** | |
188 | * Fetches Ensembl features using the /overlap REST endpoint, and adds them to | |
189 | * the sequence in the alignment. Also fetches the protein product, maps it | |
190 | * from the CDS features of the sequence, and saves it as a cross-reference of | |
191 | * the dna sequence. | |
192 | * | |
193 | * @param accId | |
194 | * @param alignment | |
195 | */ | |
196 | 0 | protected void addFeaturesAndProduct(String accId, AlignmentI alignment) |
197 | { | |
198 | 0 | if (alignment == null) |
199 | { | |
200 | 0 | return; |
201 | } | |
202 | ||
203 | 0 | try |
204 | { | |
205 | /* | |
206 | * get 'dummy' genomic sequence with gene, transcript, | |
207 | * exon, cds and variation features | |
208 | */ | |
209 | 0 | SequenceI genomicSequence = null; |
210 | 0 | EnsemblFeatures gffFetcher = new EnsemblFeatures(getDomain()); |
211 | 0 | EnsemblFeatureType[] features = getFeaturesToFetch(); |
212 | ||
213 | // Platform.timeCheck("ESP.getsequencerec1", Platform.TIME_MARK); | |
214 | ||
215 | 0 | AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId, |
216 | features); | |
217 | 0 | if (geneFeatures != null && geneFeatures.getHeight() > 0) |
218 | { | |
219 | 0 | genomicSequence = geneFeatures.getSequenceAt(0); |
220 | } | |
221 | ||
222 | // Platform.timeCheck("ESP.getsequencerec2", Platform.TIME_MARK); | |
223 | ||
224 | 0 | if (genomicSequence != null) |
225 | { | |
226 | /* | |
227 | * transfer features to the query sequence | |
228 | */ | |
229 | 0 | SequenceI querySeq = alignment.findName(accId, true); |
230 | 0 | if (transferFeatures(accId, genomicSequence, querySeq)) |
231 | { | |
232 | ||
233 | /* | |
234 | * fetch and map protein product, and add it as a cross-reference | |
235 | * of the retrieved sequence | |
236 | */ | |
237 | // Platform.timeCheck("ESP.transferFeatures", Platform.TIME_MARK); | |
238 | 0 | addProteinProduct(querySeq); |
239 | } | |
240 | } | |
241 | } catch (IOException e) | |
242 | { | |
243 | 0 | jalview.bin.Console.errPrintln( |
244 | "Error transferring Ensembl features: " + e.getMessage()); | |
245 | } | |
246 | // Platform.timeCheck("ESP.addfeat done", Platform.TIME_MARK); | |
247 | } | |
248 | ||
249 | /** | |
250 | * Returns those sequence feature types to fetch from Ensembl. We may want | |
251 | * features either because they are of interest to the user, or as means to | |
252 | * identify the locations of the sequence on the genomic sequence (CDS | |
253 | * features identify CDS, exon features identify cDNA etc). | |
254 | * | |
255 | * @return | |
256 | */ | |
257 | protected abstract EnsemblFeatureType[] getFeaturesToFetch(); | |
258 | ||
259 | /** | |
260 | * Fetches and maps the protein product, and adds it as a cross-reference of | |
261 | * the retrieved sequence | |
262 | */ | |
263 | 0 | protected void addProteinProduct(SequenceI querySeq) |
264 | { | |
265 | 0 | String accId = querySeq.getName(); |
266 | 0 | try |
267 | { | |
268 | 0 | jalview.bin.Console.outPrintln("Adding protein product for " + accId); |
269 | 0 | AlignmentI protein = new EnsemblProtein(getDomain()) |
270 | .getSequenceRecords(accId); | |
271 | 0 | if (protein == null || protein.getHeight() == 0) |
272 | { | |
273 | 0 | jalview.bin.Console |
274 | .outPrintln("No protein product found for " + accId); | |
275 | 0 | return; |
276 | } | |
277 | 0 | SequenceI proteinSeq = protein.getSequenceAt(0); |
278 | ||
279 | /* | |
280 | * need dataset sequences (to be the subject of mappings) | |
281 | */ | |
282 | 0 | proteinSeq.createDatasetSequence(); |
283 | 0 | querySeq.createDatasetSequence(); |
284 | ||
285 | 0 | MapList mapList = AlignmentUtils.mapCdsToProtein(querySeq, |
286 | proteinSeq); | |
287 | 0 | if (mapList != null) |
288 | { | |
289 | // clunky: ensure Uniprot xref if we have one is on mapped sequence | |
290 | 0 | SequenceI ds = proteinSeq.getDatasetSequence(); |
291 | // TODO: Verify ensp primary ref is on proteinSeq.getDatasetSequence() | |
292 | 0 | Mapping map = new Mapping(ds, mapList); |
293 | 0 | DBRefEntry dbr = new DBRefEntry(getDbSource(), |
294 | getEnsemblDataVersion(), proteinSeq.getName(), map); | |
295 | 0 | querySeq.getDatasetSequence().addDBRef(dbr); |
296 | 0 | List<DBRefEntry> uprots = DBRefUtils.selectRefs(ds.getDBRefs(), |
297 | new String[] | |
298 | { DBRefSource.UNIPROT }); | |
299 | 0 | List<DBRefEntry> upxrefs = DBRefUtils |
300 | .selectRefs(querySeq.getDBRefs(), new String[] | |
301 | { DBRefSource.UNIPROT }); | |
302 | 0 | if (uprots != null) |
303 | { | |
304 | 0 | for (DBRefEntry up : uprots) |
305 | { | |
306 | // locate local uniprot ref and map | |
307 | 0 | List<DBRefEntry> upx = DBRefUtils.searchRefs(upxrefs, |
308 | up.getAccessionId()); | |
309 | 0 | DBRefEntry upxref; |
310 | 0 | if (upx.size() != 0) |
311 | { | |
312 | 0 | upxref = upx.get(0); |
313 | ||
314 | 0 | if (upx.size() > 1) |
315 | { | |
316 | 0 | Console.warn( |
317 | "Implementation issue - multiple uniprot acc on product sequence."); | |
318 | } | |
319 | } | |
320 | else | |
321 | { | |
322 | 0 | upxref = new DBRefEntry(DBRefSource.UNIPROT, |
323 | getEnsemblDataVersion(), up.getAccessionId()); | |
324 | } | |
325 | ||
326 | 0 | Mapping newMap = new Mapping(ds, mapList); |
327 | 0 | upxref.setVersion(getEnsemblDataVersion()); |
328 | 0 | upxref.setMap(newMap); |
329 | 0 | if (upx.size() == 0) |
330 | { | |
331 | // add the new uniprot ref | |
332 | 0 | querySeq.getDatasetSequence().addDBRef(upxref); |
333 | } | |
334 | ||
335 | } | |
336 | } | |
337 | ||
338 | /* | |
339 | * copy exon features to protein, compute peptide variants from dna | |
340 | * variants and add as features on the protein sequence ta-da | |
341 | */ | |
342 | // JAL-3187 render on the fly instead | |
343 | // AlignmentUtils.computeProteinFeatures(querySeq, proteinSeq, mapList); | |
344 | } | |
345 | } catch (Exception e) | |
346 | { | |
347 | 0 | System.err |
348 | .println(String.format("Error retrieving protein for %s: %s", | |
349 | accId, e.getMessage())); | |
350 | } | |
351 | } | |
352 | ||
353 | /** | |
354 | * Get database xrefs from Ensembl, and attach them to the sequence | |
355 | * | |
356 | * @param seq | |
357 | */ | |
358 | 0 | protected void getCrossReferences(SequenceI seq) |
359 | { | |
360 | ||
361 | // Platform.timeCheck("ESP. getdataseq ", Platform.TIME_MARK); | |
362 | ||
363 | 0 | while (seq.getDatasetSequence() != null) |
364 | { | |
365 | 0 | seq = seq.getDatasetSequence(); |
366 | } | |
367 | ||
368 | // Platform.timeCheck("ESP. getxref ", Platform.TIME_MARK); | |
369 | ||
370 | 0 | EnsemblXref xrefFetcher = new EnsemblXref(getDomain(), getDbSource(), |
371 | getEnsemblDataVersion()); | |
372 | 0 | List<DBRefEntry> xrefs = xrefFetcher.getCrossReferences(seq.getName()); |
373 | ||
374 | 0 | for (int i = 0, n = xrefs.size(); i < n; i++) |
375 | { | |
376 | // Platform.timeCheck("ESP. getxref + " + (i) + "/" + n, | |
377 | // Platform.TIME_MARK); | |
378 | // BH 2019.01.25 this next method was taking 174 ms PER addition for a | |
379 | // 266-reference example. | |
380 | // DBRefUtils.ensurePrimaries(seq) | |
381 | // was at the end of seq.addDBRef, so executed after ever addition! | |
382 | // This method was moved to seq.getPrimaryDBRefs() | |
383 | 0 | seq.addDBRef(xrefs.get(i)); |
384 | } | |
385 | ||
386 | // jalview.bin.Console.outPrintln("primaries are " + | |
387 | // seq.getPrimaryDBRefs().toString()); | |
388 | /* | |
389 | * and add a reference to itself | |
390 | */ | |
391 | ||
392 | // Platform.timeCheck("ESP. getxref self ", Platform.TIME_MARK); | |
393 | ||
394 | 0 | DBRefEntry self = new DBRefEntry(getDbSource(), getEnsemblDataVersion(), |
395 | seq.getName()); | |
396 | ||
397 | // Platform.timeCheck("ESP. getxref self add ", Platform.TIME_MARK); | |
398 | ||
399 | 0 | seq.addDBRef(self); |
400 | ||
401 | // Platform.timeCheck("ESP. seqprox done ", Platform.TIME_MARK); | |
402 | } | |
403 | ||
404 | /** | |
405 | * Fetches sequences for the list of accession ids and adds them to the | |
406 | * alignment. Returns the extended (or created) alignment. | |
407 | * | |
408 | * @param ids | |
409 | * @param alignment | |
410 | * @return | |
411 | * @throws JalviewException | |
412 | * @throws IOException | |
413 | */ | |
414 | 0 | protected AlignmentI fetchSequences(List<String> ids, |
415 | AlignmentI alignment) throws JalviewException, IOException | |
416 | { | |
417 | 0 | if (!isEnsemblAvailable()) |
418 | { | |
419 | 0 | inProgress = false; |
420 | 0 | throw new JalviewException("ENSEMBL Rest API not available."); |
421 | } | |
422 | // Platform.timeCheck("EnsemblSeqProx.fetchSeq ", Platform.TIME_MARK); | |
423 | ||
424 | 0 | List<SequenceI> seqs = parseSequenceJson(ids); |
425 | 0 | if (seqs == null) |
426 | 0 | return alignment; |
427 | ||
428 | 0 | if (seqs.isEmpty()) |
429 | { | |
430 | 0 | throw new IOException("No data returned for " + ids); |
431 | } | |
432 | ||
433 | 0 | if (seqs.size() != ids.size()) |
434 | { | |
435 | 0 | jalview.bin.Console.outPrintln(String.format( |
436 | "Only retrieved %d sequences for %d query strings", | |
437 | seqs.size(), ids.size())); | |
438 | } | |
439 | ||
440 | 0 | if (!seqs.isEmpty()) |
441 | { | |
442 | 0 | AlignmentI seqal = new Alignment( |
443 | seqs.toArray(new SequenceI[seqs.size()])); | |
444 | 0 | for (SequenceI seq : seqs) |
445 | { | |
446 | 0 | if (seq.getDescription() == null) |
447 | { | |
448 | 0 | seq.setDescription(getDbName()); |
449 | } | |
450 | 0 | String name = seq.getName(); |
451 | 0 | if (ids.contains(name) |
452 | || ids.contains(name.replace("ENSP", "ENST"))) | |
453 | { | |
454 | // TODO JAL-3077 use true accession version in dbref | |
455 | 0 | DBRefEntry dbref = DBRefUtils.parseToDbRef(seq, getDbSource(), |
456 | getEnsemblDataVersion(), name); | |
457 | 0 | seq.addDBRef(dbref); |
458 | } | |
459 | } | |
460 | 0 | if (alignment == null) |
461 | { | |
462 | 0 | alignment = seqal; |
463 | } | |
464 | else | |
465 | { | |
466 | 0 | alignment.append(seqal); |
467 | } | |
468 | } | |
469 | 0 | return alignment; |
470 | } | |
471 | ||
472 | /** | |
473 | * Parses a JSON response for a single sequence ID query | |
474 | * | |
475 | * @param br | |
476 | * @return a single jalview.datamodel.Sequence | |
477 | * @see http://rest.ensembl.org/documentation/info/sequence_id | |
478 | */ | |
479 | 0 | @SuppressWarnings("unchecked") |
480 | protected List<SequenceI> parseSequenceJson(List<String> ids) | |
481 | { | |
482 | 0 | List<SequenceI> result = new ArrayList<>(); |
483 | 0 | try |
484 | { | |
485 | /* | |
486 | * for now, assumes only one sequence returned; refactor if needed | |
487 | * in future to handle a JSONArray with more than one | |
488 | */ | |
489 | // Platform.timeCheck("ENS seqproxy", Platform.TIME_MARK); | |
490 | 0 | Map<String, Object> val = (Map<String, Object>) getJSON(null, ids, -1, |
491 | MODE_MAP, null); | |
492 | 0 | if (val == null) |
493 | 0 | return null; |
494 | 0 | Object s = val.get("desc"); |
495 | 0 | String desc = s == null ? null : s.toString(); |
496 | 0 | s = val.get("id"); |
497 | 0 | String id = s == null ? null : s.toString(); |
498 | 0 | s = val.get("seq"); |
499 | 0 | String seq = s == null ? null : s.toString(); |
500 | 0 | Sequence sequence = new Sequence(id, seq); |
501 | 0 | if (desc != null) |
502 | { | |
503 | 0 | sequence.setDescription(desc); |
504 | } | |
505 | // todo JAL-3077 make a DBRefEntry with true accession version | |
506 | // s = val.get("version"); | |
507 | // String version = s == null ? "0" : s.toString(); | |
508 | // DBRefEntry dbref = new DBRefEntry(getDbSource(), version, id); | |
509 | // sequence.addDBRef(dbref); | |
510 | 0 | result.add(sequence); |
511 | } catch (ParseException | IOException e) | |
512 | { | |
513 | 0 | jalview.bin.Console.errPrintln( |
514 | "Error processing JSON response: " + e.toString()); | |
515 | // ignore | |
516 | } | |
517 | // Platform.timeCheck("ENS seqproxy2", Platform.TIME_MARK); | |
518 | 0 | return result; |
519 | } | |
520 | ||
521 | /** | |
522 | * Returns the URL for the REST call | |
523 | * | |
524 | * @return | |
525 | * @throws MalformedURLException | |
526 | */ | |
527 | 0 | @Override |
528 | protected URL getUrl(List<String> ids) throws MalformedURLException | |
529 | { | |
530 | /* | |
531 | * a single id is included in the URL path | |
532 | * multiple ids go in the POST body instead | |
533 | */ | |
534 | 0 | StringBuffer urlstring = new StringBuffer(128); |
535 | 0 | urlstring.append(getDomain() + "/sequence/id"); |
536 | 0 | if (ids.size() == 1) |
537 | { | |
538 | 0 | urlstring.append("/").append(ids.get(0)); |
539 | } | |
540 | // @see https://github.com/Ensembl/ensembl-rest/wiki/Output-formats | |
541 | 0 | urlstring.append("?type=").append(getSourceEnsemblType().getType()); |
542 | 0 | urlstring.append(("&Accept=application/json")); |
543 | 0 | urlstring.append(("&content-type=application/json")); |
544 | ||
545 | 0 | String objectType = getObjectType(); |
546 | 0 | if (objectType != null) |
547 | { | |
548 | 0 | urlstring.append("&").append(OBJECT_TYPE).append("=") |
549 | .append(objectType); | |
550 | } | |
551 | ||
552 | 0 | URL url = new URL(urlstring.toString()); |
553 | 0 | return url; |
554 | } | |
555 | ||
556 | /** | |
557 | * Override this method to specify object_type request parameter | |
558 | * | |
559 | * @return | |
560 | */ | |
561 | 0 | protected String getObjectType() |
562 | { | |
563 | 0 | return null; |
564 | } | |
565 | ||
566 | /** | |
567 | * A sequence/id POST request currently allows up to 50 queries | |
568 | * | |
569 | * @see http://rest.ensembl.org/documentation/info/sequence_id_post | |
570 | */ | |
571 | 0 | @Override |
572 | public int getMaximumQueryCount() | |
573 | { | |
574 | 0 | return 50; |
575 | } | |
576 | ||
577 | 0 | @Override |
578 | protected boolean useGetRequest() | |
579 | { | |
580 | 0 | return false; |
581 | } | |
582 | ||
583 | /** | |
584 | * | |
585 | * @return the configured sequence return type for this source | |
586 | */ | |
587 | protected abstract EnsemblSeqType getSourceEnsemblType(); | |
588 | ||
589 | /** | |
590 | * Returns a list of [start, end] genomic ranges corresponding to the sequence | |
591 | * being retrieved. | |
592 | * | |
593 | * The correspondence between the frames of reference is made by locating | |
594 | * those features on the genomic sequence which identify the retrieved | |
595 | * sequence. Specifically | |
596 | * <ul> | |
597 | * <li>genomic sequence is identified by "transcript" features with | |
598 | * ID=transcript:transcriptId</li> | |
599 | * <li>cdna sequence is identified by "exon" features with | |
600 | * Parent=transcript:transcriptId</li> | |
601 | * <li>cds sequence is identified by "CDS" features with | |
602 | * Parent=transcript:transcriptId</li> | |
603 | * </ul> | |
604 | * | |
605 | * The returned ranges are sorted to run forwards (for positive strand) or | |
606 | * backwards (for negative strand). Aborts and returns null if both positive | |
607 | * and negative strand are found (this should not normally happen). | |
608 | * | |
609 | * @param sourceSequence | |
610 | * @param accId | |
611 | * @param start | |
612 | * the start position of the sequence we are mapping to | |
613 | * @return | |
614 | */ | |
615 | 7 | protected MapList getGenomicRangesFromFeatures(SequenceI sourceSequence, |
616 | String accId, int start) | |
617 | { | |
618 | 7 | List<SequenceFeature> sfs = getIdentifyingFeatures(sourceSequence, |
619 | accId); | |
620 | 7 | if (sfs.isEmpty()) |
621 | { | |
622 | 1 | return null; |
623 | } | |
624 | ||
625 | /* | |
626 | * generously initial size for number of cds regions | |
627 | * (worst case titin Q8WZ42 has c. 313 exons) | |
628 | */ | |
629 | 6 | List<int[]> regions = new ArrayList<>(100); |
630 | 6 | int mappedLength = 0; |
631 | 6 | int direction = 1; // forward |
632 | 6 | boolean directionSet = false; |
633 | ||
634 | 6 | for (SequenceFeature sf : sfs) |
635 | { | |
636 | 11 | int strand = sf.getStrand(); |
637 | 11 | strand = strand == 0 ? 1 : strand; // treat unknown as forward |
638 | ||
639 | 11 | if (directionSet && strand != direction) |
640 | { | |
641 | // abort - mix of forward and backward | |
642 | 0 | System.err |
643 | .println("Error: forward and backward strand for " + accId); | |
644 | 0 | return null; |
645 | } | |
646 | 11 | direction = strand; |
647 | 11 | directionSet = true; |
648 | ||
649 | /* | |
650 | * add to CDS ranges, semi-sorted forwards/backwards | |
651 | */ | |
652 | 11 | if (strand < 0) |
653 | { | |
654 | 2 | regions.add(0, new int[] { sf.getEnd(), sf.getBegin() }); |
655 | } | |
656 | else | |
657 | { | |
658 | 9 | regions.add(new int[] { sf.getBegin(), sf.getEnd() }); |
659 | } | |
660 | 11 | mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1); |
661 | } | |
662 | ||
663 | 6 | if (regions.isEmpty()) |
664 | { | |
665 | 0 | jalview.bin.Console |
666 | .outPrintln("Failed to identify target sequence for " + accId | |
667 | + " from genomic features"); | |
668 | 0 | return null; |
669 | } | |
670 | ||
671 | /* | |
672 | * a final sort is needed since Ensembl returns CDS sorted within source | |
673 | * (havana / ensembl_havana) | |
674 | */ | |
675 | 6 | Collections.sort(regions, direction == 1 ? IntRangeComparator.ASCENDING |
676 | : IntRangeComparator.DESCENDING); | |
677 | ||
678 | 6 | List<int[]> to = Arrays |
679 | .asList(new int[] | |
680 | { start, start + mappedLength - 1 }); | |
681 | ||
682 | 6 | return new MapList(regions, to, 1, 1); |
683 | } | |
684 | ||
685 | /** | |
686 | * Answers a list of sequence features that mark positions of the genomic | |
687 | * sequence feature which are within the sequence being retrieved. For | |
688 | * example, an 'exon' feature whose parent is the target transcript marks the | |
689 | * cdna positions of the transcript. For a gene sequence, this is trivially | |
690 | * just the 'gene' feature with matching gene id. | |
691 | * | |
692 | * @param seq | |
693 | * @param accId | |
694 | * @return | |
695 | */ | |
696 | protected abstract List<SequenceFeature> getIdentifyingFeatures( | |
697 | SequenceI seq, String accId); | |
698 | ||
699 | int bhtest = 0; | |
700 | ||
701 | /** | |
702 | * Transfers the sequence feature to the target sequence, locating its start | |
703 | * and end range based on the mapping. Features which do not overlap the | |
704 | * target sequence are ignored. | |
705 | * | |
706 | * @param sf | |
707 | * @param targetSequence | |
708 | * @param mapping | |
709 | * mapping from the sequence feature's coordinates to the target | |
710 | * sequence | |
711 | * @param forwardStrand | |
712 | */ | |
713 | 0 | protected void transferFeature(SequenceFeature sf, |
714 | SequenceI targetSequence, MapList mapping, boolean forwardStrand) | |
715 | { | |
716 | 0 | int start = sf.getBegin(); |
717 | 0 | int end = sf.getEnd(); |
718 | 0 | int[] mappedRange = mapping.locateInTo(start, end); |
719 | ||
720 | 0 | if (mappedRange != null) |
721 | { | |
722 | // Platform.timeCheck(null, Platform.TIME_SET); | |
723 | 0 | String group = sf.getFeatureGroup(); |
724 | 0 | if (".".equals(group)) |
725 | { | |
726 | 0 | group = getDbSource(); |
727 | } | |
728 | 0 | int newBegin = Math.min(mappedRange[0], mappedRange[1]); |
729 | 0 | int newEnd = Math.max(mappedRange[0], mappedRange[1]); |
730 | // Platform.timeCheck(null, Platform.TIME_MARK); | |
731 | 0 | bhtest++; |
732 | // 280 ms/1000 here: | |
733 | 0 | SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd, |
734 | group, sf.getScore()); | |
735 | // 0.175 ms here: | |
736 | 0 | targetSequence.addSequenceFeature(copy); |
737 | ||
738 | /* | |
739 | * for sequence_variant on reverse strand, have to convert the allele | |
740 | * values to their complements | |
741 | */ | |
742 | 0 | if (!forwardStrand && SequenceOntologyFactory.getInstance() |
743 | .isA(sf.getType(), SequenceOntologyI.SEQUENCE_VARIANT)) | |
744 | { | |
745 | 0 | reverseComplementAlleles(copy); |
746 | } | |
747 | } | |
748 | } | |
749 | ||
750 | /** | |
751 | * Change the 'alleles' value of a feature by converting to complementary | |
752 | * bases, and also update the feature description to match | |
753 | * | |
754 | * @param sf | |
755 | */ | |
756 | 1 | static void reverseComplementAlleles(SequenceFeature sf) |
757 | { | |
758 | 1 | final String alleles = (String) sf.getValue(Gff3Helper.ALLELES); |
759 | 1 | if (alleles == null) |
760 | { | |
761 | 0 | return; |
762 | } | |
763 | 1 | StringBuilder complement = new StringBuilder(alleles.length()); |
764 | 1 | for (String allele : alleles.split(",")) |
765 | { | |
766 | 5 | reverseComplementAllele(complement, allele); |
767 | } | |
768 | 1 | String comp = complement.toString(); |
769 | 1 | sf.setValue(Gff3Helper.ALLELES, comp); |
770 | 1 | sf.setDescription(comp); |
771 | } | |
772 | ||
773 | /** | |
774 | * Makes the 'reverse complement' of the given allele and appends it to the | |
775 | * buffer, after a comma separator if not the first | |
776 | * | |
777 | * @param complement | |
778 | * @param allele | |
779 | */ | |
780 | 13 | static void reverseComplementAllele(StringBuilder complement, |
781 | String allele) | |
782 | { | |
783 | 13 | if (complement.length() > 0) |
784 | { | |
785 | 10 | complement.append(","); |
786 | } | |
787 | ||
788 | /* | |
789 | * some 'alleles' are actually descriptive terms | |
790 | * e.g. HGMD_MUTATION, PhenCode_variation | |
791 | * - we don't want to 'reverse complement' these | |
792 | */ | |
793 | 13 | if (!Comparison.isNucleotideSequence(allele, true)) |
794 | { | |
795 | 3 | complement.append(allele); |
796 | } | |
797 | else | |
798 | { | |
799 | 29 | for (int i = allele.length() - 1; i >= 0; i--) |
800 | { | |
801 | 19 | complement.append(Dna.getComplement(allele.charAt(i))); |
802 | } | |
803 | } | |
804 | } | |
805 | ||
806 | /** | |
807 | * Transfers features from sourceSequence to targetSequence | |
808 | * | |
809 | * @param accessionId | |
810 | * @param sourceSequence | |
811 | * @param targetSequence | |
812 | * @return true if any features were transferred, else false | |
813 | */ | |
814 | 0 | protected boolean transferFeatures(String accessionId, |
815 | SequenceI sourceSequence, SequenceI targetSequence) | |
816 | { | |
817 | 0 | if (sourceSequence == null || targetSequence == null) |
818 | { | |
819 | 0 | return false; |
820 | } | |
821 | ||
822 | // long start = System.currentTimeMillis(); | |
823 | 0 | List<SequenceFeature> sfs = sourceSequence.getFeatures() |
824 | .getPositionalFeatures(); | |
825 | 0 | MapList mapping = getGenomicRangesFromFeatures(sourceSequence, |
826 | accessionId, targetSequence.getStart()); | |
827 | 0 | if (mapping == null) |
828 | { | |
829 | 0 | return false; |
830 | } | |
831 | ||
832 | // Platform.timeCheck("ESP. xfer " + sfs.size(), Platform.TIME_MARK); | |
833 | ||
834 | 0 | boolean result = transferFeatures(sfs, targetSequence, mapping, |
835 | accessionId); | |
836 | // jalview.bin.Console.outPrintln("transferFeatures (" + (sfs.size()) + " | |
837 | // --> " | |
838 | // + targetSequence.getFeatures().getFeatureCount(true) + ") to " | |
839 | // + targetSequence.getName() + " took " | |
840 | // + (System.currentTimeMillis() - start) + "ms"); | |
841 | 0 | return result; |
842 | } | |
843 | ||
844 | /** | |
845 | * Transfer features to the target sequence. The start/end positions are | |
846 | * converted using the mapping. Features which do not overlap are ignored. | |
847 | * Features whose parent is not the specified identifier are also ignored. | |
848 | * | |
849 | * @param sfs | |
850 | * @param targetSequence | |
851 | * @param mapping | |
852 | * @param parentId | |
853 | * @return | |
854 | */ | |
855 | 0 | protected boolean transferFeatures(List<SequenceFeature> sfs, |
856 | SequenceI targetSequence, MapList mapping, String parentId) | |
857 | { | |
858 | 0 | final boolean forwardStrand = mapping.isFromForwardStrand(); |
859 | ||
860 | /* | |
861 | * sort features by start position (which corresponds to end | |
862 | * position descending if reverse strand) so as to add them in | |
863 | * 'forwards' order to the target sequence | |
864 | */ | |
865 | 0 | SequenceFeatures.sortFeatures(sfs, forwardStrand); |
866 | ||
867 | 0 | boolean transferred = false; |
868 | ||
869 | 0 | for (int i = 0, n = sfs.size(); i < n; i++) |
870 | { | |
871 | ||
872 | // if ((i%1000) == 0) { | |
873 | //// Platform.timeCheck("Feature " + bhtest, Platform.TIME_GET); | |
874 | // Platform.timeCheck("ESP. xferFeature + " + (i) + "/" + n, | |
875 | // Platform.TIME_MARK); | |
876 | // } | |
877 | ||
878 | 0 | SequenceFeature sf = sfs.get(i); |
879 | 0 | if (retainFeature(sf, parentId)) |
880 | { | |
881 | 0 | transferFeature(sf, targetSequence, mapping, forwardStrand); |
882 | 0 | transferred = true; |
883 | } | |
884 | } | |
885 | ||
886 | 0 | return transferred; |
887 | } | |
888 | ||
889 | /** | |
890 | * Answers true if the feature type is one we want to keep for the sequence. | |
891 | * Some features are only retrieved in order to identify the sequence range, | |
892 | * and may then be discarded as redundant information (e.g. "CDS" feature for | |
893 | * a CDS sequence). | |
894 | */ | |
895 | 0 | @SuppressWarnings("unused") |
896 | protected boolean retainFeature(SequenceFeature sf, String accessionId) | |
897 | { | |
898 | 0 | return true; // override as required |
899 | } | |
900 | ||
901 | /** | |
902 | * Answers true if the feature has a Parent which refers to the given | |
903 | * accession id, or if the feature has no parent. Answers false if the | |
904 | * feature's Parent is for a different accession id. | |
905 | * | |
906 | * @param sf | |
907 | * @param identifier | |
908 | * @return | |
909 | */ | |
910 | 10 | protected boolean featureMayBelong(SequenceFeature sf, String identifier) |
911 | { | |
912 | 10 | String parent = (String) sf.getValue(PARENT); |
913 | 10 | if (parent != null && !parent.equalsIgnoreCase(identifier)) |
914 | { | |
915 | // this genomic feature belongs to a different transcript | |
916 | 3 | return false; |
917 | } | |
918 | 7 | return true; |
919 | } | |
920 | ||
921 | /** | |
922 | * Answers a short description of the sequence fetcher | |
923 | */ | |
924 | 0 | @Override |
925 | public String getDescription() | |
926 | { | |
927 | 0 | return "Ensembl " + getSourceEnsemblType().getType() |
928 | + " sequence with variant features"; | |
929 | } | |
930 | ||
931 | /** | |
932 | * Returns a (possibly empty) list of features on the sequence which have the | |
933 | * specified sequence ontology term (or a sub-type of it), and the given | |
934 | * identifier as parent | |
935 | * | |
936 | * @param sequence | |
937 | * @param term | |
938 | * @param parentId | |
939 | * @return | |
940 | */ | |
941 | 0 | protected List<SequenceFeature> findFeatures(SequenceI sequence, |
942 | String term, String parentId) | |
943 | { | |
944 | 0 | List<SequenceFeature> result = new ArrayList<>(); |
945 | ||
946 | 0 | List<SequenceFeature> sfs = sequence.getFeatures() |
947 | .getFeaturesByOntology(term); | |
948 | 0 | for (SequenceFeature sf : sfs) |
949 | { | |
950 | 0 | String parent = (String) sf.getValue(PARENT); |
951 | 0 | if (parent != null && parent.equalsIgnoreCase(parentId)) |
952 | { | |
953 | 0 | result.add(sf); |
954 | } | |
955 | } | |
956 | ||
957 | 0 | return result; |
958 | } | |
959 | ||
960 | /** | |
961 | * Answers true if the feature type is either 'NMD_transcript_variant' or | |
962 | * 'transcript' (or one of its sub-types in the Sequence Ontology). This is | |
963 | * because NMD_transcript_variant behaves like 'transcript' in Ensembl | |
964 | * although strictly speaking it is not (it is a sub-type of | |
965 | * sequence_variant). | |
966 | * <p> | |
967 | * (This test was needed when fetching transcript features as GFF. As we are | |
968 | * now fetching as JSON, all features have type 'transcript' so the check for | |
969 | * NMD_transcript_variant is redundant. Left in for any future case arising.) | |
970 | * | |
971 | * @param featureType | |
972 | * @return | |
973 | */ | |
974 | 18 | public static boolean isTranscript(String featureType) |
975 | { | |
976 | 18 | return SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(featureType) |
977 | || SequenceOntologyFactory.getInstance().isA(featureType, | |
978 | SequenceOntologyI.TRANSCRIPT); | |
979 | } | |
980 | } |