Class |
Line # |
Actions |
|||
---|---|---|---|---|---|
EMBLLikeFlatFile | 68 | 240 | 81 | ||
CdsData | 835 | 0 | 0 |
1 | /* | |
2 | * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) | |
3 | * Copyright (C) $$Year-Rel$$ The Jalview Authors | |
4 | * | |
5 | * This file is part of Jalview. | |
6 | * | |
7 | * Jalview is free software: you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation, either version 3 | |
10 | * of the License, or (at your option) any later version. | |
11 | * | |
12 | * Jalview is distributed in the hope that it will be useful, but | |
13 | * WITHOUT ANY WARRANTY; without even the implied warranty | |
14 | * of MERCHANTABILITY or FITNESS FOR A PARTICULAR | |
15 | * PURPOSE. See the GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with Jalview. If not, see <http://www.gnu.org/licenses/>. | |
19 | * The Jalview Authors are detailed in the 'AUTHORS' file. | |
20 | */ | |
21 | package jalview.io; | |
22 | ||
23 | import java.io.IOException; | |
24 | import java.text.ParseException; | |
25 | import java.util.ArrayList; | |
26 | import java.util.Arrays; | |
27 | import java.util.HashMap; | |
28 | import java.util.Hashtable; | |
29 | import java.util.List; | |
30 | import java.util.Locale; | |
31 | import java.util.Map; | |
32 | import java.util.Map.Entry; | |
33 | import java.util.TreeMap; | |
34 | ||
35 | import jalview.bin.Console; | |
36 | import jalview.datamodel.DBRefEntry; | |
37 | import jalview.datamodel.DBRefSource; | |
38 | import jalview.datamodel.FeatureProperties; | |
39 | import jalview.datamodel.Mapping; | |
40 | import jalview.datamodel.Sequence; | |
41 | import jalview.datamodel.SequenceFeature; | |
42 | import jalview.datamodel.SequenceI; | |
43 | import jalview.util.DBRefUtils; | |
44 | import jalview.util.DnaUtils; | |
45 | import jalview.util.MapList; | |
46 | import jalview.util.MappingUtils; | |
47 | ||
48 | /** | |
49 | * A base class to support parsing of GenBank, EMBL or DDBJ flat file format | |
50 | * data. Example files (rather than formal specifications) are provided at | |
51 | * | |
52 | * <pre> | |
53 | * https://ena-docs.readthedocs.io/en/latest/submit/fileprep/flat-file-example.html | |
54 | * https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html | |
55 | * </pre> | |
56 | * | |
57 | * or to compare the same entry, see | |
58 | * | |
59 | * <pre> | |
60 | * https://www.ebi.ac.uk/ena/browser/api/embl/X81322.1 | |
61 | * https://www.ncbi.nlm.nih.gov/nuccore/X81322.1 | |
62 | * </pre> | |
63 | * | |
64 | * The feature table part of the file has a common definition, only the start of | |
65 | * each line is formatted differently in GenBank and EMBL. See | |
66 | * http://www.insdc.org/files/feature_table.html#7.1. | |
67 | */ | |
68 | public abstract class EMBLLikeFlatFile extends AlignFile | |
69 | { | |
70 | protected static final String LOCATION = "location"; | |
71 | ||
72 | protected static final String QUOTE = "\""; | |
73 | ||
74 | protected static final String DOUBLED_QUOTE = QUOTE + QUOTE; | |
75 | ||
76 | protected static final String WHITESPACE = "\\s+"; | |
77 | ||
78 | /** | |
79 | * Removes leading or trailing double quotes (") unless doubled, and changes | |
80 | * any 'escaped' (doubled) double quotes to single characters. As per the | |
81 | * Feature Table specification for Qualifiers, Free Text. | |
82 | * | |
83 | * @param value | |
84 | * @return | |
85 | */ | |
86 | 327 | protected static String removeQuotes(String value) |
87 | { | |
88 | 327 | if (value == null) |
89 | { | |
90 | 1 | return null; |
91 | } | |
92 | 326 | if (value.startsWith(QUOTE) && !value.startsWith(DOUBLED_QUOTE)) |
93 | { | |
94 | 164 | value = value.substring(1); |
95 | } | |
96 | 326 | if (value.endsWith(QUOTE) && !value.endsWith(DOUBLED_QUOTE)) |
97 | { | |
98 | 160 | value = value.substring(0, value.length() - 1); |
99 | } | |
100 | 326 | value = value.replace(DOUBLED_QUOTE, QUOTE); |
101 | 326 | return value; |
102 | } | |
103 | ||
104 | /** | |
105 | * Truncates (if necessary) the exon intervals to match 3 times the length of | |
106 | * the protein(including truncation for stop codon included in exon) | |
107 | * | |
108 | * @param proteinLength | |
109 | * @param exon | |
110 | * an array of [start, end, start, end...] intervals | |
111 | * @return the same array (if unchanged) or a truncated copy | |
112 | */ | |
113 | 31 | protected static int[] adjustForProteinLength(int proteinLength, |
114 | int[] exon) | |
115 | { | |
116 | 31 | if (proteinLength <= 0 || exon == null) |
117 | { | |
118 | 0 | return exon; |
119 | } | |
120 | 31 | int expectedCdsLength = proteinLength * 3; |
121 | 31 | int exonLength = MappingUtils.getLength(Arrays.asList(exon)); |
122 | ||
123 | /* | |
124 | * if exon length matches protein, or is shorter, then leave it unchanged | |
125 | */ | |
126 | 31 | if (expectedCdsLength >= exonLength) |
127 | { | |
128 | 3 | return exon; |
129 | } | |
130 | ||
131 | 28 | int origxon[]; |
132 | 28 | int sxpos = -1; |
133 | 28 | int endxon = 0; |
134 | 28 | origxon = new int[exon.length]; |
135 | 28 | System.arraycopy(exon, 0, origxon, 0, exon.length); |
136 | 28 | int cdspos = 0; |
137 | 37 | for (int x = 0; x < exon.length; x += 2) |
138 | { | |
139 | 37 | cdspos += Math.abs(exon[x + 1] - exon[x]) + 1; |
140 | 37 | if (expectedCdsLength <= cdspos) |
141 | { | |
142 | // advanced beyond last codon. | |
143 | 28 | sxpos = x; |
144 | 28 | if (expectedCdsLength != cdspos) |
145 | { | |
146 | // System.err | |
147 | // .println("Truncating final exon interval on region by " | |
148 | // + (cdspos - cdslength)); | |
149 | } | |
150 | ||
151 | /* | |
152 | * shrink the final exon - reduce end position if forward | |
153 | * strand, increase it if reverse | |
154 | */ | |
155 | 28 | if (exon[x + 1] >= exon[x]) |
156 | { | |
157 | 25 | endxon = exon[x + 1] - cdspos + expectedCdsLength; |
158 | } | |
159 | else | |
160 | { | |
161 | 3 | endxon = exon[x + 1] + cdspos - expectedCdsLength; |
162 | } | |
163 | 28 | break; |
164 | } | |
165 | } | |
166 | ||
167 | 28 | if (sxpos != -1) |
168 | { | |
169 | // and trim the exon interval set if necessary | |
170 | 28 | int[] nxon = new int[sxpos + 2]; |
171 | 28 | System.arraycopy(exon, 0, nxon, 0, sxpos + 2); |
172 | 28 | nxon[sxpos + 1] = endxon; // update the end boundary for the new exon |
173 | // set | |
174 | 28 | exon = nxon; |
175 | } | |
176 | 28 | return exon; |
177 | } | |
178 | ||
179 | /* | |
180 | * when true, interpret the mol_type 'source' feature attribute | |
181 | * and generate an RNA sequence from the DNA record | |
182 | */ | |
183 | protected boolean produceRna = true; | |
184 | ||
185 | /* | |
186 | * values parsed from the data file | |
187 | */ | |
188 | protected String sourceDb; | |
189 | ||
190 | protected String accession; | |
191 | ||
192 | protected String version; | |
193 | ||
194 | protected String description; | |
195 | ||
196 | protected int length = 128; | |
197 | ||
198 | protected List<DBRefEntry> dbrefs; | |
199 | ||
200 | protected boolean sequenceStringIsRNA = false; | |
201 | ||
202 | protected String sequenceString; | |
203 | ||
204 | protected Map<String, CdsData> cds; | |
205 | ||
206 | /** | |
207 | * Constructor | |
208 | * | |
209 | * @param fp | |
210 | * @param sourceId | |
211 | * @throws IOException | |
212 | */ | |
213 | 4 | public EMBLLikeFlatFile(FileParse fp, String sourceId) throws IOException |
214 | { | |
215 | 4 | super(false, fp); // don't parse immediately |
216 | 4 | this.sourceDb = sourceId; |
217 | 4 | dbrefs = new ArrayList<>(); |
218 | ||
219 | /* | |
220 | * using TreeMap gives CDS sequences in alphabetical, so readable, order | |
221 | */ | |
222 | 4 | cds = new TreeMap<>(String.CASE_INSENSITIVE_ORDER); |
223 | ||
224 | 4 | parse(); |
225 | } | |
226 | ||
227 | /** | |
228 | * process attributes for 'source' until the next FT feature entry only | |
229 | * interested in 'mol_type' | |
230 | * | |
231 | * @param tokens | |
232 | * @return | |
233 | * @throws IOException | |
234 | */ | |
235 | 3 | private String parseSourceQualifiers(String[] tokens) throws IOException |
236 | { | |
237 | 3 | if (!"source".equals(tokens[0])) |
238 | { | |
239 | 0 | throw (new RuntimeException("Not given a 'source' qualifier line")); |
240 | } | |
241 | // search for mol_type attribute | |
242 | ||
243 | 3 | StringBuilder sb = new StringBuilder().append(tokens[1]); // extent of |
244 | // sequence | |
245 | ||
246 | 3 | String line = parseFeatureQualifier(sb, false); |
247 | 17 | while (line != null) |
248 | { | |
249 | 17 | if (!line.startsWith("FT ")) // four spaces, end of this feature table |
250 | // entry | |
251 | { | |
252 | 3 | return line; |
253 | } | |
254 | ||
255 | // case sensitive ? | |
256 | 14 | int p = line.indexOf("\\mol_type"); |
257 | 14 | int qs = line.indexOf("\"", p); |
258 | 14 | int qe = line.indexOf("\"", qs + 1); |
259 | 14 | String qualifier = line.substring(qs, qe).toLowerCase(Locale.ROOT); |
260 | 14 | if (qualifier.indexOf("rna") > -1) |
261 | { | |
262 | 1 | sequenceStringIsRNA = true; |
263 | } | |
264 | 14 | if (qualifier.indexOf("dna") > -1) |
265 | { | |
266 | 1 | sequenceStringIsRNA = false; |
267 | } | |
268 | 14 | line = parseFeatureQualifier(sb, false); |
269 | } | |
270 | 0 | return line; |
271 | } | |
272 | ||
273 | /** | |
274 | * Parses one (GenBank or EMBL format) CDS feature, saves the parsed data, and | |
275 | * returns the next line | |
276 | * | |
277 | * @param location | |
278 | * @return | |
279 | * @throws IOException | |
280 | */ | |
281 | 25 | protected String parseCDSFeature(String location) throws IOException |
282 | { | |
283 | 25 | String line; |
284 | ||
285 | /* | |
286 | * parse location, which can be over >1 line e.g. EAW51554 | |
287 | */ | |
288 | 25 | CdsData data = new CdsData(); |
289 | 25 | StringBuilder sb = new StringBuilder().append(location); |
290 | 25 | line = parseFeatureQualifier(sb, false); |
291 | 25 | data.cdsLocation = sb.toString(); |
292 | ||
293 | 234 | while (line != null) |
294 | { | |
295 | 234 | if (!isFeatureContinuationLine(line)) |
296 | { | |
297 | // e.g. start of next feature "FT source..." | |
298 | 25 | break; |
299 | } | |
300 | ||
301 | /* | |
302 | * extract qualifier, e.g. FT /protein_id="CAA37824.1" | |
303 | * - the value may extend over more than one line | |
304 | * - if the value has enclosing quotes, these are removed | |
305 | * - escaped double quotes ("") are reduced to a single character | |
306 | */ | |
307 | 209 | int slashPos = line.indexOf('/'); |
308 | 209 | if (slashPos == -1) |
309 | { | |
310 | 0 | Console.error("Unexpected EMBL line ignored: " + line); |
311 | 0 | line = nextLine(); |
312 | 0 | continue; |
313 | } | |
314 | 209 | int eqPos = line.indexOf('=', slashPos + 1); |
315 | 209 | if (eqPos == -1) |
316 | { | |
317 | // can happen, e.g. /ribosomal_slippage | |
318 | 0 | line = nextLine(); |
319 | 0 | continue; |
320 | } | |
321 | 209 | String qualifier = line.substring(slashPos + 1, eqPos); |
322 | 209 | String value = line.substring(eqPos + 1); |
323 | 209 | value = removeQuotes(value); |
324 | 209 | sb = new StringBuilder().append(value); |
325 | 209 | boolean asText = !"translation".equals(qualifier); |
326 | 209 | line = parseFeatureQualifier(sb, asText); |
327 | 209 | String featureValue = sb.toString(); |
328 | ||
329 | 209 | if ("protein_id".equals(qualifier)) |
330 | { | |
331 | 25 | data.proteinId = featureValue; |
332 | } | |
333 | 184 | else if ("codon_start".equals(qualifier)) |
334 | { | |
335 | 24 | try |
336 | { | |
337 | 24 | data.codonStart = Integer.parseInt(featureValue.trim()); |
338 | } catch (NumberFormatException e) | |
339 | { | |
340 | 0 | Console.error("Invalid codon_start in XML for " + this.accession |
341 | + ": " + e.getMessage()); | |
342 | } | |
343 | } | |
344 | 160 | else if ("db_xref".equals(qualifier)) |
345 | { | |
346 | 62 | String[] parts = featureValue.split(":"); |
347 | 62 | if (parts.length == 2) |
348 | { | |
349 | 62 | String db = parts[0].trim(); |
350 | 62 | db = DBRefUtils.getCanonicalName(db); |
351 | 62 | DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim()); |
352 | 62 | data.xrefs.add(dbref); |
353 | } | |
354 | } | |
355 | 98 | else if ("product".equals(qualifier)) |
356 | { | |
357 | 25 | data.proteinName = featureValue; |
358 | } | |
359 | 73 | else if ("translation".equals(qualifier)) |
360 | { | |
361 | 25 | data.translation = featureValue; |
362 | } | |
363 | 48 | else if (!"".equals(featureValue)) |
364 | { | |
365 | // throw anything else into the additional properties hash | |
366 | 48 | data.cdsProps.put(qualifier, featureValue); |
367 | } | |
368 | } | |
369 | ||
370 | 25 | if (data.proteinId != null) |
371 | { | |
372 | 25 | this.cds.put(data.proteinId, data); |
373 | } | |
374 | else | |
375 | { | |
376 | 0 | Console.error("Ignoring CDS feature with no protein_id for " |
377 | + sourceDb + ":" + accession); | |
378 | } | |
379 | ||
380 | 25 | return line; |
381 | } | |
382 | ||
383 | protected abstract boolean isFeatureContinuationLine(String line); | |
384 | ||
385 | /** | |
386 | * Output (print) is not (yet) implemented for flat file format | |
387 | */ | |
388 | 0 | @Override |
389 | public String print(SequenceI[] seqs, boolean jvsuffix) | |
390 | { | |
391 | 0 | return null; |
392 | } | |
393 | ||
394 | /** | |
395 | * Constructs and saves the sequence from parsed components | |
396 | */ | |
397 | 4 | protected void buildSequence() |
398 | { | |
399 | 4 | if (this.accession == null || this.sequenceString == null) |
400 | { | |
401 | 0 | Console.error("Failed to parse data from EMBL"); |
402 | 0 | return; |
403 | } | |
404 | ||
405 | 4 | String name = this.accession; |
406 | 4 | if (this.sourceDb != null) |
407 | { | |
408 | 4 | name = this.sourceDb + "|" + name; |
409 | } | |
410 | ||
411 | 4 | if (produceRna && sequenceStringIsRNA) |
412 | { | |
413 | 1 | sequenceString = sequenceString.replace('T', 'U').replace('t', 'u'); |
414 | } | |
415 | ||
416 | 4 | SequenceI seq = new Sequence(name, this.sequenceString); |
417 | 4 | seq.setDescription(this.description); |
418 | ||
419 | /* | |
420 | * add a DBRef to itself | |
421 | */ | |
422 | 4 | DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession); |
423 | 4 | int[] startEnd = new int[] { 1, seq.getLength() }; |
424 | 4 | selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1)); |
425 | 4 | seq.addDBRef(selfRef); |
426 | ||
427 | 4 | for (DBRefEntry dbref : this.dbrefs) |
428 | { | |
429 | 8 | seq.addDBRef(dbref); |
430 | } | |
431 | ||
432 | 4 | processCDSFeatures(seq); |
433 | ||
434 | 4 | seq.deriveSequence(); |
435 | ||
436 | 4 | addSequence(seq); |
437 | } | |
438 | ||
439 | /** | |
440 | * Process the CDS features, including generation of cross-references and | |
441 | * mappings to the protein products (translation) | |
442 | * | |
443 | * @param seq | |
444 | */ | |
445 | 4 | protected void processCDSFeatures(SequenceI seq) |
446 | { | |
447 | /* | |
448 | * record protein products found to avoid duplication i.e. >1 CDS with | |
449 | * the same /protein_id [though not sure I can find an example of this] | |
450 | */ | |
451 | 4 | Map<String, SequenceI> proteins = new HashMap<>(); |
452 | 4 | for (CdsData data : cds.values()) |
453 | { | |
454 | 25 | processCDSFeature(seq, data, proteins); |
455 | } | |
456 | } | |
457 | ||
458 | /** | |
459 | * Processes data for one parsed CDS feature to | |
460 | * <ul> | |
461 | * <li>create a protein product sequence for the translation</li> | |
462 | * <li>create a cross-reference to protein with mapping from dna</li> | |
463 | * <li>add a CDS feature to the sequence for each CDS start-end range</li> | |
464 | * <li>add any CDS dbrefs to the sequence and to the protein product</li> | |
465 | * </ul> | |
466 | * | |
467 | * @param SequenceI | |
468 | * dna | |
469 | * @param proteins | |
470 | * map of protein products so far derived from CDS data | |
471 | */ | |
472 | 25 | void processCDSFeature(SequenceI dna, CdsData data, |
473 | Map<String, SequenceI> proteins) | |
474 | { | |
475 | /* | |
476 | * parse location into a list of [start, end, start, end] positions | |
477 | */ | |
478 | 25 | int[] exons = getCdsRanges(this.accession, data.cdsLocation); |
479 | ||
480 | 25 | MapList maplist = buildMappingToProtein(dna, exons, data); |
481 | ||
482 | 25 | int exonNumber = 0; |
483 | ||
484 | 53 | for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2) |
485 | { | |
486 | 28 | int exonStart = exons[xint]; |
487 | 28 | int exonEnd = exons[xint + 1]; |
488 | 28 | int begin = Math.min(exonStart, exonEnd); |
489 | 28 | int end = Math.max(exonStart, exonEnd); |
490 | 28 | exonNumber++; |
491 | 28 | String desc = String.format("Exon %d for protein EMBLCDS:%s", |
492 | exonNumber, data.proteinId); | |
493 | ||
494 | 28 | SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end, |
495 | this.sourceDb); | |
496 | 28 | for (Entry<String, String> val : data.cdsProps.entrySet()) |
497 | { | |
498 | 54 | sf.setValue(val.getKey(), val.getValue()); |
499 | } | |
500 | ||
501 | 28 | sf.setEnaLocation(data.cdsLocation); |
502 | 28 | boolean forwardStrand = exonStart <= exonEnd; |
503 | 28 | sf.setStrand(forwardStrand ? "+" : "-"); |
504 | 28 | sf.setPhase(String.valueOf(data.codonStart - 1)); |
505 | 28 | sf.setValue(FeatureProperties.EXONPOS, exonNumber); |
506 | 28 | sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName); |
507 | ||
508 | 28 | dna.addSequenceFeature(sf); |
509 | } | |
510 | ||
511 | 25 | boolean hasUniprotDbref = false; |
512 | 25 | for (DBRefEntry xref : data.xrefs) |
513 | { | |
514 | 62 | dna.addDBRef(xref); |
515 | 62 | if (xref.getSource().equals(DBRefSource.UNIPROT)) |
516 | { | |
517 | /* | |
518 | * construct (or find) the sequence for (data.protein_id, data.translation) | |
519 | */ | |
520 | 16 | SequenceI protein = buildProteinProduct(dna, xref, data, proteins); |
521 | 16 | Mapping map = new Mapping(protein, maplist); |
522 | 16 | map.setMappedFromId(data.proteinId); |
523 | 16 | xref.setMap(map); |
524 | ||
525 | /* | |
526 | * add DBRefs with mappings from dna to protein and the inverse | |
527 | */ | |
528 | 16 | DBRefEntry db1 = new DBRefEntry(sourceDb, version, accession); |
529 | 16 | db1.setMap(new Mapping(dna, maplist.getInverse())); |
530 | 16 | protein.addDBRef(db1); |
531 | ||
532 | 16 | hasUniprotDbref = true; |
533 | } | |
534 | } | |
535 | ||
536 | /* | |
537 | * if we have a product (translation) but no explicit Uniprot dbref | |
538 | * (example: EMBL M19487 protein_id AAB02592.1) | |
539 | * then construct mappings to an assumed EMBLCDSPROTEIN accession | |
540 | */ | |
541 | 25 | if (!hasUniprotDbref) |
542 | { | |
543 | 9 | SequenceI protein = proteins.get(data.proteinId); |
544 | 9 | if (protein == null) |
545 | { | |
546 | 9 | protein = new Sequence(data.proteinId, data.translation); |
547 | 9 | protein.setDescription(data.proteinName); |
548 | 9 | proteins.put(data.proteinId, protein); |
549 | } | |
550 | // assuming CDSPROTEIN sequence version = dna version (?!) | |
551 | 9 | DBRefEntry db1 = new DBRefEntry(DBRefSource.EMBLCDSProduct, |
552 | this.version, data.proteinId); | |
553 | 9 | protein.addDBRef(db1); |
554 | ||
555 | 9 | DBRefEntry dnaToEmblProteinRef = new DBRefEntry( |
556 | DBRefSource.EMBLCDSProduct, this.version, data.proteinId); | |
557 | 9 | Mapping map = new Mapping(protein, maplist); |
558 | 9 | map.setMappedFromId(data.proteinId); |
559 | 9 | dnaToEmblProteinRef.setMap(map); |
560 | 9 | dna.addDBRef(dnaToEmblProteinRef); |
561 | } | |
562 | ||
563 | /* | |
564 | * comment brought forward from EmblXmlSource, lines 447-451: | |
565 | * TODO: if retrieved from EMBLCDS, add a DBRef back to the parent EMBL | |
566 | * sequence with the exon map; if given a dataset reference, search | |
567 | * dataset for parent EMBL sequence if it exists and set its map; | |
568 | * make a new feature annotating the coding contig | |
569 | */ | |
570 | } | |
571 | ||
572 | /** | |
573 | * Computes a mapping from CDS positions in DNA sequence to protein product | |
574 | * positions, with allowance for stop codon or incomplete start codon | |
575 | * | |
576 | * @param dna | |
577 | * @param exons | |
578 | * @param data | |
579 | * @return | |
580 | */ | |
581 | 25 | MapList buildMappingToProtein(final SequenceI dna, final int[] exons, |
582 | final CdsData data) | |
583 | { | |
584 | 25 | MapList dnaToProteinMapping = null; |
585 | 25 | int peptideLength = data.translation.length(); |
586 | ||
587 | 25 | int[] proteinRange = new int[] { 1, peptideLength }; |
588 | 25 | if (exons != null && exons.length > 0) |
589 | { | |
590 | /* | |
591 | * We were able to parse 'location'; do a final | |
592 | * product length truncation check | |
593 | */ | |
594 | 25 | int[] cdsRanges = adjustForProteinLength(peptideLength, exons); |
595 | 25 | dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1); |
596 | } | |
597 | else | |
598 | { | |
599 | /* | |
600 | * workaround until we handle all 'location' formats fully | |
601 | * e.g. X53828.1:60..1058 or <123..>289 | |
602 | */ | |
603 | 0 | Console.error(String.format( |
604 | "Implementation Notice: EMBLCDS location '%s'not properly supported yet" | |
605 | + " - Making up the CDNA region of (%s:%s)... may be incorrect", | |
606 | data.cdsLocation, sourceDb, this.accession)); | |
607 | ||
608 | 0 | int completeCodonsLength = 1 - data.codonStart + dna.getLength(); |
609 | 0 | int mappedDnaEnd = dna.getEnd(); |
610 | 0 | if (peptideLength * 3 == completeCodonsLength) |
611 | { | |
612 | // this might occur for CDS sequences where no features are marked | |
613 | 0 | Console.warn("Assuming no stop codon at end of cDNA fragment"); |
614 | 0 | mappedDnaEnd = dna.getEnd(); |
615 | } | |
616 | 0 | else if ((peptideLength + 1) * 3 == completeCodonsLength) |
617 | { | |
618 | 0 | Console.warn("Assuming stop codon at end of cDNA fragment"); |
619 | 0 | mappedDnaEnd = dna.getEnd() - 3; |
620 | } | |
621 | ||
622 | 0 | if (mappedDnaEnd != -1) |
623 | { | |
624 | 0 | int[] cdsRanges = new int[] { |
625 | dna.getStart() + (data.codonStart - 1), mappedDnaEnd }; | |
626 | 0 | dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1); |
627 | } | |
628 | } | |
629 | ||
630 | 25 | return dnaToProteinMapping; |
631 | } | |
632 | ||
633 | /** | |
634 | * Constructs a sequence for the protein product for the CDS data (if there is | |
635 | * one), and dbrefs with mappings from CDS to protein and the reverse | |
636 | * | |
637 | * @param dna | |
638 | * @param xref | |
639 | * @param data | |
640 | * @param proteins | |
641 | * @return | |
642 | */ | |
643 | 16 | SequenceI buildProteinProduct(SequenceI dna, DBRefEntry xref, |
644 | CdsData data, Map<String, SequenceI> proteins) | |
645 | { | |
646 | /* | |
647 | * check we have some data to work with | |
648 | */ | |
649 | 16 | if (data.proteinId == null || data.translation == null) |
650 | { | |
651 | 0 | return null; |
652 | } | |
653 | ||
654 | /* | |
655 | * Construct the protein sequence (if not already seen) | |
656 | */ | |
657 | 16 | String proteinSeqName = xref.getSource() + "|" + xref.getAccessionId(); |
658 | 16 | SequenceI protein = proteins.get(proteinSeqName); |
659 | 16 | if (protein == null) |
660 | { | |
661 | 16 | protein = new Sequence(proteinSeqName, data.translation, 1, |
662 | data.translation.length()); | |
663 | 16 | protein.setDescription(data.proteinName != null ? data.proteinName |
664 | : "Protein Product from " + sourceDb); | |
665 | 16 | proteins.put(proteinSeqName, protein); |
666 | } | |
667 | ||
668 | 16 | return protein; |
669 | } | |
670 | ||
671 | /** | |
672 | * Returns the CDS location as a single array of [start, end, start, end...] | |
673 | * positions. If on the reverse strand, these will be in descending order. | |
674 | * | |
675 | * @param accession | |
676 | * @param location | |
677 | * @return | |
678 | */ | |
679 | 25 | protected int[] getCdsRanges(String accession, String location) |
680 | { | |
681 | 25 | if (location == null) |
682 | { | |
683 | 0 | return new int[] {}; |
684 | } | |
685 | ||
686 | 25 | try |
687 | { | |
688 | 25 | List<int[]> ranges = DnaUtils.parseLocation(location); |
689 | 25 | return MappingUtils.rangeListToArray(ranges); |
690 | } catch (ParseException e) | |
691 | { | |
692 | 0 | Console.warn( |
693 | String.format("Not parsing inexact CDS location %s in ENA %s", | |
694 | location, accession)); | |
695 | 0 | return new int[] {}; |
696 | } | |
697 | } | |
698 | ||
699 | /** | |
700 | * Reads the value of a feature (FT) qualifier from one or more lines of the | |
701 | * file, and returns the next line after that. Values are appended to the | |
702 | * string buffer, which should be already primed with the value read from the | |
703 | * first line for the qualifier (with any leading double quote removed). | |
704 | * Enclosing double quotes are removed, and escaped (repeated) double quotes | |
705 | * reduced to one only. For example for | |
706 | * | |
707 | * <pre> | |
708 | * FT /note="gene_id=hCG28070.3 | |
709 | * FT ""foobar"" isoform=CRA_b" | |
710 | * the returned value is | |
711 | * gene_id=hCG28070.3 "foobar" isoform=CRA_b | |
712 | * </pre> | |
713 | * | |
714 | * Note the side-effect of this method, to advance data reading to the next | |
715 | * line after the feature qualifier (which could be another qualifier, a | |
716 | * different feature, a non-feature line, or null at end of file). | |
717 | * | |
718 | * @param sb | |
719 | * a string buffer primed with the first line of the value | |
720 | * @param asText | |
721 | * @return | |
722 | * @throws IOException | |
723 | */ | |
724 | 251 | String parseFeatureQualifier(StringBuilder sb, boolean asText) |
725 | throws IOException | |
726 | { | |
727 | 251 | String line; |
728 | ? | while ((line = nextLine()) != null) |
729 | { | |
730 | 365 | if (!isFeatureContinuationLine(line)) |
731 | { | |
732 | 27 | break; // reached next feature or other input line |
733 | } | |
734 | 338 | String[] tokens = line.split(WHITESPACE); |
735 | 338 | if (tokens.length < 2) |
736 | { | |
737 | 0 | Console.error("Ignoring bad EMBL line for " + this.accession + ": " |
738 | + line); | |
739 | 0 | break; |
740 | } | |
741 | 338 | if (tokens[1].startsWith("/")) |
742 | { | |
743 | 224 | break; // next feature qualifier |
744 | } | |
745 | ||
746 | /* | |
747 | * if text (e.g. /product), add a word separator for a new line, | |
748 | * else (e.g. /translation) don't | |
749 | */ | |
750 | 114 | if (asText) |
751 | { | |
752 | 1 | sb.append(" "); |
753 | } | |
754 | ||
755 | /* | |
756 | * remove trailing " and unescape doubled "" | |
757 | */ | |
758 | 114 | String data = removeQuotes(tokens[1]); |
759 | 114 | sb.append(data); |
760 | } | |
761 | ||
762 | 251 | return line; |
763 | } | |
764 | ||
765 | /** | |
766 | * Reads and saves the sequence, read from the lines following the ORIGIN | |
767 | * (GenBank) or SQ (EMBL) line. Whitespace and position counters are | |
768 | * discarded. Returns the next line following the sequence data (the next line | |
769 | * that doesn't start with whitespace). | |
770 | * | |
771 | * @throws IOException | |
772 | */ | |
773 | 4 | protected String parseSequence() throws IOException |
774 | { | |
775 | 4 | StringBuilder sb = new StringBuilder(this.length); |
776 | 4 | String line = nextLine(); |
777 | 383 | while (line != null && line.startsWith(" ")) |
778 | { | |
779 | 379 | line = line.trim(); |
780 | 379 | String[] blocks = line.split(WHITESPACE); |
781 | ||
782 | /* | |
783 | * the first or last block on each line might be a position count - omit | |
784 | */ | |
785 | 3015 | for (int i = 0; i < blocks.length; i++) |
786 | { | |
787 | 2636 | try |
788 | { | |
789 | 2636 | Long.parseLong(blocks[i]); |
790 | // position counter - ignore it | |
791 | } catch (NumberFormatException e) | |
792 | { | |
793 | // sequence data - append it | |
794 | 2257 | sb.append(blocks[i]); |
795 | } | |
796 | } | |
797 | 379 | line = nextLine(); |
798 | } | |
799 | 4 | this.sequenceString = sb.toString(); |
800 | ||
801 | 4 | return line; |
802 | } | |
803 | ||
804 | /** | |
805 | * Processes a feature line. If it declares a feature type of interest | |
806 | * (currently, only CDS is processed), processes all of the associated lines | |
807 | * (feature qualifiers), and returns the next line after that, otherwise | |
808 | * simply returns the next line. | |
809 | * | |
810 | * @param line | |
811 | * the first line for the feature (with initial FT omitted for EMBL | |
812 | * format) | |
813 | * @return | |
814 | * @throws IOException | |
815 | */ | |
816 | 41 | protected String parseFeature(String line) throws IOException |
817 | { | |
818 | 41 | String[] tokens = line.trim().split(WHITESPACE); |
819 | 41 | if (tokens.length < 2 |
820 | || (!"CDS".equals(tokens[0]) && (!"source".equals(tokens[0])))) | |
821 | { | |
822 | 13 | return nextLine(); |
823 | } | |
824 | 28 | if (tokens[0].equals("source")) |
825 | { | |
826 | 3 | return parseSourceQualifiers(tokens); |
827 | } | |
828 | 25 | return parseCDSFeature(tokens[1]); |
829 | } | |
830 | } | |
831 | ||
832 | /** | |
833 | * A data bean class to hold values parsed from one CDS Feature | |
834 | */ | |
835 | class CdsData | |
836 | { | |
837 | String translation; // from /translation qualifier | |
838 | ||
839 | String cdsLocation; // the raw value e.g. join(1..1234,2012..2837) | |
840 | ||
841 | int codonStart = 1; // from /codon_start qualifier | |
842 | ||
843 | String proteinName; // from /product qualifier; used for protein description | |
844 | ||
845 | String proteinId; // from /protein_id qualifier | |
846 | ||
847 | List<DBRefEntry> xrefs = new ArrayList<>(); // from /db_xref qualifiers | |
848 | ||
849 | Map<String, String> cdsProps = new Hashtable<>(); // other qualifiers | |
850 | } |