Clover icon

Coverage Report

  1. Project Clover database Mon Nov 18 2024 09:38:20 GMT
  2. Package jalview.io

File EMBLLikeFlatFile.java

 

Coverage histogram

../../img/srcFileCovDistChart7.png
21% of files have more coverage

Code metrics

104
240
15
2
850
488
81
0.34
16
7.5
5.4

Classes

Class Line # Actions
EMBLLikeFlatFile 68 240 81
0.6740947467.4%
CdsData 835 0 0
-1.0 -
 

Contributing tests

This file is covered by 1 test. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.io;
22   
23    import java.io.IOException;
24    import java.text.ParseException;
25    import java.util.ArrayList;
26    import java.util.Arrays;
27    import java.util.HashMap;
28    import java.util.Hashtable;
29    import java.util.List;
30    import java.util.Locale;
31    import java.util.Map;
32    import java.util.Map.Entry;
33    import java.util.TreeMap;
34   
35    import jalview.bin.Console;
36    import jalview.datamodel.DBRefEntry;
37    import jalview.datamodel.DBRefSource;
38    import jalview.datamodel.FeatureProperties;
39    import jalview.datamodel.Mapping;
40    import jalview.datamodel.Sequence;
41    import jalview.datamodel.SequenceFeature;
42    import jalview.datamodel.SequenceI;
43    import jalview.util.DBRefUtils;
44    import jalview.util.DnaUtils;
45    import jalview.util.MapList;
46    import jalview.util.MappingUtils;
47   
48    /**
49    * A base class to support parsing of GenBank, EMBL or DDBJ flat file format
50    * data. Example files (rather than formal specifications) are provided at
51    *
52    * <pre>
53    * https://ena-docs.readthedocs.io/en/latest/submit/fileprep/flat-file-example.html
54    * https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html
55    * </pre>
56    *
57    * or to compare the same entry, see
58    *
59    * <pre>
60    * https://www.ebi.ac.uk/ena/browser/api/embl/X81322.1
61    * https://www.ncbi.nlm.nih.gov/nuccore/X81322.1
62    * </pre>
63    *
64    * The feature table part of the file has a common definition, only the start of
65    * each line is formatted differently in GenBank and EMBL. See
66    * http://www.insdc.org/files/feature_table.html#7.1.
67    */
 
68    public abstract class EMBLLikeFlatFile extends AlignFile
69    {
70    protected static final String LOCATION = "location";
71   
72    protected static final String QUOTE = "\"";
73   
74    protected static final String DOUBLED_QUOTE = QUOTE + QUOTE;
75   
76    protected static final String WHITESPACE = "\\s+";
77   
78    /**
79    * Removes leading or trailing double quotes (") unless doubled, and changes
80    * any 'escaped' (doubled) double quotes to single characters. As per the
81    * Feature Table specification for Qualifiers, Free Text.
82    *
83    * @param value
84    * @return
85    */
 
86  87 toggle protected static String removeQuotes(String value)
87    {
88  87 if (value == null)
89    {
90  0 return null;
91    }
92  87 if (value.startsWith(QUOTE) && !value.startsWith(DOUBLED_QUOTE))
93    {
94  33 value = value.substring(1);
95    }
96  87 if (value.endsWith(QUOTE) && !value.endsWith(DOUBLED_QUOTE))
97    {
98  31 value = value.substring(0, value.length() - 1);
99    }
100  87 value = value.replace(DOUBLED_QUOTE, QUOTE);
101  87 return value;
102    }
103   
104    /**
105    * Truncates (if necessary) the exon intervals to match 3 times the length of
106    * the protein(including truncation for stop codon included in exon)
107    *
108    * @param proteinLength
109    * @param exon
110    * an array of [start, end, start, end...] intervals
111    * @return the same array (if unchanged) or a truncated copy
112    */
 
113  8 toggle protected static int[] adjustForProteinLength(int proteinLength,
114    int[] exon)
115    {
116  8 if (proteinLength <= 0 || exon == null)
117    {
118  0 return exon;
119    }
120  8 int expectedCdsLength = proteinLength * 3;
121  8 int exonLength = MappingUtils.getLength(Arrays.asList(exon));
122   
123    /*
124    * if exon length matches protein, or is shorter, then leave it unchanged
125    */
126  8 if (expectedCdsLength >= exonLength)
127    {
128  0 return exon;
129    }
130   
131  8 int origxon[];
132  8 int sxpos = -1;
133  8 int endxon = 0;
134  8 origxon = new int[exon.length];
135  8 System.arraycopy(exon, 0, origxon, 0, exon.length);
136  8 int cdspos = 0;
137  9 for (int x = 0; x < exon.length; x += 2)
138    {
139  9 cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
140  9 if (expectedCdsLength <= cdspos)
141    {
142    // advanced beyond last codon.
143  8 sxpos = x;
144  8 if (expectedCdsLength != cdspos)
145    {
146    // System.err
147    // .println("Truncating final exon interval on region by "
148    // + (cdspos - cdslength));
149    }
150   
151    /*
152    * shrink the final exon - reduce end position if forward
153    * strand, increase it if reverse
154    */
155  8 if (exon[x + 1] >= exon[x])
156    {
157  7 endxon = exon[x + 1] - cdspos + expectedCdsLength;
158    }
159    else
160    {
161  1 endxon = exon[x + 1] + cdspos - expectedCdsLength;
162    }
163  8 break;
164    }
165    }
166   
167  8 if (sxpos != -1)
168    {
169    // and trim the exon interval set if necessary
170  8 int[] nxon = new int[sxpos + 2];
171  8 System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
172  8 nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
173    // set
174  8 exon = nxon;
175    }
176  8 return exon;
177    }
178   
179    /*
180    * when true, interpret the mol_type 'source' feature attribute
181    * and generate an RNA sequence from the DNA record
182    */
183    protected boolean produceRna = true;
184   
185    /*
186    * values parsed from the data file
187    */
188    protected String sourceDb;
189   
190    protected String accession;
191   
192    protected String version;
193   
194    protected String description;
195   
196    protected int length = 128;
197   
198    protected List<DBRefEntry> dbrefs;
199   
200    protected boolean sequenceStringIsRNA = false;
201   
202    protected String sequenceString;
203   
204    protected Map<String, CdsData> cds;
205   
206    /**
207    * Constructor
208    *
209    * @param fp
210    * @param sourceId
211    * @throws IOException
212    */
 
213  1 toggle public EMBLLikeFlatFile(FileParse fp, String sourceId) throws IOException
214    {
215  1 super(false, fp); // don't parse immediately
216  1 this.sourceDb = sourceId;
217  1 dbrefs = new ArrayList<>();
218   
219    /*
220    * using TreeMap gives CDS sequences in alphabetical, so readable, order
221    */
222  1 cds = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
223   
224  1 parse();
225    }
226   
227    /**
228    * process attributes for 'source' until the next FT feature entry only
229    * interested in 'mol_type'
230    *
231    * @param tokens
232    * @return
233    * @throws IOException
234    */
 
235  1 toggle private String parseSourceQualifiers(String[] tokens) throws IOException
236    {
237  1 if (!"source".equals(tokens[0]))
238    {
239  0 throw (new RuntimeException("Not given a 'source' qualifier line"));
240    }
241    // search for mol_type attribute
242   
243  1 StringBuilder sb = new StringBuilder().append(tokens[1]); // extent of
244    // sequence
245   
246  1 String line = parseFeatureQualifier(sb, false);
247  1 while (line != null)
248    {
249  1 if (!line.startsWith("FT ")) // four spaces, end of this feature table
250    // entry
251    {
252  1 return line;
253    }
254   
255    // case sensitive ?
256  0 int p = line.indexOf("\\mol_type");
257  0 int qs = line.indexOf("\"", p);
258  0 int qe = line.indexOf("\"", qs + 1);
259  0 String qualifier = line.substring(qs, qe).toLowerCase(Locale.ROOT);
260  0 if (qualifier.indexOf("rna") > -1)
261    {
262  0 sequenceStringIsRNA = true;
263    }
264  0 if (qualifier.indexOf("dna") > -1)
265    {
266  0 sequenceStringIsRNA = false;
267    }
268  0 line = parseFeatureQualifier(sb, false);
269    }
270  0 return line;
271    }
272   
273    /**
274    * Parses one (GenBank or EMBL format) CDS feature, saves the parsed data, and
275    * returns the next line
276    *
277    * @param location
278    * @return
279    * @throws IOException
280    */
 
281  8 toggle protected String parseCDSFeature(String location) throws IOException
282    {
283  8 String line;
284   
285    /*
286    * parse location, which can be over >1 line e.g. EAW51554
287    */
288  8 CdsData data = new CdsData();
289  8 StringBuilder sb = new StringBuilder().append(location);
290  8 line = parseFeatureQualifier(sb, false);
291  8 data.cdsLocation = sb.toString();
292   
293  56 while (line != null)
294    {
295  56 if (!isFeatureContinuationLine(line))
296    {
297    // e.g. start of next feature "FT source..."
298  8 break;
299    }
300   
301    /*
302    * extract qualifier, e.g. FT /protein_id="CAA37824.1"
303    * - the value may extend over more than one line
304    * - if the value has enclosing quotes, these are removed
305    * - escaped double quotes ("") are reduced to a single character
306    */
307  48 int slashPos = line.indexOf('/');
308  48 if (slashPos == -1)
309    {
310  0 Console.error("Unexpected EMBL line ignored: " + line);
311  0 line = nextLine();
312  0 continue;
313    }
314  48 int eqPos = line.indexOf('=', slashPos + 1);
315  48 if (eqPos == -1)
316    {
317    // can happen, e.g. /ribosomal_slippage
318  0 line = nextLine();
319  0 continue;
320    }
321  48 String qualifier = line.substring(slashPos + 1, eqPos);
322  48 String value = line.substring(eqPos + 1);
323  48 value = removeQuotes(value);
324  48 sb = new StringBuilder().append(value);
325  48 boolean asText = !"translation".equals(qualifier);
326  48 line = parseFeatureQualifier(sb, asText);
327  48 String featureValue = sb.toString();
328   
329  48 if ("protein_id".equals(qualifier))
330    {
331  8 data.proteinId = featureValue;
332    }
333  40 else if ("codon_start".equals(qualifier))
334    {
335  8 try
336    {
337  8 data.codonStart = Integer.parseInt(featureValue.trim());
338    } catch (NumberFormatException e)
339    {
340  0 Console.error("Invalid codon_start in XML for " + this.accession
341    + ": " + e.getMessage());
342    }
343    }
344  32 else if ("db_xref".equals(qualifier))
345    {
346  0 String[] parts = featureValue.split(":");
347  0 if (parts.length == 2)
348    {
349  0 String db = parts[0].trim();
350  0 db = DBRefUtils.getCanonicalName(db);
351  0 DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
352  0 data.xrefs.add(dbref);
353    }
354    }
355  32 else if ("product".equals(qualifier))
356    {
357  8 data.proteinName = featureValue;
358    }
359  24 else if ("translation".equals(qualifier))
360    {
361  8 data.translation = featureValue;
362    }
363  16 else if (!"".equals(featureValue))
364    {
365    // throw anything else into the additional properties hash
366  16 data.cdsProps.put(qualifier, featureValue);
367    }
368    }
369   
370  8 if (data.proteinId != null)
371    {
372  8 this.cds.put(data.proteinId, data);
373    }
374    else
375    {
376  0 Console.error("Ignoring CDS feature with no protein_id for "
377    + sourceDb + ":" + accession);
378    }
379   
380  8 return line;
381    }
382   
383    protected abstract boolean isFeatureContinuationLine(String line);
384   
385    /**
386    * Output (print) is not (yet) implemented for flat file format
387    */
 
388  0 toggle @Override
389    public String print(SequenceI[] seqs, boolean jvsuffix)
390    {
391  0 return null;
392    }
393   
394    /**
395    * Constructs and saves the sequence from parsed components
396    */
 
397  1 toggle protected void buildSequence()
398    {
399  1 if (this.accession == null || this.sequenceString == null)
400    {
401  0 Console.error("Failed to parse data from EMBL");
402  0 return;
403    }
404   
405  1 String name = this.accession;
406  1 if (this.sourceDb != null)
407    {
408  1 name = this.sourceDb + "|" + name;
409    }
410   
411  1 if (produceRna && sequenceStringIsRNA)
412    {
413  0 sequenceString = sequenceString.replace('T', 'U').replace('t', 'u');
414    }
415   
416  1 SequenceI seq = new Sequence(name, this.sequenceString);
417  1 seq.setDescription(this.description);
418   
419    /*
420    * add a DBRef to itself
421    */
422  1 DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession);
423  1 int[] startEnd = new int[] { 1, seq.getLength() };
424  1 selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1));
425  1 seq.addDBRef(selfRef);
426   
427  1 for (DBRefEntry dbref : this.dbrefs)
428    {
429  0 seq.addDBRef(dbref);
430    }
431   
432  1 processCDSFeatures(seq);
433   
434  1 seq.deriveSequence();
435   
436  1 addSequence(seq);
437    }
438   
439    /**
440    * Process the CDS features, including generation of cross-references and
441    * mappings to the protein products (translation)
442    *
443    * @param seq
444    */
 
445  1 toggle protected void processCDSFeatures(SequenceI seq)
446    {
447    /*
448    * record protein products found to avoid duplication i.e. >1 CDS with
449    * the same /protein_id [though not sure I can find an example of this]
450    */
451  1 Map<String, SequenceI> proteins = new HashMap<>();
452  1 for (CdsData data : cds.values())
453    {
454  8 processCDSFeature(seq, data, proteins);
455    }
456    }
457   
458    /**
459    * Processes data for one parsed CDS feature to
460    * <ul>
461    * <li>create a protein product sequence for the translation</li>
462    * <li>create a cross-reference to protein with mapping from dna</li>
463    * <li>add a CDS feature to the sequence for each CDS start-end range</li>
464    * <li>add any CDS dbrefs to the sequence and to the protein product</li>
465    * </ul>
466    *
467    * @param SequenceI
468    * dna
469    * @param proteins
470    * map of protein products so far derived from CDS data
471    */
 
472  8 toggle void processCDSFeature(SequenceI dna, CdsData data,
473    Map<String, SequenceI> proteins)
474    {
475    /*
476    * parse location into a list of [start, end, start, end] positions
477    */
478  8 int[] exons = getCdsRanges(this.accession, data.cdsLocation);
479   
480  8 MapList maplist = buildMappingToProtein(dna, exons, data);
481   
482  8 int exonNumber = 0;
483   
484  17 for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
485    {
486  9 int exonStart = exons[xint];
487  9 int exonEnd = exons[xint + 1];
488  9 int begin = Math.min(exonStart, exonEnd);
489  9 int end = Math.max(exonStart, exonEnd);
490  9 exonNumber++;
491  9 String desc = String.format("Exon %d for protein EMBLCDS:%s",
492    exonNumber, data.proteinId);
493   
494  9 SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end,
495    this.sourceDb);
496  9 for (Entry<String, String> val : data.cdsProps.entrySet())
497    {
498  18 sf.setValue(val.getKey(), val.getValue());
499    }
500   
501  9 sf.setEnaLocation(data.cdsLocation);
502  9 boolean forwardStrand = exonStart <= exonEnd;
503  9 sf.setStrand(forwardStrand ? "+" : "-");
504  9 sf.setPhase(String.valueOf(data.codonStart - 1));
505  9 sf.setValue(FeatureProperties.EXONPOS, exonNumber);
506  9 sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
507   
508  9 dna.addSequenceFeature(sf);
509    }
510   
511  8 boolean hasUniprotDbref = false;
512  8 for (DBRefEntry xref : data.xrefs)
513    {
514  0 dna.addDBRef(xref);
515  0 if (xref.getSource().equals(DBRefSource.UNIPROT))
516    {
517    /*
518    * construct (or find) the sequence for (data.protein_id, data.translation)
519    */
520  0 SequenceI protein = buildProteinProduct(dna, xref, data, proteins);
521  0 Mapping map = new Mapping(protein, maplist);
522  0 map.setMappedFromId(data.proteinId);
523  0 xref.setMap(map);
524   
525    /*
526    * add DBRefs with mappings from dna to protein and the inverse
527    */
528  0 DBRefEntry db1 = new DBRefEntry(sourceDb, version, accession);
529  0 db1.setMap(new Mapping(dna, maplist.getInverse()));
530  0 protein.addDBRef(db1);
531   
532  0 hasUniprotDbref = true;
533    }
534    }
535   
536    /*
537    * if we have a product (translation) but no explicit Uniprot dbref
538    * (example: EMBL M19487 protein_id AAB02592.1)
539    * then construct mappings to an assumed EMBLCDSPROTEIN accession
540    */
541  8 if (!hasUniprotDbref)
542    {
543  8 SequenceI protein = proteins.get(data.proteinId);
544  8 if (protein == null)
545    {
546  8 protein = new Sequence(data.proteinId, data.translation);
547  8 protein.setDescription(data.proteinName);
548  8 proteins.put(data.proteinId, protein);
549    }
550    // assuming CDSPROTEIN sequence version = dna version (?!)
551  8 DBRefEntry db1 = new DBRefEntry(DBRefSource.EMBLCDSProduct,
552    this.version, data.proteinId);
553  8 protein.addDBRef(db1);
554   
555  8 DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
556    DBRefSource.EMBLCDSProduct, this.version, data.proteinId);
557  8 Mapping map = new Mapping(protein, maplist);
558  8 map.setMappedFromId(data.proteinId);
559  8 dnaToEmblProteinRef.setMap(map);
560  8 dna.addDBRef(dnaToEmblProteinRef);
561    }
562   
563    /*
564    * comment brought forward from EmblXmlSource, lines 447-451:
565    * TODO: if retrieved from EMBLCDS, add a DBRef back to the parent EMBL
566    * sequence with the exon map; if given a dataset reference, search
567    * dataset for parent EMBL sequence if it exists and set its map;
568    * make a new feature annotating the coding contig
569    */
570    }
571   
572    /**
573    * Computes a mapping from CDS positions in DNA sequence to protein product
574    * positions, with allowance for stop codon or incomplete start codon
575    *
576    * @param dna
577    * @param exons
578    * @param data
579    * @return
580    */
 
581  8 toggle MapList buildMappingToProtein(final SequenceI dna, final int[] exons,
582    final CdsData data)
583    {
584  8 MapList dnaToProteinMapping = null;
585  8 int peptideLength = data.translation.length();
586   
587  8 int[] proteinRange = new int[] { 1, peptideLength };
588  8 if (exons != null && exons.length > 0)
589    {
590    /*
591    * We were able to parse 'location'; do a final
592    * product length truncation check
593    */
594  8 int[] cdsRanges = adjustForProteinLength(peptideLength, exons);
595  8 dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
596    }
597    else
598    {
599    /*
600    * workaround until we handle all 'location' formats fully
601    * e.g. X53828.1:60..1058 or <123..>289
602    */
603  0 Console.error(String.format(
604    "Implementation Notice: EMBLCDS location '%s'not properly supported yet"
605    + " - Making up the CDNA region of (%s:%s)... may be incorrect",
606    data.cdsLocation, sourceDb, this.accession));
607   
608  0 int completeCodonsLength = 1 - data.codonStart + dna.getLength();
609  0 int mappedDnaEnd = dna.getEnd();
610  0 if (peptideLength * 3 == completeCodonsLength)
611    {
612    // this might occur for CDS sequences where no features are marked
613  0 Console.warn("Assuming no stop codon at end of cDNA fragment");
614  0 mappedDnaEnd = dna.getEnd();
615    }
616  0 else if ((peptideLength + 1) * 3 == completeCodonsLength)
617    {
618  0 Console.warn("Assuming stop codon at end of cDNA fragment");
619  0 mappedDnaEnd = dna.getEnd() - 3;
620    }
621   
622  0 if (mappedDnaEnd != -1)
623    {
624  0 int[] cdsRanges = new int[] {
625    dna.getStart() + (data.codonStart - 1), mappedDnaEnd };
626  0 dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
627    }
628    }
629   
630  8 return dnaToProteinMapping;
631    }
632   
633    /**
634    * Constructs a sequence for the protein product for the CDS data (if there is
635    * one), and dbrefs with mappings from CDS to protein and the reverse
636    *
637    * @param dna
638    * @param xref
639    * @param data
640    * @param proteins
641    * @return
642    */
 
643  0 toggle SequenceI buildProteinProduct(SequenceI dna, DBRefEntry xref,
644    CdsData data, Map<String, SequenceI> proteins)
645    {
646    /*
647    * check we have some data to work with
648    */
649  0 if (data.proteinId == null || data.translation == null)
650    {
651  0 return null;
652    }
653   
654    /*
655    * Construct the protein sequence (if not already seen)
656    */
657  0 String proteinSeqName = xref.getSource() + "|" + xref.getAccessionId();
658  0 SequenceI protein = proteins.get(proteinSeqName);
659  0 if (protein == null)
660    {
661  0 protein = new Sequence(proteinSeqName, data.translation, 1,
662    data.translation.length());
663  0 protein.setDescription(data.proteinName != null ? data.proteinName
664    : "Protein Product from " + sourceDb);
665  0 proteins.put(proteinSeqName, protein);
666    }
667   
668  0 return protein;
669    }
670   
671    /**
672    * Returns the CDS location as a single array of [start, end, start, end...]
673    * positions. If on the reverse strand, these will be in descending order.
674    *
675    * @param accession
676    * @param location
677    * @return
678    */
 
679  8 toggle protected int[] getCdsRanges(String accession, String location)
680    {
681  8 if (location == null)
682    {
683  0 return new int[] {};
684    }
685   
686  8 try
687    {
688  8 List<int[]> ranges = DnaUtils.parseLocation(location);
689  8 return MappingUtils.rangeListToArray(ranges);
690    } catch (ParseException e)
691    {
692  0 Console.warn(
693    String.format("Not parsing inexact CDS location %s in ENA %s",
694    location, accession));
695  0 return new int[] {};
696    }
697    }
698   
699    /**
700    * Reads the value of a feature (FT) qualifier from one or more lines of the
701    * file, and returns the next line after that. Values are appended to the
702    * string buffer, which should be already primed with the value read from the
703    * first line for the qualifier (with any leading double quote removed).
704    * Enclosing double quotes are removed, and escaped (repeated) double quotes
705    * reduced to one only. For example for
706    *
707    * <pre>
708    * FT /note="gene_id=hCG28070.3
709    * FT ""foobar"" isoform=CRA_b"
710    * the returned value is
711    * gene_id=hCG28070.3 "foobar" isoform=CRA_b
712    * </pre>
713    *
714    * Note the side-effect of this method, to advance data reading to the next
715    * line after the feature qualifier (which could be another qualifier, a
716    * different feature, a non-feature line, or null at end of file).
717    *
718    * @param sb
719    * a string buffer primed with the first line of the value
720    * @param asText
721    * @return
722    * @throws IOException
723    */
 
724  57 toggle String parseFeatureQualifier(StringBuilder sb, boolean asText)
725    throws IOException
726    {
727  57 String line;
728  ? while ((line = nextLine()) != null)
729    {
730  96 if (!isFeatureContinuationLine(line))
731    {
732  8 break; // reached next feature or other input line
733    }
734  88 String[] tokens = line.split(WHITESPACE);
735  88 if (tokens.length < 2)
736    {
737  0 Console.error("Ignoring bad EMBL line for " + this.accession + ": "
738    + line);
739  0 break;
740    }
741  88 if (tokens[1].startsWith("/"))
742    {
743  49 break; // next feature qualifier
744    }
745   
746    /*
747    * if text (e.g. /product), add a word separator for a new line,
748    * else (e.g. /translation) don't
749    */
750  39 if (asText)
751    {
752  0 sb.append(" ");
753    }
754   
755    /*
756    * remove trailing " and unescape doubled ""
757    */
758  39 String data = removeQuotes(tokens[1]);
759  39 sb.append(data);
760    }
761   
762  57 return line;
763    }
764   
765    /**
766    * Reads and saves the sequence, read from the lines following the ORIGIN
767    * (GenBank) or SQ (EMBL) line. Whitespace and position counters are
768    * discarded. Returns the next line following the sequence data (the next line
769    * that doesn't start with whitespace).
770    *
771    * @throws IOException
772    */
 
773  1 toggle protected String parseSequence() throws IOException
774    {
775  1 StringBuilder sb = new StringBuilder(this.length);
776  1 String line = nextLine();
777  127 while (line != null && line.startsWith(" "))
778    {
779  126 line = line.trim();
780  126 String[] blocks = line.split(WHITESPACE);
781   
782    /*
783    * the first or last block on each line might be a position count - omit
784    */
785  1003 for (int i = 0; i < blocks.length; i++)
786    {
787  877 try
788    {
789  877 Long.parseLong(blocks[i]);
790    // position counter - ignore it
791    } catch (NumberFormatException e)
792    {
793    // sequence data - append it
794  751 sb.append(blocks[i]);
795    }
796    }
797  126 line = nextLine();
798    }
799  1 this.sequenceString = sb.toString();
800   
801  1 return line;
802    }
803   
804    /**
805    * Processes a feature line. If it declares a feature type of interest
806    * (currently, only CDS is processed), processes all of the associated lines
807    * (feature qualifiers), and returns the next line after that, otherwise
808    * simply returns the next line.
809    *
810    * @param line
811    * the first line for the feature (with initial FT omitted for EMBL
812    * format)
813    * @return
814    * @throws IOException
815    */
 
816  18 toggle protected String parseFeature(String line) throws IOException
817    {
818  18 String[] tokens = line.trim().split(WHITESPACE);
819  18 if (tokens.length < 2
820    || (!"CDS".equals(tokens[0]) && (!"source".equals(tokens[0]))))
821    {
822  9 return nextLine();
823    }
824  9 if (tokens[0].equals("source"))
825    {
826  1 return parseSourceQualifiers(tokens);
827    }
828  8 return parseCDSFeature(tokens[1]);
829    }
830    }
831   
832    /**
833    * A data bean class to hold values parsed from one CDS Feature
834    */
 
835    class CdsData
836    {
837    String translation; // from /translation qualifier
838   
839    String cdsLocation; // the raw value e.g. join(1..1234,2012..2837)
840   
841    int codonStart = 1; // from /codon_start qualifier
842   
843    String proteinName; // from /product qualifier; used for protein description
844   
845    String proteinId; // from /protein_id qualifier
846   
847    List<DBRefEntry> xrefs = new ArrayList<>(); // from /db_xref qualifiers
848   
849    Map<String, String> cdsProps = new Hashtable<>(); // other qualifiers
850    }