Clover icon

Coverage Report

  1. Project Clover database Mon Sep 2 2024 17:57:51 BST
  2. Package jalview.io

File EMBLLikeFlatFile.java

 

Coverage histogram

../../img/srcFileCovDistChart9.png
13% of files have more coverage

Code metrics

104
240
15
2
850
488
81
0.34
16
7.5
5.4

Classes

Class Line # Actions
EMBLLikeFlatFile 68 240 81
0.8328690583.3%
CdsData 835 0 0
-1.0 -
 

Contributing tests

This file is covered by 6 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.io;
22   
23    import java.io.IOException;
24    import java.text.ParseException;
25    import java.util.ArrayList;
26    import java.util.Arrays;
27    import java.util.HashMap;
28    import java.util.Hashtable;
29    import java.util.List;
30    import java.util.Locale;
31    import java.util.Map;
32    import java.util.Map.Entry;
33    import java.util.TreeMap;
34   
35    import jalview.bin.Console;
36    import jalview.datamodel.DBRefEntry;
37    import jalview.datamodel.DBRefSource;
38    import jalview.datamodel.FeatureProperties;
39    import jalview.datamodel.Mapping;
40    import jalview.datamodel.Sequence;
41    import jalview.datamodel.SequenceFeature;
42    import jalview.datamodel.SequenceI;
43    import jalview.util.DBRefUtils;
44    import jalview.util.DnaUtils;
45    import jalview.util.MapList;
46    import jalview.util.MappingUtils;
47   
48    /**
49    * A base class to support parsing of GenBank, EMBL or DDBJ flat file format
50    * data. Example files (rather than formal specifications) are provided at
51    *
52    * <pre>
53    * https://ena-docs.readthedocs.io/en/latest/submit/fileprep/flat-file-example.html
54    * https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html
55    * </pre>
56    *
57    * or to compare the same entry, see
58    *
59    * <pre>
60    * https://www.ebi.ac.uk/ena/browser/api/embl/X81322.1
61    * https://www.ncbi.nlm.nih.gov/nuccore/X81322.1
62    * </pre>
63    *
64    * The feature table part of the file has a common definition, only the start of
65    * each line is formatted differently in GenBank and EMBL. See
66    * http://www.insdc.org/files/feature_table.html#7.1.
67    */
 
68    public abstract class EMBLLikeFlatFile extends AlignFile
69    {
70    protected static final String LOCATION = "location";
71   
72    protected static final String QUOTE = "\"";
73   
74    protected static final String DOUBLED_QUOTE = QUOTE + QUOTE;
75   
76    protected static final String WHITESPACE = "\\s+";
77   
78    /**
79    * Removes leading or trailing double quotes (") unless doubled, and changes
80    * any 'escaped' (doubled) double quotes to single characters. As per the
81    * Feature Table specification for Qualifiers, Free Text.
82    *
83    * @param value
84    * @return
85    */
 
86  327 toggle protected static String removeQuotes(String value)
87    {
88  327 if (value == null)
89    {
90  1 return null;
91    }
92  326 if (value.startsWith(QUOTE) && !value.startsWith(DOUBLED_QUOTE))
93    {
94  164 value = value.substring(1);
95    }
96  326 if (value.endsWith(QUOTE) && !value.endsWith(DOUBLED_QUOTE))
97    {
98  160 value = value.substring(0, value.length() - 1);
99    }
100  326 value = value.replace(DOUBLED_QUOTE, QUOTE);
101  326 return value;
102    }
103   
104    /**
105    * Truncates (if necessary) the exon intervals to match 3 times the length of
106    * the protein(including truncation for stop codon included in exon)
107    *
108    * @param proteinLength
109    * @param exon
110    * an array of [start, end, start, end...] intervals
111    * @return the same array (if unchanged) or a truncated copy
112    */
 
113  31 toggle protected static int[] adjustForProteinLength(int proteinLength,
114    int[] exon)
115    {
116  31 if (proteinLength <= 0 || exon == null)
117    {
118  0 return exon;
119    }
120  31 int expectedCdsLength = proteinLength * 3;
121  31 int exonLength = MappingUtils.getLength(Arrays.asList(exon));
122   
123    /*
124    * if exon length matches protein, or is shorter, then leave it unchanged
125    */
126  31 if (expectedCdsLength >= exonLength)
127    {
128  3 return exon;
129    }
130   
131  28 int origxon[];
132  28 int sxpos = -1;
133  28 int endxon = 0;
134  28 origxon = new int[exon.length];
135  28 System.arraycopy(exon, 0, origxon, 0, exon.length);
136  28 int cdspos = 0;
137  37 for (int x = 0; x < exon.length; x += 2)
138    {
139  37 cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
140  37 if (expectedCdsLength <= cdspos)
141    {
142    // advanced beyond last codon.
143  28 sxpos = x;
144  28 if (expectedCdsLength != cdspos)
145    {
146    // System.err
147    // .println("Truncating final exon interval on region by "
148    // + (cdspos - cdslength));
149    }
150   
151    /*
152    * shrink the final exon - reduce end position if forward
153    * strand, increase it if reverse
154    */
155  28 if (exon[x + 1] >= exon[x])
156    {
157  25 endxon = exon[x + 1] - cdspos + expectedCdsLength;
158    }
159    else
160    {
161  3 endxon = exon[x + 1] + cdspos - expectedCdsLength;
162    }
163  28 break;
164    }
165    }
166   
167  28 if (sxpos != -1)
168    {
169    // and trim the exon interval set if necessary
170  28 int[] nxon = new int[sxpos + 2];
171  28 System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
172  28 nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
173    // set
174  28 exon = nxon;
175    }
176  28 return exon;
177    }
178   
179    /*
180    * when true, interpret the mol_type 'source' feature attribute
181    * and generate an RNA sequence from the DNA record
182    */
183    protected boolean produceRna = true;
184   
185    /*
186    * values parsed from the data file
187    */
188    protected String sourceDb;
189   
190    protected String accession;
191   
192    protected String version;
193   
194    protected String description;
195   
196    protected int length = 128;
197   
198    protected List<DBRefEntry> dbrefs;
199   
200    protected boolean sequenceStringIsRNA = false;
201   
202    protected String sequenceString;
203   
204    protected Map<String, CdsData> cds;
205   
206    /**
207    * Constructor
208    *
209    * @param fp
210    * @param sourceId
211    * @throws IOException
212    */
 
213  4 toggle public EMBLLikeFlatFile(FileParse fp, String sourceId) throws IOException
214    {
215  4 super(false, fp); // don't parse immediately
216  4 this.sourceDb = sourceId;
217  4 dbrefs = new ArrayList<>();
218   
219    /*
220    * using TreeMap gives CDS sequences in alphabetical, so readable, order
221    */
222  4 cds = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
223   
224  4 parse();
225    }
226   
227    /**
228    * process attributes for 'source' until the next FT feature entry only
229    * interested in 'mol_type'
230    *
231    * @param tokens
232    * @return
233    * @throws IOException
234    */
 
235  3 toggle private String parseSourceQualifiers(String[] tokens) throws IOException
236    {
237  3 if (!"source".equals(tokens[0]))
238    {
239  0 throw (new RuntimeException("Not given a 'source' qualifier line"));
240    }
241    // search for mol_type attribute
242   
243  3 StringBuilder sb = new StringBuilder().append(tokens[1]); // extent of
244    // sequence
245   
246  3 String line = parseFeatureQualifier(sb, false);
247  17 while (line != null)
248    {
249  17 if (!line.startsWith("FT ")) // four spaces, end of this feature table
250    // entry
251    {
252  3 return line;
253    }
254   
255    // case sensitive ?
256  14 int p = line.indexOf("\\mol_type");
257  14 int qs = line.indexOf("\"", p);
258  14 int qe = line.indexOf("\"", qs + 1);
259  14 String qualifier = line.substring(qs, qe).toLowerCase(Locale.ROOT);
260  14 if (qualifier.indexOf("rna") > -1)
261    {
262  1 sequenceStringIsRNA = true;
263    }
264  14 if (qualifier.indexOf("dna") > -1)
265    {
266  1 sequenceStringIsRNA = false;
267    }
268  14 line = parseFeatureQualifier(sb, false);
269    }
270  0 return line;
271    }
272   
273    /**
274    * Parses one (GenBank or EMBL format) CDS feature, saves the parsed data, and
275    * returns the next line
276    *
277    * @param location
278    * @return
279    * @throws IOException
280    */
 
281  25 toggle protected String parseCDSFeature(String location) throws IOException
282    {
283  25 String line;
284   
285    /*
286    * parse location, which can be over >1 line e.g. EAW51554
287    */
288  25 CdsData data = new CdsData();
289  25 StringBuilder sb = new StringBuilder().append(location);
290  25 line = parseFeatureQualifier(sb, false);
291  25 data.cdsLocation = sb.toString();
292   
293  234 while (line != null)
294    {
295  234 if (!isFeatureContinuationLine(line))
296    {
297    // e.g. start of next feature "FT source..."
298  25 break;
299    }
300   
301    /*
302    * extract qualifier, e.g. FT /protein_id="CAA37824.1"
303    * - the value may extend over more than one line
304    * - if the value has enclosing quotes, these are removed
305    * - escaped double quotes ("") are reduced to a single character
306    */
307  209 int slashPos = line.indexOf('/');
308  209 if (slashPos == -1)
309    {
310  0 Console.error("Unexpected EMBL line ignored: " + line);
311  0 line = nextLine();
312  0 continue;
313    }
314  209 int eqPos = line.indexOf('=', slashPos + 1);
315  209 if (eqPos == -1)
316    {
317    // can happen, e.g. /ribosomal_slippage
318  0 line = nextLine();
319  0 continue;
320    }
321  209 String qualifier = line.substring(slashPos + 1, eqPos);
322  209 String value = line.substring(eqPos + 1);
323  209 value = removeQuotes(value);
324  209 sb = new StringBuilder().append(value);
325  209 boolean asText = !"translation".equals(qualifier);
326  209 line = parseFeatureQualifier(sb, asText);
327  209 String featureValue = sb.toString();
328   
329  209 if ("protein_id".equals(qualifier))
330    {
331  25 data.proteinId = featureValue;
332    }
333  184 else if ("codon_start".equals(qualifier))
334    {
335  24 try
336    {
337  24 data.codonStart = Integer.parseInt(featureValue.trim());
338    } catch (NumberFormatException e)
339    {
340  0 Console.error("Invalid codon_start in XML for " + this.accession
341    + ": " + e.getMessage());
342    }
343    }
344  160 else if ("db_xref".equals(qualifier))
345    {
346  62 String[] parts = featureValue.split(":");
347  62 if (parts.length == 2)
348    {
349  62 String db = parts[0].trim();
350  62 db = DBRefUtils.getCanonicalName(db);
351  62 DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
352  62 data.xrefs.add(dbref);
353    }
354    }
355  98 else if ("product".equals(qualifier))
356    {
357  25 data.proteinName = featureValue;
358    }
359  73 else if ("translation".equals(qualifier))
360    {
361  25 data.translation = featureValue;
362    }
363  48 else if (!"".equals(featureValue))
364    {
365    // throw anything else into the additional properties hash
366  48 data.cdsProps.put(qualifier, featureValue);
367    }
368    }
369   
370  25 if (data.proteinId != null)
371    {
372  25 this.cds.put(data.proteinId, data);
373    }
374    else
375    {
376  0 Console.error("Ignoring CDS feature with no protein_id for "
377    + sourceDb + ":" + accession);
378    }
379   
380  25 return line;
381    }
382   
383    protected abstract boolean isFeatureContinuationLine(String line);
384   
385    /**
386    * Output (print) is not (yet) implemented for flat file format
387    */
 
388  0 toggle @Override
389    public String print(SequenceI[] seqs, boolean jvsuffix)
390    {
391  0 return null;
392    }
393   
394    /**
395    * Constructs and saves the sequence from parsed components
396    */
 
397  4 toggle protected void buildSequence()
398    {
399  4 if (this.accession == null || this.sequenceString == null)
400    {
401  0 Console.error("Failed to parse data from EMBL");
402  0 return;
403    }
404   
405  4 String name = this.accession;
406  4 if (this.sourceDb != null)
407    {
408  4 name = this.sourceDb + "|" + name;
409    }
410   
411  4 if (produceRna && sequenceStringIsRNA)
412    {
413  1 sequenceString = sequenceString.replace('T', 'U').replace('t', 'u');
414    }
415   
416  4 SequenceI seq = new Sequence(name, this.sequenceString);
417  4 seq.setDescription(this.description);
418   
419    /*
420    * add a DBRef to itself
421    */
422  4 DBRefEntry selfRef = new DBRefEntry(sourceDb, version, accession);
423  4 int[] startEnd = new int[] { 1, seq.getLength() };
424  4 selfRef.setMap(new Mapping(null, startEnd, startEnd, 1, 1));
425  4 seq.addDBRef(selfRef);
426   
427  4 for (DBRefEntry dbref : this.dbrefs)
428    {
429  8 seq.addDBRef(dbref);
430    }
431   
432  4 processCDSFeatures(seq);
433   
434  4 seq.deriveSequence();
435   
436  4 addSequence(seq);
437    }
438   
439    /**
440    * Process the CDS features, including generation of cross-references and
441    * mappings to the protein products (translation)
442    *
443    * @param seq
444    */
 
445  4 toggle protected void processCDSFeatures(SequenceI seq)
446    {
447    /*
448    * record protein products found to avoid duplication i.e. >1 CDS with
449    * the same /protein_id [though not sure I can find an example of this]
450    */
451  4 Map<String, SequenceI> proteins = new HashMap<>();
452  4 for (CdsData data : cds.values())
453    {
454  25 processCDSFeature(seq, data, proteins);
455    }
456    }
457   
458    /**
459    * Processes data for one parsed CDS feature to
460    * <ul>
461    * <li>create a protein product sequence for the translation</li>
462    * <li>create a cross-reference to protein with mapping from dna</li>
463    * <li>add a CDS feature to the sequence for each CDS start-end range</li>
464    * <li>add any CDS dbrefs to the sequence and to the protein product</li>
465    * </ul>
466    *
467    * @param SequenceI
468    * dna
469    * @param proteins
470    * map of protein products so far derived from CDS data
471    */
 
472  25 toggle void processCDSFeature(SequenceI dna, CdsData data,
473    Map<String, SequenceI> proteins)
474    {
475    /*
476    * parse location into a list of [start, end, start, end] positions
477    */
478  25 int[] exons = getCdsRanges(this.accession, data.cdsLocation);
479   
480  25 MapList maplist = buildMappingToProtein(dna, exons, data);
481   
482  25 int exonNumber = 0;
483   
484  53 for (int xint = 0; exons != null && xint < exons.length - 1; xint += 2)
485    {
486  28 int exonStart = exons[xint];
487  28 int exonEnd = exons[xint + 1];
488  28 int begin = Math.min(exonStart, exonEnd);
489  28 int end = Math.max(exonStart, exonEnd);
490  28 exonNumber++;
491  28 String desc = String.format("Exon %d for protein EMBLCDS:%s",
492    exonNumber, data.proteinId);
493   
494  28 SequenceFeature sf = new SequenceFeature("CDS", desc, begin, end,
495    this.sourceDb);
496  28 for (Entry<String, String> val : data.cdsProps.entrySet())
497    {
498  54 sf.setValue(val.getKey(), val.getValue());
499    }
500   
501  28 sf.setEnaLocation(data.cdsLocation);
502  28 boolean forwardStrand = exonStart <= exonEnd;
503  28 sf.setStrand(forwardStrand ? "+" : "-");
504  28 sf.setPhase(String.valueOf(data.codonStart - 1));
505  28 sf.setValue(FeatureProperties.EXONPOS, exonNumber);
506  28 sf.setValue(FeatureProperties.EXONPRODUCT, data.proteinName);
507   
508  28 dna.addSequenceFeature(sf);
509    }
510   
511  25 boolean hasUniprotDbref = false;
512  25 for (DBRefEntry xref : data.xrefs)
513    {
514  62 dna.addDBRef(xref);
515  62 if (xref.getSource().equals(DBRefSource.UNIPROT))
516    {
517    /*
518    * construct (or find) the sequence for (data.protein_id, data.translation)
519    */
520  16 SequenceI protein = buildProteinProduct(dna, xref, data, proteins);
521  16 Mapping map = new Mapping(protein, maplist);
522  16 map.setMappedFromId(data.proteinId);
523  16 xref.setMap(map);
524   
525    /*
526    * add DBRefs with mappings from dna to protein and the inverse
527    */
528  16 DBRefEntry db1 = new DBRefEntry(sourceDb, version, accession);
529  16 db1.setMap(new Mapping(dna, maplist.getInverse()));
530  16 protein.addDBRef(db1);
531   
532  16 hasUniprotDbref = true;
533    }
534    }
535   
536    /*
537    * if we have a product (translation) but no explicit Uniprot dbref
538    * (example: EMBL M19487 protein_id AAB02592.1)
539    * then construct mappings to an assumed EMBLCDSPROTEIN accession
540    */
541  25 if (!hasUniprotDbref)
542    {
543  9 SequenceI protein = proteins.get(data.proteinId);
544  9 if (protein == null)
545    {
546  9 protein = new Sequence(data.proteinId, data.translation);
547  9 protein.setDescription(data.proteinName);
548  9 proteins.put(data.proteinId, protein);
549    }
550    // assuming CDSPROTEIN sequence version = dna version (?!)
551  9 DBRefEntry db1 = new DBRefEntry(DBRefSource.EMBLCDSProduct,
552    this.version, data.proteinId);
553  9 protein.addDBRef(db1);
554   
555  9 DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
556    DBRefSource.EMBLCDSProduct, this.version, data.proteinId);
557  9 Mapping map = new Mapping(protein, maplist);
558  9 map.setMappedFromId(data.proteinId);
559  9 dnaToEmblProteinRef.setMap(map);
560  9 dna.addDBRef(dnaToEmblProteinRef);
561    }
562   
563    /*
564    * comment brought forward from EmblXmlSource, lines 447-451:
565    * TODO: if retrieved from EMBLCDS, add a DBRef back to the parent EMBL
566    * sequence with the exon map; if given a dataset reference, search
567    * dataset for parent EMBL sequence if it exists and set its map;
568    * make a new feature annotating the coding contig
569    */
570    }
571   
572    /**
573    * Computes a mapping from CDS positions in DNA sequence to protein product
574    * positions, with allowance for stop codon or incomplete start codon
575    *
576    * @param dna
577    * @param exons
578    * @param data
579    * @return
580    */
 
581  25 toggle MapList buildMappingToProtein(final SequenceI dna, final int[] exons,
582    final CdsData data)
583    {
584  25 MapList dnaToProteinMapping = null;
585  25 int peptideLength = data.translation.length();
586   
587  25 int[] proteinRange = new int[] { 1, peptideLength };
588  25 if (exons != null && exons.length > 0)
589    {
590    /*
591    * We were able to parse 'location'; do a final
592    * product length truncation check
593    */
594  25 int[] cdsRanges = adjustForProteinLength(peptideLength, exons);
595  25 dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
596    }
597    else
598    {
599    /*
600    * workaround until we handle all 'location' formats fully
601    * e.g. X53828.1:60..1058 or <123..>289
602    */
603  0 Console.error(String.format(
604    "Implementation Notice: EMBLCDS location '%s'not properly supported yet"
605    + " - Making up the CDNA region of (%s:%s)... may be incorrect",
606    data.cdsLocation, sourceDb, this.accession));
607   
608  0 int completeCodonsLength = 1 - data.codonStart + dna.getLength();
609  0 int mappedDnaEnd = dna.getEnd();
610  0 if (peptideLength * 3 == completeCodonsLength)
611    {
612    // this might occur for CDS sequences where no features are marked
613  0 Console.warn("Assuming no stop codon at end of cDNA fragment");
614  0 mappedDnaEnd = dna.getEnd();
615    }
616  0 else if ((peptideLength + 1) * 3 == completeCodonsLength)
617    {
618  0 Console.warn("Assuming stop codon at end of cDNA fragment");
619  0 mappedDnaEnd = dna.getEnd() - 3;
620    }
621   
622  0 if (mappedDnaEnd != -1)
623    {
624  0 int[] cdsRanges = new int[] {
625    dna.getStart() + (data.codonStart - 1), mappedDnaEnd };
626  0 dnaToProteinMapping = new MapList(cdsRanges, proteinRange, 3, 1);
627    }
628    }
629   
630  25 return dnaToProteinMapping;
631    }
632   
633    /**
634    * Constructs a sequence for the protein product for the CDS data (if there is
635    * one), and dbrefs with mappings from CDS to protein and the reverse
636    *
637    * @param dna
638    * @param xref
639    * @param data
640    * @param proteins
641    * @return
642    */
 
643  16 toggle SequenceI buildProteinProduct(SequenceI dna, DBRefEntry xref,
644    CdsData data, Map<String, SequenceI> proteins)
645    {
646    /*
647    * check we have some data to work with
648    */
649  16 if (data.proteinId == null || data.translation == null)
650    {
651  0 return null;
652    }
653   
654    /*
655    * Construct the protein sequence (if not already seen)
656    */
657  16 String proteinSeqName = xref.getSource() + "|" + xref.getAccessionId();
658  16 SequenceI protein = proteins.get(proteinSeqName);
659  16 if (protein == null)
660    {
661  16 protein = new Sequence(proteinSeqName, data.translation, 1,
662    data.translation.length());
663  16 protein.setDescription(data.proteinName != null ? data.proteinName
664    : "Protein Product from " + sourceDb);
665  16 proteins.put(proteinSeqName, protein);
666    }
667   
668  16 return protein;
669    }
670   
671    /**
672    * Returns the CDS location as a single array of [start, end, start, end...]
673    * positions. If on the reverse strand, these will be in descending order.
674    *
675    * @param accession
676    * @param location
677    * @return
678    */
 
679  25 toggle protected int[] getCdsRanges(String accession, String location)
680    {
681  25 if (location == null)
682    {
683  0 return new int[] {};
684    }
685   
686  25 try
687    {
688  25 List<int[]> ranges = DnaUtils.parseLocation(location);
689  25 return MappingUtils.rangeListToArray(ranges);
690    } catch (ParseException e)
691    {
692  0 Console.warn(
693    String.format("Not parsing inexact CDS location %s in ENA %s",
694    location, accession));
695  0 return new int[] {};
696    }
697    }
698   
699    /**
700    * Reads the value of a feature (FT) qualifier from one or more lines of the
701    * file, and returns the next line after that. Values are appended to the
702    * string buffer, which should be already primed with the value read from the
703    * first line for the qualifier (with any leading double quote removed).
704    * Enclosing double quotes are removed, and escaped (repeated) double quotes
705    * reduced to one only. For example for
706    *
707    * <pre>
708    * FT /note="gene_id=hCG28070.3
709    * FT ""foobar"" isoform=CRA_b"
710    * the returned value is
711    * gene_id=hCG28070.3 "foobar" isoform=CRA_b
712    * </pre>
713    *
714    * Note the side-effect of this method, to advance data reading to the next
715    * line after the feature qualifier (which could be another qualifier, a
716    * different feature, a non-feature line, or null at end of file).
717    *
718    * @param sb
719    * a string buffer primed with the first line of the value
720    * @param asText
721    * @return
722    * @throws IOException
723    */
 
724  251 toggle String parseFeatureQualifier(StringBuilder sb, boolean asText)
725    throws IOException
726    {
727  251 String line;
728  ? while ((line = nextLine()) != null)
729    {
730  365 if (!isFeatureContinuationLine(line))
731    {
732  27 break; // reached next feature or other input line
733    }
734  338 String[] tokens = line.split(WHITESPACE);
735  338 if (tokens.length < 2)
736    {
737  0 Console.error("Ignoring bad EMBL line for " + this.accession + ": "
738    + line);
739  0 break;
740    }
741  338 if (tokens[1].startsWith("/"))
742    {
743  224 break; // next feature qualifier
744    }
745   
746    /*
747    * if text (e.g. /product), add a word separator for a new line,
748    * else (e.g. /translation) don't
749    */
750  114 if (asText)
751    {
752  1 sb.append(" ");
753    }
754   
755    /*
756    * remove trailing " and unescape doubled ""
757    */
758  114 String data = removeQuotes(tokens[1]);
759  114 sb.append(data);
760    }
761   
762  251 return line;
763    }
764   
765    /**
766    * Reads and saves the sequence, read from the lines following the ORIGIN
767    * (GenBank) or SQ (EMBL) line. Whitespace and position counters are
768    * discarded. Returns the next line following the sequence data (the next line
769    * that doesn't start with whitespace).
770    *
771    * @throws IOException
772    */
 
773  4 toggle protected String parseSequence() throws IOException
774    {
775  4 StringBuilder sb = new StringBuilder(this.length);
776  4 String line = nextLine();
777  383 while (line != null && line.startsWith(" "))
778    {
779  379 line = line.trim();
780  379 String[] blocks = line.split(WHITESPACE);
781   
782    /*
783    * the first or last block on each line might be a position count - omit
784    */
785  3015 for (int i = 0; i < blocks.length; i++)
786    {
787  2636 try
788    {
789  2636 Long.parseLong(blocks[i]);
790    // position counter - ignore it
791    } catch (NumberFormatException e)
792    {
793    // sequence data - append it
794  2257 sb.append(blocks[i]);
795    }
796    }
797  379 line = nextLine();
798    }
799  4 this.sequenceString = sb.toString();
800   
801  4 return line;
802    }
803   
804    /**
805    * Processes a feature line. If it declares a feature type of interest
806    * (currently, only CDS is processed), processes all of the associated lines
807    * (feature qualifiers), and returns the next line after that, otherwise
808    * simply returns the next line.
809    *
810    * @param line
811    * the first line for the feature (with initial FT omitted for EMBL
812    * format)
813    * @return
814    * @throws IOException
815    */
 
816  41 toggle protected String parseFeature(String line) throws IOException
817    {
818  41 String[] tokens = line.trim().split(WHITESPACE);
819  41 if (tokens.length < 2
820    || (!"CDS".equals(tokens[0]) && (!"source".equals(tokens[0]))))
821    {
822  13 return nextLine();
823    }
824  28 if (tokens[0].equals("source"))
825    {
826  3 return parseSourceQualifiers(tokens);
827    }
828  25 return parseCDSFeature(tokens[1]);
829    }
830    }
831   
832    /**
833    * A data bean class to hold values parsed from one CDS Feature
834    */
 
835    class CdsData
836    {
837    String translation; // from /translation qualifier
838   
839    String cdsLocation; // the raw value e.g. join(1..1234,2012..2837)
840   
841    int codonStart = 1; // from /codon_start qualifier
842   
843    String proteinName; // from /product qualifier; used for protein description
844   
845    String proteinId; // from /protein_id qualifier
846   
847    List<DBRefEntry> xrefs = new ArrayList<>(); // from /db_xref qualifiers
848   
849    Map<String, String> cdsProps = new Hashtable<>(); // other qualifiers
850    }