Clover icon

Coverage Report

  1. Project Clover database Mon Nov 11 2024 17:27:16 GMT
  2. Package jalview.ws.dbsources

File EmblXmlSource.java

 

Coverage histogram

../../../img/srcFileCovDistChart7.png
29% of files have more coverage

Code metrics

104
256
16
1
827
568
85
0.33
16
16
5.31

Classes

Class Line # Actions
EmblXmlSource 67 256 85
0.6808510468.1%
 

Contributing tests

No tests hitting this source file were found.

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.ws.dbsources;
22   
23    import java.io.File;
24    import java.io.FileInputStream;
25    import java.io.InputStream;
26    import java.text.ParseException;
27    import java.util.ArrayList;
28    import java.util.Arrays;
29    import java.util.Hashtable;
30    import java.util.List;
31    import java.util.Locale;
32    import java.util.Map;
33    import java.util.Map.Entry;
34   
35    import javax.xml.bind.JAXBContext;
36    import javax.xml.bind.JAXBElement;
37    import javax.xml.bind.JAXBException;
38    import javax.xml.stream.FactoryConfigurationError;
39    import javax.xml.stream.XMLInputFactory;
40    import javax.xml.stream.XMLStreamException;
41    import javax.xml.stream.XMLStreamReader;
42   
43    import com.stevesoft.pat.Regex;
44   
45    import jalview.analysis.SequenceIdMatcher;
46    import jalview.bin.Console;
47    import jalview.datamodel.Alignment;
48    import jalview.datamodel.AlignmentI;
49    import jalview.datamodel.DBRefEntry;
50    import jalview.datamodel.DBRefSource;
51    import jalview.datamodel.FeatureProperties;
52    import jalview.datamodel.Mapping;
53    import jalview.datamodel.Sequence;
54    import jalview.datamodel.SequenceFeature;
55    import jalview.datamodel.SequenceI;
56    import jalview.util.DBRefUtils;
57    import jalview.util.DnaUtils;
58    import jalview.util.MapList;
59    import jalview.util.MappingUtils;
60    import jalview.ws.ebi.EBIFetchClient;
61    import jalview.xml.binding.embl.EntryType;
62    import jalview.xml.binding.embl.EntryType.Feature;
63    import jalview.xml.binding.embl.EntryType.Feature.Qualifier;
64    import jalview.xml.binding.embl.ROOT;
65    import jalview.xml.binding.embl.XrefType;
66   
 
67    public abstract class EmblXmlSource extends EbiFileRetrievedProxy
68    {
69    private static final Regex ACCESSION_REGEX = new Regex("^[A-Z]+[0-9]+");
70   
71    /*
72    * JAL-1856 Embl returns this text for query not found
73    */
74    private static final String EMBL_NOT_FOUND_REPLY = "ERROR 12 No entries found.";
75   
 
76  1 toggle public EmblXmlSource()
77    {
78  1 super();
79    }
80   
81    /**
82    * Retrieves and parses an emblxml file, and returns an alignment containing
83    * the parsed sequences, or null if none were found
84    *
85    * @param emprefx
86    * "EMBL" or "EMBLCDS" - anything else will not retrieve emblxml
87    * @param query
88    * @return
89    * @throws Exception
90    */
 
91  0 toggle protected AlignmentI getEmblSequenceRecords(String emprefx, String query)
92    throws Exception
93    {
94  0 startQuery();
95  0 EBIFetchClient dbFetch = new EBIFetchClient();
96  0 File reply;
97  0 try
98    {
99  0 reply = dbFetch.fetchDataAsFile(
100    emprefx.toLowerCase(Locale.ROOT) + ":" + query.trim(),
101    "display=xml", "xml");
102    } catch (Exception e)
103    {
104  0 stopQuery();
105  0 throw new Exception(
106    String.format("EBI EMBL XML retrieval failed for %s:%s",
107    emprefx.toLowerCase(Locale.ROOT), query.trim()),
108    e);
109    }
110  0 return getEmblSequenceRecords(emprefx, query, reply);
111    }
112   
113    /**
114    * parse an emblxml file stored locally
115    *
116    * @param emprefx
117    * either EMBL or EMBLCDS strings are allowed - anything else will
118    * not retrieve emblxml
119    * @param query
120    * @param file
121    * the EMBL XML file containing the results of a query
122    * @return
123    * @throws Exception
124    */
 
125  0 toggle protected AlignmentI getEmblSequenceRecords(String emprefx, String query,
126    File reply) throws Exception
127    {
128  0 List<EntryType> entries = null;
129  0 if (reply != null && reply.exists())
130    {
131  0 file = reply.getAbsolutePath();
132  0 if (reply.length() > EMBL_NOT_FOUND_REPLY.length())
133    {
134  0 InputStream is = new FileInputStream(reply);
135  0 entries = getEmblEntries(is);
136    }
137    }
138   
139    /*
140    * invalid accession gets a reply with no <entry> elements, text content of
141    * EmbFile reads something like (e.g.) this ungrammatical phrase
142    * Entry: <acc> display type is either not supported or entry is not found.
143    */
144  0 AlignmentI al = null;
145  0 List<SequenceI> seqs = new ArrayList<>();
146  0 List<SequenceI> peptides = new ArrayList<>();
147  0 if (entries != null)
148    {
149  0 for (EntryType entry : entries)
150    {
151  0 SequenceI seq = getSequence(emprefx, entry, peptides);
152  0 if (seq != null)
153    {
154  0 seqs.add(seq.deriveSequence());
155    // place DBReferences on dataset and refer
156    }
157    }
158  0 if (!seqs.isEmpty())
159    {
160  0 al = new Alignment(seqs.toArray(new SequenceI[seqs.size()]));
161    }
162    else
163    {
164  0 jalview.bin.Console.outPrintln(
165    "No record found for '" + emprefx + ":" + query + "'");
166    }
167    }
168   
169  0 stopQuery();
170  0 return al;
171    }
172   
173    /**
174    * Reads the XML reply from file and unmarshals it to Java objects. Answers a
175    * (possibly empty) list of <code>EntryType</code> objects.
176    *
177    * is
178    *
179    * @return
180    */
 
181  2 toggle List<EntryType> getEmblEntries(InputStream is)
182    {
183  2 List<EntryType> entries = new ArrayList<>();
184  2 try
185    {
186  2 JAXBContext jc = JAXBContext.newInstance("jalview.xml.binding.embl");
187  2 XMLStreamReader streamReader = XMLInputFactory.newInstance()
188    .createXMLStreamReader(is);
189  2 javax.xml.bind.Unmarshaller um = jc.createUnmarshaller();
190  2 JAXBElement<ROOT> rootElement = um.unmarshal(streamReader,
191    ROOT.class);
192  2 ROOT root = rootElement.getValue();
193   
194    /*
195    * document root contains either "entry" or "entrySet"
196    */
197  2 if (root == null)
198    {
199  0 return entries;
200    }
201  2 if (root.getEntrySet() != null)
202    {
203  0 entries = root.getEntrySet().getEntry();
204    }
205  2 else if (root.getEntry() != null)
206    {
207  2 entries.add(root.getEntry());
208    }
209    } catch (JAXBException | XMLStreamException
210    | FactoryConfigurationError e)
211    {
212  0 e.printStackTrace();
213    }
214  2 return entries;
215    }
216   
217    /**
218    * A helper method to parse XML data and construct a sequence, with any
219    * available database references and features
220    *
221    * @param emprefx
222    * @param entry
223    * @param peptides
224    * @return
225    */
 
226  1 toggle SequenceI getSequence(String sourceDb, EntryType entry,
227    List<SequenceI> peptides)
228    {
229  1 String seqString = entry.getSequence();
230  1 if (seqString == null)
231    {
232  0 return null;
233    }
234  1 seqString = seqString.replace(" ", "").replace("\n", "").replace("\t",
235    "");
236  1 String accession = entry.getAccession();
237  1 SequenceI dna = new Sequence(sourceDb + "|" + accession, seqString);
238   
239  1 dna.setDescription(entry.getDescription());
240  1 String sequenceVersion = String.valueOf(entry.getVersion().intValue());
241  1 DBRefEntry selfRref = new DBRefEntry(sourceDb, sequenceVersion,
242    accession);
243  1 dna.addDBRef(selfRref);
244  1 selfRref.setMap(
245    new Mapping(null, new int[]
246    { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1,
247    1));
248   
249    /*
250    * add db references
251    */
252  1 List<XrefType> xrefs = entry.getXref();
253  1 if (xrefs != null)
254    {
255  1 for (XrefType xref : xrefs)
256    {
257  2 String acc = xref.getId();
258  2 String source = DBRefUtils.getCanonicalName(xref.getDb());
259  2 String version = xref.getSecondaryId();
260  2 if (version == null || "".equals(version))
261    {
262  1 version = "0";
263    }
264  2 dna.addDBRef(new DBRefEntry(source, version, acc));
265    }
266    }
267   
268  1 SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
269  1 try
270    {
271  1 List<Feature> features = entry.getFeature();
272  1 if (features != null)
273    {
274  1 for (Feature feature : features)
275    {
276  3 if (FeatureProperties.isCodingFeature(sourceDb,
277    feature.getName()))
278    {
279  3 parseCodingFeature(entry, feature, sourceDb, dna, peptides,
280    matcher);
281    }
282    }
283    }
284    } catch (Exception e)
285    {
286  0 jalview.bin.Console.errPrintln("EMBL Record Features parsing error!");
287  0 System.err
288    .println("Please report the following to help@jalview.org :");
289  0 jalview.bin.Console.errPrintln("EMBL Record " + accession);
290  0 jalview.bin.Console
291    .errPrintln("Resulted in exception: " + e.getMessage());
292  0 e.printStackTrace(System.err);
293    }
294   
295  1 return dna;
296    }
297   
298    /**
299    * Extracts coding region and product from a CDS feature and decorates it with
300    * annotations
301    *
302    * @param entry
303    * @param feature
304    * @param sourceDb
305    * @param dna
306    * @param peptides
307    * @param matcher
308    */
 
309  3 toggle void parseCodingFeature(EntryType entry, Feature feature, String sourceDb,
310    SequenceI dna, List<SequenceI> peptides,
311    SequenceIdMatcher matcher)
312    {
313  3 final boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
314  3 final String accession = entry.getAccession();
315  3 final String sequenceVersion = entry.getVersion().toString();
316   
317  3 int[] exons = getCdsRanges(entry.getAccession(), feature);
318   
319  3 String translation = null;
320  3 String proteinName = "";
321  3 String proteinId = null;
322  3 Map<String, String> vals = new Hashtable<>();
323   
324    /*
325    * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
326    * (phase is required for CDS features in GFF3 format)
327    */
328  3 int codonStart = 1;
329   
330    /*
331    * parse qualifiers, saving protein translation, protein id,
332    * codon start position, product (name), and 'other values'
333    */
334  3 if (feature.getQualifier() != null)
335    {
336  3 for (Qualifier q : feature.getQualifier())
337    {
338  7 String qname = q.getName();
339  7 String value = q.getValue();
340  7 value = value == null ? ""
341    : value.trim().replace(" ", "").replace("\n", "")
342    .replace("\t", "");
343  7 if (qname.equals("translation"))
344    {
345  3 translation = value;
346    }
347  4 else if (qname.equals("protein_id"))
348    {
349  3 proteinId = value;
350    }
351  1 else if (qname.equals("codon_start"))
352    {
353  0 try
354    {
355  0 codonStart = Integer.parseInt(value.trim());
356    } catch (NumberFormatException e)
357    {
358  0 jalview.bin.Console.errPrintln("Invalid codon_start in XML for "
359    + entry.getAccession() + ": " + e.getMessage());
360    }
361    }
362  1 else if (qname.equals("product"))
363    {
364    // sometimes name is returned e.g. for V00488
365  0 proteinName = value;
366    }
367    else
368    {
369    // throw anything else into the additional properties hash
370  1 if (!"".equals(value))
371    {
372  1 vals.put(qname, value);
373    }
374    }
375    }
376    }
377   
378  3 DBRefEntry proteinToEmblProteinRef = null;
379  3 exons = MappingUtils.removeStartPositions(codonStart - 1, exons);
380   
381  3 SequenceI product = null;
382  3 Mapping dnaToProteinMapping = null;
383  3 if (translation != null && proteinName != null && proteinId != null)
384    {
385  3 int translationLength = translation.length();
386   
387    /*
388    * look for product in peptides list, if not found, add it
389    */
390  3 product = matcher.findIdMatch(proteinId);
391  3 if (product == null)
392    {
393  3 product = new Sequence(proteinId, translation, 1,
394    translationLength);
395  3 product.setDescription(((proteinName.length() == 0)
396    ? "Protein Product from " + sourceDb
397    : proteinName));
398  3 peptides.add(product);
399  3 matcher.add(product);
400    }
401   
402    // we have everything - create the mapping and perhaps the protein
403    // sequence
404  3 if (exons == null || exons.length == 0)
405    {
406    /*
407    * workaround until we handle dna location for CDS sequence
408    * e.g. location="X53828.1:60..1058" correctly
409    */
410  0 jalview.bin.Console.errPrintln(
411    "Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
412    + sourceDb + ":" + entry.getAccession() + ")");
413  0 int dnaLength = dna.getLength();
414  0 if (translationLength * 3 == (1 - codonStart + dnaLength))
415    {
416  0 jalview.bin.Console.errPrintln(
417    "Not allowing for additional stop codon at end of cDNA fragment... !");
418    // this might occur for CDS sequences where no features are marked
419  0 exons = new int[] { dna.getStart() + (codonStart - 1),
420    dna.getEnd() };
421  0 dnaToProteinMapping = new Mapping(product, exons,
422    new int[]
423    { 1, translationLength }, 3, 1);
424    }
425  0 if ((translationLength + 1) * 3 == (1 - codonStart + dnaLength))
426    {
427  0 jalview.bin.Console.errPrintln(
428    "Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
429  0 exons = new int[] { dna.getStart() + (codonStart - 1),
430    dna.getEnd() - 3 };
431  0 dnaToProteinMapping = new Mapping(product, exons,
432    new int[]
433    { 1, translationLength }, 3, 1);
434    }
435    }
436    else
437    {
438    // Trim the exon mapping if necessary - the given product may only be a
439    // fragment of a larger protein. (EMBL:AY043181 is an example)
440   
441  3 if (isEmblCdna)
442    {
443    // TODO: Add a DbRef back to the parent EMBL sequence with the exon
444    // map
445    // if given a dataset reference, search dataset for parent EMBL
446    // sequence if it exists and set its map
447    // make a new feature annotating the coding contig
448    }
449    else
450    {
451    // final product length truncation check
452  3 int[] exons2 = adjustForProteinLength(translationLength, exons);
453  3 dnaToProteinMapping = new Mapping(product, exons2,
454    new int[]
455    { 1, translationLength }, 3, 1);
456  3 if (product != null)
457    {
458    /*
459    * make xref with mapping from protein to EMBL dna
460    */
461  3 DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL,
462    sequenceVersion, proteinId,
463    new Mapping(dnaToProteinMapping.getMap().getInverse()));
464  3 product.addDBRef(proteinToEmblRef);
465   
466    /*
467    * make xref from protein to EMBLCDS; we assume here that the
468    * CDS sequence version is same as dna sequence (?!)
469    */
470  3 MapList proteinToCdsMapList = new MapList(
471    new int[]
472    { 1, translationLength },
473    new int[]
474    { 1 + (codonStart - 1),
475    (codonStart - 1) + 3 * translationLength },
476    1, 3);
477  3 DBRefEntry proteinToEmblCdsRef = new DBRefEntry(
478    DBRefSource.EMBLCDS, sequenceVersion, proteinId,
479    new Mapping(proteinToCdsMapList));
480  3 product.addDBRef(proteinToEmblCdsRef);
481   
482    /*
483    * make 'direct' xref from protein to EMBLCDSPROTEIN
484    */
485  3 proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef);
486  3 proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct);
487  3 proteinToEmblProteinRef.setMap(null);
488  3 product.addDBRef(proteinToEmblProteinRef);
489    }
490    }
491    }
492   
493    /*
494    * add cds features to dna sequence
495    */
496  3 String cds = feature.getName(); // "CDS"
497  7 for (int xint = 0; exons != null
498    && xint < exons.length - 1; xint += 2)
499    {
500  4 int exonStart = exons[xint];
501  4 int exonEnd = exons[xint + 1];
502  4 int begin = Math.min(exonStart, exonEnd);
503  4 int end = Math.max(exonStart, exonEnd);
504  4 int exonNumber = xint / 2 + 1;
505  4 String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s",
506    exonNumber, proteinName, proteinId);
507   
508  4 SequenceFeature sf = makeCdsFeature(cds, desc, begin, end, sourceDb,
509    vals);
510   
511  4 sf.setEnaLocation(feature.getLocation());
512  4 boolean forwardStrand = exonStart <= exonEnd;
513  4 sf.setStrand(forwardStrand ? "+" : "-");
514  4 sf.setPhase(String.valueOf(codonStart - 1));
515  4 sf.setValue(FeatureProperties.EXONPOS, exonNumber);
516  4 sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
517   
518  4 dna.addSequenceFeature(sf);
519    }
520    }
521   
522    /*
523    * add feature dbRefs to sequence, and mappings for Uniprot xrefs
524    */
525  3 boolean hasUniprotDbref = false;
526  3 List<XrefType> xrefs = feature.getXref();
527  3 if (xrefs != null)
528    {
529  3 boolean mappingUsed = false;
530  3 for (XrefType xref : xrefs)
531    {
532    /*
533    * ensure UniProtKB/Swiss-Prot converted to UNIPROT
534    */
535  3 String source = DBRefUtils.getCanonicalName(xref.getDb());
536  3 String version = xref.getSecondaryId();
537  3 if (version == null || "".equals(version))
538    {
539  2 version = "0";
540    }
541  3 DBRefEntry dbref = new DBRefEntry(source, version, xref.getId());
542  3 DBRefEntry proteinDbRef = new DBRefEntry(source, version,
543    dbref.getAccessionId());
544  3 if (source.equals(DBRefSource.UNIPROT))
545    {
546  3 String proteinSeqName = DBRefSource.UNIPROT + "|"
547    + dbref.getAccessionId();
548  3 if (dnaToProteinMapping != null
549    && dnaToProteinMapping.getTo() != null)
550    {
551  3 if (mappingUsed)
552    {
553    /*
554    * two or more Uniprot xrefs for the same CDS -
555    * each needs a distinct Mapping (as to a different sequence)
556    */
557  1 dnaToProteinMapping = new Mapping(dnaToProteinMapping);
558    }
559  3 mappingUsed = true;
560   
561    /*
562    * try to locate the protein mapped to (possibly by a
563    * previous CDS feature); if not found, construct it from
564    * the EMBL translation
565    */
566  3 SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName);
567  3 if (proteinSeq == null)
568    {
569  3 proteinSeq = new Sequence(proteinSeqName,
570    product.getSequenceAsString());
571  3 matcher.add(proteinSeq);
572  3 proteinSeq.setDescription(product.getDescription());
573  3 peptides.add(proteinSeq);
574    }
575  3 dnaToProteinMapping.setTo(proteinSeq);
576  3 dnaToProteinMapping.setMappedFromId(proteinId);
577  3 proteinSeq.addDBRef(proteinDbRef);
578  3 dbref.setMap(dnaToProteinMapping);
579    }
580  3 hasUniprotDbref = true;
581    }
582  3 if (product != null)
583    {
584    /*
585    * copy feature dbref to our protein product
586    */
587  3 DBRefEntry pref = proteinDbRef;
588  3 pref.setMap(null); // reference is direct
589  3 product.addDBRef(pref);
590    // Add converse mapping reference
591  3 if (dnaToProteinMapping != null)
592    {
593  3 Mapping pmap = new Mapping(dna,
594    dnaToProteinMapping.getMap().getInverse());
595  3 pref = new DBRefEntry(sourceDb, sequenceVersion, accession);
596  3 pref.setMap(pmap);
597  3 if (dnaToProteinMapping.getTo() != null)
598    {
599  3 dnaToProteinMapping.getTo().addDBRef(pref);
600    }
601    }
602    }
603  3 dna.addDBRef(dbref);
604    }
605    }
606   
607    /*
608    * if we have a product (translation) but no explicit Uniprot dbref
609    * (example: EMBL AAFI02000057 protein_id EAL65544.1)
610    * then construct mappings to an assumed EMBLCDSPROTEIN accession
611    */
612  3 if (!hasUniprotDbref && product != null)
613    {
614  1 if (proteinToEmblProteinRef == null)
615    {
616    // assuming CDSPROTEIN sequence version = dna version (?!)
617  0 proteinToEmblProteinRef = new DBRefEntry(DBRefSource.EMBLCDSProduct,
618    sequenceVersion, proteinId);
619    }
620  1 product.addDBRef(proteinToEmblProteinRef);
621   
622  1 if (dnaToProteinMapping != null
623    && dnaToProteinMapping.getTo() != null)
624    {
625  1 DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
626    DBRefSource.EMBLCDSProduct, sequenceVersion, proteinId);
627  1 dnaToEmblProteinRef.setMap(dnaToProteinMapping);
628  1 dnaToProteinMapping.setMappedFromId(proteinId);
629  1 dna.addDBRef(dnaToEmblProteinRef);
630    }
631    }
632    }
633   
 
634  0 toggle @Override
635    public boolean isDnaCoding()
636    {
637  0 return true;
638    }
639   
640    /**
641    * Returns the CDS positions as a single array of [start, end, start, end...]
642    * positions. If on the reverse strand, these will be in descending order.
643    *
644    * @param accession
645    * @param feature
646    * @return
647    */
 
648  4 toggle protected int[] getCdsRanges(String accession, Feature feature)
649    {
650  4 String location = feature.getLocation();
651  4 if (location == null)
652    {
653  0 return new int[] {};
654    }
655   
656  4 try
657    {
658  4 List<int[]> ranges = DnaUtils.parseLocation(location);
659  4 return listToArray(ranges);
660    } catch (ParseException e)
661    {
662  0 Console.warn(
663    String.format("Not parsing inexact CDS location %s in ENA %s",
664    location, accession));
665  0 return new int[] {};
666    }
667    }
668   
669    /**
670    * Converts a list of [start, end] ranges to a single array of [start, end,
671    * start, end ...]
672    *
673    * @param ranges
674    * @return
675    */
 
676  4 toggle int[] listToArray(List<int[]> ranges)
677    {
678  4 int[] result = new int[ranges.size() * 2];
679  4 int i = 0;
680  4 for (int[] range : ranges)
681    {
682  9 result[i++] = range[0];
683  9 result[i++] = range[1];
684    }
685  4 return result;
686    }
687   
688    /**
689    * Helper method to construct a SequenceFeature for one cds range
690    *
691    * @param type
692    * feature type ("CDS")
693    * @param desc
694    * description
695    * @param begin
696    * start position
697    * @param end
698    * end position
699    * @param group
700    * feature group
701    * @param vals
702    * map of 'miscellaneous values' for feature
703    * @return
704    */
 
705  4 toggle protected SequenceFeature makeCdsFeature(String type, String desc,
706    int begin, int end, String group, Map<String, String> vals)
707    {
708  4 SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group);
709  4 if (!vals.isEmpty())
710    {
711  1 for (Entry<String, String> val : vals.entrySet())
712    {
713  1 sf.setValue(val.getKey(), val.getValue());
714    }
715    }
716  4 return sf;
717    }
718   
 
719  0 toggle @Override
720    public String getAccessionSeparator()
721    {
722  0 return null;
723    }
724   
 
725  0 toggle @Override
726    public Regex getAccessionValidator()
727    {
728  0 return ACCESSION_REGEX;
729    }
730   
 
731  0 toggle @Override
732    public String getDbVersion()
733    {
734  0 return "0";
735    }
736   
 
737  0 toggle @Override
738    public int getTier()
739    {
740  0 return 0;
741    }
742   
 
743  0 toggle @Override
744    public boolean isValidReference(String accession)
745    {
746  0 if (accession == null || accession.length() < 2)
747    {
748  0 return false;
749    }
750  0 return getAccessionValidator().search(accession);
751    }
752   
753    /**
754    * Truncates (if necessary) the exon intervals to match 3 times the length of
755    * the protein (including truncation for stop codon included in exon)
756    *
757    * @param proteinLength
758    * @param exon
759    * an array of [start, end, start, end...] intervals
760    * @return the same array (if unchanged) or a truncated copy
761    */
 
762  9 toggle static int[] adjustForProteinLength(int proteinLength, int[] exon)
763    {
764  9 if (proteinLength <= 0 || exon == null)
765    {
766  0 return exon;
767    }
768  9 int expectedCdsLength = proteinLength * 3;
769  9 int exonLength = MappingUtils.getLength(Arrays.asList(exon));
770   
771    /*
772    * if exon length matches protein, or is shorter, then leave it unchanged
773    */
774  9 if (expectedCdsLength >= exonLength)
775    {
776  5 return exon;
777    }
778   
779  4 int origxon[];
780  4 int sxpos = -1;
781  4 int endxon = 0;
782  4 origxon = new int[exon.length];
783  4 System.arraycopy(exon, 0, origxon, 0, exon.length);
784  4 int cdspos = 0;
785  10 for (int x = 0; x < exon.length; x += 2)
786    {
787  10 cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
788  10 if (expectedCdsLength <= cdspos)
789    {
790    // advanced beyond last codon.
791  4 sxpos = x;
792  4 if (expectedCdsLength != cdspos)
793    {
794    // System.err
795    // .println("Truncating final exon interval on region by "
796    // + (cdspos - cdslength));
797    }
798   
799    /*
800    * shrink the final exon - reduce end position if forward
801    * strand, increase it if reverse
802    */
803  4 if (exon[x + 1] >= exon[x])
804    {
805  4 endxon = exon[x + 1] - cdspos + expectedCdsLength;
806    }
807    else
808    {
809  0 endxon = exon[x + 1] + cdspos - expectedCdsLength;
810    }
811  4 break;
812    }
813    }
814   
815  4 if (sxpos != -1)
816    {
817    // and trim the exon interval set if necessary
818  4 int[] nxon = new int[sxpos + 2];
819  4 System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
820  4 nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
821    // set
822  4 exon = nxon;
823    }
824  4 return exon;
825    }
826   
827    }