Clover icon

Coverage Report

  1. Project Clover database Thu Aug 13 2020 12:04:21 BST
  2. Package jalview.ws.dbsources

File EmblXmlSource.java

 

Coverage histogram

../../../img/srcFileCovDistChart7.png
27% of files have more coverage

Code metrics

102
248
11
1
791
537
79
0.32
22.55
11
7.18

Classes

Class Line # Actions
EmblXmlSource 66 248 79
0.706371270.6%
 

Contributing tests

This file is covered by 4 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.ws.dbsources;
22   
23    import jalview.analysis.SequenceIdMatcher;
24    import jalview.bin.Cache;
25    import jalview.datamodel.Alignment;
26    import jalview.datamodel.AlignmentI;
27    import jalview.datamodel.DBRefEntry;
28    import jalview.datamodel.DBRefSource;
29    import jalview.datamodel.FeatureProperties;
30    import jalview.datamodel.Mapping;
31    import jalview.datamodel.Sequence;
32    import jalview.datamodel.SequenceFeature;
33    import jalview.datamodel.SequenceI;
34    import jalview.util.DBRefUtils;
35    import jalview.util.DnaUtils;
36    import jalview.util.MapList;
37    import jalview.util.MappingUtils;
38    import jalview.util.MessageManager;
39    import jalview.ws.ebi.EBIFetchClient;
40    import jalview.xml.binding.embl.EntryType;
41    import jalview.xml.binding.embl.EntryType.Feature;
42    import jalview.xml.binding.embl.EntryType.Feature.Qualifier;
43    import jalview.xml.binding.jalview.JalviewModel;
44    import jalview.xml.binding.embl.ROOT;
45    import jalview.xml.binding.embl.XrefType;
46   
47    import java.io.File;
48    import java.io.FileInputStream;
49    import java.io.InputStream;
50    import java.text.ParseException;
51    import java.util.ArrayList;
52    import java.util.Arrays;
53    import java.util.Hashtable;
54    import java.util.List;
55    import java.util.Map;
56    import java.util.Map.Entry;
57   
58    import javax.xml.bind.JAXBContext;
59    import javax.xml.bind.JAXBElement;
60    import javax.xml.bind.JAXBException;
61    import javax.xml.stream.FactoryConfigurationError;
62    import javax.xml.stream.XMLInputFactory;
63    import javax.xml.stream.XMLStreamException;
64    import javax.xml.stream.XMLStreamReader;
65   
 
66    public abstract class EmblXmlSource extends EbiFileRetrievedProxy
67    {
68    /*
69    * JAL-1856 Embl returns this text for query not found
70    */
71    private static final String EMBL_NOT_FOUND_REPLY = "ERROR 12 No entries found.";
72   
 
73  20 toggle public EmblXmlSource()
74    {
75  20 super();
76    }
77   
78    /**
79    * Retrieves and parses an emblxml file, and returns an alignment containing
80    * the parsed sequences, or null if none were found
81    *
82    * @param emprefx
83    * "EMBL" or "EMBLCDS" - anything else will not retrieve emblxml
84    * @param query
85    * @return
86    * @throws Exception
87    */
 
88  0 toggle protected AlignmentI getEmblSequenceRecords(String emprefx, String query)
89    throws Exception
90    {
91  0 startQuery();
92  0 EBIFetchClient dbFetch = new EBIFetchClient();
93  0 File reply;
94  0 try
95    {
96  0 reply = dbFetch.fetchDataAsFile(
97    emprefx.toLowerCase() + ":" + query.trim(), "display=xml",
98    "xml");
99    } catch (Exception e)
100    {
101  0 stopQuery();
102  0 throw new Exception(MessageManager.formatMessage(
103    "exception.ebiembl_retrieval_failed_on", new String[]
104    { emprefx.toLowerCase(), query.trim() }), e);
105    }
106  0 return getEmblSequenceRecords(emprefx, query, reply);
107    }
108   
109    /**
110    * parse an emblxml file stored locally
111    *
112    * @param emprefx
113    * either EMBL or EMBLCDS strings are allowed - anything else will
114    * not retrieve emblxml
115    * @param query
116    * @param file
117    * the EMBL XML file containing the results of a query
118    * @return
119    * @throws Exception
120    */
 
121  0 toggle protected AlignmentI getEmblSequenceRecords(String emprefx, String query,
122    File reply) throws Exception
123    {
124  0 List<EntryType> entries = null;
125  0 if (reply != null && reply.exists())
126    {
127  0 file = reply.getAbsolutePath();
128  0 if (reply.length() > EMBL_NOT_FOUND_REPLY.length())
129    {
130  0 InputStream is = new FileInputStream(reply);
131  0 entries = getEmblEntries(is);
132    }
133    }
134   
135    /*
136    * invalid accession gets a reply with no <entry> elements, text content of
137    * EmbFile reads something like (e.g.) this ungrammatical phrase
138    * Entry: <acc> display type is either not supported or entry is not found.
139    */
140  0 AlignmentI al = null;
141  0 List<SequenceI> seqs = new ArrayList<>();
142  0 List<SequenceI> peptides = new ArrayList<>();
143  0 if (entries != null)
144    {
145  0 for (EntryType entry : entries)
146    {
147  0 SequenceI seq = getSequence(emprefx, entry, peptides);
148  0 if (seq != null)
149    {
150  0 seqs.add(seq.deriveSequence());
151    // place DBReferences on dataset and refer
152    }
153    }
154  0 if (!seqs.isEmpty())
155    {
156  0 al = new Alignment(seqs.toArray(new SequenceI[seqs.size()]));
157    }
158    else
159    {
160  0 System.out.println(
161    "No record found for '" + emprefx + ":" + query + "'");
162    }
163    }
164   
165  0 stopQuery();
166  0 return al;
167    }
168   
169    /**
170    * Reads the XML reply from file and unmarshals it to Java objects. Answers a
171    * (possibly empty) list of <code>EntryType</code> objects.
172    *
173    * is
174    *
175    * @return
176    */
 
177  2 toggle List<EntryType> getEmblEntries(InputStream is)
178    {
179  2 List<EntryType> entries = new ArrayList<>();
180  2 try
181    {
182  2 JAXBContext jc = JAXBContext.newInstance("jalview.xml.binding.embl");
183  2 XMLStreamReader streamReader = XMLInputFactory.newInstance()
184    .createXMLStreamReader(is);
185  2 javax.xml.bind.Unmarshaller um = jc.createUnmarshaller();
186  2 JAXBElement<ROOT> rootElement = um.unmarshal(streamReader, ROOT.class);
187  2 ROOT root = rootElement.getValue();
188   
189    /*
190    * document root contains either "entry" or "entrySet"
191    */
192  2 if (root == null)
193    {
194  0 return entries;
195    }
196  2 if (root.getEntrySet() != null)
197    {
198  0 entries = root.getEntrySet().getEntry();
199    }
200  2 else if (root.getEntry() != null)
201    {
202  2 entries.add(root.getEntry());
203    }
204    } catch (JAXBException | XMLStreamException
205    | FactoryConfigurationError e)
206    {
207  0 e.printStackTrace();
208    }
209  2 return entries;
210    }
211   
212    /**
213    * A helper method to parse XML data and construct a sequence, with any
214    * available database references and features
215    *
216    * @param emprefx
217    * @param entry
218    * @param peptides
219    * @return
220    */
 
221  1 toggle SequenceI getSequence(String sourceDb, EntryType entry,
222    List<SequenceI> peptides)
223    {
224  1 String seqString = entry.getSequence();
225  1 if (seqString == null)
226    {
227  0 return null;
228    }
229  1 seqString = seqString.replace(" ", "").replace("\n", "").replace("\t",
230    "");
231  1 String accession = entry.getAccession();
232  1 SequenceI dna = new Sequence(sourceDb + "|" + accession, seqString);
233   
234  1 dna.setDescription(entry.getDescription());
235  1 String sequenceVersion = String.valueOf(entry.getVersion().intValue());
236  1 DBRefEntry selfRref = new DBRefEntry(sourceDb, sequenceVersion,
237    accession);
238  1 dna.addDBRef(selfRref);
239  1 selfRref.setMap(
240    new Mapping(null, new int[]
241    { 1, dna.getLength() }, new int[] { 1, dna.getLength() }, 1,
242    1));
243   
244    /*
245    * add db references
246    */
247  1 List<XrefType> xrefs = entry.getXref();
248  1 if (xrefs != null)
249    {
250  1 for (XrefType xref : xrefs)
251    {
252  2 String acc = xref.getId();
253  2 String source = DBRefUtils.getCanonicalName(xref.getDb());
254  2 String version = xref.getSecondaryId();
255  2 if (version == null || "".equals(version))
256    {
257  1 version = "0";
258    }
259  2 dna.addDBRef(new DBRefEntry(source, version, acc));
260    }
261    }
262   
263  1 SequenceIdMatcher matcher = new SequenceIdMatcher(peptides);
264  1 try
265    {
266  1 List<Feature> features = entry.getFeature();
267  1 if (features != null)
268    {
269  1 for (Feature feature : features)
270    {
271  3 if (FeatureProperties.isCodingFeature(sourceDb,
272    feature.getName()))
273    {
274  3 parseCodingFeature(entry, feature, sourceDb, dna, peptides,
275    matcher);
276    }
277    }
278    }
279    } catch (Exception e)
280    {
281  0 System.err.println("EMBL Record Features parsing error!");
282  0 System.err
283    .println("Please report the following to help@jalview.org :");
284  0 System.err.println("EMBL Record " + accession);
285  0 System.err.println("Resulted in exception: " + e.getMessage());
286  0 e.printStackTrace(System.err);
287    }
288   
289  1 return dna;
290    }
291   
292    /**
293    * Extracts coding region and product from a CDS feature and decorates it with
294    * annotations
295    *
296    * @param entry
297    * @param feature
298    * @param sourceDb
299    * @param dna
300    * @param peptides
301    * @param matcher
302    */
 
303  3 toggle void parseCodingFeature(EntryType entry, Feature feature, String sourceDb,
304    SequenceI dna, List<SequenceI> peptides,
305    SequenceIdMatcher matcher)
306    {
307  3 final boolean isEmblCdna = sourceDb.equals(DBRefSource.EMBLCDS);
308  3 final String accession = entry.getAccession();
309  3 final String sequenceVersion = entry.getVersion().toString();
310   
311  3 int[] exons = getCdsRanges(entry.getAccession(), feature);
312   
313  3 String translation = null;
314  3 String proteinName = "";
315  3 String proteinId = null;
316  3 Map<String, String> vals = new Hashtable<>();
317   
318    /*
319    * codon_start 1/2/3 in EMBL corresponds to phase 0/1/2 in CDS
320    * (phase is required for CDS features in GFF3 format)
321    */
322  3 int codonStart = 1;
323   
324    /*
325    * parse qualifiers, saving protein translation, protein id,
326    * codon start position, product (name), and 'other values'
327    */
328  3 if (feature.getQualifier() != null)
329    {
330  3 for (Qualifier q : feature.getQualifier())
331    {
332  7 String qname = q.getName();
333  7 String value = q.getValue();
334  7 value = value == null ? ""
335    : value.trim().replace(" ", "").replace("\n", "")
336    .replace("\t", "");
337  7 if (qname.equals("translation"))
338    {
339  3 translation = value;
340    }
341  4 else if (qname.equals("protein_id"))
342    {
343  3 proteinId = value;
344    }
345  1 else if (qname.equals("codon_start"))
346    {
347  0 try
348    {
349  0 codonStart = Integer.parseInt(value.trim());
350    } catch (NumberFormatException e)
351    {
352  0 System.err.println("Invalid codon_start in XML for "
353    + entry.getAccession() + ": " + e.getMessage());
354    }
355    }
356  1 else if (qname.equals("product"))
357    {
358    // sometimes name is returned e.g. for V00488
359  0 proteinName = value;
360    }
361    else
362    {
363    // throw anything else into the additional properties hash
364  1 if (!"".equals(value))
365    {
366  1 vals.put(qname, value);
367    }
368    }
369    }
370    }
371   
372  3 DBRefEntry proteinToEmblProteinRef = null;
373  3 exons = MappingUtils.removeStartPositions(codonStart - 1, exons);
374   
375  3 SequenceI product = null;
376  3 Mapping dnaToProteinMapping = null;
377  3 if (translation != null && proteinName != null && proteinId != null)
378    {
379  3 int translationLength = translation.length();
380   
381    /*
382    * look for product in peptides list, if not found, add it
383    */
384  3 product = matcher.findIdMatch(proteinId);
385  3 if (product == null)
386    {
387  3 product = new Sequence(proteinId, translation, 1,
388    translationLength);
389  3 product.setDescription(((proteinName.length() == 0)
390    ? "Protein Product from " + sourceDb
391    : proteinName));
392  3 peptides.add(product);
393  3 matcher.add(product);
394    }
395   
396    // we have everything - create the mapping and perhaps the protein
397    // sequence
398  3 if (exons == null || exons.length == 0)
399    {
400    /*
401    * workaround until we handle dna location for CDS sequence
402    * e.g. location="X53828.1:60..1058" correctly
403    */
404  0 System.err.println(
405    "Implementation Notice: EMBLCDS records not properly supported yet - Making up the CDNA region of this sequence... may be incorrect ("
406    + sourceDb + ":" + entry.getAccession() + ")");
407  0 int dnaLength = dna.getLength();
408  0 if (translationLength * 3 == (1 - codonStart + dnaLength))
409    {
410  0 System.err.println(
411    "Not allowing for additional stop codon at end of cDNA fragment... !");
412    // this might occur for CDS sequences where no features are marked
413  0 exons = new int[] { dna.getStart() + (codonStart - 1),
414    dna.getEnd() };
415  0 dnaToProteinMapping = new Mapping(product, exons,
416    new int[]
417    { 1, translationLength }, 3, 1);
418    }
419  0 if ((translationLength + 1) * 3 == (1 - codonStart + dnaLength))
420    {
421  0 System.err.println(
422    "Allowing for additional stop codon at end of cDNA fragment... will probably cause an error in VAMSAs!");
423  0 exons = new int[] { dna.getStart() + (codonStart - 1),
424    dna.getEnd() - 3 };
425  0 dnaToProteinMapping = new Mapping(product, exons,
426    new int[]
427    { 1, translationLength }, 3, 1);
428    }
429    }
430    else
431    {
432    // Trim the exon mapping if necessary - the given product may only be a
433    // fragment of a larger protein. (EMBL:AY043181 is an example)
434   
435  3 if (isEmblCdna)
436    {
437    // TODO: Add a DbRef back to the parent EMBL sequence with the exon
438    // map
439    // if given a dataset reference, search dataset for parent EMBL
440    // sequence if it exists and set its map
441    // make a new feature annotating the coding contig
442    }
443    else
444    {
445    // final product length truncation check
446  3 int[] cdsRanges = adjustForProteinLength(translationLength,
447    exons);
448  3 dnaToProteinMapping = new Mapping(product, cdsRanges,
449    new int[]
450    { 1, translationLength }, 3, 1);
451  3 if (product != null)
452    {
453    /*
454    * make xref with mapping from protein to EMBL dna
455    */
456  3 DBRefEntry proteinToEmblRef = new DBRefEntry(DBRefSource.EMBL,
457    sequenceVersion, proteinId,
458    new Mapping(dnaToProteinMapping.getMap().getInverse()));
459  3 product.addDBRef(proteinToEmblRef);
460   
461    /*
462    * make xref from protein to EMBLCDS; we assume here that the
463    * CDS sequence version is same as dna sequence (?!)
464    */
465  3 MapList proteinToCdsMapList = new MapList(
466    new int[]
467    { 1, translationLength },
468    new int[]
469    { 1 + (codonStart - 1),
470    (codonStart - 1) + 3 * translationLength },
471    1, 3);
472  3 DBRefEntry proteinToEmblCdsRef = new DBRefEntry(
473    DBRefSource.EMBLCDS, sequenceVersion, proteinId,
474    new Mapping(proteinToCdsMapList));
475  3 product.addDBRef(proteinToEmblCdsRef);
476   
477    /*
478    * make 'direct' xref from protein to EMBLCDSPROTEIN
479    */
480  3 proteinToEmblProteinRef = new DBRefEntry(proteinToEmblCdsRef);
481  3 proteinToEmblProteinRef.setSource(DBRefSource.EMBLCDSProduct);
482  3 proteinToEmblProteinRef.setMap(null);
483  3 product.addDBRef(proteinToEmblProteinRef);
484    }
485    }
486    }
487   
488    /*
489    * add cds features to dna sequence
490    */
491  3 String cds = feature.getName(); // "CDS"
492  7 for (int xint = 0; exons != null
493    && xint < exons.length - 1; xint += 2)
494    {
495  4 int exonStart = exons[xint];
496  4 int exonEnd = exons[xint + 1];
497  4 int begin = Math.min(exonStart, exonEnd);
498  4 int end = Math.max(exonStart, exonEnd);
499  4 int exonNumber = xint / 2 + 1;
500  4 String desc = String.format("Exon %d for protein '%s' EMBLCDS:%s",
501    exonNumber, proteinName, proteinId);
502   
503  4 SequenceFeature sf = makeCdsFeature(cds, desc, begin, end, sourceDb,
504    vals);
505   
506  4 sf.setEnaLocation(feature.getLocation());
507  4 boolean forwardStrand = exonStart <= exonEnd;
508  4 sf.setStrand(forwardStrand ? "+" : "-");
509  4 sf.setPhase(String.valueOf(codonStart - 1));
510  4 sf.setValue(FeatureProperties.EXONPOS, exonNumber);
511  4 sf.setValue(FeatureProperties.EXONPRODUCT, proteinName);
512   
513  4 dna.addSequenceFeature(sf);
514    }
515    }
516   
517    /*
518    * add feature dbRefs to sequence, and mappings for Uniprot xrefs
519    */
520  3 boolean hasUniprotDbref = false;
521  3 List<XrefType> xrefs = feature.getXref();
522  3 if (xrefs != null)
523    {
524  3 boolean mappingUsed = false;
525  3 for (XrefType xref : xrefs)
526    {
527    /*
528    * ensure UniProtKB/Swiss-Prot converted to UNIPROT
529    */
530  3 String source = DBRefUtils.getCanonicalName(xref.getDb());
531  3 String version = xref.getSecondaryId();
532  3 if (version == null || "".equals(version))
533    {
534  2 version = "0";
535    }
536  3 DBRefEntry dbref = new DBRefEntry(source, version, xref.getId());
537  3 DBRefEntry proteinDbRef = new DBRefEntry(source, version,
538    dbref.getAccessionId());
539  3 if (source.equals(DBRefSource.UNIPROT))
540    {
541  3 String proteinSeqName = DBRefSource.UNIPROT + "|"
542    + dbref.getAccessionId();
543  3 if (dnaToProteinMapping != null
544    && dnaToProteinMapping.getTo() != null)
545    {
546  3 if (mappingUsed)
547    {
548    /*
549    * two or more Uniprot xrefs for the same CDS -
550    * each needs a distinct Mapping (as to a different sequence)
551    */
552  1 dnaToProteinMapping = new Mapping(dnaToProteinMapping);
553    }
554  3 mappingUsed = true;
555   
556    /*
557    * try to locate the protein mapped to (possibly by a
558    * previous CDS feature); if not found, construct it from
559    * the EMBL translation
560    */
561  3 SequenceI proteinSeq = matcher.findIdMatch(proteinSeqName);
562  3 if (proteinSeq == null)
563    {
564  3 proteinSeq = new Sequence(proteinSeqName,
565    product.getSequenceAsString());
566  3 matcher.add(proteinSeq);
567  3 peptides.add(proteinSeq);
568    }
569  3 dnaToProteinMapping.setTo(proteinSeq);
570  3 dnaToProteinMapping.setMappedFromId(proteinId);
571  3 proteinSeq.addDBRef(proteinDbRef);
572  3 dbref.setMap(dnaToProteinMapping);
573    }
574  3 hasUniprotDbref = true;
575    }
576  3 if (product != null)
577    {
578    /*
579    * copy feature dbref to our protein product
580    */
581  3 DBRefEntry pref = proteinDbRef;
582  3 pref.setMap(null); // reference is direct
583  3 product.addDBRef(pref);
584    // Add converse mapping reference
585  3 if (dnaToProteinMapping != null)
586    {
587  3 Mapping pmap = new Mapping(dna,
588    dnaToProteinMapping.getMap().getInverse());
589  3 pref = new DBRefEntry(sourceDb, sequenceVersion, accession);
590  3 pref.setMap(pmap);
591  3 if (dnaToProteinMapping.getTo() != null)
592    {
593  3 dnaToProteinMapping.getTo().addDBRef(pref);
594    }
595    }
596    }
597  3 dna.addDBRef(dbref);
598    }
599    }
600   
601    /*
602    * if we have a product (translation) but no explicit Uniprot dbref
603    * (example: EMBL AAFI02000057 protein_id EAL65544.1)
604    * then construct mappings to an assumed EMBLCDSPROTEIN accession
605    */
606  3 if (!hasUniprotDbref && product != null)
607    {
608  1 if (proteinToEmblProteinRef == null)
609    {
610    // assuming CDSPROTEIN sequence version = dna version (?!)
611  0 proteinToEmblProteinRef = new DBRefEntry(DBRefSource.EMBLCDSProduct,
612    sequenceVersion, proteinId);
613    }
614  1 product.addDBRef(proteinToEmblProteinRef);
615   
616  1 if (dnaToProteinMapping != null
617    && dnaToProteinMapping.getTo() != null)
618    {
619  1 DBRefEntry dnaToEmblProteinRef = new DBRefEntry(
620    DBRefSource.EMBLCDSProduct, sequenceVersion,
621    proteinId);
622  1 dnaToEmblProteinRef.setMap(dnaToProteinMapping);
623  1 dnaToProteinMapping.setMappedFromId(proteinId);
624  1 dna.addDBRef(dnaToEmblProteinRef);
625    }
626    }
627    }
628   
 
629  0 toggle @Override
630    public boolean isDnaCoding()
631    {
632  0 return true;
633    }
634   
635    /**
636    * Returns the CDS positions as a single array of [start, end, start, end...]
637    * positions. If on the reverse strand, these will be in descending order.
638    *
639    * @param accession
640    * @param feature
641    * @return
642    */
 
643  4 toggle protected int[] getCdsRanges(String accession, Feature feature)
644    {
645  4 String location = feature.getLocation();
646  4 if (location == null)
647    {
648  0 return new int[] {};
649    }
650   
651  4 try
652    {
653  4 List<int[]> ranges = DnaUtils.parseLocation(location);
654  4 return listToArray(ranges);
655    } catch (ParseException e)
656    {
657  0 Cache.log.warn(
658    String.format("Not parsing inexact CDS location %s in ENA %s",
659    location, accession));
660  0 return new int[] {};
661    }
662    }
663   
664    /**
665    * Converts a list of [start, end] ranges to a single array of [start, end,
666    * start, end ...]
667    *
668    * @param ranges
669    * @return
670    */
 
671  4 toggle int[] listToArray(List<int[]> ranges)
672    {
673  4 int[] result = new int[ranges.size() * 2];
674  4 int i = 0;
675  4 for (int[] range : ranges)
676    {
677  9 result[i++] = range[0];
678  9 result[i++] = range[1];
679    }
680  4 return result;
681    }
682   
683    /**
684    * Helper method to construct a SequenceFeature for one cds range
685    *
686    * @param type
687    * feature type ("CDS")
688    * @param desc
689    * description
690    * @param begin
691    * start position
692    * @param end
693    * end position
694    * @param group
695    * feature group
696    * @param vals
697    * map of 'miscellaneous values' for feature
698    * @return
699    */
 
700  4 toggle protected SequenceFeature makeCdsFeature(String type, String desc,
701    int begin, int end, String group, Map<String, String> vals)
702    {
703  4 SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group);
704  4 if (!vals.isEmpty())
705    {
706  1 for (Entry<String, String> val : vals.entrySet())
707    {
708  1 sf.setValue(val.getKey(), val.getValue());
709    }
710    }
711  4 return sf;
712    }
713   
714    /**
715    * Truncates (if necessary) the exon intervals to match 3 times the length of
716    * the protein; also accepts 3 bases longer (for stop codon not included in
717    * protein)
718    *
719    * @param proteinLength
720    * @param exon
721    * an array of [start, end, start, end...] intervals
722    * @return the same array (if unchanged) or a truncated copy
723    */
 
724  9 toggle static int[] adjustForProteinLength(int proteinLength, int[] exon)
725    {
726  9 if (proteinLength <= 0 || exon == null)
727    {
728  0 return exon;
729    }
730  9 int expectedCdsLength = proteinLength * 3;
731  9 int exonLength = MappingUtils.getLength(Arrays.asList(exon));
732   
733    /*
734    * if exon length matches protein, or is shorter, or longer by the
735    * length of a stop codon (3 bases), then leave it unchanged
736    */
737  9 if (expectedCdsLength >= exonLength
738    || expectedCdsLength == exonLength - 3)
739    {
740  6 return exon;
741    }
742   
743  3 int origxon[];
744  3 int sxpos = -1;
745  3 int endxon = 0;
746  3 origxon = new int[exon.length];
747  3 System.arraycopy(exon, 0, origxon, 0, exon.length);
748  3 int cdspos = 0;
749  7 for (int x = 0; x < exon.length; x += 2)
750    {
751  7 cdspos += Math.abs(exon[x + 1] - exon[x]) + 1;
752  7 if (expectedCdsLength <= cdspos)
753    {
754    // advanced beyond last codon.
755  3 sxpos = x;
756  3 if (expectedCdsLength != cdspos)
757    {
758    // System.err
759    // .println("Truncating final exon interval on region by "
760    // + (cdspos - cdslength));
761    }
762   
763    /*
764    * shrink the final exon - reduce end position if forward
765    * strand, increase it if reverse
766    */
767  3 if (exon[x + 1] >= exon[x])
768    {
769  3 endxon = exon[x + 1] - cdspos + expectedCdsLength;
770    }
771    else
772    {
773  0 endxon = exon[x + 1] + cdspos - expectedCdsLength;
774    }
775  3 break;
776    }
777    }
778   
779  3 if (sxpos != -1)
780    {
781    // and trim the exon interval set if necessary
782  3 int[] nxon = new int[sxpos + 2];
783  3 System.arraycopy(exon, 0, nxon, 0, sxpos + 2);
784  3 nxon[sxpos + 1] = endxon; // update the end boundary for the new exon
785    // set
786  3 exon = nxon;
787    }
788  3 return exon;
789    }
790   
791    }