Clover icon

Coverage Report

  1. Project Clover database Thu Aug 13 2020 12:04:21 BST
  2. Package jalview.io.vcf

File VCFLoader.java

 

Coverage histogram

../../../img/srcFileCovDistChart7.png
27% of files have more coverage

Code metrics

180
407
35
2
1,659
911
152
0.37
11.63
17.5
4.34

Classes

Class Line # Actions
VCFLoader 77 404 150
0.7001620570%
VCFLoader.VCFMap 106 3 2
0.660%
 

Contributing tests

This file is covered by 4 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.io.vcf;
22   
23    import jalview.analysis.Dna;
24    import jalview.api.AlignViewControllerGuiI;
25    import jalview.bin.Cache;
26    import jalview.datamodel.DBRefEntry;
27    import jalview.datamodel.GeneLociI;
28    import jalview.datamodel.Mapping;
29    import jalview.datamodel.SequenceFeature;
30    import jalview.datamodel.SequenceI;
31    import jalview.datamodel.features.FeatureAttributeType;
32    import jalview.datamodel.features.FeatureSource;
33    import jalview.datamodel.features.FeatureSources;
34    import jalview.ext.ensembl.EnsemblMap;
35    import jalview.ext.htsjdk.HtsContigDb;
36    import jalview.ext.htsjdk.VCFReader;
37    import jalview.io.gff.Gff3Helper;
38    import jalview.io.gff.SequenceOntologyI;
39    import jalview.util.MapList;
40    import jalview.util.MappingUtils;
41    import jalview.util.MessageManager;
42    import jalview.util.StringUtils;
43   
44    import java.io.File;
45    import java.io.IOException;
46    import java.util.ArrayList;
47    import java.util.HashMap;
48    import java.util.HashSet;
49    import java.util.Iterator;
50    import java.util.List;
51    import java.util.Map;
52    import java.util.Map.Entry;
53    import java.util.Set;
54    import java.util.regex.Pattern;
55    import java.util.regex.PatternSyntaxException;
56   
57    import htsjdk.samtools.SAMException;
58    import htsjdk.samtools.SAMSequenceDictionary;
59    import htsjdk.samtools.SAMSequenceRecord;
60    import htsjdk.samtools.util.CloseableIterator;
61    import htsjdk.tribble.TribbleException;
62    import htsjdk.variant.variantcontext.Allele;
63    import htsjdk.variant.variantcontext.VariantContext;
64    import htsjdk.variant.vcf.VCFConstants;
65    import htsjdk.variant.vcf.VCFHeader;
66    import htsjdk.variant.vcf.VCFHeaderLine;
67    import htsjdk.variant.vcf.VCFHeaderLineCount;
68    import htsjdk.variant.vcf.VCFHeaderLineType;
69    import htsjdk.variant.vcf.VCFInfoHeaderLine;
70   
71    /**
72    * A class to read VCF data (using the htsjdk) and add variants as sequence
73    * features on dna and any related protein product sequences
74    *
75    * @author gmcarstairs
76    */
 
77    public class VCFLoader
78    {
79    private static final String VCF_ENCODABLE = ":;=%,";
80   
81    /*
82    * Jalview feature attributes for VCF fixed column data
83    */
84    private static final String VCF_POS = "POS";
85   
86    private static final String VCF_ID = "ID";
87   
88    private static final String VCF_QUAL = "QUAL";
89   
90    private static final String VCF_FILTER = "FILTER";
91   
92    private static final String NO_VALUE = VCFConstants.MISSING_VALUE_v4; // '.'
93   
94    private static final String DEFAULT_SPECIES = "homo_sapiens";
95   
96    /**
97    * A class to model the mapping from sequence to VCF coordinates. Cases include
98    * <ul>
99    * <li>a direct 1:1 mapping where the sequence is one of the VCF contigs</li>
100    * <li>a mapping of sequence to chromosomal coordinates, where sequence and VCF
101    * use the same reference assembly</li>
102    * <li>a modified mapping of sequence to chromosomal coordinates, where sequence
103    * and VCF use different reference assembles</li>
104    * </ul>
105    */
 
106    class VCFMap
107    {
108    final String chromosome;
109   
110    final MapList map;
111   
 
112  24 toggle VCFMap(String chr, MapList m)
113    {
114  24 chromosome = chr;
115  24 map = m;
116    }
117   
 
118  0 toggle @Override
119    public String toString()
120    {
121  0 return chromosome + ":" + map.toString();
122    }
123    }
124   
125    /*
126    * Lookup keys, and default values, for Preference entries that describe
127    * patterns for VCF and VEP fields to capture
128    */
129    private static final String VEP_FIELDS_PREF = "VEP_FIELDS";
130   
131    private static final String VCF_FIELDS_PREF = "VCF_FIELDS";
132   
133    private static final String DEFAULT_VCF_FIELDS = ".*";
134   
135    private static final String DEFAULT_VEP_FIELDS = ".*";// "Allele,Consequence,IMPACT,SWISSPROT,SIFT,PolyPhen,CLIN_SIG";
136   
137    /*
138    * Lookup keys, and default values, for Preference entries that give
139    * mappings from tokens in the 'reference' header to species or assembly
140    */
141    private static final String VCF_ASSEMBLY = "VCF_ASSEMBLY";
142   
143    private static final String DEFAULT_VCF_ASSEMBLY = "assembly19=GRCh37,hs37=GRCh37,grch37=GRCh37,grch38=GRCh38";
144   
145    private static final String VCF_SPECIES = "VCF_SPECIES"; // default is human
146   
147    private static final String DEFAULT_REFERENCE = "grch37"; // fallback default is human GRCh37
148   
149    /*
150    * keys to fields of VEP CSQ consequence data
151    * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html
152    */
153    private static final String CSQ_CONSEQUENCE_KEY = "Consequence";
154    private static final String CSQ_ALLELE_KEY = "Allele";
155    private static final String CSQ_ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1...
156    private static final String CSQ_FEATURE_KEY = "Feature"; // Ensembl stable id
157   
158    /*
159    * default VCF INFO key for VEP consequence data
160    * NB this can be overridden running VEP with --vcf_info_field
161    * - we don't handle this case (require identifier to be CSQ)
162    */
163    private static final String CSQ_FIELD = "CSQ";
164   
165    /*
166    * separator for fields in consequence data is '|'
167    */
168    private static final String PIPE_REGEX = "\\|";
169   
170    /*
171    * delimiter that separates multiple consequence data blocks
172    */
173    private static final String COMMA = ",";
174   
175    /*
176    * the feature group assigned to a VCF variant in Jalview
177    */
178    private static final String FEATURE_GROUP_VCF = "VCF";
179   
180    /*
181    * internal delimiter used to build keys for assemblyMappings
182    *
183    */
184    private static final String EXCL = "!";
185   
186    /*
187    * the VCF file we are processing
188    */
189    protected String vcfFilePath;
190   
191    /*
192    * mappings between VCF and sequence reference assembly regions, as
193    * key = "species!chromosome!fromAssembly!toAssembly
194    * value = Map{fromRange, toRange}
195    */
196    private Map<String, Map<int[], int[]>> assemblyMappings;
197   
198    private VCFReader reader;
199   
200    /*
201    * holds details of the VCF header lines (metadata)
202    */
203    private VCFHeader header;
204   
205    /*
206    * species (as a valid Ensembl term) the VCF is for
207    */
208    private String vcfSpecies;
209   
210    /*
211    * genome assembly version (as a valid Ensembl identifier) the VCF is for
212    */
213    private String vcfAssembly;
214   
215    /*
216    * a Dictionary of contigs (if present) referenced in the VCF file
217    */
218    private SAMSequenceDictionary dictionary;
219   
220    /*
221    * the position (0...) of field in each block of
222    * CSQ (consequence) data (if declared in the VCF INFO header for CSQ)
223    * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
224    */
225    private int csqConsequenceFieldIndex = -1;
226    private int csqAlleleFieldIndex = -1;
227    private int csqAlleleNumberFieldIndex = -1;
228    private int csqFeatureFieldIndex = -1;
229   
230    // todo the same fields for SnpEff ANN data if wanted
231    // see http://snpeff.sourceforge.net/SnpEff_manual.html#input
232   
233    /*
234    * a unique identifier under which to save metadata about feature
235    * attributes (selected INFO field data)
236    */
237    private String sourceId;
238   
239    /*
240    * The INFO IDs of data that is both present in the VCF file, and
241    * also matched by any filters for data of interest
242    */
243    List<String> vcfFieldsOfInterest;
244   
245    /*
246    * The field offsets and identifiers for VEP (CSQ) data that is both present
247    * in the VCF file, and also matched by any filters for data of interest
248    * for example 0 -> Allele, 1 -> Consequence, ..., 36 -> SIFT, ...
249    */
250    Map<Integer, String> vepFieldsOfInterest;
251   
252    /*
253    * key:value for which rejected data has been seen
254    * (the error is logged only once for each combination)
255    */
256    private Set<String> badData;
257   
258    /**
259    * Constructor given a VCF file
260    *
261    * @param alignment
262    */
 
263  4 toggle public VCFLoader(String vcfFile)
264    {
265  4 try
266    {
267  4 initialise(vcfFile);
268    } catch (IOException e)
269    {
270  0 System.err.println("Error opening VCF file: " + e.getMessage());
271    }
272   
273    // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange}
274  4 assemblyMappings = new HashMap<>();
275    }
276   
277    /**
278    * Starts a new thread to query and load VCF variant data on to the given
279    * sequences
280    * <p>
281    * This method is not thread safe - concurrent threads should use separate
282    * instances of this class.
283    *
284    * @param seqs
285    * @param gui
286    */
 
287  0 toggle public void loadVCF(SequenceI[] seqs, final AlignViewControllerGuiI gui)
288    {
289  0 if (gui != null)
290    {
291  0 gui.setStatus(MessageManager.getString("label.searching_vcf"));
292    }
293   
294  0 new Thread()
295    {
 
296  0 toggle @Override
297    public void run()
298    {
299  0 VCFLoader.this.doLoad(seqs, gui);
300    }
301    }.start();
302    }
303   
304    /**
305    * Reads the specified contig sequence and adds its VCF variants to it
306    *
307    * @param contig
308    * the id of a single sequence (contig) to load
309    * @return
310    */
 
311  3 toggle public SequenceI loadVCFContig(String contig)
312    {
313  3 VCFHeaderLine headerLine = header.getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
314  3 if (headerLine == null)
315    {
316  0 Cache.log.error("VCF reference header not found");
317  0 return null;
318    }
319  3 String ref = headerLine.getValue();
320  3 if (ref.startsWith("file://"))
321    {
322  0 ref = ref.substring(7);
323    }
324  3 setSpeciesAndAssembly(ref);
325   
326  3 SequenceI seq = null;
327  3 File dbFile = new File(ref);
328   
329  3 if (dbFile.exists())
330    {
331  3 HtsContigDb db = new HtsContigDb("", dbFile);
332  3 seq = db.getSequenceProxy(contig);
333  3 loadSequenceVCF(seq);
334  3 db.close();
335    }
336    else
337    {
338  0 Cache.log.error("VCF reference not found: " + ref);
339    }
340   
341  3 return seq;
342    }
343   
344    /**
345    * Loads VCF on to one or more sequences
346    *
347    * @param seqs
348    * @param gui
349    * optional callback handler for messages
350    */
 
351  3 toggle protected void doLoad(SequenceI[] seqs, AlignViewControllerGuiI gui)
352    {
353  3 try
354    {
355  3 VCFHeaderLine ref = header
356    .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
357  3 String reference = ref == null ? null : ref.getValue();
358   
359  3 setSpeciesAndAssembly(reference);
360   
361  3 int varCount = 0;
362  3 int seqCount = 0;
363   
364    /*
365    * query for VCF overlapping each sequence in turn
366    */
367  3 for (SequenceI seq : seqs)
368    {
369  21 int added = loadSequenceVCF(seq);
370  21 if (added > 0)
371    {
372  11 seqCount++;
373  11 varCount += added;
374  11 transferAddedFeatures(seq);
375    }
376    }
377  3 if (gui != null)
378    {
379  0 String msg = MessageManager.formatMessage("label.added_vcf",
380    varCount, seqCount);
381  0 gui.setStatus(msg);
382  0 if (gui.getFeatureSettingsUI() != null)
383    {
384  0 gui.getFeatureSettingsUI().discoverAllFeatureData();
385    }
386    }
387    } catch (Throwable e)
388    {
389  0 System.err.println("Error processing VCF: " + e.getMessage());
390  0 e.printStackTrace();
391  0 if (gui != null)
392    {
393  0 gui.setStatus("Error occurred - see console for details");
394    }
395    } finally
396    {
397  3 if (reader != null)
398    {
399  3 try
400    {
401  3 reader.close();
402    } catch (IOException e)
403    {
404    // ignore
405    }
406    }
407  3 header = null;
408  3 dictionary = null;
409    }
410    }
411   
412    /**
413    * Attempts to determine and save the species and genome assembly version to
414    * which the VCF data applies. This may be done by parsing the {@code reference}
415    * header line, configured in a property file, or (potentially) confirmed
416    * interactively by the user.
417    * <p>
418    * The saved values should be identifiers valid for Ensembl's REST service
419    * {@code map} endpoint, so they can be used (if necessary) to retrieve the
420    * mapping between VCF coordinates and sequence coordinates.
421    *
422    * @param reference
423    * @see https://rest.ensembl.org/documentation/info/assembly_map
424    * @see https://rest.ensembl.org/info/assembly/human?content-type=text/xml
425    * @see https://rest.ensembl.org/info/species?content-type=text/xml
426    */
 
427  6 toggle protected void setSpeciesAndAssembly(String reference)
428    {
429  6 if (reference == null)
430    {
431  0 Cache.log.error("No VCF ##reference found, defaulting to "
432    + DEFAULT_REFERENCE + ":" + DEFAULT_SPECIES);
433  0 reference = DEFAULT_REFERENCE; // default to GRCh37 if not specified
434    }
435  6 reference = reference.toLowerCase();
436   
437    /*
438    * for a non-human species, or other assembly identifier,
439    * specify as a Jalview property file entry e.g.
440    * VCF_ASSEMBLY = hs37=GRCh37,assembly19=GRCh37
441    * VCF_SPECIES = c_elegans=celegans
442    * to map a token in the reference header to a value
443    */
444  6 String prop = Cache.getDefault(VCF_ASSEMBLY, DEFAULT_VCF_ASSEMBLY);
445  6 for (String token : prop.split(","))
446    {
447  6 String[] tokens = token.split("=");
448  6 if (tokens.length == 2)
449    {
450  6 if (reference.contains(tokens[0].trim().toLowerCase()))
451    {
452  3 vcfAssembly = tokens[1].trim();
453  3 break;
454    }
455    }
456    }
457   
458  6 vcfSpecies = DEFAULT_SPECIES;
459  6 prop = Cache.getProperty(VCF_SPECIES);
460  6 if (prop != null)
461    {
462  0 for (String token : prop.split(","))
463    {
464  0 String[] tokens = token.split("=");
465  0 if (tokens.length == 2)
466    {
467  0 if (reference.contains(tokens[0].trim().toLowerCase()))
468    {
469  0 vcfSpecies = tokens[1].trim();
470  0 break;
471    }
472    }
473    }
474    }
475    }
476   
477    /**
478    * Opens the VCF file and parses header data
479    *
480    * @param filePath
481    * @throws IOException
482    */
 
483  4 toggle private void initialise(String filePath) throws IOException
484    {
485  4 vcfFilePath = filePath;
486   
487  4 reader = new VCFReader(filePath);
488   
489  4 header = reader.getFileHeader();
490   
491  4 try
492    {
493  4 dictionary = header.getSequenceDictionary();
494    } catch (SAMException e)
495    {
496    // ignore - thrown if any contig line lacks length info
497    }
498   
499  4 sourceId = filePath;
500   
501  4 saveMetadata(sourceId);
502   
503    /*
504    * get offset of CSQ ALLELE_NUM and Feature if declared
505    */
506  4 parseCsqHeader();
507    }
508   
509    /**
510    * Reads metadata (such as INFO field descriptions and datatypes) and saves
511    * them for future reference
512    *
513    * @param theSourceId
514    */
 
515  4 toggle void saveMetadata(String theSourceId)
516    {
517  4 List<Pattern> vcfFieldPatterns = getFieldMatchers(VCF_FIELDS_PREF,
518    DEFAULT_VCF_FIELDS);
519  4 vcfFieldsOfInterest = new ArrayList<>();
520   
521  4 FeatureSource metadata = new FeatureSource(theSourceId);
522   
523  4 for (VCFInfoHeaderLine info : header.getInfoHeaderLines())
524    {
525  13 String attributeId = info.getID();
526  13 String desc = info.getDescription();
527  13 VCFHeaderLineType type = info.getType();
528  13 FeatureAttributeType attType = null;
529  13 switch (type)
530    {
531  0 case Character:
532  0 attType = FeatureAttributeType.Character;
533  0 break;
534  0 case Flag:
535  0 attType = FeatureAttributeType.Flag;
536  0 break;
537  7 case Float:
538  7 attType = FeatureAttributeType.Float;
539  7 break;
540  5 case Integer:
541  5 attType = FeatureAttributeType.Integer;
542  5 break;
543  1 case String:
544  1 attType = FeatureAttributeType.String;
545  1 break;
546    }
547  13 metadata.setAttributeName(attributeId, desc);
548  13 metadata.setAttributeType(attributeId, attType);
549   
550  13 if (isFieldWanted(attributeId, vcfFieldPatterns))
551    {
552  13 vcfFieldsOfInterest.add(attributeId);
553    }
554    }
555   
556  4 FeatureSources.getInstance().addSource(theSourceId, metadata);
557    }
558   
559    /**
560    * Answers true if the field id is matched by any of the filter patterns, else
561    * false. Matching is against regular expression patterns, and is not
562    * case-sensitive.
563    *
564    * @param id
565    * @param filters
566    * @return
567    */
 
568  22 toggle private boolean isFieldWanted(String id, List<Pattern> filters)
569    {
570  22 for (Pattern p : filters)
571    {
572  22 if (p.matcher(id.toUpperCase()).matches())
573    {
574  22 return true;
575    }
576    }
577  0 return false;
578    }
579   
580    /**
581    * Records 'wanted' fields defined in the CSQ INFO header (if there is one).
582    * Also records the position of selected fields (Allele, ALLELE_NUM, Feature)
583    * required for processing.
584    * <p>
585    * CSQ fields are declared in the CSQ INFO Description e.g.
586    * <p>
587    * Description="Consequence ...from ... VEP. Format: Allele|Consequence|...
588    */
 
589  4 toggle protected void parseCsqHeader()
590    {
591  4 List<Pattern> vepFieldFilters = getFieldMatchers(VEP_FIELDS_PREF,
592    DEFAULT_VEP_FIELDS);
593  4 vepFieldsOfInterest = new HashMap<>();
594   
595  4 VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ_FIELD);
596  4 if (csqInfo == null)
597    {
598  3 return;
599    }
600   
601    /*
602    * parse out the pipe-separated list of CSQ fields; we assume here that
603    * these form the last part of the description, and contain no spaces
604    */
605  1 String desc = csqInfo.getDescription();
606  1 int spacePos = desc.lastIndexOf(" ");
607  1 desc = desc.substring(spacePos + 1);
608   
609  1 if (desc != null)
610    {
611  1 String[] format = desc.split(PIPE_REGEX);
612  1 int index = 0;
613  1 for (String field : format)
614    {
615  9 if (CSQ_CONSEQUENCE_KEY.equals(field))
616    {
617  1 csqConsequenceFieldIndex = index;
618    }
619  9 if (CSQ_ALLELE_NUM_KEY.equals(field))
620    {
621  0 csqAlleleNumberFieldIndex = index;
622    }
623  9 if (CSQ_ALLELE_KEY.equals(field))
624    {
625  1 csqAlleleFieldIndex = index;
626    }
627  9 if (CSQ_FEATURE_KEY.equals(field))
628    {
629  1 csqFeatureFieldIndex = index;
630    }
631   
632  9 if (isFieldWanted(field, vepFieldFilters))
633    {
634  9 vepFieldsOfInterest.put(index, field);
635    }
636   
637  9 index++;
638    }
639    }
640    }
641   
642    /**
643    * Reads the Preference value for the given key, with default specified if no
644    * preference set. The value is interpreted as a comma-separated list of
645    * regular expressions, and converted into a list of compiled patterns ready
646    * for matching. Patterns are forced to upper-case for non-case-sensitive
647    * matching.
648    * <p>
649    * This supports user-defined filters for fields of interest to capture while
650    * processing data. For example, VCF_FIELDS = AF,AC* would mean that VCF INFO
651    * fields with an ID of AF, or starting with AC, would be matched.
652    *
653    * @param key
654    * @param def
655    * @return
656    */
 
657  8 toggle private List<Pattern> getFieldMatchers(String key, String def)
658    {
659  8 String pref = Cache.getDefault(key, def);
660  8 List<Pattern> patterns = new ArrayList<>();
661  8 String[] tokens = pref.split(",");
662  8 for (String token : tokens)
663    {
664  8 try
665    {
666  8 patterns.add(Pattern.compile(token.toUpperCase()));
667    } catch (PatternSyntaxException e)
668    {
669  0 System.err.println("Invalid pattern ignored: " + token);
670    }
671    }
672  8 return patterns;
673    }
674   
675    /**
676    * Transfers VCF features to sequences to which this sequence has a mapping.
677    * If the mapping is 3:1, computes peptide variants from nucleotide variants.
678    *
679    * @param seq
680    */
 
681  11 toggle protected void transferAddedFeatures(SequenceI seq)
682    {
683  11 List<DBRefEntry> dbrefs = seq.getDBRefs();
684  11 if (dbrefs == null)
685    {
686  0 return;
687    }
688  11 for (DBRefEntry dbref : dbrefs)
689    {
690  16 Mapping mapping = dbref.getMap();
691  16 if (mapping == null || mapping.getTo() == null)
692    {
693  11 continue;
694    }
695   
696  5 SequenceI mapTo = mapping.getTo();
697  5 MapList map = mapping.getMap();
698  5 if (map.getFromRatio() == 3)
699    {
700    /*
701    * dna-to-peptide product mapping
702    */
703    // JAL-3187 render on the fly instead
704    // AlignmentUtils.computeProteinFeatures(seq, mapTo, map);
705    }
706    else
707    {
708    /*
709    * nucleotide-to-nucleotide mapping e.g. transcript to CDS
710    */
711  0 List<SequenceFeature> features = seq.getFeatures()
712    .getPositionalFeatures(SequenceOntologyI.SEQUENCE_VARIANT);
713  0 for (SequenceFeature sf : features)
714    {
715  0 if (FEATURE_GROUP_VCF.equals(sf.getFeatureGroup()))
716    {
717  0 transferFeature(sf, mapTo, map);
718    }
719    }
720    }
721    }
722    }
723   
724    /**
725    * Tries to add overlapping variants read from a VCF file to the given sequence,
726    * and returns the number of variant features added
727    *
728    * @param seq
729    * @return
730    */
 
731  24 toggle protected int loadSequenceVCF(SequenceI seq)
732    {
733  24 VCFMap vcfMap = getVcfMap(seq);
734  24 if (vcfMap == null)
735    {
736  0 return 0;
737    }
738   
739    /*
740    * work with the dataset sequence here
741    */
742  24 SequenceI dss = seq.getDatasetSequence();
743  24 if (dss == null)
744    {
745  3 dss = seq;
746    }
747  24 return addVcfVariants(dss, vcfMap);
748    }
749   
750    /**
751    * Answers a map from sequence coordinates to VCF chromosome ranges
752    *
753    * @param seq
754    * @return
755    */
 
756  24 toggle private VCFMap getVcfMap(SequenceI seq)
757    {
758    /*
759    * simplest case: sequence has id and length matching a VCF contig
760    */
761  24 VCFMap vcfMap = null;
762  24 if (dictionary != null)
763    {
764  3 vcfMap = getContigMap(seq);
765    }
766  24 if (vcfMap != null)
767    {
768  3 return vcfMap;
769    }
770   
771    /*
772    * otherwise, map to VCF from chromosomal coordinates
773    * of the sequence (if known)
774    */
775  21 GeneLociI seqCoords = seq.getGeneLoci();
776  21 if (seqCoords == null)
777    {
778  0 Cache.log.warn(String.format(
779    "Can't query VCF for %s as chromosome coordinates not known",
780    seq.getName()));
781  0 return null;
782    }
783   
784  21 String species = seqCoords.getSpeciesId();
785  21 String chromosome = seqCoords.getChromosomeId();
786  21 String seqRef = seqCoords.getAssemblyId();
787  21 MapList map = seqCoords.getMapping();
788   
789    // note this requires the configured species to match that
790    // returned with the Ensembl sequence; todo: support aliases?
791  21 if (!vcfSpecies.equalsIgnoreCase(species))
792    {
793  0 Cache.log.warn("No VCF loaded to " + seq.getName()
794    + " as species not matched");
795  0 return null;
796    }
797   
798  21 if (seqRef.equalsIgnoreCase(vcfAssembly))
799    {
800  21 return new VCFMap(chromosome, map);
801    }
802   
803    /*
804    * VCF data has a different reference assembly to the sequence:
805    * query Ensembl to map chromosomal coordinates from sequence to VCF
806    */
807  0 List<int[]> toVcfRanges = new ArrayList<>();
808  0 List<int[]> fromSequenceRanges = new ArrayList<>();
809   
810  0 for (int[] range : map.getToRanges())
811    {
812  0 int[] fromRange = map.locateInFrom(range[0], range[1]);
813  0 if (fromRange == null)
814    {
815    // corrupted map?!?
816  0 continue;
817    }
818   
819  0 int[] newRange = mapReferenceRange(range, chromosome, "human", seqRef,
820    vcfAssembly);
821  0 if (newRange == null)
822    {
823  0 Cache.log.error(
824    String.format("Failed to map %s:%s:%s:%d:%d to %s", species,
825    chromosome, seqRef, range[0], range[1],
826    vcfAssembly));
827  0 continue;
828    }
829    else
830    {
831  0 toVcfRanges.add(newRange);
832  0 fromSequenceRanges.add(fromRange);
833    }
834    }
835   
836  0 return new VCFMap(chromosome,
837    new MapList(fromSequenceRanges, toVcfRanges, 1, 1));
838    }
839   
840    /**
841    * If the sequence id matches a contig declared in the VCF file, and the
842    * sequence length matches the contig length, then returns a 1:1 map of the
843    * sequence to the contig, else returns null
844    *
845    * @param seq
846    * @return
847    */
 
848  3 toggle private VCFMap getContigMap(SequenceI seq)
849    {
850  3 String id = seq.getName();
851  3 SAMSequenceRecord contig = dictionary.getSequence(id);
852  3 if (contig != null)
853    {
854  3 int len = seq.getLength();
855  3 if (len == contig.getSequenceLength())
856    {
857  3 MapList map = new MapList(new int[] { 1, len },
858    new int[]
859    { 1, len }, 1, 1);
860  3 return new VCFMap(id, map);
861    }
862    }
863  0 return null;
864    }
865   
866    /**
867    * Queries the VCF reader for any variants that overlap the mapped chromosome
868    * ranges of the sequence, and adds as variant features. Returns the number of
869    * overlapping variants found.
870    *
871    * @param seq
872    * @param map
873    * mapping from sequence to VCF coordinates
874    * @return
875    */
 
876  24 toggle protected int addVcfVariants(SequenceI seq, VCFMap map)
877    {
878  24 boolean forwardStrand = map.map.isToForwardStrand();
879   
880    /*
881    * query the VCF for overlaps of each contiguous chromosomal region
882    */
883  24 int count = 0;
884   
885  24 for (int[] range : map.map.getToRanges())
886    {
887  39 int vcfStart = Math.min(range[0], range[1]);
888  39 int vcfEnd = Math.max(range[0], range[1]);
889  39 try
890    {
891  39 CloseableIterator<VariantContext> variants = reader
892    .query(map.chromosome, vcfStart, vcfEnd);
893  75 while (variants.hasNext())
894    {
895  36 VariantContext variant = variants.next();
896   
897  36 int[] featureRange = map.map.locateInFrom(variant.getStart(),
898    variant.getEnd());
899   
900  36 if (featureRange != null)
901    {
902  34 int featureStart = Math.min(featureRange[0], featureRange[1]);
903  34 int featureEnd = Math.max(featureRange[0], featureRange[1]);
904  34 count += addAlleleFeatures(seq, variant, featureStart,
905    featureEnd, forwardStrand);
906    }
907    }
908  39 variants.close();
909    } catch (TribbleException e)
910    {
911    /*
912    * RuntimeException throwable by htsjdk
913    */
914  0 String msg = String.format("Error reading VCF for %s:%d-%d: %s ",
915    map.chromosome, vcfStart, vcfEnd,e.getLocalizedMessage());
916  0 Cache.log.error(msg);
917    }
918    }
919   
920  24 return count;
921    }
922   
923    /**
924    * A convenience method to get an attribute value for an alternate allele
925    *
926    * @param variant
927    * @param attributeName
928    * @param alleleIndex
929    * @return
930    */
 
931  100 toggle protected String getAttributeValue(VariantContext variant,
932    String attributeName, int alleleIndex)
933    {
934  100 Object att = variant.getAttribute(attributeName);
935   
936  100 if (att instanceof String)
937    {
938  39 return (String) att;
939    }
940  61 else if (att instanceof ArrayList)
941    {
942  61 return ((List<String>) att).get(alleleIndex);
943    }
944   
945  0 return null;
946    }
947   
948    /**
949    * Adds one variant feature for each allele in the VCF variant record, and
950    * returns the number of features added.
951    *
952    * @param seq
953    * @param variant
954    * @param featureStart
955    * @param featureEnd
956    * @param forwardStrand
957    * @return
958    */
 
959  34 toggle protected int addAlleleFeatures(SequenceI seq, VariantContext variant,
960    int featureStart, int featureEnd, boolean forwardStrand)
961    {
962  34 int added = 0;
963   
964    /*
965    * Javadoc says getAlternateAlleles() imposes no order on the list returned
966    * so we proceed defensively to get them in strict order
967    */
968  34 int altAlleleCount = variant.getAlternateAlleles().size();
969  85 for (int i = 0; i < altAlleleCount; i++)
970    {
971  51 added += addAlleleFeature(seq, variant, i, featureStart, featureEnd,
972    forwardStrand);
973    }
974  34 return added;
975    }
976   
977    /**
978    * Inspects one allele and attempts to add a variant feature for it to the
979    * sequence. The additional data associated with this allele is extracted to
980    * store in the feature's key-value map. Answers the number of features added (0
981    * or 1).
982    *
983    * @param seq
984    * @param variant
985    * @param altAlleleIndex
986    * (0, 1..)
987    * @param featureStart
988    * @param featureEnd
989    * @param forwardStrand
990    * @return
991    */
 
992  51 toggle protected int addAlleleFeature(SequenceI seq, VariantContext variant,
993    int altAlleleIndex, int featureStart, int featureEnd,
994    boolean forwardStrand)
995    {
996  51 String reference = variant.getReference().getBaseString();
997  51 Allele alt = variant.getAlternateAllele(altAlleleIndex);
998  51 String allele = alt.getBaseString();
999   
1000    /*
1001    * insertion after a genomic base, if on reverse strand, has to be
1002    * converted to insertion of complement after the preceding position
1003    */
1004  51 int referenceLength = reference.length();
1005  51 if (!forwardStrand && allele.length() > referenceLength
1006    && allele.startsWith(reference))
1007    {
1008  4 featureStart -= referenceLength;
1009  4 featureEnd = featureStart;
1010  4 char insertAfter = seq.getCharAt(featureStart - seq.getStart());
1011  4 reference = Dna.reverseComplement(String.valueOf(insertAfter));
1012  4 allele = allele.substring(referenceLength) + reference;
1013    }
1014   
1015    /*
1016    * build the ref,alt allele description e.g. "G,A", using the base
1017    * complement if the sequence is on the reverse strand
1018    */
1019  51 StringBuilder sb = new StringBuilder();
1020  51 sb.append(forwardStrand ? reference : Dna.reverseComplement(reference));
1021  51 sb.append(COMMA);
1022  51 sb.append(forwardStrand ? allele : Dna.reverseComplement(allele));
1023  51 String alleles = sb.toString(); // e.g. G,A
1024   
1025    /*
1026    * pick out the consequence data (if any) that is for the current allele
1027    * and feature (transcript) that matches the current sequence
1028    */
1029  51 String consequence = getConsequenceForAlleleAndFeature(variant, CSQ_FIELD,
1030    altAlleleIndex, csqAlleleFieldIndex,
1031    csqAlleleNumberFieldIndex, seq.getName().toLowerCase(),
1032    csqFeatureFieldIndex);
1033   
1034    /*
1035    * pick out the ontology term for the consequence type
1036    */
1037  51 String type = SequenceOntologyI.SEQUENCE_VARIANT;
1038  51 if (consequence != null)
1039    {
1040  7 type = getOntologyTerm(consequence);
1041    }
1042   
1043  51 SequenceFeature sf = new SequenceFeature(type, alleles, featureStart,
1044    featureEnd, FEATURE_GROUP_VCF);
1045  51 sf.setSource(sourceId);
1046   
1047    /*
1048    * save the derived alleles as a named attribute; this will be
1049    * needed when Jalview computes derived peptide variants
1050    */
1051  51 addFeatureAttribute(sf, Gff3Helper.ALLELES, alleles);
1052   
1053    /*
1054    * add selected VCF fixed column data as feature attributes
1055    */
1056  51 addFeatureAttribute(sf, VCF_POS, String.valueOf(variant.getStart()));
1057  51 addFeatureAttribute(sf, VCF_ID, variant.getID());
1058  51 addFeatureAttribute(sf, VCF_QUAL,
1059    String.valueOf(variant.getPhredScaledQual()));
1060  51 addFeatureAttribute(sf, VCF_FILTER, getFilter(variant));
1061   
1062  51 addAlleleProperties(variant, sf, altAlleleIndex, consequence);
1063   
1064  51 seq.addSequenceFeature(sf);
1065   
1066  51 return 1;
1067    }
1068   
1069    /**
1070    * Answers the VCF FILTER value for the variant - or an approximation to it.
1071    * This field is either PASS, or a semi-colon separated list of filters not
1072    * passed. htsjdk saves filters as a HashSet, so the order when reassembled into
1073    * a list may be different.
1074    *
1075    * @param variant
1076    * @return
1077    */
 
1078  51 toggle String getFilter(VariantContext variant)
1079    {
1080  51 Set<String> filters = variant.getFilters();
1081  51 if (filters.isEmpty())
1082    {
1083  21 return NO_VALUE;
1084    }
1085  30 Iterator<String> iterator = filters.iterator();
1086  30 String first = iterator.next();
1087  30 if (filters.size() == 1)
1088    {
1089  11 return first;
1090    }
1091   
1092  19 StringBuilder sb = new StringBuilder(first);
1093  38 while (iterator.hasNext())
1094    {
1095  19 sb.append(";").append(iterator.next());
1096    }
1097   
1098  19 return sb.toString();
1099    }
1100   
1101    /**
1102    * Adds one feature attribute unless the value is null, empty or '.'
1103    *
1104    * @param sf
1105    * @param key
1106    * @param value
1107    */
 
1108  347 toggle void addFeatureAttribute(SequenceFeature sf, String key, String value)
1109    {
1110  347 if (value != null && !value.isEmpty() && !NO_VALUE.equals(value))
1111    {
1112  275 sf.setValue(key, value);
1113    }
1114    }
1115   
1116    /**
1117    * Determines the Sequence Ontology term to use for the variant feature type in
1118    * Jalview. The default is 'sequence_variant', but a more specific term is used
1119    * if:
1120    * <ul>
1121    * <li>VEP (or SnpEff) Consequence annotation is included in the VCF</li>
1122    * <li>sequence id can be matched to VEP Feature (or SnpEff Feature_ID)</li>
1123    * </ul>
1124    *
1125    * @param consequence
1126    * @return
1127    * @see http://www.sequenceontology.org/browser/current_svn/term/SO:0001060
1128    */
 
1129  7 toggle String getOntologyTerm(String consequence)
1130    {
1131  7 String type = SequenceOntologyI.SEQUENCE_VARIANT;
1132   
1133    /*
1134    * could we associate Consequence data with this allele and feature (transcript)?
1135    * if so, prefer the consequence term from that data
1136    */
1137  7 if (csqAlleleFieldIndex == -1) // && snpEffAlleleFieldIndex == -1
1138    {
1139    /*
1140    * no Consequence data so we can't refine the ontology term
1141    */
1142  0 return type;
1143    }
1144   
1145  7 if (consequence != null)
1146    {
1147  7 String[] csqFields = consequence.split(PIPE_REGEX);
1148  7 if (csqFields.length > csqConsequenceFieldIndex)
1149    {
1150  7 type = csqFields[csqConsequenceFieldIndex];
1151    }
1152    }
1153    else
1154    {
1155    // todo the same for SnpEff consequence data matching if wanted
1156    }
1157   
1158    /*
1159    * if of the form (e.g.) missense_variant&splice_region_variant,
1160    * just take the first ('most severe') consequence
1161    */
1162  7 if (type != null)
1163    {
1164  7 int pos = type.indexOf('&');
1165  7 if (pos > 0)
1166    {
1167  0 type = type.substring(0, pos);
1168    }
1169    }
1170  7 return type;
1171    }
1172   
1173    /**
1174    * Returns matched consequence data if it can be found, else null.
1175    * <ul>
1176    * <li>inspects the VCF data for key 'vcfInfoId'</li>
1177    * <li>splits this on comma (to distinct consequences)</li>
1178    * <li>returns the first consequence (if any) where</li>
1179    * <ul>
1180    * <li>the allele matches the altAlleleIndex'th allele of variant</li>
1181    * <li>the feature matches the sequence name (e.g. transcript id)</li>
1182    * </ul>
1183    * </ul>
1184    * If matched, the consequence is returned (as pipe-delimited fields).
1185    *
1186    * @param variant
1187    * @param vcfInfoId
1188    * @param altAlleleIndex
1189    * @param alleleFieldIndex
1190    * @param alleleNumberFieldIndex
1191    * @param seqName
1192    * @param featureFieldIndex
1193    * @return
1194    */
 
1195  51 toggle private String getConsequenceForAlleleAndFeature(VariantContext variant,
1196    String vcfInfoId, int altAlleleIndex, int alleleFieldIndex,
1197    int alleleNumberFieldIndex,
1198    String seqName, int featureFieldIndex)
1199    {
1200  51 if (alleleFieldIndex == -1 || featureFieldIndex == -1)
1201    {
1202  37 return null;
1203    }
1204  14 Object value = variant.getAttribute(vcfInfoId);
1205   
1206  14 if (value == null || !(value instanceof List<?>))
1207    {
1208  0 return null;
1209    }
1210   
1211    /*
1212    * inspect each consequence in turn (comma-separated blocks
1213    * extracted by htsjdk)
1214    */
1215  14 List<String> consequences = (List<String>) value;
1216   
1217  14 for (String consequence : consequences)
1218    {
1219  41 String[] csqFields = consequence.split(PIPE_REGEX);
1220  41 if (csqFields.length > featureFieldIndex)
1221    {
1222  41 String featureIdentifier = csqFields[featureFieldIndex];
1223  41 if (featureIdentifier.length() > 4
1224    && seqName.indexOf(featureIdentifier.toLowerCase()) > -1)
1225    {
1226    /*
1227    * feature (transcript) matched - now check for allele match
1228    */
1229  10 if (matchAllele(variant, altAlleleIndex, csqFields,
1230    alleleFieldIndex, alleleNumberFieldIndex))
1231    {
1232  7 return consequence;
1233    }
1234    }
1235    }
1236    }
1237  7 return null;
1238    }
1239   
 
1240  10 toggle private boolean matchAllele(VariantContext variant, int altAlleleIndex,
1241    String[] csqFields, int alleleFieldIndex,
1242    int alleleNumberFieldIndex)
1243    {
1244    /*
1245    * if ALLELE_NUM is present, it must match altAlleleIndex
1246    * NB first alternate allele is 1 for ALLELE_NUM, 0 for altAlleleIndex
1247    */
1248  10 if (alleleNumberFieldIndex > -1)
1249    {
1250  0 if (csqFields.length <= alleleNumberFieldIndex)
1251    {
1252  0 return false;
1253    }
1254  0 String alleleNum = csqFields[alleleNumberFieldIndex];
1255  0 return String.valueOf(altAlleleIndex + 1).equals(alleleNum);
1256    }
1257   
1258    /*
1259    * else consequence allele must match variant allele
1260    */
1261  10 if (alleleFieldIndex > -1 && csqFields.length > alleleFieldIndex)
1262    {
1263  10 String csqAllele = csqFields[alleleFieldIndex];
1264  10 String vcfAllele = variant.getAlternateAllele(altAlleleIndex)
1265    .getBaseString();
1266  10 return csqAllele.equals(vcfAllele);
1267    }
1268  0 return false;
1269    }
1270   
1271    /**
1272    * Add any allele-specific VCF key-value data to the sequence feature
1273    *
1274    * @param variant
1275    * @param sf
1276    * @param altAlelleIndex
1277    * (0, 1..)
1278    * @param consequence
1279    * if not null, the consequence specific to this sequence (transcript
1280    * feature) and allele
1281    */
 
1282  51 toggle protected void addAlleleProperties(VariantContext variant,
1283    SequenceFeature sf, final int altAlelleIndex, String consequence)
1284    {
1285  51 Map<String, Object> atts = variant.getAttributes();
1286   
1287  51 for (Entry<String, Object> att : atts.entrySet())
1288    {
1289  174 String key = att.getKey();
1290   
1291    /*
1292    * extract Consequence data (if present) that we are able to
1293    * associated with the allele for this variant feature
1294    */
1295  174 if (CSQ_FIELD.equals(key))
1296    {
1297  14 addConsequences(variant, sf, consequence);
1298  14 continue;
1299    }
1300   
1301    /*
1302    * filter out fields we don't want to capture
1303    */
1304  160 if (!vcfFieldsOfInterest.contains(key))
1305    {
1306  46 continue;
1307    }
1308   
1309    /*
1310    * we extract values for other data which are allele-specific;
1311    * these may be per alternate allele (INFO[key].Number = 'A')
1312    * or per allele including reference (INFO[key].Number = 'R')
1313    */
1314  114 VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine(key);
1315  114 if (infoHeader == null)
1316    {
1317    /*
1318    * can't be sure what data belongs to this allele, so
1319    * play safe and don't take any
1320    */
1321  0 continue;
1322    }
1323   
1324  114 VCFHeaderLineCount number = infoHeader.getCountType();
1325  114 int index = altAlelleIndex;
1326  114 if (number == VCFHeaderLineCount.R)
1327    {
1328    /*
1329    * one value per allele including reference, so bump index
1330    * e.g. the 3rd value is for the 2nd alternate allele
1331    */
1332  14 index++;
1333    }
1334  100 else if (number != VCFHeaderLineCount.A)
1335    {
1336    /*
1337    * don't save other values as not allele-related
1338    */
1339  14 continue;
1340    }
1341   
1342    /*
1343    * take the index'th value
1344    */
1345  100 String value = getAttributeValue(variant, key, index);
1346  100 if (value != null && isValid(variant, key, value))
1347    {
1348    /*
1349    * decode colon, semicolon, equals sign, percent sign, comma (only)
1350    * as required by the VCF specification (para 1.2)
1351    */
1352  92 value = StringUtils.urlDecode(value, VCF_ENCODABLE);
1353  92 addFeatureAttribute(sf, key, value);
1354    }
1355    }
1356    }
1357   
1358    /**
1359    * Answers true for '.', null, or an empty value, or if the INFO type is String.
1360    * If the INFO type is Integer or Float, answers false if the value is not in
1361    * valid format.
1362    *
1363    * @param variant
1364    * @param infoId
1365    * @param value
1366    * @return
1367    */
 
1368  100 toggle protected boolean isValid(VariantContext variant, String infoId,
1369    String value)
1370    {
1371  100 if (value == null || value.isEmpty() || NO_VALUE.equals(value))
1372    {
1373  8 return true;
1374    }
1375  92 VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine(infoId);
1376  92 if (infoHeader == null)
1377    {
1378  0 Cache.log.error("Field " + infoId + " has no INFO header");
1379  0 return false;
1380    }
1381  92 VCFHeaderLineType infoType = infoHeader.getType();
1382  92 try
1383    {
1384  92 if (infoType == VCFHeaderLineType.Integer)
1385    {
1386  27 Integer.parseInt(value);
1387    }
1388  65 else if (infoType == VCFHeaderLineType.Float)
1389    {
1390  65 Float.parseFloat(value);
1391    }
1392    } catch (NumberFormatException e)
1393    {
1394  8 logInvalidValue(variant, infoId, value);
1395  8 return false;
1396    }
1397  84 return true;
1398    }
1399   
1400    /**
1401    * Logs an error message for malformed data; duplicate messages (same id and
1402    * value) are not logged
1403    *
1404    * @param variant
1405    * @param infoId
1406    * @param value
1407    */
 
1408  8 toggle private void logInvalidValue(VariantContext variant, String infoId,
1409    String value)
1410    {
1411  8 if (badData == null)
1412    {
1413  2 badData = new HashSet<>();
1414    }
1415  8 String token = infoId + ":" + value;
1416  8 if (!badData.contains(token))
1417    {
1418  4 badData.add(token);
1419  4 Cache.log.error(String.format("Invalid VCF data at %s:%d %s=%s",
1420    variant.getContig(), variant.getStart(), infoId, value));
1421    }
1422    }
1423   
1424    /**
1425    * Inspects CSQ data blocks (consequences) and adds attributes on the sequence
1426    * feature.
1427    * <p>
1428    * If <code>myConsequence</code> is not null, then this is the specific
1429    * consequence data (pipe-delimited fields) that is for the current allele and
1430    * transcript (sequence) being processed)
1431    *
1432    * @param variant
1433    * @param sf
1434    * @param myConsequence
1435    */
 
1436  14 toggle protected void addConsequences(VariantContext variant, SequenceFeature sf,
1437    String myConsequence)
1438    {
1439  14 Object value = variant.getAttribute(CSQ_FIELD);
1440   
1441  14 if (value == null || !(value instanceof List<?>))
1442    {
1443  0 return;
1444    }
1445   
1446  14 List<String> consequences = (List<String>) value;
1447   
1448    /*
1449    * inspect CSQ consequences; restrict to the consequence
1450    * associated with the current transcript (Feature)
1451    */
1452  14 Map<String, String> csqValues = new HashMap<>();
1453   
1454  14 for (String consequence : consequences)
1455    {
1456  50 if (myConsequence == null || myConsequence.equals(consequence))
1457    {
1458  31 String[] csqFields = consequence.split(PIPE_REGEX);
1459   
1460    /*
1461    * inspect individual fields of this consequence, copying non-null
1462    * values which are 'fields of interest'
1463    */
1464  31 int i = 0;
1465  31 for (String field : csqFields)
1466    {
1467  279 if (field != null && field.length() > 0)
1468    {
1469  279 String id = vepFieldsOfInterest.get(i);
1470  279 if (id != null)
1471    {
1472    /*
1473    * VCF spec requires encoding of special characters e.g. '='
1474    * so decode them here before storing
1475    */
1476  279 field = StringUtils.urlDecode(field, VCF_ENCODABLE);
1477  279 csqValues.put(id, field);
1478    }
1479    }
1480  279 i++;
1481    }
1482    }
1483    }
1484   
1485  14 if (!csqValues.isEmpty())
1486    {
1487  14 sf.setValue(CSQ_FIELD, csqValues);
1488    }
1489    }
1490   
1491    /**
1492    * A convenience method to complement a dna base and return the string value
1493    * of its complement
1494    *
1495    * @param reference
1496    * @return
1497    */
 
1498  0 toggle protected String complement(byte[] reference)
1499    {
1500  0 return String.valueOf(Dna.getComplement((char) reference[0]));
1501    }
1502   
1503    /**
1504    * Determines the location of the query range (chromosome positions) in a
1505    * different reference assembly.
1506    * <p>
1507    * If the range is just a subregion of one for which we already have a mapping
1508    * (for example, an exon sub-region of a gene), then the mapping is just
1509    * computed arithmetically.
1510    * <p>
1511    * Otherwise, calls the Ensembl REST service that maps from one assembly
1512    * reference's coordinates to another's
1513    *
1514    * @param queryRange
1515    * start-end chromosomal range in 'fromRef' coordinates
1516    * @param chromosome
1517    * @param species
1518    * @param fromRef
1519    * assembly reference for the query coordinates
1520    * @param toRef
1521    * assembly reference we wish to translate to
1522    * @return the start-end range in 'toRef' coordinates
1523    */
 
1524  0 toggle protected int[] mapReferenceRange(int[] queryRange, String chromosome,
1525    String species, String fromRef, String toRef)
1526    {
1527    /*
1528    * first try shorcut of computing the mapping as a subregion of one
1529    * we already have (e.g. for an exon, if we have the gene mapping)
1530    */
1531  0 int[] mappedRange = findSubsumedRangeMapping(queryRange, chromosome,
1532    species, fromRef, toRef);
1533  0 if (mappedRange != null)
1534    {
1535  0 return mappedRange;
1536    }
1537   
1538    /*
1539    * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37
1540    */
1541  0 EnsemblMap mapper = new EnsemblMap();
1542  0 int[] mapping = mapper.getAssemblyMapping(species, chromosome, fromRef,
1543    toRef, queryRange);
1544   
1545  0 if (mapping == null)
1546    {
1547    // mapping service failure
1548  0 return null;
1549    }
1550   
1551    /*
1552    * save mapping for possible future re-use
1553    */
1554  0 String key = makeRangesKey(chromosome, species, fromRef, toRef);
1555  0 if (!assemblyMappings.containsKey(key))
1556    {
1557  0 assemblyMappings.put(key, new HashMap<int[], int[]>());
1558    }
1559   
1560  0 assemblyMappings.get(key).put(queryRange, mapping);
1561   
1562  0 return mapping;
1563    }
1564   
1565    /**
1566    * If we already have a 1:1 contiguous mapping which subsumes the given query
1567    * range, this method just calculates and returns the subset of that mapping,
1568    * else it returns null. In practical terms, if a gene has a contiguous
1569    * mapping between (for example) GRCh37 and GRCh38, then we assume that its
1570    * subsidiary exons occupy unchanged relative positions, and just compute
1571    * these as offsets, rather than do another lookup of the mapping.
1572    * <p>
1573    * If in future these assumptions prove invalid (e.g. for bacterial dna?!),
1574    * simply remove this method or let it always return null.
1575    * <p>
1576    * Warning: many rapid calls to the /map service map result in a 429 overload
1577    * error response
1578    *
1579    * @param queryRange
1580    * @param chromosome
1581    * @param species
1582    * @param fromRef
1583    * @param toRef
1584    * @return
1585    */
 
1586  0 toggle protected int[] findSubsumedRangeMapping(int[] queryRange, String chromosome,
1587    String species, String fromRef, String toRef)
1588    {
1589  0 String key = makeRangesKey(chromosome, species, fromRef, toRef);
1590  0 if (assemblyMappings.containsKey(key))
1591    {
1592  0 Map<int[], int[]> mappedRanges = assemblyMappings.get(key);
1593  0 for (Entry<int[], int[]> mappedRange : mappedRanges.entrySet())
1594    {
1595  0 int[] fromRange = mappedRange.getKey();
1596  0 int[] toRange = mappedRange.getValue();
1597  0 if (fromRange[1] - fromRange[0] == toRange[1] - toRange[0])
1598    {
1599    /*
1600    * mapping is 1:1 in length, so we trust it to have no discontinuities
1601    */
1602  0 if (MappingUtils.rangeContains(fromRange, queryRange))
1603    {
1604    /*
1605    * fromRange subsumes our query range
1606    */
1607  0 int offset = queryRange[0] - fromRange[0];
1608  0 int mappedRangeFrom = toRange[0] + offset;
1609  0 int mappedRangeTo = mappedRangeFrom + (queryRange[1] - queryRange[0]);
1610  0 return new int[] { mappedRangeFrom, mappedRangeTo };
1611    }
1612    }
1613    }
1614    }
1615  0 return null;
1616    }
1617   
1618    /**
1619    * Transfers the sequence feature to the target sequence, locating its start
1620    * and end range based on the mapping. Features which do not overlap the
1621    * target sequence are ignored.
1622    *
1623    * @param sf
1624    * @param targetSequence
1625    * @param mapping
1626    * mapping from the feature's coordinates to the target sequence
1627    */
 
1628  0 toggle protected void transferFeature(SequenceFeature sf,
1629    SequenceI targetSequence, MapList mapping)
1630    {
1631  0 int[] mappedRange = mapping.locateInTo(sf.getBegin(), sf.getEnd());
1632   
1633  0 if (mappedRange != null)
1634    {
1635  0 String group = sf.getFeatureGroup();
1636  0 int newBegin = Math.min(mappedRange[0], mappedRange[1]);
1637  0 int newEnd = Math.max(mappedRange[0], mappedRange[1]);
1638  0 SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
1639    group, sf.getScore());
1640  0 targetSequence.addSequenceFeature(copy);
1641    }
1642    }
1643   
1644    /**
1645    * Formats a ranges map lookup key
1646    *
1647    * @param chromosome
1648    * @param species
1649    * @param fromRef
1650    * @param toRef
1651    * @return
1652    */
 
1653  0 toggle protected static String makeRangesKey(String chromosome, String species,
1654    String fromRef, String toRef)
1655    {
1656  0 return species + EXCL + chromosome + EXCL + fromRef + EXCL
1657    + toRef;
1658    }
1659    }