Class |
Line # |
Actions |
|||
---|---|---|---|---|---|
VCFLoader | 79 | 405 | 151 | ||
VCFLoader.VCFMap | 109 | 3 | 2 |
1 | /* | |
2 | * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) | |
3 | * Copyright (C) $$Year-Rel$$ The Jalview Authors | |
4 | * | |
5 | * This file is part of Jalview. | |
6 | * | |
7 | * Jalview is free software: you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation, either version 3 | |
10 | * of the License, or (at your option) any later version. | |
11 | * | |
12 | * Jalview is distributed in the hope that it will be useful, but | |
13 | * WITHOUT ANY WARRANTY; without even the implied warranty | |
14 | * of MERCHANTABILITY or FITNESS FOR A PARTICULAR | |
15 | * PURPOSE. See the GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with Jalview. If not, see <http://www.gnu.org/licenses/>. | |
19 | * The Jalview Authors are detailed in the 'AUTHORS' file. | |
20 | */ | |
21 | package jalview.io.vcf; | |
22 | ||
23 | import java.util.Locale; | |
24 | ||
25 | import java.io.File; | |
26 | import java.io.IOException; | |
27 | import java.util.ArrayList; | |
28 | import java.util.HashMap; | |
29 | import java.util.HashSet; | |
30 | import java.util.Iterator; | |
31 | import java.util.List; | |
32 | import java.util.Map; | |
33 | import java.util.Map.Entry; | |
34 | import java.util.Set; | |
35 | import java.util.regex.Pattern; | |
36 | import java.util.regex.PatternSyntaxException; | |
37 | ||
38 | import htsjdk.samtools.SAMException; | |
39 | import htsjdk.samtools.SAMSequenceDictionary; | |
40 | import htsjdk.samtools.SAMSequenceRecord; | |
41 | import htsjdk.samtools.util.CloseableIterator; | |
42 | import htsjdk.tribble.TribbleException; | |
43 | import htsjdk.variant.variantcontext.Allele; | |
44 | import htsjdk.variant.variantcontext.VariantContext; | |
45 | import htsjdk.variant.vcf.VCFConstants; | |
46 | import htsjdk.variant.vcf.VCFHeader; | |
47 | import htsjdk.variant.vcf.VCFHeaderLine; | |
48 | import htsjdk.variant.vcf.VCFHeaderLineCount; | |
49 | import htsjdk.variant.vcf.VCFHeaderLineType; | |
50 | import htsjdk.variant.vcf.VCFInfoHeaderLine; | |
51 | import jalview.analysis.Dna; | |
52 | import jalview.api.AlignViewControllerGuiI; | |
53 | import jalview.bin.Cache; | |
54 | import jalview.bin.Console; | |
55 | import jalview.datamodel.DBRefEntry; | |
56 | import jalview.datamodel.GeneLociI; | |
57 | import jalview.datamodel.Mapping; | |
58 | import jalview.datamodel.SequenceFeature; | |
59 | import jalview.datamodel.SequenceI; | |
60 | import jalview.datamodel.features.FeatureAttributeType; | |
61 | import jalview.datamodel.features.FeatureSource; | |
62 | import jalview.datamodel.features.FeatureSources; | |
63 | import jalview.ext.ensembl.EnsemblMap; | |
64 | import jalview.ext.htsjdk.HtsContigDb; | |
65 | import jalview.ext.htsjdk.VCFReader; | |
66 | import jalview.io.gff.Gff3Helper; | |
67 | import jalview.io.gff.SequenceOntologyI; | |
68 | import jalview.util.MapList; | |
69 | import jalview.util.MappingUtils; | |
70 | import jalview.util.MessageManager; | |
71 | import jalview.util.StringUtils; | |
72 | ||
73 | /** | |
74 | * A class to read VCF data (using the htsjdk) and add variants as sequence | |
75 | * features on dna and any related protein product sequences | |
76 | * | |
77 | * @author gmcarstairs | |
78 | */ | |
79 | public class VCFLoader | |
80 | { | |
81 | private static final String VCF_ENCODABLE = ":;=%,"; | |
82 | ||
83 | /* | |
84 | * Jalview feature attributes for VCF fixed column data | |
85 | */ | |
86 | private static final String VCF_POS = "POS"; | |
87 | ||
88 | private static final String VCF_ID = "ID"; | |
89 | ||
90 | private static final String VCF_QUAL = "QUAL"; | |
91 | ||
92 | private static final String VCF_FILTER = "FILTER"; | |
93 | ||
94 | private static final String NO_VALUE = VCFConstants.MISSING_VALUE_v4; // '.' | |
95 | ||
96 | private static final String DEFAULT_SPECIES = "homo_sapiens"; | |
97 | ||
98 | /** | |
99 | * A class to model the mapping from sequence to VCF coordinates. Cases | |
100 | * include | |
101 | * <ul> | |
102 | * <li>a direct 1:1 mapping where the sequence is one of the VCF contigs</li> | |
103 | * <li>a mapping of sequence to chromosomal coordinates, where sequence and | |
104 | * VCF use the same reference assembly</li> | |
105 | * <li>a modified mapping of sequence to chromosomal coordinates, where | |
106 | * sequence and VCF use different reference assembles</li> | |
107 | * </ul> | |
108 | */ | |
109 | class VCFMap | |
110 | { | |
111 | final String chromosome; | |
112 | ||
113 | final MapList map; | |
114 | ||
115 | 24 | VCFMap(String chr, MapList m) |
116 | { | |
117 | 24 | chromosome = chr; |
118 | 24 | map = m; |
119 | } | |
120 | ||
121 | 0 | @Override |
122 | public String toString() | |
123 | { | |
124 | 0 | return chromosome + ":" + map.toString(); |
125 | } | |
126 | } | |
127 | ||
128 | /* | |
129 | * Lookup keys, and default values, for Preference entries that describe | |
130 | * patterns for VCF and VEP fields to capture | |
131 | */ | |
132 | private static final String VEP_FIELDS_PREF = "VEP_FIELDS"; | |
133 | ||
134 | private static final String VCF_FIELDS_PREF = "VCF_FIELDS"; | |
135 | ||
136 | private static final String DEFAULT_VCF_FIELDS = ".*"; | |
137 | ||
138 | private static final String DEFAULT_VEP_FIELDS = ".*";// "Allele,Consequence,IMPACT,SWISSPROT,SIFT,PolyPhen,CLIN_SIG"; | |
139 | ||
140 | /* | |
141 | * Lookup keys, and default values, for Preference entries that give | |
142 | * mappings from tokens in the 'reference' header to species or assembly | |
143 | */ | |
144 | private static final String VCF_ASSEMBLY = "VCF_ASSEMBLY"; | |
145 | ||
146 | private static final String DEFAULT_VCF_ASSEMBLY = "assembly19=GRCh37,hs37=GRCh37,grch37=GRCh37,grch38=GRCh38"; | |
147 | ||
148 | private static final String VCF_SPECIES = "VCF_SPECIES"; // default is human | |
149 | ||
150 | private static final String DEFAULT_REFERENCE = "grch37"; // fallback default | |
151 | // is human GRCh37 | |
152 | ||
153 | /* | |
154 | * keys to fields of VEP CSQ consequence data | |
155 | * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html | |
156 | */ | |
157 | private static final String CSQ_CONSEQUENCE_KEY = "Consequence"; | |
158 | ||
159 | private static final String CSQ_ALLELE_KEY = "Allele"; | |
160 | ||
161 | private static final String CSQ_ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), | |
162 | // 1... | |
163 | ||
164 | private static final String CSQ_FEATURE_KEY = "Feature"; // Ensembl stable id | |
165 | ||
166 | /* | |
167 | * default VCF INFO key for VEP consequence data | |
168 | * NB this can be overridden running VEP with --vcf_info_field | |
169 | * - we don't handle this case (require identifier to be CSQ) | |
170 | */ | |
171 | private static final String CSQ_FIELD = "CSQ"; | |
172 | ||
173 | /* | |
174 | * separator for fields in consequence data is '|' | |
175 | */ | |
176 | private static final String PIPE_REGEX = "\\|"; | |
177 | ||
178 | /* | |
179 | * delimiter that separates multiple consequence data blocks | |
180 | */ | |
181 | private static final String COMMA = ","; | |
182 | ||
183 | /* | |
184 | * the feature group assigned to a VCF variant in Jalview | |
185 | */ | |
186 | private static final String FEATURE_GROUP_VCF = "VCF"; | |
187 | ||
188 | /* | |
189 | * internal delimiter used to build keys for assemblyMappings | |
190 | * | |
191 | */ | |
192 | private static final String EXCL = "!"; | |
193 | ||
194 | /* | |
195 | * the VCF file we are processing | |
196 | */ | |
197 | protected String vcfFilePath; | |
198 | ||
199 | /* | |
200 | * mappings between VCF and sequence reference assembly regions, as | |
201 | * key = "species!chromosome!fromAssembly!toAssembly | |
202 | * value = Map{fromRange, toRange} | |
203 | */ | |
204 | private Map<String, Map<int[], int[]>> assemblyMappings; | |
205 | ||
206 | private VCFReader reader; | |
207 | ||
208 | /* | |
209 | * holds details of the VCF header lines (metadata) | |
210 | */ | |
211 | private VCFHeader header; | |
212 | ||
213 | /* | |
214 | * species (as a valid Ensembl term) the VCF is for | |
215 | */ | |
216 | private String vcfSpecies; | |
217 | ||
218 | /* | |
219 | * genome assembly version (as a valid Ensembl identifier) the VCF is for | |
220 | */ | |
221 | private String vcfAssembly; | |
222 | ||
223 | /* | |
224 | * a Dictionary of contigs (if present) referenced in the VCF file | |
225 | */ | |
226 | private SAMSequenceDictionary dictionary; | |
227 | ||
228 | /* | |
229 | * the position (0...) of field in each block of | |
230 | * CSQ (consequence) data (if declared in the VCF INFO header for CSQ) | |
231 | * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html | |
232 | */ | |
233 | private int csqConsequenceFieldIndex = -1; | |
234 | ||
235 | private int csqAlleleFieldIndex = -1; | |
236 | ||
237 | private int csqAlleleNumberFieldIndex = -1; | |
238 | ||
239 | private int csqFeatureFieldIndex = -1; | |
240 | ||
241 | // todo the same fields for SnpEff ANN data if wanted | |
242 | // see http://snpeff.sourceforge.net/SnpEff_manual.html#input | |
243 | ||
244 | /* | |
245 | * a unique identifier under which to save metadata about feature | |
246 | * attributes (selected INFO field data) | |
247 | */ | |
248 | private String sourceId; | |
249 | ||
250 | /* | |
251 | * The INFO IDs of data that is both present in the VCF file, and | |
252 | * also matched by any filters for data of interest | |
253 | */ | |
254 | List<String> vcfFieldsOfInterest; | |
255 | ||
256 | /* | |
257 | * The field offsets and identifiers for VEP (CSQ) data that is both present | |
258 | * in the VCF file, and also matched by any filters for data of interest | |
259 | * for example 0 -> Allele, 1 -> Consequence, ..., 36 -> SIFT, ... | |
260 | */ | |
261 | Map<Integer, String> vepFieldsOfInterest; | |
262 | ||
263 | /* | |
264 | * key:value for which rejected data has been seen | |
265 | * (the error is logged only once for each combination) | |
266 | */ | |
267 | private Set<String> badData; | |
268 | ||
269 | /** | |
270 | * Constructor given a VCF file | |
271 | * | |
272 | * @param alignment | |
273 | */ | |
274 | 4 | public VCFLoader(String vcfFile) |
275 | { | |
276 | 4 | try |
277 | { | |
278 | 4 | initialise(vcfFile); |
279 | } catch (IOException e) | |
280 | { | |
281 | 0 | jalview.bin.Console |
282 | .errPrintln("Error opening VCF file: " + e.getMessage()); | |
283 | } | |
284 | ||
285 | // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange} | |
286 | 4 | assemblyMappings = new HashMap<>(); |
287 | } | |
288 | ||
289 | /** | |
290 | * Starts a new thread to query and load VCF variant data on to the given | |
291 | * sequences | |
292 | * <p> | |
293 | * This method is not thread safe - concurrent threads should use separate | |
294 | * instances of this class. | |
295 | * | |
296 | * @param seqs | |
297 | * @param gui | |
298 | */ | |
299 | 0 | public void loadVCF(SequenceI[] seqs, final AlignViewControllerGuiI gui) |
300 | { | |
301 | 0 | if (gui != null) |
302 | { | |
303 | 0 | gui.setStatus(MessageManager.getString("label.searching_vcf")); |
304 | } | |
305 | ||
306 | 0 | new Thread() |
307 | { | |
308 | 0 | @Override |
309 | public void run() | |
310 | { | |
311 | 0 | VCFLoader.this.doLoad(seqs, gui); |
312 | } | |
313 | }.start(); | |
314 | } | |
315 | ||
316 | /** | |
317 | * Reads the specified contig sequence and adds its VCF variants to it | |
318 | * | |
319 | * @param contig | |
320 | * the id of a single sequence (contig) to load | |
321 | * @return | |
322 | */ | |
323 | 3 | public SequenceI loadVCFContig(String contig) |
324 | { | |
325 | 3 | VCFHeaderLine headerLine = header |
326 | .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); | |
327 | 3 | if (headerLine == null) |
328 | { | |
329 | 0 | Console.error("VCF reference header not found"); |
330 | 0 | return null; |
331 | } | |
332 | 3 | String ref = headerLine.getValue(); |
333 | 3 | if (ref.startsWith("file://")) |
334 | { | |
335 | 0 | ref = ref.substring(7); |
336 | } | |
337 | 3 | setSpeciesAndAssembly(ref); |
338 | ||
339 | 3 | SequenceI seq = null; |
340 | 3 | File dbFile = new File(ref); |
341 | ||
342 | 3 | if (dbFile.exists()) |
343 | { | |
344 | 3 | HtsContigDb db = new HtsContigDb("", dbFile); |
345 | 3 | seq = db.getSequenceProxy(contig); |
346 | 3 | loadSequenceVCF(seq); |
347 | 3 | db.close(); |
348 | } | |
349 | else | |
350 | { | |
351 | 0 | Console.error("VCF reference not found: " + ref); |
352 | } | |
353 | ||
354 | 3 | return seq; |
355 | } | |
356 | ||
357 | /** | |
358 | * Loads VCF on to one or more sequences | |
359 | * | |
360 | * @param seqs | |
361 | * @param gui | |
362 | * optional callback handler for messages | |
363 | */ | |
364 | 3 | protected void doLoad(SequenceI[] seqs, AlignViewControllerGuiI gui) |
365 | { | |
366 | 3 | try |
367 | { | |
368 | 3 | VCFHeaderLine ref = header |
369 | .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); | |
370 | 3 | String reference = ref == null ? null : ref.getValue(); |
371 | ||
372 | 3 | setSpeciesAndAssembly(reference); |
373 | ||
374 | 3 | int varCount = 0; |
375 | 3 | int seqCount = 0; |
376 | ||
377 | /* | |
378 | * query for VCF overlapping each sequence in turn | |
379 | */ | |
380 | 3 | for (SequenceI seq : seqs) |
381 | { | |
382 | 21 | int added = loadSequenceVCF(seq); |
383 | 21 | if (added > 0) |
384 | { | |
385 | 11 | seqCount++; |
386 | 11 | varCount += added; |
387 | 11 | transferAddedFeatures(seq); |
388 | } | |
389 | } | |
390 | 3 | if (gui != null) |
391 | { | |
392 | 0 | String msg = MessageManager.formatMessage("label.added_vcf", |
393 | varCount, seqCount); | |
394 | 0 | gui.setStatus(msg); |
395 | 0 | if (gui.getFeatureSettingsUI() != null) |
396 | { | |
397 | 0 | gui.getFeatureSettingsUI().discoverAllFeatureData(); |
398 | } | |
399 | } | |
400 | } catch (Throwable e) | |
401 | { | |
402 | 0 | jalview.bin.Console |
403 | .errPrintln("Error processing VCF: " + e.getMessage()); | |
404 | 0 | e.printStackTrace(); |
405 | 0 | if (gui != null) |
406 | { | |
407 | 0 | gui.setStatus("Error occurred - see console for details"); |
408 | } | |
409 | } finally | |
410 | { | |
411 | 3 | if (reader != null) |
412 | { | |
413 | 3 | try |
414 | { | |
415 | 3 | reader.close(); |
416 | } catch (IOException e) | |
417 | { | |
418 | // ignore | |
419 | } | |
420 | } | |
421 | 3 | header = null; |
422 | 3 | dictionary = null; |
423 | } | |
424 | } | |
425 | ||
426 | /** | |
427 | * Attempts to determine and save the species and genome assembly version to | |
428 | * which the VCF data applies. This may be done by parsing the | |
429 | * {@code reference} header line, configured in a property file, or | |
430 | * (potentially) confirmed interactively by the user. | |
431 | * <p> | |
432 | * The saved values should be identifiers valid for Ensembl's REST service | |
433 | * {@code map} endpoint, so they can be used (if necessary) to retrieve the | |
434 | * mapping between VCF coordinates and sequence coordinates. | |
435 | * | |
436 | * @param reference | |
437 | * @see https://rest.ensembl.org/documentation/info/assembly_map | |
438 | * @see https://rest.ensembl.org/info/assembly/human?content-type=text/xml | |
439 | * @see https://rest.ensembl.org/info/species?content-type=text/xml | |
440 | */ | |
441 | 6 | protected void setSpeciesAndAssembly(String reference) |
442 | { | |
443 | 6 | if (reference == null) |
444 | { | |
445 | 0 | Console.error("No VCF ##reference found, defaulting to " |
446 | + DEFAULT_REFERENCE + ":" + DEFAULT_SPECIES); | |
447 | 0 | reference = DEFAULT_REFERENCE; // default to GRCh37 if not specified |
448 | } | |
449 | 6 | reference = reference.toLowerCase(Locale.ROOT); |
450 | ||
451 | /* | |
452 | * for a non-human species, or other assembly identifier, | |
453 | * specify as a Jalview property file entry e.g. | |
454 | * VCF_ASSEMBLY = hs37=GRCh37,assembly19=GRCh37 | |
455 | * VCF_SPECIES = c_elegans=celegans | |
456 | * to map a token in the reference header to a value | |
457 | */ | |
458 | 6 | String prop = Cache.getDefault(VCF_ASSEMBLY, DEFAULT_VCF_ASSEMBLY); |
459 | 6 | for (String token : prop.split(",")) |
460 | { | |
461 | 6 | String[] tokens = token.split("="); |
462 | 6 | if (tokens.length == 2) |
463 | { | |
464 | 6 | if (reference.contains(tokens[0].trim().toLowerCase(Locale.ROOT))) |
465 | { | |
466 | 3 | vcfAssembly = tokens[1].trim(); |
467 | 3 | break; |
468 | } | |
469 | } | |
470 | } | |
471 | ||
472 | 6 | vcfSpecies = DEFAULT_SPECIES; |
473 | 6 | prop = Cache.getProperty(VCF_SPECIES); |
474 | 6 | if (prop != null) |
475 | { | |
476 | 0 | for (String token : prop.split(",")) |
477 | { | |
478 | 0 | String[] tokens = token.split("="); |
479 | 0 | if (tokens.length == 2) |
480 | { | |
481 | 0 | if (reference.contains(tokens[0].trim().toLowerCase(Locale.ROOT))) |
482 | { | |
483 | 0 | vcfSpecies = tokens[1].trim(); |
484 | 0 | break; |
485 | } | |
486 | } | |
487 | } | |
488 | } | |
489 | } | |
490 | ||
491 | /** | |
492 | * Opens the VCF file and parses header data | |
493 | * | |
494 | * @param filePath | |
495 | * @throws IOException | |
496 | */ | |
497 | 4 | private void initialise(String filePath) throws IOException |
498 | { | |
499 | 4 | vcfFilePath = filePath; |
500 | ||
501 | 4 | reader = new VCFReader(filePath); |
502 | ||
503 | 4 | header = reader.getFileHeader(); |
504 | ||
505 | 4 | try |
506 | { | |
507 | 4 | dictionary = header.getSequenceDictionary(); |
508 | } catch (SAMException e) | |
509 | { | |
510 | // ignore - thrown if any contig line lacks length info | |
511 | } | |
512 | ||
513 | 4 | sourceId = filePath; |
514 | ||
515 | 4 | saveMetadata(sourceId); |
516 | ||
517 | /* | |
518 | * get offset of CSQ ALLELE_NUM and Feature if declared | |
519 | */ | |
520 | 4 | parseCsqHeader(); |
521 | } | |
522 | ||
523 | /** | |
524 | * Reads metadata (such as INFO field descriptions and datatypes) and saves | |
525 | * them for future reference | |
526 | * | |
527 | * @param theSourceId | |
528 | */ | |
529 | 4 | void saveMetadata(String theSourceId) |
530 | { | |
531 | 4 | List<Pattern> vcfFieldPatterns = getFieldMatchers(VCF_FIELDS_PREF, |
532 | DEFAULT_VCF_FIELDS); | |
533 | 4 | vcfFieldsOfInterest = new ArrayList<>(); |
534 | ||
535 | 4 | FeatureSource metadata = new FeatureSource(theSourceId); |
536 | ||
537 | 4 | for (VCFInfoHeaderLine info : header.getInfoHeaderLines()) |
538 | { | |
539 | 13 | String attributeId = info.getID(); |
540 | 13 | String desc = info.getDescription(); |
541 | 13 | VCFHeaderLineType type = info.getType(); |
542 | 13 | FeatureAttributeType attType = null; |
543 | 13 | switch (type) |
544 | { | |
545 | 0 | case Character: |
546 | 0 | attType = FeatureAttributeType.Character; |
547 | 0 | break; |
548 | 0 | case Flag: |
549 | 0 | attType = FeatureAttributeType.Flag; |
550 | 0 | break; |
551 | 7 | case Float: |
552 | 7 | attType = FeatureAttributeType.Float; |
553 | 7 | break; |
554 | 5 | case Integer: |
555 | 5 | attType = FeatureAttributeType.Integer; |
556 | 5 | break; |
557 | 1 | case String: |
558 | 1 | attType = FeatureAttributeType.String; |
559 | 1 | break; |
560 | } | |
561 | 13 | metadata.setAttributeName(attributeId, desc); |
562 | 13 | metadata.setAttributeType(attributeId, attType); |
563 | ||
564 | 13 | if (isFieldWanted(attributeId, vcfFieldPatterns)) |
565 | { | |
566 | 13 | vcfFieldsOfInterest.add(attributeId); |
567 | } | |
568 | } | |
569 | ||
570 | 4 | FeatureSources.getInstance().addSource(theSourceId, metadata); |
571 | } | |
572 | ||
573 | /** | |
574 | * Answers true if the field id is matched by any of the filter patterns, else | |
575 | * false. Matching is against regular expression patterns, and is not | |
576 | * case-sensitive. | |
577 | * | |
578 | * @param id | |
579 | * @param filters | |
580 | * @return | |
581 | */ | |
582 | 22 | private boolean isFieldWanted(String id, List<Pattern> filters) |
583 | { | |
584 | 22 | for (Pattern p : filters) |
585 | { | |
586 | 22 | if (p.matcher(id.toUpperCase(Locale.ROOT)).matches()) |
587 | { | |
588 | 22 | return true; |
589 | } | |
590 | } | |
591 | 0 | return false; |
592 | } | |
593 | ||
594 | /** | |
595 | * Records 'wanted' fields defined in the CSQ INFO header (if there is one). | |
596 | * Also records the position of selected fields (Allele, ALLELE_NUM, Feature) | |
597 | * required for processing. | |
598 | * <p> | |
599 | * CSQ fields are declared in the CSQ INFO Description e.g. | |
600 | * <p> | |
601 | * Description="Consequence ...from ... VEP. Format: Allele|Consequence|... | |
602 | */ | |
603 | 4 | protected void parseCsqHeader() |
604 | { | |
605 | 4 | List<Pattern> vepFieldFilters = getFieldMatchers(VEP_FIELDS_PREF, |
606 | DEFAULT_VEP_FIELDS); | |
607 | 4 | vepFieldsOfInterest = new HashMap<>(); |
608 | ||
609 | 4 | VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ_FIELD); |
610 | 4 | if (csqInfo == null) |
611 | { | |
612 | 3 | return; |
613 | } | |
614 | ||
615 | /* | |
616 | * parse out the pipe-separated list of CSQ fields; we assume here that | |
617 | * these form the last part of the description, and contain no spaces | |
618 | */ | |
619 | 1 | String desc = csqInfo.getDescription(); |
620 | 1 | int spacePos = desc.lastIndexOf(" "); |
621 | 1 | desc = desc.substring(spacePos + 1); |
622 | ||
623 | 1 | if (desc != null) |
624 | { | |
625 | 1 | String[] format = desc.split(PIPE_REGEX); |
626 | 1 | int index = 0; |
627 | 1 | for (String field : format) |
628 | { | |
629 | 9 | if (CSQ_CONSEQUENCE_KEY.equals(field)) |
630 | { | |
631 | 1 | csqConsequenceFieldIndex = index; |
632 | } | |
633 | 9 | if (CSQ_ALLELE_NUM_KEY.equals(field)) |
634 | { | |
635 | 0 | csqAlleleNumberFieldIndex = index; |
636 | } | |
637 | 9 | if (CSQ_ALLELE_KEY.equals(field)) |
638 | { | |
639 | 1 | csqAlleleFieldIndex = index; |
640 | } | |
641 | 9 | if (CSQ_FEATURE_KEY.equals(field)) |
642 | { | |
643 | 1 | csqFeatureFieldIndex = index; |
644 | } | |
645 | ||
646 | 9 | if (isFieldWanted(field, vepFieldFilters)) |
647 | { | |
648 | 9 | vepFieldsOfInterest.put(index, field); |
649 | } | |
650 | ||
651 | 9 | index++; |
652 | } | |
653 | } | |
654 | } | |
655 | ||
656 | /** | |
657 | * Reads the Preference value for the given key, with default specified if no | |
658 | * preference set. The value is interpreted as a comma-separated list of | |
659 | * regular expressions, and converted into a list of compiled patterns ready | |
660 | * for matching. Patterns are forced to upper-case for non-case-sensitive | |
661 | * matching. | |
662 | * <p> | |
663 | * This supports user-defined filters for fields of interest to capture while | |
664 | * processing data. For example, VCF_FIELDS = AF,AC* would mean that VCF INFO | |
665 | * fields with an ID of AF, or starting with AC, would be matched. | |
666 | * | |
667 | * @param key | |
668 | * @param def | |
669 | * @return | |
670 | */ | |
671 | 8 | private List<Pattern> getFieldMatchers(String key, String def) |
672 | { | |
673 | 8 | String pref = Cache.getDefault(key, def); |
674 | 8 | List<Pattern> patterns = new ArrayList<>(); |
675 | 8 | String[] tokens = pref.split(","); |
676 | 8 | for (String token : tokens) |
677 | { | |
678 | 8 | try |
679 | { | |
680 | 8 | patterns.add(Pattern.compile(token.toUpperCase(Locale.ROOT))); |
681 | } catch (PatternSyntaxException e) | |
682 | { | |
683 | 0 | jalview.bin.Console.errPrintln("Invalid pattern ignored: " + token); |
684 | } | |
685 | } | |
686 | 8 | return patterns; |
687 | } | |
688 | ||
689 | /** | |
690 | * Transfers VCF features to sequences to which this sequence has a mapping. | |
691 | * | |
692 | * @param seq | |
693 | */ | |
694 | 11 | protected void transferAddedFeatures(SequenceI seq) |
695 | { | |
696 | 11 | List<DBRefEntry> dbrefs = seq.getDBRefs(); |
697 | 11 | if (dbrefs == null) |
698 | { | |
699 | 0 | return; |
700 | } | |
701 | 11 | for (DBRefEntry dbref : dbrefs) |
702 | { | |
703 | 16 | Mapping mapping = dbref.getMap(); |
704 | 16 | if (mapping == null || mapping.getTo() == null) |
705 | { | |
706 | 11 | continue; |
707 | } | |
708 | ||
709 | 5 | SequenceI mapTo = mapping.getTo(); |
710 | 5 | MapList map = mapping.getMap(); |
711 | 5 | if (map.getFromRatio() == 3) |
712 | { | |
713 | /* | |
714 | * dna-to-peptide product mapping | |
715 | */ | |
716 | // JAL-3187 render on the fly instead | |
717 | // AlignmentUtils.computeProteinFeatures(seq, mapTo, map); | |
718 | } | |
719 | else | |
720 | { | |
721 | /* | |
722 | * nucleotide-to-nucleotide mapping e.g. transcript to CDS | |
723 | */ | |
724 | 0 | List<SequenceFeature> features = seq.getFeatures() |
725 | .getPositionalFeatures(SequenceOntologyI.SEQUENCE_VARIANT); | |
726 | 0 | for (SequenceFeature sf : features) |
727 | { | |
728 | 0 | if (FEATURE_GROUP_VCF.equals(sf.getFeatureGroup())) |
729 | { | |
730 | 0 | transferFeature(sf, mapTo, map); |
731 | } | |
732 | } | |
733 | } | |
734 | } | |
735 | } | |
736 | ||
737 | /** | |
738 | * Tries to add overlapping variants read from a VCF file to the given | |
739 | * sequence, and returns the number of variant features added | |
740 | * | |
741 | * @param seq | |
742 | * @return | |
743 | */ | |
744 | 24 | protected int loadSequenceVCF(SequenceI seq) |
745 | { | |
746 | 24 | VCFMap vcfMap = getVcfMap(seq); |
747 | 24 | if (vcfMap == null) |
748 | { | |
749 | 0 | return 0; |
750 | } | |
751 | ||
752 | /* | |
753 | * work with the dataset sequence here | |
754 | */ | |
755 | 24 | SequenceI dss = seq.getDatasetSequence(); |
756 | 24 | if (dss == null) |
757 | { | |
758 | 3 | dss = seq; |
759 | } | |
760 | 24 | return addVcfVariants(dss, vcfMap); |
761 | } | |
762 | ||
763 | /** | |
764 | * Answers a map from sequence coordinates to VCF chromosome ranges | |
765 | * | |
766 | * @param seq | |
767 | * @return | |
768 | */ | |
769 | 24 | private VCFMap getVcfMap(SequenceI seq) |
770 | { | |
771 | /* | |
772 | * simplest case: sequence has id and length matching a VCF contig | |
773 | */ | |
774 | 24 | VCFMap vcfMap = null; |
775 | 24 | if (dictionary != null) |
776 | { | |
777 | 3 | vcfMap = getContigMap(seq); |
778 | } | |
779 | 24 | if (vcfMap != null) |
780 | { | |
781 | 3 | return vcfMap; |
782 | } | |
783 | ||
784 | /* | |
785 | * otherwise, map to VCF from chromosomal coordinates | |
786 | * of the sequence (if known) | |
787 | */ | |
788 | 21 | GeneLociI seqCoords = seq.getGeneLoci(); |
789 | 21 | if (seqCoords == null) |
790 | { | |
791 | 0 | Console.warn(String.format( |
792 | "Can't query VCF for %s as chromosome coordinates not known", | |
793 | seq.getName())); | |
794 | 0 | return null; |
795 | } | |
796 | ||
797 | 21 | String species = seqCoords.getSpeciesId(); |
798 | 21 | String chromosome = seqCoords.getChromosomeId(); |
799 | 21 | String seqRef = seqCoords.getAssemblyId(); |
800 | 21 | MapList map = seqCoords.getMapping(); |
801 | ||
802 | // note this requires the configured species to match that | |
803 | // returned with the Ensembl sequence; todo: support aliases? | |
804 | 21 | if (!vcfSpecies.equalsIgnoreCase(species)) |
805 | { | |
806 | 0 | Console.warn("No VCF loaded to " + seq.getName() |
807 | + " as species not matched"); | |
808 | 0 | return null; |
809 | } | |
810 | ||
811 | 21 | if (seqRef.equalsIgnoreCase(vcfAssembly)) |
812 | { | |
813 | 21 | return new VCFMap(chromosome, map); |
814 | } | |
815 | ||
816 | /* | |
817 | * VCF data has a different reference assembly to the sequence: | |
818 | * query Ensembl to map chromosomal coordinates from sequence to VCF | |
819 | */ | |
820 | 0 | List<int[]> toVcfRanges = new ArrayList<>(); |
821 | 0 | List<int[]> fromSequenceRanges = new ArrayList<>(); |
822 | ||
823 | 0 | for (int[] range : map.getToRanges()) |
824 | { | |
825 | 0 | int[] fromRange = map.locateInFrom(range[0], range[1]); |
826 | 0 | if (fromRange == null) |
827 | { | |
828 | // corrupted map?!? | |
829 | 0 | continue; |
830 | } | |
831 | ||
832 | 0 | int[] newRange = mapReferenceRange(range, chromosome, "human", seqRef, |
833 | vcfAssembly); | |
834 | 0 | if (newRange == null) |
835 | { | |
836 | 0 | Console.error(String.format("Failed to map %s:%s:%s:%d:%d to %s", |
837 | species, chromosome, seqRef, range[0], range[1], | |
838 | vcfAssembly)); | |
839 | 0 | continue; |
840 | } | |
841 | else | |
842 | { | |
843 | 0 | toVcfRanges.add(newRange); |
844 | 0 | fromSequenceRanges.add(fromRange); |
845 | } | |
846 | } | |
847 | ||
848 | 0 | return new VCFMap(chromosome, |
849 | new MapList(fromSequenceRanges, toVcfRanges, 1, 1)); | |
850 | } | |
851 | ||
852 | /** | |
853 | * If the sequence id matches a contig declared in the VCF file, and the | |
854 | * sequence length matches the contig length, then returns a 1:1 map of the | |
855 | * sequence to the contig, else returns null | |
856 | * | |
857 | * @param seq | |
858 | * @return | |
859 | */ | |
860 | 3 | private VCFMap getContigMap(SequenceI seq) |
861 | { | |
862 | 3 | String id = seq.getName(); |
863 | 3 | SAMSequenceRecord contig = dictionary.getSequence(id); |
864 | 3 | if (contig != null) |
865 | { | |
866 | 3 | int len = seq.getLength(); |
867 | 3 | if (len == contig.getSequenceLength()) |
868 | { | |
869 | 3 | MapList map = new MapList(new int[] { 1, len }, |
870 | new int[] | |
871 | { 1, len }, 1, 1); | |
872 | 3 | return new VCFMap(id, map); |
873 | } | |
874 | } | |
875 | 0 | return null; |
876 | } | |
877 | ||
878 | /** | |
879 | * Queries the VCF reader for any variants that overlap the mapped chromosome | |
880 | * ranges of the sequence, and adds as variant features. Returns the number of | |
881 | * overlapping variants found. | |
882 | * | |
883 | * @param seq | |
884 | * @param map | |
885 | * mapping from sequence to VCF coordinates | |
886 | * @return | |
887 | */ | |
888 | 24 | protected int addVcfVariants(SequenceI seq, VCFMap map) |
889 | { | |
890 | 24 | boolean forwardStrand = map.map.isToForwardStrand(); |
891 | ||
892 | /* | |
893 | * query the VCF for overlaps of each contiguous chromosomal region | |
894 | */ | |
895 | 24 | int count = 0; |
896 | ||
897 | 24 | for (int[] range : map.map.getToRanges()) |
898 | { | |
899 | 39 | int vcfStart = Math.min(range[0], range[1]); |
900 | 39 | int vcfEnd = Math.max(range[0], range[1]); |
901 | 39 | try |
902 | { | |
903 | 39 | CloseableIterator<VariantContext> variants = reader |
904 | .query(map.chromosome, vcfStart, vcfEnd); | |
905 | 75 | while (variants.hasNext()) |
906 | { | |
907 | 36 | VariantContext variant = variants.next(); |
908 | ||
909 | 36 | int[] featureRange = map.map.locateInFrom(variant.getStart(), |
910 | variant.getEnd()); | |
911 | ||
912 | /* | |
913 | * only take features whose range is fully mappable to sequence positions | |
914 | */ | |
915 | 36 | if (featureRange != null) |
916 | { | |
917 | 36 | int featureStart = Math.min(featureRange[0], featureRange[1]); |
918 | 36 | int featureEnd = Math.max(featureRange[0], featureRange[1]); |
919 | 36 | if (featureEnd - featureStart == variant.getEnd() |
920 | - variant.getStart()) | |
921 | { | |
922 | 34 | count += addAlleleFeatures(seq, variant, featureStart, |
923 | featureEnd, forwardStrand); | |
924 | } | |
925 | } | |
926 | } | |
927 | 39 | variants.close(); |
928 | } catch (TribbleException e) | |
929 | { | |
930 | /* | |
931 | * RuntimeException throwable by htsjdk | |
932 | */ | |
933 | 0 | String msg = String.format("Error reading VCF for %s:%d-%d: %s ", |
934 | map.chromosome, vcfStart, vcfEnd, e.getLocalizedMessage()); | |
935 | 0 | Console.error(msg); |
936 | } | |
937 | } | |
938 | ||
939 | 24 | return count; |
940 | } | |
941 | ||
942 | /** | |
943 | * A convenience method to get an attribute value for an alternate allele | |
944 | * | |
945 | * @param variant | |
946 | * @param attributeName | |
947 | * @param alleleIndex | |
948 | * @return | |
949 | */ | |
950 | 100 | protected String getAttributeValue(VariantContext variant, |
951 | String attributeName, int alleleIndex) | |
952 | { | |
953 | 100 | Object att = variant.getAttribute(attributeName); |
954 | ||
955 | 100 | if (att instanceof String) |
956 | { | |
957 | 39 | return (String) att; |
958 | } | |
959 | 61 | else if (att instanceof ArrayList) |
960 | { | |
961 | 61 | return ((List<String>) att).get(alleleIndex); |
962 | } | |
963 | ||
964 | 0 | return null; |
965 | } | |
966 | ||
967 | /** | |
968 | * Adds one variant feature for each allele in the VCF variant record, and | |
969 | * returns the number of features added. | |
970 | * | |
971 | * @param seq | |
972 | * @param variant | |
973 | * @param featureStart | |
974 | * @param featureEnd | |
975 | * @param forwardStrand | |
976 | * @return | |
977 | */ | |
978 | 34 | protected int addAlleleFeatures(SequenceI seq, VariantContext variant, |
979 | int featureStart, int featureEnd, boolean forwardStrand) | |
980 | { | |
981 | 34 | int added = 0; |
982 | ||
983 | /* | |
984 | * Javadoc says getAlternateAlleles() imposes no order on the list returned | |
985 | * so we proceed defensively to get them in strict order | |
986 | */ | |
987 | 34 | int altAlleleCount = variant.getAlternateAlleles().size(); |
988 | 85 | for (int i = 0; i < altAlleleCount; i++) |
989 | { | |
990 | 51 | added += addAlleleFeature(seq, variant, i, featureStart, featureEnd, |
991 | forwardStrand); | |
992 | } | |
993 | 34 | return added; |
994 | } | |
995 | ||
996 | /** | |
997 | * Inspects one allele and attempts to add a variant feature for it to the | |
998 | * sequence. The additional data associated with this allele is extracted to | |
999 | * store in the feature's key-value map. Answers the number of features added | |
1000 | * (0 or 1). | |
1001 | * | |
1002 | * @param seq | |
1003 | * @param variant | |
1004 | * @param altAlleleIndex | |
1005 | * (0, 1..) | |
1006 | * @param featureStart | |
1007 | * @param featureEnd | |
1008 | * @param forwardStrand | |
1009 | * @return | |
1010 | */ | |
1011 | 51 | protected int addAlleleFeature(SequenceI seq, VariantContext variant, |
1012 | int altAlleleIndex, int featureStart, int featureEnd, | |
1013 | boolean forwardStrand) | |
1014 | { | |
1015 | 51 | String reference = variant.getReference().getBaseString(); |
1016 | 51 | Allele alt = variant.getAlternateAllele(altAlleleIndex); |
1017 | 51 | String allele = alt.getBaseString(); |
1018 | ||
1019 | /* | |
1020 | * insertion after a genomic base, if on reverse strand, has to be | |
1021 | * converted to insertion of complement after the preceding position | |
1022 | */ | |
1023 | 51 | int referenceLength = reference.length(); |
1024 | 51 | if (!forwardStrand && allele.length() > referenceLength |
1025 | && allele.startsWith(reference)) | |
1026 | { | |
1027 | 4 | featureStart -= referenceLength; |
1028 | 4 | featureEnd = featureStart; |
1029 | 4 | char insertAfter = seq.getCharAt(featureStart - seq.getStart()); |
1030 | 4 | reference = Dna.reverseComplement(String.valueOf(insertAfter)); |
1031 | 4 | allele = allele.substring(referenceLength) + reference; |
1032 | } | |
1033 | ||
1034 | /* | |
1035 | * build the ref,alt allele description e.g. "G,A", using the base | |
1036 | * complement if the sequence is on the reverse strand | |
1037 | */ | |
1038 | 51 | StringBuilder sb = new StringBuilder(); |
1039 | 51 | sb.append(forwardStrand ? reference : Dna.reverseComplement(reference)); |
1040 | 51 | sb.append(COMMA); |
1041 | 51 | sb.append(forwardStrand ? allele : Dna.reverseComplement(allele)); |
1042 | 51 | String alleles = sb.toString(); // e.g. G,A |
1043 | ||
1044 | /* | |
1045 | * pick out the consequence data (if any) that is for the current allele | |
1046 | * and feature (transcript) that matches the current sequence | |
1047 | */ | |
1048 | 51 | String consequence = getConsequenceForAlleleAndFeature(variant, |
1049 | CSQ_FIELD, altAlleleIndex, csqAlleleFieldIndex, | |
1050 | csqAlleleNumberFieldIndex, | |
1051 | seq.getName().toLowerCase(Locale.ROOT), csqFeatureFieldIndex); | |
1052 | ||
1053 | /* | |
1054 | * pick out the ontology term for the consequence type | |
1055 | */ | |
1056 | 51 | String type = SequenceOntologyI.SEQUENCE_VARIANT; |
1057 | 51 | if (consequence != null) |
1058 | { | |
1059 | 7 | type = getOntologyTerm(consequence); |
1060 | } | |
1061 | ||
1062 | 51 | SequenceFeature sf = new SequenceFeature(type, alleles, featureStart, |
1063 | featureEnd, FEATURE_GROUP_VCF); | |
1064 | 51 | sf.setSource(sourceId); |
1065 | ||
1066 | /* | |
1067 | * save the derived alleles as a named attribute; this will be | |
1068 | * needed when Jalview computes derived peptide variants | |
1069 | */ | |
1070 | 51 | addFeatureAttribute(sf, Gff3Helper.ALLELES, alleles); |
1071 | ||
1072 | /* | |
1073 | * add selected VCF fixed column data as feature attributes | |
1074 | */ | |
1075 | 51 | addFeatureAttribute(sf, VCF_POS, String.valueOf(variant.getStart())); |
1076 | 51 | addFeatureAttribute(sf, VCF_ID, variant.getID()); |
1077 | 51 | addFeatureAttribute(sf, VCF_QUAL, |
1078 | String.valueOf(variant.getPhredScaledQual())); | |
1079 | 51 | addFeatureAttribute(sf, VCF_FILTER, getFilter(variant)); |
1080 | ||
1081 | 51 | addAlleleProperties(variant, sf, altAlleleIndex, consequence); |
1082 | ||
1083 | 51 | seq.addSequenceFeature(sf); |
1084 | ||
1085 | 51 | return 1; |
1086 | } | |
1087 | ||
1088 | /** | |
1089 | * Answers the VCF FILTER value for the variant - or an approximation to it. | |
1090 | * This field is either PASS, or a semi-colon separated list of filters not | |
1091 | * passed. htsjdk saves filters as a HashSet, so the order when reassembled | |
1092 | * into a list may be different. | |
1093 | * | |
1094 | * @param variant | |
1095 | * @return | |
1096 | */ | |
1097 | 51 | String getFilter(VariantContext variant) |
1098 | { | |
1099 | 51 | Set<String> filters = variant.getFilters(); |
1100 | 51 | if (filters.isEmpty()) |
1101 | { | |
1102 | 21 | return NO_VALUE; |
1103 | } | |
1104 | 30 | Iterator<String> iterator = filters.iterator(); |
1105 | 30 | String first = iterator.next(); |
1106 | 30 | if (filters.size() == 1) |
1107 | { | |
1108 | 11 | return first; |
1109 | } | |
1110 | ||
1111 | 19 | StringBuilder sb = new StringBuilder(first); |
1112 | 38 | while (iterator.hasNext()) |
1113 | { | |
1114 | 19 | sb.append(";").append(iterator.next()); |
1115 | } | |
1116 | ||
1117 | 19 | return sb.toString(); |
1118 | } | |
1119 | ||
1120 | /** | |
1121 | * Adds one feature attribute unless the value is null, empty or '.' | |
1122 | * | |
1123 | * @param sf | |
1124 | * @param key | |
1125 | * @param value | |
1126 | */ | |
1127 | 347 | void addFeatureAttribute(SequenceFeature sf, String key, String value) |
1128 | { | |
1129 | 347 | if (value != null && !value.isEmpty() && !NO_VALUE.equals(value)) |
1130 | { | |
1131 | 275 | sf.setValue(key, value); |
1132 | } | |
1133 | } | |
1134 | ||
1135 | /** | |
1136 | * Determines the Sequence Ontology term to use for the variant feature type | |
1137 | * in Jalview. The default is 'sequence_variant', but a more specific term is | |
1138 | * used if: | |
1139 | * <ul> | |
1140 | * <li>VEP (or SnpEff) Consequence annotation is included in the VCF</li> | |
1141 | * <li>sequence id can be matched to VEP Feature (or SnpEff Feature_ID)</li> | |
1142 | * </ul> | |
1143 | * | |
1144 | * @param consequence | |
1145 | * @return | |
1146 | * @see http://www.sequenceontology.org/browser/current_svn/term/SO:0001060 | |
1147 | */ | |
1148 | 7 | String getOntologyTerm(String consequence) |
1149 | { | |
1150 | 7 | String type = SequenceOntologyI.SEQUENCE_VARIANT; |
1151 | ||
1152 | /* | |
1153 | * could we associate Consequence data with this allele and feature (transcript)? | |
1154 | * if so, prefer the consequence term from that data | |
1155 | */ | |
1156 | 7 | if (csqAlleleFieldIndex == -1) // && snpEffAlleleFieldIndex == -1 |
1157 | { | |
1158 | /* | |
1159 | * no Consequence data so we can't refine the ontology term | |
1160 | */ | |
1161 | 0 | return type; |
1162 | } | |
1163 | ||
1164 | 7 | if (consequence != null) |
1165 | { | |
1166 | 7 | String[] csqFields = consequence.split(PIPE_REGEX); |
1167 | 7 | if (csqFields.length > csqConsequenceFieldIndex) |
1168 | { | |
1169 | 7 | type = csqFields[csqConsequenceFieldIndex]; |
1170 | } | |
1171 | } | |
1172 | else | |
1173 | { | |
1174 | // todo the same for SnpEff consequence data matching if wanted | |
1175 | } | |
1176 | ||
1177 | /* | |
1178 | * if of the form (e.g.) missense_variant&splice_region_variant, | |
1179 | * just take the first ('most severe') consequence | |
1180 | */ | |
1181 | 7 | if (type != null) |
1182 | { | |
1183 | 7 | int pos = type.indexOf('&'); |
1184 | 7 | if (pos > 0) |
1185 | { | |
1186 | 0 | type = type.substring(0, pos); |
1187 | } | |
1188 | } | |
1189 | 7 | return type; |
1190 | } | |
1191 | ||
1192 | /** | |
1193 | * Returns matched consequence data if it can be found, else null. | |
1194 | * <ul> | |
1195 | * <li>inspects the VCF data for key 'vcfInfoId'</li> | |
1196 | * <li>splits this on comma (to distinct consequences)</li> | |
1197 | * <li>returns the first consequence (if any) where</li> | |
1198 | * <ul> | |
1199 | * <li>the allele matches the altAlleleIndex'th allele of variant</li> | |
1200 | * <li>the feature matches the sequence name (e.g. transcript id)</li> | |
1201 | * </ul> | |
1202 | * </ul> | |
1203 | * If matched, the consequence is returned (as pipe-delimited fields). | |
1204 | * | |
1205 | * @param variant | |
1206 | * @param vcfInfoId | |
1207 | * @param altAlleleIndex | |
1208 | * @param alleleFieldIndex | |
1209 | * @param alleleNumberFieldIndex | |
1210 | * @param seqName | |
1211 | * @param featureFieldIndex | |
1212 | * @return | |
1213 | */ | |
1214 | 51 | private String getConsequenceForAlleleAndFeature(VariantContext variant, |
1215 | String vcfInfoId, int altAlleleIndex, int alleleFieldIndex, | |
1216 | int alleleNumberFieldIndex, String seqName, int featureFieldIndex) | |
1217 | { | |
1218 | 51 | if (alleleFieldIndex == -1 || featureFieldIndex == -1) |
1219 | { | |
1220 | 37 | return null; |
1221 | } | |
1222 | 14 | Object value = variant.getAttribute(vcfInfoId); |
1223 | ||
1224 | 14 | if (value == null || !(value instanceof List<?>)) |
1225 | { | |
1226 | 0 | return null; |
1227 | } | |
1228 | ||
1229 | /* | |
1230 | * inspect each consequence in turn (comma-separated blocks | |
1231 | * extracted by htsjdk) | |
1232 | */ | |
1233 | 14 | List<String> consequences = (List<String>) value; |
1234 | ||
1235 | 14 | for (String consequence : consequences) |
1236 | { | |
1237 | 41 | String[] csqFields = consequence.split(PIPE_REGEX); |
1238 | 41 | if (csqFields.length > featureFieldIndex) |
1239 | { | |
1240 | 41 | String featureIdentifier = csqFields[featureFieldIndex]; |
1241 | 41 | if (featureIdentifier.length() > 4 && seqName |
1242 | .indexOf(featureIdentifier.toLowerCase(Locale.ROOT)) > -1) | |
1243 | { | |
1244 | /* | |
1245 | * feature (transcript) matched - now check for allele match | |
1246 | */ | |
1247 | 10 | if (matchAllele(variant, altAlleleIndex, csqFields, |
1248 | alleleFieldIndex, alleleNumberFieldIndex)) | |
1249 | { | |
1250 | 7 | return consequence; |
1251 | } | |
1252 | } | |
1253 | } | |
1254 | } | |
1255 | 7 | return null; |
1256 | } | |
1257 | ||
1258 | 10 | private boolean matchAllele(VariantContext variant, int altAlleleIndex, |
1259 | String[] csqFields, int alleleFieldIndex, | |
1260 | int alleleNumberFieldIndex) | |
1261 | { | |
1262 | /* | |
1263 | * if ALLELE_NUM is present, it must match altAlleleIndex | |
1264 | * NB first alternate allele is 1 for ALLELE_NUM, 0 for altAlleleIndex | |
1265 | */ | |
1266 | 10 | if (alleleNumberFieldIndex > -1) |
1267 | { | |
1268 | 0 | if (csqFields.length <= alleleNumberFieldIndex) |
1269 | { | |
1270 | 0 | return false; |
1271 | } | |
1272 | 0 | String alleleNum = csqFields[alleleNumberFieldIndex]; |
1273 | 0 | return String.valueOf(altAlleleIndex + 1).equals(alleleNum); |
1274 | } | |
1275 | ||
1276 | /* | |
1277 | * else consequence allele must match variant allele | |
1278 | */ | |
1279 | 10 | if (alleleFieldIndex > -1 && csqFields.length > alleleFieldIndex) |
1280 | { | |
1281 | 10 | String csqAllele = csqFields[alleleFieldIndex]; |
1282 | 10 | String vcfAllele = variant.getAlternateAllele(altAlleleIndex) |
1283 | .getBaseString(); | |
1284 | 10 | return csqAllele.equals(vcfAllele); |
1285 | } | |
1286 | 0 | return false; |
1287 | } | |
1288 | ||
1289 | /** | |
1290 | * Add any allele-specific VCF key-value data to the sequence feature | |
1291 | * | |
1292 | * @param variant | |
1293 | * @param sf | |
1294 | * @param altAlelleIndex | |
1295 | * (0, 1..) | |
1296 | * @param consequence | |
1297 | * if not null, the consequence specific to this sequence (transcript | |
1298 | * feature) and allele | |
1299 | */ | |
1300 | 51 | protected void addAlleleProperties(VariantContext variant, |
1301 | SequenceFeature sf, final int altAlelleIndex, String consequence) | |
1302 | { | |
1303 | 51 | Map<String, Object> atts = variant.getAttributes(); |
1304 | ||
1305 | 51 | for (Entry<String, Object> att : atts.entrySet()) |
1306 | { | |
1307 | 174 | String key = att.getKey(); |
1308 | ||
1309 | /* | |
1310 | * extract Consequence data (if present) that we are able to | |
1311 | * associated with the allele for this variant feature | |
1312 | */ | |
1313 | 174 | if (CSQ_FIELD.equals(key)) |
1314 | { | |
1315 | 14 | addConsequences(variant, sf, consequence); |
1316 | 14 | continue; |
1317 | } | |
1318 | ||
1319 | /* | |
1320 | * filter out fields we don't want to capture | |
1321 | */ | |
1322 | 160 | if (!vcfFieldsOfInterest.contains(key)) |
1323 | { | |
1324 | 46 | continue; |
1325 | } | |
1326 | ||
1327 | /* | |
1328 | * we extract values for other data which are allele-specific; | |
1329 | * these may be per alternate allele (INFO[key].Number = 'A') | |
1330 | * or per allele including reference (INFO[key].Number = 'R') | |
1331 | */ | |
1332 | 114 | VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine(key); |
1333 | 114 | if (infoHeader == null) |
1334 | { | |
1335 | /* | |
1336 | * can't be sure what data belongs to this allele, so | |
1337 | * play safe and don't take any | |
1338 | */ | |
1339 | 0 | continue; |
1340 | } | |
1341 | ||
1342 | 114 | VCFHeaderLineCount number = infoHeader.getCountType(); |
1343 | 114 | int index = altAlelleIndex; |
1344 | 114 | if (number == VCFHeaderLineCount.R) |
1345 | { | |
1346 | /* | |
1347 | * one value per allele including reference, so bump index | |
1348 | * e.g. the 3rd value is for the 2nd alternate allele | |
1349 | */ | |
1350 | 14 | index++; |
1351 | } | |
1352 | 100 | else if (number != VCFHeaderLineCount.A) |
1353 | { | |
1354 | /* | |
1355 | * don't save other values as not allele-related | |
1356 | */ | |
1357 | 14 | continue; |
1358 | } | |
1359 | ||
1360 | /* | |
1361 | * take the index'th value | |
1362 | */ | |
1363 | 100 | String value = getAttributeValue(variant, key, index); |
1364 | 100 | if (value != null && isValid(variant, key, value)) |
1365 | { | |
1366 | /* | |
1367 | * decode colon, semicolon, equals sign, percent sign, comma (only) | |
1368 | * as required by the VCF specification (para 1.2) | |
1369 | */ | |
1370 | 92 | value = StringUtils.urlDecode(value, VCF_ENCODABLE); |
1371 | 92 | addFeatureAttribute(sf, key, value); |
1372 | } | |
1373 | } | |
1374 | } | |
1375 | ||
1376 | /** | |
1377 | * Answers true for '.', null, or an empty value, or if the INFO type is | |
1378 | * String. If the INFO type is Integer or Float, answers false if the value is | |
1379 | * not in valid format. | |
1380 | * | |
1381 | * @param variant | |
1382 | * @param infoId | |
1383 | * @param value | |
1384 | * @return | |
1385 | */ | |
1386 | 100 | protected boolean isValid(VariantContext variant, String infoId, |
1387 | String value) | |
1388 | { | |
1389 | 100 | if (value == null || value.isEmpty() || NO_VALUE.equals(value)) |
1390 | { | |
1391 | 8 | return true; |
1392 | } | |
1393 | 92 | VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine(infoId); |
1394 | 92 | if (infoHeader == null) |
1395 | { | |
1396 | 0 | Console.error("Field " + infoId + " has no INFO header"); |
1397 | 0 | return false; |
1398 | } | |
1399 | 92 | VCFHeaderLineType infoType = infoHeader.getType(); |
1400 | 92 | try |
1401 | { | |
1402 | 92 | if (infoType == VCFHeaderLineType.Integer) |
1403 | { | |
1404 | 27 | Integer.parseInt(value); |
1405 | } | |
1406 | 65 | else if (infoType == VCFHeaderLineType.Float) |
1407 | { | |
1408 | 65 | Float.parseFloat(value); |
1409 | } | |
1410 | } catch (NumberFormatException e) | |
1411 | { | |
1412 | 8 | logInvalidValue(variant, infoId, value); |
1413 | 8 | return false; |
1414 | } | |
1415 | 84 | return true; |
1416 | } | |
1417 | ||
1418 | /** | |
1419 | * Logs an error message for malformed data; duplicate messages (same id and | |
1420 | * value) are not logged | |
1421 | * | |
1422 | * @param variant | |
1423 | * @param infoId | |
1424 | * @param value | |
1425 | */ | |
1426 | 8 | private void logInvalidValue(VariantContext variant, String infoId, |
1427 | String value) | |
1428 | { | |
1429 | 8 | if (badData == null) |
1430 | { | |
1431 | 2 | badData = new HashSet<>(); |
1432 | } | |
1433 | 8 | String token = infoId + ":" + value; |
1434 | 8 | if (!badData.contains(token)) |
1435 | { | |
1436 | 4 | badData.add(token); |
1437 | 4 | Console.error(String.format("Invalid VCF data at %s:%d %s=%s", |
1438 | variant.getContig(), variant.getStart(), infoId, value)); | |
1439 | } | |
1440 | } | |
1441 | ||
1442 | /** | |
1443 | * Inspects CSQ data blocks (consequences) and adds attributes on the sequence | |
1444 | * feature. | |
1445 | * <p> | |
1446 | * If <code>myConsequence</code> is not null, then this is the specific | |
1447 | * consequence data (pipe-delimited fields) that is for the current allele and | |
1448 | * transcript (sequence) being processed) | |
1449 | * | |
1450 | * @param variant | |
1451 | * @param sf | |
1452 | * @param myConsequence | |
1453 | */ | |
1454 | 14 | protected void addConsequences(VariantContext variant, SequenceFeature sf, |
1455 | String myConsequence) | |
1456 | { | |
1457 | 14 | Object value = variant.getAttribute(CSQ_FIELD); |
1458 | ||
1459 | 14 | if (value == null || !(value instanceof List<?>)) |
1460 | { | |
1461 | 0 | return; |
1462 | } | |
1463 | ||
1464 | 14 | List<String> consequences = (List<String>) value; |
1465 | ||
1466 | /* | |
1467 | * inspect CSQ consequences; restrict to the consequence | |
1468 | * associated with the current transcript (Feature) | |
1469 | */ | |
1470 | 14 | Map<String, String> csqValues = new HashMap<>(); |
1471 | ||
1472 | 14 | for (String consequence : consequences) |
1473 | { | |
1474 | 50 | if (myConsequence == null || myConsequence.equals(consequence)) |
1475 | { | |
1476 | 31 | String[] csqFields = consequence.split(PIPE_REGEX); |
1477 | ||
1478 | /* | |
1479 | * inspect individual fields of this consequence, copying non-null | |
1480 | * values which are 'fields of interest' | |
1481 | */ | |
1482 | 31 | int i = 0; |
1483 | 31 | for (String field : csqFields) |
1484 | { | |
1485 | 279 | if (field != null && field.length() > 0) |
1486 | { | |
1487 | 279 | String id = vepFieldsOfInterest.get(i); |
1488 | 279 | if (id != null) |
1489 | { | |
1490 | /* | |
1491 | * VCF spec requires encoding of special characters e.g. '=' | |
1492 | * so decode them here before storing | |
1493 | */ | |
1494 | 279 | field = StringUtils.urlDecode(field, VCF_ENCODABLE); |
1495 | 279 | csqValues.put(id, field); |
1496 | } | |
1497 | } | |
1498 | 279 | i++; |
1499 | } | |
1500 | } | |
1501 | } | |
1502 | ||
1503 | 14 | if (!csqValues.isEmpty()) |
1504 | { | |
1505 | 14 | sf.setValue(CSQ_FIELD, csqValues); |
1506 | } | |
1507 | } | |
1508 | ||
1509 | /** | |
1510 | * A convenience method to complement a dna base and return the string value | |
1511 | * of its complement | |
1512 | * | |
1513 | * @param reference | |
1514 | * @return | |
1515 | */ | |
1516 | 0 | protected String complement(byte[] reference) |
1517 | { | |
1518 | 0 | return String.valueOf(Dna.getComplement((char) reference[0])); |
1519 | } | |
1520 | ||
1521 | /** | |
1522 | * Determines the location of the query range (chromosome positions) in a | |
1523 | * different reference assembly. | |
1524 | * <p> | |
1525 | * If the range is just a subregion of one for which we already have a mapping | |
1526 | * (for example, an exon sub-region of a gene), then the mapping is just | |
1527 | * computed arithmetically. | |
1528 | * <p> | |
1529 | * Otherwise, calls the Ensembl REST service that maps from one assembly | |
1530 | * reference's coordinates to another's | |
1531 | * | |
1532 | * @param queryRange | |
1533 | * start-end chromosomal range in 'fromRef' coordinates | |
1534 | * @param chromosome | |
1535 | * @param species | |
1536 | * @param fromRef | |
1537 | * assembly reference for the query coordinates | |
1538 | * @param toRef | |
1539 | * assembly reference we wish to translate to | |
1540 | * @return the start-end range in 'toRef' coordinates | |
1541 | */ | |
1542 | 0 | protected int[] mapReferenceRange(int[] queryRange, String chromosome, |
1543 | String species, String fromRef, String toRef) | |
1544 | { | |
1545 | /* | |
1546 | * first try shorcut of computing the mapping as a subregion of one | |
1547 | * we already have (e.g. for an exon, if we have the gene mapping) | |
1548 | */ | |
1549 | 0 | int[] mappedRange = findSubsumedRangeMapping(queryRange, chromosome, |
1550 | species, fromRef, toRef); | |
1551 | 0 | if (mappedRange != null) |
1552 | { | |
1553 | 0 | return mappedRange; |
1554 | } | |
1555 | ||
1556 | /* | |
1557 | * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37 | |
1558 | */ | |
1559 | 0 | EnsemblMap mapper = new EnsemblMap(); |
1560 | 0 | int[] mapping = mapper.getAssemblyMapping(species, chromosome, fromRef, |
1561 | toRef, queryRange); | |
1562 | ||
1563 | 0 | if (mapping == null) |
1564 | { | |
1565 | // mapping service failure | |
1566 | 0 | return null; |
1567 | } | |
1568 | ||
1569 | /* | |
1570 | * save mapping for possible future re-use | |
1571 | */ | |
1572 | 0 | String key = makeRangesKey(chromosome, species, fromRef, toRef); |
1573 | 0 | if (!assemblyMappings.containsKey(key)) |
1574 | { | |
1575 | 0 | assemblyMappings.put(key, new HashMap<int[], int[]>()); |
1576 | } | |
1577 | ||
1578 | 0 | assemblyMappings.get(key).put(queryRange, mapping); |
1579 | ||
1580 | 0 | return mapping; |
1581 | } | |
1582 | ||
1583 | /** | |
1584 | * If we already have a 1:1 contiguous mapping which subsumes the given query | |
1585 | * range, this method just calculates and returns the subset of that mapping, | |
1586 | * else it returns null. In practical terms, if a gene has a contiguous | |
1587 | * mapping between (for example) GRCh37 and GRCh38, then we assume that its | |
1588 | * subsidiary exons occupy unchanged relative positions, and just compute | |
1589 | * these as offsets, rather than do another lookup of the mapping. | |
1590 | * <p> | |
1591 | * If in future these assumptions prove invalid (e.g. for bacterial dna?!), | |
1592 | * simply remove this method or let it always return null. | |
1593 | * <p> | |
1594 | * Warning: many rapid calls to the /map service map result in a 429 overload | |
1595 | * error response | |
1596 | * | |
1597 | * @param queryRange | |
1598 | * @param chromosome | |
1599 | * @param species | |
1600 | * @param fromRef | |
1601 | * @param toRef | |
1602 | * @return | |
1603 | */ | |
1604 | 0 | protected int[] findSubsumedRangeMapping(int[] queryRange, |
1605 | String chromosome, String species, String fromRef, String toRef) | |
1606 | { | |
1607 | 0 | String key = makeRangesKey(chromosome, species, fromRef, toRef); |
1608 | 0 | if (assemblyMappings.containsKey(key)) |
1609 | { | |
1610 | 0 | Map<int[], int[]> mappedRanges = assemblyMappings.get(key); |
1611 | 0 | for (Entry<int[], int[]> mappedRange : mappedRanges.entrySet()) |
1612 | { | |
1613 | 0 | int[] fromRange = mappedRange.getKey(); |
1614 | 0 | int[] toRange = mappedRange.getValue(); |
1615 | 0 | if (fromRange[1] - fromRange[0] == toRange[1] - toRange[0]) |
1616 | { | |
1617 | /* | |
1618 | * mapping is 1:1 in length, so we trust it to have no discontinuities | |
1619 | */ | |
1620 | 0 | if (MappingUtils.rangeContains(fromRange, queryRange)) |
1621 | { | |
1622 | /* | |
1623 | * fromRange subsumes our query range | |
1624 | */ | |
1625 | 0 | int offset = queryRange[0] - fromRange[0]; |
1626 | 0 | int mappedRangeFrom = toRange[0] + offset; |
1627 | 0 | int mappedRangeTo = mappedRangeFrom |
1628 | + (queryRange[1] - queryRange[0]); | |
1629 | 0 | return new int[] { mappedRangeFrom, mappedRangeTo }; |
1630 | } | |
1631 | } | |
1632 | } | |
1633 | } | |
1634 | 0 | return null; |
1635 | } | |
1636 | ||
1637 | /** | |
1638 | * Transfers the sequence feature to the target sequence, locating its start | |
1639 | * and end range based on the mapping. Features which do not overlap the | |
1640 | * target sequence are ignored. | |
1641 | * | |
1642 | * @param sf | |
1643 | * @param targetSequence | |
1644 | * @param mapping | |
1645 | * mapping from the feature's coordinates to the target sequence | |
1646 | */ | |
1647 | 0 | protected void transferFeature(SequenceFeature sf, |
1648 | SequenceI targetSequence, MapList mapping) | |
1649 | { | |
1650 | 0 | int[] mappedRange = mapping.locateInTo(sf.getBegin(), sf.getEnd()); |
1651 | ||
1652 | 0 | if (mappedRange != null) |
1653 | { | |
1654 | 0 | String group = sf.getFeatureGroup(); |
1655 | 0 | int newBegin = Math.min(mappedRange[0], mappedRange[1]); |
1656 | 0 | int newEnd = Math.max(mappedRange[0], mappedRange[1]); |
1657 | 0 | SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd, |
1658 | group, sf.getScore()); | |
1659 | 0 | targetSequence.addSequenceFeature(copy); |
1660 | } | |
1661 | } | |
1662 | ||
1663 | /** | |
1664 | * Formats a ranges map lookup key | |
1665 | * | |
1666 | * @param chromosome | |
1667 | * @param species | |
1668 | * @param fromRef | |
1669 | * @param toRef | |
1670 | * @return | |
1671 | */ | |
1672 | 0 | protected static String makeRangesKey(String chromosome, String species, |
1673 | String fromRef, String toRef) | |
1674 | { | |
1675 | 0 | return species + EXCL + chromosome + EXCL + fromRef + EXCL + toRef; |
1676 | } | |
1677 | } |