Class |
Line # |
Actions |
|||
---|---|---|---|---|---|
Gff3Helper | 40 | 92 | 28 |
1 | /* | |
2 | * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) | |
3 | * Copyright (C) $$Year-Rel$$ The Jalview Authors | |
4 | * | |
5 | * This file is part of Jalview. | |
6 | * | |
7 | * Jalview is free software: you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation, either version 3 | |
10 | * of the License, or (at your option) any later version. | |
11 | * | |
12 | * Jalview is distributed in the hope that it will be useful, but | |
13 | * WITHOUT ANY WARRANTY; without even the implied warranty | |
14 | * of MERCHANTABILITY or FITNESS FOR A PARTICULAR | |
15 | * PURPOSE. See the GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with Jalview. If not, see <http://www.gnu.org/licenses/>. | |
19 | * The Jalview Authors are detailed in the 'AUTHORS' file. | |
20 | */ | |
21 | package jalview.io.gff; | |
22 | ||
23 | import jalview.datamodel.AlignedCodonFrame; | |
24 | import jalview.datamodel.AlignmentI; | |
25 | import jalview.datamodel.MappingType; | |
26 | import jalview.datamodel.SequenceFeature; | |
27 | import jalview.datamodel.SequenceI; | |
28 | import jalview.util.MapList; | |
29 | import jalview.util.StringUtils; | |
30 | ||
31 | import java.io.IOException; | |
32 | import java.util.List; | |
33 | import java.util.Map; | |
34 | ||
35 | /** | |
36 | * Base class with generic / common functionality for processing GFF3 data. | |
37 | * Override this as required for any specialisations resulting from | |
38 | * peculiarities of GFF3 generated by particular tools. | |
39 | */ | |
40 | public class Gff3Helper extends GffHelperBase | |
41 | { | |
42 | public static final String ALLELES = "alleles"; | |
43 | ||
44 | protected static final String TARGET = "Target"; | |
45 | ||
46 | protected static final String ID = "ID"; | |
47 | ||
48 | private static final String NAME = "Name"; | |
49 | ||
50 | /** | |
51 | * GFF3 uses '=' to delimit name/value pairs in column 9, and comma to | |
52 | * separate multiple values for a name | |
53 | * | |
54 | * @param text | |
55 | * @return | |
56 | */ | |
57 | 8 | public static Map<String, List<String>> parseNameValuePairs(String text) |
58 | { | |
59 | 8 | return parseNameValuePairs(text, ";", '=', ","); |
60 | } | |
61 | ||
62 | /** | |
63 | * Process one GFF feature line (as modelled by SequenceFeature) | |
64 | * | |
65 | * @param seq | |
66 | * the sequence with which this feature is associated | |
67 | * @param sf | |
68 | * the sequence feature with ATTRIBUTES property containing any | |
69 | * additional attributes | |
70 | * @param align | |
71 | * the alignment we are adding GFF to | |
72 | * @param newseqs | |
73 | * any new sequences referenced by the GFF | |
74 | * @param relaxedIdMatching | |
75 | * if true, match word tokens in sequence names | |
76 | * @return true if the sequence feature should be added to the sequence, else | |
77 | * false (i.e. it has been processed in another way e.g. to generate a | |
78 | * mapping) | |
79 | * @throws IOException | |
80 | */ | |
81 | 7 | @Override |
82 | public SequenceFeature processGff(SequenceI seq, String[] gff, | |
83 | AlignmentI align, List<SequenceI> newseqs, | |
84 | boolean relaxedIdMatching) throws IOException | |
85 | { | |
86 | 7 | SequenceFeature sf = null; |
87 | ||
88 | 7 | if (gff.length == 9) |
89 | { | |
90 | 7 | String soTerm = gff[TYPE_COL]; |
91 | 7 | String atts = gff[ATTRIBUTES_COL]; |
92 | 7 | Map<String, List<String>> attributes = parseNameValuePairs(atts); |
93 | ||
94 | 7 | SequenceOntologyI so = SequenceOntologyFactory.getInstance(); |
95 | 7 | if (so.isA(soTerm, SequenceOntologyI.PROTEIN_MATCH)) |
96 | { | |
97 | 0 | sf = processProteinMatch(attributes, seq, gff, align, newseqs, |
98 | relaxedIdMatching); | |
99 | } | |
100 | 7 | else if (so.isA(soTerm, SequenceOntologyI.NUCLEOTIDE_MATCH)) |
101 | { | |
102 | 5 | sf = processNucleotideMatch(attributes, seq, gff, align, newseqs, |
103 | relaxedIdMatching); | |
104 | } | |
105 | else | |
106 | { | |
107 | 2 | sf = buildSequenceFeature(gff, attributes); |
108 | } | |
109 | } | |
110 | else | |
111 | { | |
112 | /* | |
113 | * fall back on generating a sequence feature with no special processing | |
114 | */ | |
115 | 0 | sf = buildSequenceFeature(gff, null); |
116 | } | |
117 | ||
118 | 7 | return sf; |
119 | } | |
120 | ||
121 | /** | |
122 | * Processes one GFF3 nucleotide (e.g. cDNA to genome) match. | |
123 | * | |
124 | * @param attributes | |
125 | * parsed GFF column 9 key/value(s) | |
126 | * @param seq | |
127 | * the sequence the GFF feature is on | |
128 | * @param gffColumns | |
129 | * the GFF column data | |
130 | * @param align | |
131 | * the alignment the sequence belongs to, where any new mappings | |
132 | * should be added | |
133 | * @param newseqs | |
134 | * a list of new 'virtual sequences' generated while parsing GFF | |
135 | * @param relaxedIdMatching | |
136 | * if true allow fuzzy search for a matching target sequence | |
137 | * @return a sequence feature, if one should be added to the sequence, else | |
138 | * null | |
139 | * @throws IOException | |
140 | */ | |
141 | 5 | protected SequenceFeature processNucleotideMatch( |
142 | Map<String, List<String>> attributes, SequenceI seq, | |
143 | String[] gffColumns, AlignmentI align, List<SequenceI> newseqs, | |
144 | boolean relaxedIdMatching) throws IOException | |
145 | { | |
146 | 5 | String strand = gffColumns[STRAND_COL]; |
147 | ||
148 | /* | |
149 | * (For now) we don't process mappings from reverse complement ; to do | |
150 | * this would require (a) creating a virtual sequence placeholder for | |
151 | * the reverse complement (b) resolving the sequence by its id from some | |
152 | * source (GFF ##FASTA or other) (c) creating the reverse complement | |
153 | * sequence (d) updating the mapping to be to the reverse complement | |
154 | */ | |
155 | 5 | if ("-".equals(strand)) |
156 | { | |
157 | 1 | jalview.bin.Console.errPrintln( |
158 | "Skipping mapping from reverse complement as not yet supported"); | |
159 | 1 | return null; |
160 | } | |
161 | ||
162 | 4 | List<String> targets = attributes.get(TARGET); |
163 | 4 | if (targets == null) |
164 | { | |
165 | 0 | jalview.bin.Console.errPrintln("'Target' missing in GFF"); |
166 | 0 | return null; |
167 | } | |
168 | ||
169 | /* | |
170 | * Typically we only expect one Target per GFF line, but this can handle | |
171 | * multiple matches, to the same or different sequences (e.g. dna variants) | |
172 | */ | |
173 | 4 | for (String target : targets) |
174 | { | |
175 | /* | |
176 | * Process "seqid start end [strand]" | |
177 | */ | |
178 | 4 | String[] tokens = target.split(" "); |
179 | 4 | if (tokens.length < 3) |
180 | { | |
181 | 0 | jalview.bin.Console.errPrintln("Incomplete Target: " + target); |
182 | 0 | continue; |
183 | } | |
184 | ||
185 | /* | |
186 | * Locate the mapped sequence in the alignment, or as a | |
187 | * (new or existing) virtual sequence in the newseqs list | |
188 | */ | |
189 | 4 | String targetId = findTargetId(tokens[0], attributes); |
190 | 4 | SequenceI mappedSequence1 = findSequence(targetId, align, newseqs, |
191 | relaxedIdMatching); | |
192 | 4 | SequenceI mappedSequence = mappedSequence1; |
193 | 4 | if (mappedSequence == null) |
194 | { | |
195 | 0 | continue; |
196 | } | |
197 | ||
198 | /* | |
199 | * get any existing mapping for these sequences (or start one), | |
200 | * and add this mapped range | |
201 | */ | |
202 | 4 | AlignedCodonFrame acf = getMapping(align, seq, mappedSequence); |
203 | ||
204 | 4 | try |
205 | { | |
206 | 4 | int toStart = Integer.parseInt(tokens[1]); |
207 | 4 | int toEnd = Integer.parseInt(tokens[2]); |
208 | 4 | if (tokens.length > 3 && "-".equals(tokens[3])) |
209 | { | |
210 | // mapping to reverse strand - swap start/end | |
211 | 1 | int temp = toStart; |
212 | 1 | toStart = toEnd; |
213 | 1 | toEnd = temp; |
214 | } | |
215 | ||
216 | 4 | int fromStart = Integer.parseInt(gffColumns[START_COL]); |
217 | 4 | int fromEnd = Integer.parseInt(gffColumns[END_COL]); |
218 | 4 | MapList mapping = constructMappingFromAlign(fromStart, fromEnd, |
219 | toStart, toEnd, MappingType.NucleotideToNucleotide); | |
220 | ||
221 | 4 | if (mapping != null) |
222 | { | |
223 | 4 | acf.addMap(seq, mappedSequence, mapping); |
224 | 4 | align.addCodonFrame(acf); |
225 | } | |
226 | } catch (NumberFormatException nfe) | |
227 | { | |
228 | 0 | jalview.bin.Console |
229 | .errPrintln("Invalid start or end in Target " + target); | |
230 | } | |
231 | } | |
232 | ||
233 | 4 | SequenceFeature sf = buildSequenceFeature(gffColumns, attributes); |
234 | 4 | return sf; |
235 | } | |
236 | ||
237 | /** | |
238 | * Returns the target sequence id extracted from the GFF name/value pairs. | |
239 | * Default (standard behaviour) is the first token for "Target". This may be | |
240 | * overridden where tools report this in a non-standard way. | |
241 | * | |
242 | * @param target | |
243 | * first token of a "Target" value from GFF column 9, typically | |
244 | * "seqid start end" | |
245 | * @param set | |
246 | * a map with all parsed column 9 attributes | |
247 | * @return | |
248 | */ | |
249 | 4 | @SuppressWarnings("unused") |
250 | protected String findTargetId(String target, | |
251 | Map<String, List<String>> set) | |
252 | { | |
253 | 4 | return target; |
254 | } | |
255 | ||
256 | /** | |
257 | * Processes one GFF 'protein_match'; fields of interest are | |
258 | * <ul> | |
259 | * <li>feature group - the database reporting a match e.g. Pfam</li> | |
260 | * <li>Name - the matched entry's accession id in the database</li> | |
261 | * <li>ID - a sequence identifier for the matched region (which may be | |
262 | * appended as FASTA in the GFF file)</li> | |
263 | * </ul> | |
264 | * | |
265 | * @param set | |
266 | * parsed GFF column 9 key/value(s) | |
267 | * @param seq | |
268 | * the sequence the GFF feature is on | |
269 | * @param gffColumns | |
270 | * the sequence feature holding GFF data | |
271 | * @param align | |
272 | * the alignment the sequence belongs to, where any new mappings | |
273 | * should be added | |
274 | * @param newseqs | |
275 | * a list of new 'virtual sequences' generated while parsing GFF | |
276 | * @param relaxedIdMatching | |
277 | * if true allow fuzzy search for a matching target sequence | |
278 | * @return the (real or virtual) sequence(s) mapped to by this match | |
279 | * @throws IOException | |
280 | */ | |
281 | 1 | protected SequenceFeature processProteinMatch( |
282 | Map<String, List<String>> set, SequenceI seq, String[] gffColumns, | |
283 | AlignmentI align, List<SequenceI> newseqs, | |
284 | boolean relaxedIdMatching) | |
285 | { | |
286 | // This is currently tailored to InterProScan GFF output: | |
287 | // ID holds the ID of the matched sequence, Target references the | |
288 | // query sequence; this looks wrong, as ID should just be the GFF internal | |
289 | // ID of the GFF feature, while Target would normally reference the matched | |
290 | // sequence. | |
291 | // TODO refactor as needed if other protein-protein GFF varies | |
292 | ||
293 | 1 | SequenceFeature sf = buildSequenceFeature(gffColumns, set); |
294 | ||
295 | /* | |
296 | * locate the mapped sequence in the alignment, or as a | |
297 | * (new or existing) virtual sequence in the newseqs list | |
298 | */ | |
299 | 1 | List<String> targets = set.get(TARGET); |
300 | 1 | if (targets != null) |
301 | { | |
302 | 1 | for (String target : targets) |
303 | { | |
304 | ||
305 | 1 | SequenceI mappedSequence1 = findSequence(findTargetId(target, set), |
306 | align, newseqs, relaxedIdMatching); | |
307 | 1 | SequenceI mappedSequence = mappedSequence1; |
308 | 1 | if (mappedSequence == null) |
309 | { | |
310 | 0 | continue; |
311 | } | |
312 | ||
313 | /* | |
314 | * give the mapped sequence a copy of the sequence feature, with | |
315 | * start/end range adjusted | |
316 | */ | |
317 | 1 | int sequenceFeatureLength = 1 + sf.getEnd() - sf.getBegin(); |
318 | 1 | SequenceFeature sf2 = new SequenceFeature(sf, 1, |
319 | sequenceFeatureLength, sf.getFeatureGroup(), sf.getScore()); | |
320 | 1 | mappedSequence.addSequenceFeature(sf2); |
321 | ||
322 | /* | |
323 | * add a property to the mapped sequence so that it can eventually be | |
324 | * renamed with its qualified accession id; renaming has to wait until | |
325 | * all sequence reference resolution is complete | |
326 | */ | |
327 | 1 | String accessionId = StringUtils |
328 | .listToDelimitedString(set.get(NAME), ","); | |
329 | 1 | if (accessionId.length() > 0) |
330 | { | |
331 | 1 | String database = sf.getType(); // TODO InterProScan only?? |
332 | 1 | String qualifiedAccId = database + "|" + accessionId; |
333 | 1 | sf2.setValue(RENAME_TOKEN, qualifiedAccId); |
334 | } | |
335 | ||
336 | /* | |
337 | * get any existing mapping for these sequences (or start one), | |
338 | * and add this mapped range | |
339 | */ | |
340 | 1 | AlignedCodonFrame alco = getMapping(align, seq, mappedSequence); |
341 | 1 | int[] from = new int[] { sf.getBegin(), sf.getEnd() }; |
342 | 1 | int[] to = new int[] { 1, sequenceFeatureLength }; |
343 | 1 | MapList mapping = new MapList(from, to, 1, 1); |
344 | ||
345 | 1 | alco.addMap(seq, mappedSequence, mapping); |
346 | 1 | align.addCodonFrame(alco); |
347 | } | |
348 | } | |
349 | ||
350 | 1 | return sf; |
351 | } | |
352 | ||
353 | /** | |
354 | * Modifies the default SequenceFeature in order to set the Target sequence id | |
355 | * as the description | |
356 | */ | |
357 | 7 | @Override |
358 | protected SequenceFeature buildSequenceFeature(String[] gff, | |
359 | int typeColumn, String group, | |
360 | Map<String, List<String>> attributes) | |
361 | { | |
362 | 7 | SequenceFeature sf = super.buildSequenceFeature(gff, typeColumn, group, |
363 | attributes); | |
364 | 7 | String desc = getDescription(sf, attributes); |
365 | 7 | if (desc != null) |
366 | { | |
367 | 6 | sf.setDescription(desc); |
368 | } | |
369 | 7 | return sf; |
370 | } | |
371 | ||
372 | /** | |
373 | * Apply heuristic rules to try to get the most useful feature description | |
374 | * | |
375 | * @param sf | |
376 | * @param attributes | |
377 | * @return | |
378 | */ | |
379 | 15 | protected String getDescription(SequenceFeature sf, |
380 | Map<String, List<String>> attributes) | |
381 | { | |
382 | 15 | String desc = null; |
383 | 15 | String target = (String) sf.getValue(TARGET); |
384 | 15 | if (target != null) |
385 | { | |
386 | 6 | desc = target.split(" ")[0]; |
387 | } | |
388 | ||
389 | 15 | SequenceOntologyI so = SequenceOntologyFactory.getInstance(); |
390 | 15 | String type = sf.getType(); |
391 | 15 | if (so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT)) |
392 | { | |
393 | /* | |
394 | * Ensembl returns dna variants as 'alleles' | |
395 | */ | |
396 | 2 | desc = StringUtils.listToDelimitedString(attributes.get(ALLELES), |
397 | ","); | |
398 | } | |
399 | ||
400 | /* | |
401 | * extract 'Name' for a transcript (to show gene name) | |
402 | * or an exon (so 'colour by label' shows exon boundaries) | |
403 | */ | |
404 | 15 | if (SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(type) |
405 | || so.isA(type, SequenceOntologyI.TRANSCRIPT) | |
406 | || so.isA(type, SequenceOntologyI.EXON)) | |
407 | { | |
408 | 4 | desc = StringUtils.listToDelimitedString(attributes.get("Name"), ","); |
409 | } | |
410 | ||
411 | /* | |
412 | * if the above fails, try ID | |
413 | */ | |
414 | 15 | if (desc == null) |
415 | { | |
416 | 4 | desc = (String) sf.getValue(ID); |
417 | } | |
418 | ||
419 | /* | |
420 | * and decode comma, equals, semi-colon as required by GFF3 spec | |
421 | */ | |
422 | 15 | desc = StringUtils.urlDecode(desc, GFF_ENCODABLE); |
423 | ||
424 | 15 | return desc; |
425 | } | |
426 | } |