Class |
Line # |
Actions |
|||
---|---|---|---|---|---|
GffHelperBase | 44 | 135 | 53 |
1 | /* | |
2 | * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) | |
3 | * Copyright (C) $$Year-Rel$$ The Jalview Authors | |
4 | * | |
5 | * This file is part of Jalview. | |
6 | * | |
7 | * Jalview is free software: you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation, either version 3 | |
10 | * of the License, or (at your option) any later version. | |
11 | * | |
12 | * Jalview is distributed in the hope that it will be useful, but | |
13 | * WITHOUT ANY WARRANTY; without even the implied warranty | |
14 | * of MERCHANTABILITY or FITNESS FOR A PARTICULAR | |
15 | * PURPOSE. See the GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with Jalview. If not, see <http://www.gnu.org/licenses/>. | |
19 | * The Jalview Authors are detailed in the 'AUTHORS' file. | |
20 | */ | |
21 | package jalview.io.gff; | |
22 | ||
23 | import jalview.analysis.SequenceIdMatcher; | |
24 | import jalview.datamodel.AlignedCodonFrame; | |
25 | import jalview.datamodel.AlignmentI; | |
26 | import jalview.datamodel.MappingType; | |
27 | import jalview.datamodel.SequenceDummy; | |
28 | import jalview.datamodel.SequenceFeature; | |
29 | import jalview.datamodel.SequenceI; | |
30 | import jalview.util.MapList; | |
31 | import jalview.util.StringUtils; | |
32 | ||
33 | import java.util.ArrayList; | |
34 | import java.util.Arrays; | |
35 | import java.util.HashMap; | |
36 | import java.util.List; | |
37 | import java.util.Map; | |
38 | import java.util.Map.Entry; | |
39 | ||
40 | /** | |
41 | * Base class with common functionality for flavours of GFF handler (GFF2 or | |
42 | * GFF3) | |
43 | */ | |
44 | public abstract class GffHelperBase implements GffHelperI | |
45 | { | |
46 | private static final String INVALID_GFF_ATTRIBUTE_FORMAT = "Invalid GFF attribute format: "; | |
47 | ||
48 | protected static final String COMMA = ","; | |
49 | ||
50 | protected static final String EQUALS = "="; | |
51 | ||
52 | protected static final String NOTE = "Note"; | |
53 | ||
54 | /* | |
55 | * GFF columns 1-9 (zero-indexed): | |
56 | */ | |
57 | protected static final int SEQID_COL = 0; | |
58 | ||
59 | protected static final int SOURCE_COL = 1; | |
60 | ||
61 | protected static final int TYPE_COL = 2; | |
62 | ||
63 | protected static final int START_COL = 3; | |
64 | ||
65 | protected static final int END_COL = 4; | |
66 | ||
67 | protected static final int SCORE_COL = 5; | |
68 | ||
69 | protected static final int STRAND_COL = 6; | |
70 | ||
71 | protected static final int PHASE_COL = 7; | |
72 | ||
73 | protected static final int ATTRIBUTES_COL = 8; | |
74 | ||
75 | private AlignmentI lastmatchedAl = null; | |
76 | ||
77 | private SequenceIdMatcher matcher = null; | |
78 | ||
79 | /** | |
80 | * Constructs and returns a mapping, or null if data appear invalid | |
81 | * | |
82 | * @param fromStart | |
83 | * @param fromEnd | |
84 | * @param toStart | |
85 | * @param toEnd | |
86 | * @param mappingType | |
87 | * type of mapping (e.g. protein to nucleotide) | |
88 | * @return | |
89 | */ | |
90 | 19 | protected MapList constructMappingFromAlign(int fromStart, int fromEnd, |
91 | int toStart, int toEnd, MappingType mappingType) | |
92 | { | |
93 | 19 | int[] from = new int[] { fromStart, fromEnd }; |
94 | 19 | int[] to = new int[] { toStart, toEnd }; |
95 | ||
96 | /* | |
97 | * Jalview always models from dna to protein, so switch values if the | |
98 | * GFF mapping is from protein to dna | |
99 | */ | |
100 | 19 | if (mappingType == MappingType.PeptideToNucleotide) |
101 | { | |
102 | 15 | int[] temp = from; |
103 | 15 | from = to; |
104 | 15 | to = temp; |
105 | 15 | mappingType = mappingType.getInverse(); |
106 | } | |
107 | ||
108 | 19 | int fromRatio = mappingType.getFromRatio(); |
109 | 19 | int toRatio = mappingType.getToRatio(); |
110 | ||
111 | /* | |
112 | * sanity check that mapped residue counts match | |
113 | * TODO understand why PASA generates such cases... | |
114 | */ | |
115 | 19 | if (!trimMapping(from, to, fromRatio, toRatio)) |
116 | { | |
117 | 0 | jalview.bin.Console.errPrintln( |
118 | "Ignoring mapping from " + Arrays.toString(from) + " to " | |
119 | + Arrays.toString(to) + " as counts don't match!"); | |
120 | 0 | return null; |
121 | } | |
122 | ||
123 | /* | |
124 | * If a codon has an intron gap, there will be contiguous 'toRanges'; | |
125 | * this is handled for us by the MapList constructor. | |
126 | * (It is not clear that exonerate ever generates this case) | |
127 | */ | |
128 | ||
129 | 19 | return new MapList(from, to, fromRatio, toRatio); |
130 | } | |
131 | ||
132 | /** | |
133 | * Checks that the 'from' and 'to' ranges have equivalent lengths. If not, | |
134 | * tries to trim the end of the longer so they do. Returns true if the | |
135 | * mappings could be made equivalent, else false. Note the range array values | |
136 | * may be modified by this method. | |
137 | * | |
138 | * @param from | |
139 | * @param to | |
140 | * @param fromRatio | |
141 | * @param toRatio | |
142 | * @return | |
143 | */ | |
144 | 36 | protected static boolean trimMapping(int[] from, int[] to, int fromRatio, |
145 | int toRatio) | |
146 | { | |
147 | 36 | int fromLength = Math.abs(from[1] - from[0]) + 1; |
148 | 36 | int toLength = Math.abs(to[1] - to[0]) + 1; |
149 | 36 | int fromOverlap = fromLength * toRatio - toLength * fromRatio; |
150 | 36 | if (fromOverlap == 0) |
151 | { | |
152 | 24 | return true; |
153 | } | |
154 | 12 | if (fromOverlap > 0 && fromOverlap % toRatio == 0) |
155 | { | |
156 | /* | |
157 | * restrict from range to make them match up | |
158 | * it's kind of arbitrary which end we truncate - here it is the end | |
159 | */ | |
160 | 6 | System.err.print( |
161 | "Truncating mapping from " + Arrays.toString(from) + " to "); | |
162 | 6 | if (from[1] > from[0]) |
163 | { | |
164 | 3 | from[1] -= fromOverlap / toRatio; |
165 | } | |
166 | else | |
167 | { | |
168 | 3 | from[1] += fromOverlap / toRatio; |
169 | } | |
170 | 6 | jalview.bin.Console.errPrintln(Arrays.toString(from)); |
171 | 6 | return true; |
172 | } | |
173 | 6 | else if (fromOverlap < 0 && fromOverlap % fromRatio == 0) |
174 | { | |
175 | 5 | fromOverlap = -fromOverlap; // > 0 |
176 | /* | |
177 | * restrict to range to make them match up | |
178 | */ | |
179 | 5 | System.err.print( |
180 | "Truncating mapping to " + Arrays.toString(to) + " to "); | |
181 | 5 | if (to[1] > to[0]) |
182 | { | |
183 | 2 | to[1] -= fromOverlap / fromRatio; |
184 | } | |
185 | else | |
186 | { | |
187 | 3 | to[1] += fromOverlap / fromRatio; |
188 | } | |
189 | 5 | jalview.bin.Console.errPrintln(Arrays.toString(to)); |
190 | 5 | return true; |
191 | } | |
192 | ||
193 | /* | |
194 | * Couldn't truncate to an exact match.. | |
195 | */ | |
196 | 1 | return false; |
197 | } | |
198 | ||
199 | /** | |
200 | * Returns a sequence matching the given id, as follows | |
201 | * <ul> | |
202 | * <li>strict matching is on exact sequence name</li> | |
203 | * <li>relaxed matching allows matching on a token within the sequence name, | |
204 | * or a dbxref</li> | |
205 | * <li>first tries to find a match in the alignment sequences</li> | |
206 | * <li>else tries to find a match in the new sequences already generated while | |
207 | * parsing the features file</li> | |
208 | * <li>else creates a new placeholder sequence, adds it to the new sequences | |
209 | * list, and returns it</li> | |
210 | * </ul> | |
211 | * | |
212 | * @param seqId | |
213 | * @param align | |
214 | * @param newseqs | |
215 | * @param relaxedIdMatching | |
216 | * | |
217 | * @return | |
218 | */ | |
219 | 16 | protected SequenceI findSequence(String seqId, AlignmentI align, |
220 | List<SequenceI> newseqs, boolean relaxedIdMatching) | |
221 | { | |
222 | 16 | if (seqId == null) |
223 | { | |
224 | 0 | return null; |
225 | } | |
226 | 16 | SequenceI match = null; |
227 | 16 | if (relaxedIdMatching) |
228 | { | |
229 | 3 | if (lastmatchedAl != align) |
230 | { | |
231 | 3 | lastmatchedAl = align; |
232 | 3 | matcher = new SequenceIdMatcher(align.getSequencesArray()); |
233 | 3 | if (newseqs != null) |
234 | { | |
235 | 3 | matcher.addAll(newseqs); |
236 | } | |
237 | } | |
238 | 3 | match = matcher.findIdMatch(seqId); |
239 | } | |
240 | else | |
241 | { | |
242 | 13 | match = align.findName(seqId, true); |
243 | 13 | if (match == null && newseqs != null) |
244 | { | |
245 | 12 | for (SequenceI m : newseqs) |
246 | { | |
247 | 3 | if (seqId.equals(m.getName())) |
248 | { | |
249 | 1 | return m; |
250 | } | |
251 | } | |
252 | } | |
253 | ||
254 | } | |
255 | 15 | if (match == null && newseqs != null) |
256 | { | |
257 | 14 | match = new SequenceDummy(seqId); |
258 | 14 | if (relaxedIdMatching) |
259 | { | |
260 | 3 | matcher.addAll(Arrays.asList(new SequenceI[] { match })); |
261 | } | |
262 | // add dummy sequence to the newseqs list | |
263 | 14 | newseqs.add(match); |
264 | } | |
265 | 15 | return match; |
266 | } | |
267 | ||
268 | /** | |
269 | * Parses the input line to a map of name / value(s) pairs. For example the | |
270 | * line | |
271 | * | |
272 | * <pre> | |
273 | * Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal | |
274 | * </pre> | |
275 | * | |
276 | * if parsed with delimiter=";" and separators {' ', '='} <br> | |
277 | * would return a map with { Notes={Fe=S, Metal}, Method={manual curation, | |
278 | * prediction}, source={Pfam}} <br> | |
279 | * | |
280 | * This method supports parsing of either GFF2 format (which uses space ' ' as | |
281 | * the name/value delimiter, and allows multiple occurrences of the same | |
282 | * name), or GFF3 format (which uses '=' as the name/value delimiter, and | |
283 | * strictly does not allow repeat occurrences of the same name - but does | |
284 | * allow a comma-separated list of values). | |
285 | * <p> | |
286 | * Returns a (possibly empty) map of lists of values by attribute name. | |
287 | * | |
288 | * @param text | |
289 | * @param namesDelimiter | |
290 | * the major delimiter between name-value pairs | |
291 | * @param nameValueSeparator | |
292 | * separator used between name and value | |
293 | * @param valuesDelimiter | |
294 | * delimits a list of more than one value | |
295 | * @return | |
296 | */ | |
297 | 36 | public static Map<String, List<String>> parseNameValuePairs(String text, |
298 | String namesDelimiter, char nameValueSeparator, | |
299 | String valuesDelimiter) | |
300 | { | |
301 | 36 | Map<String, List<String>> map = new HashMap<>(); |
302 | 36 | if (text == null || text.trim().length() == 0) |
303 | { | |
304 | 2 | return map; |
305 | } | |
306 | ||
307 | /* | |
308 | * split by major delimiter (; for GFF3) | |
309 | */ | |
310 | 34 | for (String nameValuePair : text.trim().split(namesDelimiter)) |
311 | { | |
312 | 92 | nameValuePair = nameValuePair.trim(); |
313 | 92 | if (nameValuePair.length() == 0) |
314 | { | |
315 | 0 | continue; |
316 | } | |
317 | ||
318 | /* | |
319 | * find name/value separator (= for GFF3) | |
320 | */ | |
321 | 92 | int sepPos = nameValuePair.indexOf(nameValueSeparator); |
322 | 92 | if (sepPos == -1) |
323 | { | |
324 | // no name=value found | |
325 | 2 | continue; |
326 | } | |
327 | ||
328 | 90 | String name = nameValuePair.substring(0, sepPos).trim(); |
329 | 90 | String values = nameValuePair.substring(sepPos + 1).trim(); |
330 | 90 | if (values.isEmpty()) |
331 | { | |
332 | 1 | continue; |
333 | } | |
334 | ||
335 | 89 | List<String> vals = map.get(name); |
336 | 89 | if (vals == null) |
337 | { | |
338 | 82 | vals = new ArrayList<>(); |
339 | 82 | map.put(name, vals); |
340 | } | |
341 | ||
342 | /* | |
343 | * if 'values' contains more name/value separators, parse as a map | |
344 | * (nested sub-attribute values) | |
345 | */ | |
346 | 89 | if (values.indexOf(nameValueSeparator) != -1) |
347 | { | |
348 | 17 | vals.add(values); |
349 | } | |
350 | else | |
351 | { | |
352 | 72 | for (String val : values.split(valuesDelimiter)) |
353 | { | |
354 | 74 | vals.add(val); |
355 | } | |
356 | } | |
357 | } | |
358 | ||
359 | 34 | return map; |
360 | } | |
361 | ||
362 | /** | |
363 | * Constructs a SequenceFeature from the GFF column data. Subclasses may wish | |
364 | * to call this method then adjust the SequenceFeature depending on the | |
365 | * particular usage of different tools that generate GFF. | |
366 | * | |
367 | * @param gff | |
368 | * @param attributes | |
369 | * @return | |
370 | */ | |
371 | 23 | protected SequenceFeature buildSequenceFeature(String[] gff, |
372 | Map<String, List<String>> attributes) | |
373 | { | |
374 | 23 | return buildSequenceFeature(gff, TYPE_COL, gff[SOURCE_COL], attributes); |
375 | } | |
376 | ||
377 | /** | |
378 | * @param gff | |
379 | * @param typeColumn | |
380 | * @param group | |
381 | * @param attributes | |
382 | * @return | |
383 | */ | |
384 | 24 | protected SequenceFeature buildSequenceFeature(String[] gff, |
385 | int typeColumn, String group, | |
386 | Map<String, List<String>> attributes) | |
387 | { | |
388 | 24 | try |
389 | { | |
390 | 24 | int start = Integer.parseInt(gff[START_COL]); |
391 | 24 | int end = Integer.parseInt(gff[END_COL]); |
392 | ||
393 | /* | |
394 | * default 'score' is 0 rather than Float.NaN - see JAL-2554 | |
395 | */ | |
396 | 24 | float score = 0f; |
397 | 24 | try |
398 | { | |
399 | 24 | score = Float.parseFloat(gff[SCORE_COL]); |
400 | } catch (NumberFormatException nfe) | |
401 | { | |
402 | // e.g. '.' - leave as zero | |
403 | } | |
404 | ||
405 | 24 | SequenceFeature sf = new SequenceFeature(gff[typeColumn], |
406 | gff[SOURCE_COL], start, end, score, group); | |
407 | ||
408 | 24 | sf.setStrand(gff[STRAND_COL]); |
409 | ||
410 | 24 | sf.setPhase(gff[PHASE_COL]); |
411 | ||
412 | 24 | if (attributes != null) |
413 | { | |
414 | /* | |
415 | * Add attributes in column 9 to the sequence feature's | |
416 | * 'otherData' table; use Note as a best proxy for description; | |
417 | * decode any encoded comma, equals, semi-colon as per GFF3 spec | |
418 | */ | |
419 | 18 | for (Entry<String, List<String>> attr : attributes.entrySet()) |
420 | { | |
421 | 42 | String key = attr.getKey(); |
422 | 42 | List<String> values = attr.getValue(); |
423 | 42 | if (values.size() == 1 && values.get(0).contains(EQUALS)) |
424 | { | |
425 | /* | |
426 | * 'value' is actually nested subattributes as x=a,y=b,z=c | |
427 | */ | |
428 | 1 | Map<String, String> valueMap = parseAttributeMap(values.get(0)); |
429 | 1 | sf.setValue(key, valueMap); |
430 | } | |
431 | else | |
432 | { | |
433 | 41 | String csvValues = StringUtils.listToDelimitedString(values, |
434 | COMMA); | |
435 | 41 | csvValues = StringUtils.urlDecode(csvValues, GFF_ENCODABLE); |
436 | 41 | sf.setValue(key, csvValues); |
437 | 41 | if (NOTE.equals(key)) |
438 | { | |
439 | 2 | sf.setDescription(csvValues); |
440 | } | |
441 | } | |
442 | } | |
443 | } | |
444 | ||
445 | 24 | return sf; |
446 | } catch (NumberFormatException nfe) | |
447 | { | |
448 | 0 | jalview.bin.Console |
449 | .errPrintln("Invalid number in gff: " + nfe.getMessage()); | |
450 | 0 | return null; |
451 | } | |
452 | } | |
453 | ||
454 | /** | |
455 | * Parses a (GFF3 format) list of comma-separated key=value pairs into a Map | |
456 | * of {@code key, | |
457 | * value} <br> | |
458 | * An input string like {@code a=b,c,d=e,f=g,h} is parsed to | |
459 | * | |
460 | * <pre> | |
461 | * a = "b,c" | |
462 | * d = "e" | |
463 | * f = "g,h" | |
464 | * </pre> | |
465 | * | |
466 | * @param s | |
467 | * | |
468 | * @return | |
469 | */ | |
470 | 17 | protected static Map<String, String> parseAttributeMap(String s) |
471 | { | |
472 | 17 | Map<String, String> map = new HashMap<>(); |
473 | 17 | String[] fields = s.split(EQUALS); |
474 | ||
475 | /* | |
476 | * format validation | |
477 | */ | |
478 | 16 | boolean valid = true; |
479 | 16 | if (fields.length < 2) |
480 | { | |
481 | /* | |
482 | * need at least A=B here | |
483 | */ | |
484 | 6 | valid = false; |
485 | } | |
486 | 10 | else if (fields[0].isEmpty() || fields[0].contains(COMMA)) |
487 | { | |
488 | /* | |
489 | * A,B=C is not a valid start, nor is =C | |
490 | */ | |
491 | 3 | valid = false; |
492 | } | |
493 | else | |
494 | { | |
495 | 13 | for (int i = 1; i < fields.length - 1; i++) |
496 | { | |
497 | 6 | if (fields[i].isEmpty() || !fields[i].contains(COMMA)) |
498 | { | |
499 | /* | |
500 | * intermediate tokens must include value,name | |
501 | */ | |
502 | 2 | valid = false; |
503 | } | |
504 | } | |
505 | } | |
506 | ||
507 | 16 | if (!valid) |
508 | { | |
509 | 11 | jalview.bin.Console.errPrintln(INVALID_GFF_ATTRIBUTE_FORMAT + s); |
510 | 11 | return map; |
511 | } | |
512 | ||
513 | 5 | int i = 0; |
514 | 13 | while (i < fields.length - 1) |
515 | { | |
516 | 9 | boolean lastPair = i == fields.length - 2; |
517 | 9 | String before = fields[i]; |
518 | 9 | String after = fields[i + 1]; |
519 | ||
520 | /* | |
521 | * if 'key' looks like a,b,c then the last token is the | |
522 | * key | |
523 | */ | |
524 | 9 | String theKey = before.contains(COMMA) |
525 | ? before.substring(before.lastIndexOf(COMMA) + 1) | |
526 | : before; | |
527 | ||
528 | 9 | theKey = theKey.trim(); |
529 | 9 | if (theKey.isEmpty()) |
530 | { | |
531 | 1 | jalview.bin.Console.errPrintln(INVALID_GFF_ATTRIBUTE_FORMAT + s); |
532 | 1 | map.clear(); |
533 | 1 | return map; |
534 | } | |
535 | ||
536 | /* | |
537 | * if 'value' looks like a,b,c then all but the last token is the value, | |
538 | * unless this is the last field (no more = to follow), in which case | |
539 | * all of it makes up the value | |
540 | */ | |
541 | 8 | String theValue = after.contains(COMMA) && !lastPair |
542 | ? after.substring(0, after.lastIndexOf(COMMA)) | |
543 | : after; | |
544 | 8 | map.put(StringUtils.urlDecode(theKey, GFF_ENCODABLE), |
545 | StringUtils.urlDecode(theValue, GFF_ENCODABLE)); | |
546 | 8 | i += 1; |
547 | } | |
548 | ||
549 | 4 | return map; |
550 | } | |
551 | ||
552 | /** | |
553 | * Returns any existing mapping held on the alignment between the given | |
554 | * dataset sequences, or a new one if none found. This is a convenience method | |
555 | * to facilitate processing multiple GFF lines that make up a single 'spliced' | |
556 | * mapping, by extending the first mapping as the others are read. | |
557 | * | |
558 | * @param align | |
559 | * @param fromSeq | |
560 | * @param toSeq | |
561 | * @return | |
562 | */ | |
563 | 16 | protected AlignedCodonFrame getMapping(AlignmentI align, |
564 | SequenceI fromSeq, SequenceI toSeq) | |
565 | { | |
566 | 16 | AlignedCodonFrame acf = align.getMapping(fromSeq, toSeq); |
567 | 16 | if (acf == null) |
568 | { | |
569 | 15 | acf = new AlignedCodonFrame(); |
570 | } | |
571 | 16 | return acf; |
572 | } | |
573 | ||
574 | } |