1. Project Clover database Fri Dec 6 2024 13:47:14 GMT
  2. Package jalview.io.gff

File GffHelperBase.java

 

Coverage histogram

../../../img/srcFileCovDistChart10.png
0% of files have more coverage

Code metrics

68
135
8
1
574
312
53
0.39
16.88
8
6.62

Classes

Class
Line #
Actions
GffHelperBase 44 135 53
0.947867394.8%
 

Contributing tests

This file is covered by 20 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.io.gff;
22   
23    import jalview.analysis.SequenceIdMatcher;
24    import jalview.datamodel.AlignedCodonFrame;
25    import jalview.datamodel.AlignmentI;
26    import jalview.datamodel.MappingType;
27    import jalview.datamodel.SequenceDummy;
28    import jalview.datamodel.SequenceFeature;
29    import jalview.datamodel.SequenceI;
30    import jalview.util.MapList;
31    import jalview.util.StringUtils;
32   
33    import java.util.ArrayList;
34    import java.util.Arrays;
35    import java.util.HashMap;
36    import java.util.List;
37    import java.util.Map;
38    import java.util.Map.Entry;
39   
40    /**
41    * Base class with common functionality for flavours of GFF handler (GFF2 or
42    * GFF3)
43    */
 
44    public abstract class GffHelperBase implements GffHelperI
45    {
46    private static final String INVALID_GFF_ATTRIBUTE_FORMAT = "Invalid GFF attribute format: ";
47   
48    protected static final String COMMA = ",";
49   
50    protected static final String EQUALS = "=";
51   
52    protected static final String NOTE = "Note";
53   
54    /*
55    * GFF columns 1-9 (zero-indexed):
56    */
57    protected static final int SEQID_COL = 0;
58   
59    protected static final int SOURCE_COL = 1;
60   
61    protected static final int TYPE_COL = 2;
62   
63    protected static final int START_COL = 3;
64   
65    protected static final int END_COL = 4;
66   
67    protected static final int SCORE_COL = 5;
68   
69    protected static final int STRAND_COL = 6;
70   
71    protected static final int PHASE_COL = 7;
72   
73    protected static final int ATTRIBUTES_COL = 8;
74   
75    private AlignmentI lastmatchedAl = null;
76   
77    private SequenceIdMatcher matcher = null;
78   
79    /**
80    * Constructs and returns a mapping, or null if data appear invalid
81    *
82    * @param fromStart
83    * @param fromEnd
84    * @param toStart
85    * @param toEnd
86    * @param mappingType
87    * type of mapping (e.g. protein to nucleotide)
88    * @return
89    */
 
90  19 toggle protected MapList constructMappingFromAlign(int fromStart, int fromEnd,
91    int toStart, int toEnd, MappingType mappingType)
92    {
93  19 int[] from = new int[] { fromStart, fromEnd };
94  19 int[] to = new int[] { toStart, toEnd };
95   
96    /*
97    * Jalview always models from dna to protein, so switch values if the
98    * GFF mapping is from protein to dna
99    */
100  19 if (mappingType == MappingType.PeptideToNucleotide)
101    {
102  15 int[] temp = from;
103  15 from = to;
104  15 to = temp;
105  15 mappingType = mappingType.getInverse();
106    }
107   
108  19 int fromRatio = mappingType.getFromRatio();
109  19 int toRatio = mappingType.getToRatio();
110   
111    /*
112    * sanity check that mapped residue counts match
113    * TODO understand why PASA generates such cases...
114    */
115  19 if (!trimMapping(from, to, fromRatio, toRatio))
116    {
117  0 jalview.bin.Console.errPrintln(
118    "Ignoring mapping from " + Arrays.toString(from) + " to "
119    + Arrays.toString(to) + " as counts don't match!");
120  0 return null;
121    }
122   
123    /*
124    * If a codon has an intron gap, there will be contiguous 'toRanges';
125    * this is handled for us by the MapList constructor.
126    * (It is not clear that exonerate ever generates this case)
127    */
128   
129  19 return new MapList(from, to, fromRatio, toRatio);
130    }
131   
132    /**
133    * Checks that the 'from' and 'to' ranges have equivalent lengths. If not,
134    * tries to trim the end of the longer so they do. Returns true if the
135    * mappings could be made equivalent, else false. Note the range array values
136    * may be modified by this method.
137    *
138    * @param from
139    * @param to
140    * @param fromRatio
141    * @param toRatio
142    * @return
143    */
 
144  36 toggle protected static boolean trimMapping(int[] from, int[] to, int fromRatio,
145    int toRatio)
146    {
147  36 int fromLength = Math.abs(from[1] - from[0]) + 1;
148  36 int toLength = Math.abs(to[1] - to[0]) + 1;
149  36 int fromOverlap = fromLength * toRatio - toLength * fromRatio;
150  36 if (fromOverlap == 0)
151    {
152  24 return true;
153    }
154  12 if (fromOverlap > 0 && fromOverlap % toRatio == 0)
155    {
156    /*
157    * restrict from range to make them match up
158    * it's kind of arbitrary which end we truncate - here it is the end
159    */
160  6 System.err.print(
161    "Truncating mapping from " + Arrays.toString(from) + " to ");
162  6 if (from[1] > from[0])
163    {
164  3 from[1] -= fromOverlap / toRatio;
165    }
166    else
167    {
168  3 from[1] += fromOverlap / toRatio;
169    }
170  6 jalview.bin.Console.errPrintln(Arrays.toString(from));
171  6 return true;
172    }
173  6 else if (fromOverlap < 0 && fromOverlap % fromRatio == 0)
174    {
175  5 fromOverlap = -fromOverlap; // > 0
176    /*
177    * restrict to range to make them match up
178    */
179  5 System.err.print(
180    "Truncating mapping to " + Arrays.toString(to) + " to ");
181  5 if (to[1] > to[0])
182    {
183  2 to[1] -= fromOverlap / fromRatio;
184    }
185    else
186    {
187  3 to[1] += fromOverlap / fromRatio;
188    }
189  5 jalview.bin.Console.errPrintln(Arrays.toString(to));
190  5 return true;
191    }
192   
193    /*
194    * Couldn't truncate to an exact match..
195    */
196  1 return false;
197    }
198   
199    /**
200    * Returns a sequence matching the given id, as follows
201    * <ul>
202    * <li>strict matching is on exact sequence name</li>
203    * <li>relaxed matching allows matching on a token within the sequence name,
204    * or a dbxref</li>
205    * <li>first tries to find a match in the alignment sequences</li>
206    * <li>else tries to find a match in the new sequences already generated while
207    * parsing the features file</li>
208    * <li>else creates a new placeholder sequence, adds it to the new sequences
209    * list, and returns it</li>
210    * </ul>
211    *
212    * @param seqId
213    * @param align
214    * @param newseqs
215    * @param relaxedIdMatching
216    *
217    * @return
218    */
 
219  16 toggle protected SequenceI findSequence(String seqId, AlignmentI align,
220    List<SequenceI> newseqs, boolean relaxedIdMatching)
221    {
222  16 if (seqId == null)
223    {
224  0 return null;
225    }
226  16 SequenceI match = null;
227  16 if (relaxedIdMatching)
228    {
229  3 if (lastmatchedAl != align)
230    {
231  3 lastmatchedAl = align;
232  3 matcher = new SequenceIdMatcher(align.getSequencesArray());
233  3 if (newseqs != null)
234    {
235  3 matcher.addAll(newseqs);
236    }
237    }
238  3 match = matcher.findIdMatch(seqId);
239    }
240    else
241    {
242  13 match = align.findName(seqId, true);
243  13 if (match == null && newseqs != null)
244    {
245  12 for (SequenceI m : newseqs)
246    {
247  3 if (seqId.equals(m.getName()))
248    {
249  1 return m;
250    }
251    }
252    }
253   
254    }
255  15 if (match == null && newseqs != null)
256    {
257  14 match = new SequenceDummy(seqId);
258  14 if (relaxedIdMatching)
259    {
260  3 matcher.addAll(Arrays.asList(new SequenceI[] { match }));
261    }
262    // add dummy sequence to the newseqs list
263  14 newseqs.add(match);
264    }
265  15 return match;
266    }
267   
268    /**
269    * Parses the input line to a map of name / value(s) pairs. For example the
270    * line
271    *
272    * <pre>
273    * Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal
274    * </pre>
275    *
276    * if parsed with delimiter=";" and separators {' ', '='} <br>
277    * would return a map with { Notes={Fe=S, Metal}, Method={manual curation,
278    * prediction}, source={Pfam}} <br>
279    *
280    * This method supports parsing of either GFF2 format (which uses space ' ' as
281    * the name/value delimiter, and allows multiple occurrences of the same
282    * name), or GFF3 format (which uses '=' as the name/value delimiter, and
283    * strictly does not allow repeat occurrences of the same name - but does
284    * allow a comma-separated list of values).
285    * <p>
286    * Returns a (possibly empty) map of lists of values by attribute name.
287    *
288    * @param text
289    * @param namesDelimiter
290    * the major delimiter between name-value pairs
291    * @param nameValueSeparator
292    * separator used between name and value
293    * @param valuesDelimiter
294    * delimits a list of more than one value
295    * @return
296    */
 
297  36 toggle public static Map<String, List<String>> parseNameValuePairs(String text,
298    String namesDelimiter, char nameValueSeparator,
299    String valuesDelimiter)
300    {
301  36 Map<String, List<String>> map = new HashMap<>();
302  36 if (text == null || text.trim().length() == 0)
303    {
304  2 return map;
305    }
306   
307    /*
308    * split by major delimiter (; for GFF3)
309    */
310  34 for (String nameValuePair : text.trim().split(namesDelimiter))
311    {
312  92 nameValuePair = nameValuePair.trim();
313  92 if (nameValuePair.length() == 0)
314    {
315  0 continue;
316    }
317   
318    /*
319    * find name/value separator (= for GFF3)
320    */
321  92 int sepPos = nameValuePair.indexOf(nameValueSeparator);
322  92 if (sepPos == -1)
323    {
324    // no name=value found
325  2 continue;
326    }
327   
328  90 String name = nameValuePair.substring(0, sepPos).trim();
329  90 String values = nameValuePair.substring(sepPos + 1).trim();
330  90 if (values.isEmpty())
331    {
332  1 continue;
333    }
334   
335  89 List<String> vals = map.get(name);
336  89 if (vals == null)
337    {
338  82 vals = new ArrayList<>();
339  82 map.put(name, vals);
340    }
341   
342    /*
343    * if 'values' contains more name/value separators, parse as a map
344    * (nested sub-attribute values)
345    */
346  89 if (values.indexOf(nameValueSeparator) != -1)
347    {
348  17 vals.add(values);
349    }
350    else
351    {
352  72 for (String val : values.split(valuesDelimiter))
353    {
354  74 vals.add(val);
355    }
356    }
357    }
358   
359  34 return map;
360    }
361   
362    /**
363    * Constructs a SequenceFeature from the GFF column data. Subclasses may wish
364    * to call this method then adjust the SequenceFeature depending on the
365    * particular usage of different tools that generate GFF.
366    *
367    * @param gff
368    * @param attributes
369    * @return
370    */
 
371  23 toggle protected SequenceFeature buildSequenceFeature(String[] gff,
372    Map<String, List<String>> attributes)
373    {
374  23 return buildSequenceFeature(gff, TYPE_COL, gff[SOURCE_COL], attributes);
375    }
376   
377    /**
378    * @param gff
379    * @param typeColumn
380    * @param group
381    * @param attributes
382    * @return
383    */
 
384  24 toggle protected SequenceFeature buildSequenceFeature(String[] gff,
385    int typeColumn, String group,
386    Map<String, List<String>> attributes)
387    {
388  24 try
389    {
390  24 int start = Integer.parseInt(gff[START_COL]);
391  24 int end = Integer.parseInt(gff[END_COL]);
392   
393    /*
394    * default 'score' is 0 rather than Float.NaN - see JAL-2554
395    */
396  24 float score = 0f;
397  24 try
398    {
399  24 score = Float.parseFloat(gff[SCORE_COL]);
400    } catch (NumberFormatException nfe)
401    {
402    // e.g. '.' - leave as zero
403    }
404   
405  24 SequenceFeature sf = new SequenceFeature(gff[typeColumn],
406    gff[SOURCE_COL], start, end, score, group);
407   
408  24 sf.setStrand(gff[STRAND_COL]);
409   
410  24 sf.setPhase(gff[PHASE_COL]);
411   
412  24 if (attributes != null)
413    {
414    /*
415    * Add attributes in column 9 to the sequence feature's
416    * 'otherData' table; use Note as a best proxy for description;
417    * decode any encoded comma, equals, semi-colon as per GFF3 spec
418    */
419  18 for (Entry<String, List<String>> attr : attributes.entrySet())
420    {
421  42 String key = attr.getKey();
422  42 List<String> values = attr.getValue();
423  42 if (values.size() == 1 && values.get(0).contains(EQUALS))
424    {
425    /*
426    * 'value' is actually nested subattributes as x=a,y=b,z=c
427    */
428  1 Map<String, String> valueMap = parseAttributeMap(values.get(0));
429  1 sf.setValue(key, valueMap);
430    }
431    else
432    {
433  41 String csvValues = StringUtils.listToDelimitedString(values,
434    COMMA);
435  41 csvValues = StringUtils.urlDecode(csvValues, GFF_ENCODABLE);
436  41 sf.setValue(key, csvValues);
437  41 if (NOTE.equals(key))
438    {
439  2 sf.setDescription(csvValues);
440    }
441    }
442    }
443    }
444   
445  24 return sf;
446    } catch (NumberFormatException nfe)
447    {
448  0 jalview.bin.Console
449    .errPrintln("Invalid number in gff: " + nfe.getMessage());
450  0 return null;
451    }
452    }
453   
454    /**
455    * Parses a (GFF3 format) list of comma-separated key=value pairs into a Map
456    * of {@code key,
457    * value} <br>
458    * An input string like {@code a=b,c,d=e,f=g,h} is parsed to
459    *
460    * <pre>
461    * a = "b,c"
462    * d = "e"
463    * f = "g,h"
464    * </pre>
465    *
466    * @param s
467    *
468    * @return
469    */
 
470  17 toggle protected static Map<String, String> parseAttributeMap(String s)
471    {
472  17 Map<String, String> map = new HashMap<>();
473  17 String[] fields = s.split(EQUALS);
474   
475    /*
476    * format validation
477    */
478  16 boolean valid = true;
479  16 if (fields.length < 2)
480    {
481    /*
482    * need at least A=B here
483    */
484  6 valid = false;
485    }
486  10 else if (fields[0].isEmpty() || fields[0].contains(COMMA))
487    {
488    /*
489    * A,B=C is not a valid start, nor is =C
490    */
491  3 valid = false;
492    }
493    else
494    {
495  13 for (int i = 1; i < fields.length - 1; i++)
496    {
497  6 if (fields[i].isEmpty() || !fields[i].contains(COMMA))
498    {
499    /*
500    * intermediate tokens must include value,name
501    */
502  2 valid = false;
503    }
504    }
505    }
506   
507  16 if (!valid)
508    {
509  11 jalview.bin.Console.errPrintln(INVALID_GFF_ATTRIBUTE_FORMAT + s);
510  11 return map;
511    }
512   
513  5 int i = 0;
514  13 while (i < fields.length - 1)
515    {
516  9 boolean lastPair = i == fields.length - 2;
517  9 String before = fields[i];
518  9 String after = fields[i + 1];
519   
520    /*
521    * if 'key' looks like a,b,c then the last token is the
522    * key
523    */
524  9 String theKey = before.contains(COMMA)
525    ? before.substring(before.lastIndexOf(COMMA) + 1)
526    : before;
527   
528  9 theKey = theKey.trim();
529  9 if (theKey.isEmpty())
530    {
531  1 jalview.bin.Console.errPrintln(INVALID_GFF_ATTRIBUTE_FORMAT + s);
532  1 map.clear();
533  1 return map;
534    }
535   
536    /*
537    * if 'value' looks like a,b,c then all but the last token is the value,
538    * unless this is the last field (no more = to follow), in which case
539    * all of it makes up the value
540    */
541  8 String theValue = after.contains(COMMA) && !lastPair
542    ? after.substring(0, after.lastIndexOf(COMMA))
543    : after;
544  8 map.put(StringUtils.urlDecode(theKey, GFF_ENCODABLE),
545    StringUtils.urlDecode(theValue, GFF_ENCODABLE));
546  8 i += 1;
547    }
548   
549  4 return map;
550    }
551   
552    /**
553    * Returns any existing mapping held on the alignment between the given
554    * dataset sequences, or a new one if none found. This is a convenience method
555    * to facilitate processing multiple GFF lines that make up a single 'spliced'
556    * mapping, by extending the first mapping as the others are read.
557    *
558    * @param align
559    * @param fromSeq
560    * @param toSeq
561    * @return
562    */
 
563  16 toggle protected AlignedCodonFrame getMapping(AlignmentI align,
564    SequenceI fromSeq, SequenceI toSeq)
565    {
566  16 AlignedCodonFrame acf = align.getMapping(fromSeq, toSeq);
567  16 if (acf == null)
568    {
569  15 acf = new AlignedCodonFrame();
570    }
571  16 return acf;
572    }
573   
574    }