Clover icon

Coverage Report

  1. Project Clover database Thu Aug 13 2020 12:04:21 BST
  2. Package jalview.io.gff

File GffHelperBase.java

 

Coverage histogram

../../../img/srcFileCovDistChart10.png
0% of files have more coverage

Code metrics

68
135
8
1
571
309
53
0.39
16.88
8
6.62

Classes

Class Line # Actions
GffHelperBase 44 135 53
0.947867394.8%
 

Contributing tests

This file is covered by 20 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.io.gff;
22   
23    import jalview.analysis.SequenceIdMatcher;
24    import jalview.datamodel.AlignedCodonFrame;
25    import jalview.datamodel.AlignmentI;
26    import jalview.datamodel.MappingType;
27    import jalview.datamodel.SequenceDummy;
28    import jalview.datamodel.SequenceFeature;
29    import jalview.datamodel.SequenceI;
30    import jalview.util.MapList;
31    import jalview.util.StringUtils;
32   
33    import java.util.ArrayList;
34    import java.util.Arrays;
35    import java.util.HashMap;
36    import java.util.List;
37    import java.util.Map;
38    import java.util.Map.Entry;
39   
40    /**
41    * Base class with common functionality for flavours of GFF handler (GFF2 or
42    * GFF3)
43    */
 
44    public abstract class GffHelperBase implements GffHelperI
45    {
46    private static final String INVALID_GFF_ATTRIBUTE_FORMAT = "Invalid GFF attribute format: ";
47   
48    protected static final String COMMA = ",";
49   
50    protected static final String EQUALS = "=";
51   
52    protected static final String NOTE = "Note";
53   
54    /*
55    * GFF columns 1-9 (zero-indexed):
56    */
57    protected static final int SEQID_COL = 0;
58   
59    protected static final int SOURCE_COL = 1;
60   
61    protected static final int TYPE_COL = 2;
62   
63    protected static final int START_COL = 3;
64   
65    protected static final int END_COL = 4;
66   
67    protected static final int SCORE_COL = 5;
68   
69    protected static final int STRAND_COL = 6;
70   
71    protected static final int PHASE_COL = 7;
72   
73    protected static final int ATTRIBUTES_COL = 8;
74   
75    private AlignmentI lastmatchedAl = null;
76   
77    private SequenceIdMatcher matcher = null;
78   
79    /**
80    * Constructs and returns a mapping, or null if data appear invalid
81    *
82    * @param fromStart
83    * @param fromEnd
84    * @param toStart
85    * @param toEnd
86    * @param mappingType
87    * type of mapping (e.g. protein to nucleotide)
88    * @return
89    */
 
90  19 toggle protected MapList constructMappingFromAlign(int fromStart, int fromEnd,
91    int toStart, int toEnd, MappingType mappingType)
92    {
93  19 int[] from = new int[] { fromStart, fromEnd };
94  19 int[] to = new int[] { toStart, toEnd };
95   
96    /*
97    * Jalview always models from dna to protein, so switch values if the
98    * GFF mapping is from protein to dna
99    */
100  19 if (mappingType == MappingType.PeptideToNucleotide)
101    {
102  15 int[] temp = from;
103  15 from = to;
104  15 to = temp;
105  15 mappingType = mappingType.getInverse();
106    }
107   
108  19 int fromRatio = mappingType.getFromRatio();
109  19 int toRatio = mappingType.getToRatio();
110   
111    /*
112    * sanity check that mapped residue counts match
113    * TODO understand why PASA generates such cases...
114    */
115  19 if (!trimMapping(from, to, fromRatio, toRatio))
116    {
117  0 System.err.println("Ignoring mapping from " + Arrays.toString(from)
118    + " to " + Arrays.toString(to) + " as counts don't match!");
119  0 return null;
120    }
121   
122    /*
123    * If a codon has an intron gap, there will be contiguous 'toRanges';
124    * this is handled for us by the MapList constructor.
125    * (It is not clear that exonerate ever generates this case)
126    */
127   
128  19 return new MapList(from, to, fromRatio, toRatio);
129    }
130   
131    /**
132    * Checks that the 'from' and 'to' ranges have equivalent lengths. If not,
133    * tries to trim the end of the longer so they do. Returns true if the
134    * mappings could be made equivalent, else false. Note the range array values
135    * may be modified by this method.
136    *
137    * @param from
138    * @param to
139    * @param fromRatio
140    * @param toRatio
141    * @return
142    */
 
143  36 toggle protected static boolean trimMapping(int[] from, int[] to, int fromRatio,
144    int toRatio)
145    {
146  36 int fromLength = Math.abs(from[1] - from[0]) + 1;
147  36 int toLength = Math.abs(to[1] - to[0]) + 1;
148  36 int fromOverlap = fromLength * toRatio - toLength * fromRatio;
149  36 if (fromOverlap == 0)
150    {
151  24 return true;
152    }
153  12 if (fromOverlap > 0 && fromOverlap % toRatio == 0)
154    {
155    /*
156    * restrict from range to make them match up
157    * it's kind of arbitrary which end we truncate - here it is the end
158    */
159  6 System.err.print(
160    "Truncating mapping from " + Arrays.toString(from) + " to ");
161  6 if (from[1] > from[0])
162    {
163  3 from[1] -= fromOverlap / toRatio;
164    }
165    else
166    {
167  3 from[1] += fromOverlap / toRatio;
168    }
169  6 System.err.println(Arrays.toString(from));
170  6 return true;
171    }
172  6 else if (fromOverlap < 0 && fromOverlap % fromRatio == 0)
173    {
174  5 fromOverlap = -fromOverlap; // > 0
175    /*
176    * restrict to range to make them match up
177    */
178  5 System.err.print(
179    "Truncating mapping to " + Arrays.toString(to) + " to ");
180  5 if (to[1] > to[0])
181    {
182  2 to[1] -= fromOverlap / fromRatio;
183    }
184    else
185    {
186  3 to[1] += fromOverlap / fromRatio;
187    }
188  5 System.err.println(Arrays.toString(to));
189  5 return true;
190    }
191   
192    /*
193    * Couldn't truncate to an exact match..
194    */
195  1 return false;
196    }
197   
198    /**
199    * Returns a sequence matching the given id, as follows
200    * <ul>
201    * <li>strict matching is on exact sequence name</li>
202    * <li>relaxed matching allows matching on a token within the sequence name,
203    * or a dbxref</li>
204    * <li>first tries to find a match in the alignment sequences</li>
205    * <li>else tries to find a match in the new sequences already generated while
206    * parsing the features file</li>
207    * <li>else creates a new placeholder sequence, adds it to the new sequences
208    * list, and returns it</li>
209    * </ul>
210    *
211    * @param seqId
212    * @param align
213    * @param newseqs
214    * @param relaxedIdMatching
215    *
216    * @return
217    */
 
218  16 toggle protected SequenceI findSequence(String seqId, AlignmentI align,
219    List<SequenceI> newseqs, boolean relaxedIdMatching)
220    {
221  16 if (seqId == null)
222    {
223  0 return null;
224    }
225  16 SequenceI match = null;
226  16 if (relaxedIdMatching)
227    {
228  3 if (lastmatchedAl != align)
229    {
230  3 lastmatchedAl = align;
231  3 matcher = new SequenceIdMatcher(align.getSequencesArray());
232  3 if (newseqs != null)
233    {
234  3 matcher.addAll(newseqs);
235    }
236    }
237  3 match = matcher.findIdMatch(seqId);
238    }
239    else
240    {
241  13 match = align.findName(seqId, true);
242  13 if (match == null && newseqs != null)
243    {
244  12 for (SequenceI m : newseqs)
245    {
246  3 if (seqId.equals(m.getName()))
247    {
248  1 return m;
249    }
250    }
251    }
252   
253    }
254  15 if (match == null && newseqs != null)
255    {
256  14 match = new SequenceDummy(seqId);
257  14 if (relaxedIdMatching)
258    {
259  3 matcher.addAll(Arrays.asList(new SequenceI[] { match }));
260    }
261    // add dummy sequence to the newseqs list
262  14 newseqs.add(match);
263    }
264  15 return match;
265    }
266   
267    /**
268    * Parses the input line to a map of name / value(s) pairs. For example the
269    * line
270    *
271    * <pre>
272    * Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal
273    * </pre>
274    *
275    * if parsed with delimiter=";" and separators {' ', '='} <br>
276    * would return a map with { Notes={Fe=S, Metal}, Method={manual curation,
277    * prediction}, source={Pfam}} <br>
278    *
279    * This method supports parsing of either GFF2 format (which uses space ' ' as
280    * the name/value delimiter, and allows multiple occurrences of the same
281    * name), or GFF3 format (which uses '=' as the name/value delimiter, and
282    * strictly does not allow repeat occurrences of the same name - but does
283    * allow a comma-separated list of values).
284    * <p>
285    * Returns a (possibly empty) map of lists of values by attribute name.
286    *
287    * @param text
288    * @param namesDelimiter
289    * the major delimiter between name-value pairs
290    * @param nameValueSeparator
291    * separator used between name and value
292    * @param valuesDelimiter
293    * delimits a list of more than one value
294    * @return
295    */
 
296  36 toggle public static Map<String, List<String>> parseNameValuePairs(String text,
297    String namesDelimiter, char nameValueSeparator,
298    String valuesDelimiter)
299    {
300  36 Map<String, List<String>> map = new HashMap<>();
301  36 if (text == null || text.trim().length() == 0)
302    {
303  2 return map;
304    }
305   
306    /*
307    * split by major delimiter (; for GFF3)
308    */
309  34 for (String nameValuePair : text.trim().split(namesDelimiter))
310    {
311  92 nameValuePair = nameValuePair.trim();
312  92 if (nameValuePair.length() == 0)
313    {
314  0 continue;
315    }
316   
317    /*
318    * find name/value separator (= for GFF3)
319    */
320  92 int sepPos = nameValuePair.indexOf(nameValueSeparator);
321  92 if (sepPos == -1)
322    {
323    // no name=value found
324  2 continue;
325    }
326   
327  90 String name = nameValuePair.substring(0, sepPos).trim();
328  90 String values = nameValuePair.substring(sepPos + 1).trim();
329  90 if (values.isEmpty())
330    {
331  1 continue;
332    }
333   
334  89 List<String> vals = map.get(name);
335  89 if (vals == null)
336    {
337  82 vals = new ArrayList<>();
338  82 map.put(name, vals);
339    }
340   
341    /*
342    * if 'values' contains more name/value separators, parse as a map
343    * (nested sub-attribute values)
344    */
345  89 if (values.indexOf(nameValueSeparator) != -1)
346    {
347  17 vals.add(values);
348    }
349    else
350    {
351  72 for (String val : values.split(valuesDelimiter))
352    {
353  74 vals.add(val);
354    }
355    }
356    }
357   
358  34 return map;
359    }
360   
361    /**
362    * Constructs a SequenceFeature from the GFF column data. Subclasses may wish
363    * to call this method then adjust the SequenceFeature depending on the
364    * particular usage of different tools that generate GFF.
365    *
366    * @param gff
367    * @param attributes
368    * @return
369    */
 
370  23 toggle protected SequenceFeature buildSequenceFeature(String[] gff,
371    Map<String, List<String>> attributes)
372    {
373  23 return buildSequenceFeature(gff, TYPE_COL, gff[SOURCE_COL], attributes);
374    }
375   
376    /**
377    * @param gff
378    * @param typeColumn
379    * @param group
380    * @param attributes
381    * @return
382    */
 
383  24 toggle protected SequenceFeature buildSequenceFeature(String[] gff,
384    int typeColumn, String group, Map<String, List<String>> attributes)
385    {
386  24 try
387    {
388  24 int start = Integer.parseInt(gff[START_COL]);
389  24 int end = Integer.parseInt(gff[END_COL]);
390   
391    /*
392    * default 'score' is 0 rather than Float.NaN - see JAL-2554
393    */
394  24 float score = 0f;
395  24 try
396    {
397  24 score = Float.parseFloat(gff[SCORE_COL]);
398    } catch (NumberFormatException nfe)
399    {
400    // e.g. '.' - leave as zero
401    }
402   
403  24 SequenceFeature sf = new SequenceFeature(gff[typeColumn],
404    gff[SOURCE_COL], start, end, score, group);
405   
406  24 sf.setStrand(gff[STRAND_COL]);
407   
408  24 sf.setPhase(gff[PHASE_COL]);
409   
410  24 if (attributes != null)
411    {
412    /*
413    * Add attributes in column 9 to the sequence feature's
414    * 'otherData' table; use Note as a best proxy for description;
415    * decode any encoded comma, equals, semi-colon as per GFF3 spec
416    */
417  18 for (Entry<String, List<String>> attr : attributes.entrySet())
418    {
419  42 String key = attr.getKey();
420  42 List<String> values = attr.getValue();
421  42 if (values.size() == 1 && values.get(0).contains(EQUALS))
422    {
423    /*
424    * 'value' is actually nested subattributes as x=a,y=b,z=c
425    */
426  1 Map<String, String> valueMap = parseAttributeMap(values.get(0));
427  1 sf.setValue(key, valueMap);
428    }
429    else
430    {
431  41 String csvValues = StringUtils.listToDelimitedString(values,
432    COMMA);
433  41 csvValues = StringUtils.urlDecode(csvValues, GFF_ENCODABLE);
434  41 sf.setValue(key, csvValues);
435  41 if (NOTE.equals(key))
436    {
437  2 sf.setDescription(csvValues);
438    }
439    }
440    }
441    }
442   
443  24 return sf;
444    } catch (NumberFormatException nfe)
445    {
446  0 System.err.println("Invalid number in gff: " + nfe.getMessage());
447  0 return null;
448    }
449    }
450   
451    /**
452    * Parses a (GFF3 format) list of comma-separated key=value pairs into a Map
453    * of {@code key,
454    * value} <br>
455    * An input string like {@code a=b,c,d=e,f=g,h} is parsed to
456    *
457    * <pre>
458    * a = "b,c"
459    * d = "e"
460    * f = "g,h"
461    * </pre>
462    *
463    * @param s
464    *
465    * @return
466    */
 
467  17 toggle protected static Map<String, String> parseAttributeMap(String s)
468    {
469  17 Map<String, String> map = new HashMap<>();
470  17 String[] fields = s.split(EQUALS);
471   
472    /*
473    * format validation
474    */
475  16 boolean valid = true;
476  16 if (fields.length < 2)
477    {
478    /*
479    * need at least A=B here
480    */
481  6 valid = false;
482    }
483  10 else if (fields[0].isEmpty() || fields[0].contains(COMMA))
484    {
485    /*
486    * A,B=C is not a valid start, nor is =C
487    */
488  3 valid = false;
489    }
490    else
491    {
492  13 for (int i = 1; i < fields.length - 1; i++)
493    {
494  6 if (fields[i].isEmpty() || !fields[i].contains(COMMA))
495    {
496    /*
497    * intermediate tokens must include value,name
498    */
499  2 valid = false;
500    }
501    }
502    }
503   
504  16 if (!valid)
505    {
506  11 System.err.println(INVALID_GFF_ATTRIBUTE_FORMAT + s);
507  11 return map;
508    }
509   
510  5 int i = 0;
511  13 while (i < fields.length - 1)
512    {
513  9 boolean lastPair = i == fields.length - 2;
514  9 String before = fields[i];
515  9 String after = fields[i + 1];
516   
517    /*
518    * if 'key' looks like a,b,c then the last token is the
519    * key
520    */
521  9 String theKey = before.contains(COMMA)
522    ? before.substring(before.lastIndexOf(COMMA) + 1)
523    : before;
524   
525  9 theKey = theKey.trim();
526  9 if (theKey.isEmpty())
527    {
528  1 System.err.println(INVALID_GFF_ATTRIBUTE_FORMAT + s);
529  1 map.clear();
530  1 return map;
531    }
532   
533    /*
534    * if 'value' looks like a,b,c then all but the last token is the value,
535    * unless this is the last field (no more = to follow), in which case
536    * all of it makes up the value
537    */
538  8 String theValue = after.contains(COMMA) && !lastPair
539    ? after.substring(0, after.lastIndexOf(COMMA))
540    : after;
541  8 map.put(StringUtils.urlDecode(theKey, GFF_ENCODABLE),
542    StringUtils.urlDecode(theValue, GFF_ENCODABLE));
543  8 i += 1;
544    }
545   
546  4 return map;
547    }
548   
549    /**
550    * Returns any existing mapping held on the alignment between the given
551    * dataset sequences, or a new one if none found. This is a convenience method
552    * to facilitate processing multiple GFF lines that make up a single 'spliced'
553    * mapping, by extending the first mapping as the others are read.
554    *
555    * @param align
556    * @param fromSeq
557    * @param toSeq
558    * @return
559    */
 
560  16 toggle protected AlignedCodonFrame getMapping(AlignmentI align,
561    SequenceI fromSeq, SequenceI toSeq)
562    {
563  16 AlignedCodonFrame acf = align.getMapping(fromSeq, toSeq);
564  16 if (acf == null)
565    {
566  15 acf = new AlignedCodonFrame();
567    }
568  16 return acf;
569    }
570   
571    }