Clover icon

jalviewX

  1. Project Clover database Wed Oct 31 2018 15:13:58 GMT
  2. Package jalview.io.gff

File GffHelperBase.java

 

Coverage histogram

../../../img/srcFileCovDistChart10.png
0% of files have more coverage

Code metrics

46
99
7
1
440
236
37
0.37
14.14
7
5.29

Classes

Class Line # Actions
GffHelperBase 44 99 37 11
0.9276315692.8%
 

Contributing tests

This file is covered by 19 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.io.gff;
22   
23    import jalview.analysis.SequenceIdMatcher;
24    import jalview.datamodel.AlignedCodonFrame;
25    import jalview.datamodel.AlignmentI;
26    import jalview.datamodel.MappingType;
27    import jalview.datamodel.SequenceDummy;
28    import jalview.datamodel.SequenceFeature;
29    import jalview.datamodel.SequenceI;
30    import jalview.util.MapList;
31    import jalview.util.StringUtils;
32   
33    import java.util.ArrayList;
34    import java.util.Arrays;
35    import java.util.HashMap;
36    import java.util.List;
37    import java.util.Map;
38    import java.util.Map.Entry;
39   
40    /**
41    * Base class with common functionality for flavours of GFF handler (GFF2 or
42    * GFF3)
43    */
 
44    public abstract class GffHelperBase implements GffHelperI
45    {
46    private static final String NOTE = "Note";
47   
48    /*
49    * GFF columns 1-9 (zero-indexed):
50    */
51    protected static final int SEQID_COL = 0;
52   
53    protected static final int SOURCE_COL = 1;
54   
55    protected static final int TYPE_COL = 2;
56   
57    protected static final int START_COL = 3;
58   
59    protected static final int END_COL = 4;
60   
61    protected static final int SCORE_COL = 5;
62   
63    protected static final int STRAND_COL = 6;
64   
65    protected static final int PHASE_COL = 7;
66   
67    protected static final int ATTRIBUTES_COL = 8;
68   
69    private AlignmentI lastmatchedAl = null;
70   
71    private SequenceIdMatcher matcher = null;
72   
73    /**
74    * Constructs and returns a mapping, or null if data appear invalid
75    *
76    * @param fromStart
77    * @param fromEnd
78    * @param toStart
79    * @param toEnd
80    * @param mappingType
81    * type of mapping (e.g. protein to nucleotide)
82    * @return
83    */
 
84  19 toggle protected MapList constructMappingFromAlign(int fromStart, int fromEnd,
85    int toStart, int toEnd, MappingType mappingType)
86    {
87  19 int[] from = new int[] { fromStart, fromEnd };
88  19 int[] to = new int[] { toStart, toEnd };
89   
90    /*
91    * Jalview always models from dna to protein, so switch values if the
92    * GFF mapping is from protein to dna
93    */
94  19 if (mappingType == MappingType.PeptideToNucleotide)
95    {
96  15 int[] temp = from;
97  15 from = to;
98  15 to = temp;
99  15 mappingType = mappingType.getInverse();
100    }
101   
102  19 int fromRatio = mappingType.getFromRatio();
103  19 int toRatio = mappingType.getToRatio();
104   
105    /*
106    * sanity check that mapped residue counts match
107    * TODO understand why PASA generates such cases...
108    */
109  19 if (!trimMapping(from, to, fromRatio, toRatio))
110    {
111  0 System.err.println("Ignoring mapping from " + Arrays.toString(from)
112    + " to " + Arrays.toString(to) + " as counts don't match!");
113  0 return null;
114    }
115   
116    /*
117    * If a codon has an intron gap, there will be contiguous 'toRanges';
118    * this is handled for us by the MapList constructor.
119    * (It is not clear that exonerate ever generates this case)
120    */
121   
122  19 return new MapList(from, to, fromRatio, toRatio);
123    }
124   
125    /**
126    * Checks that the 'from' and 'to' ranges have equivalent lengths. If not,
127    * tries to trim the end of the longer so they do. Returns true if the
128    * mappings could be made equivalent, else false. Note the range array values
129    * may be modified by this method.
130    *
131    * @param from
132    * @param to
133    * @param fromRatio
134    * @param toRatio
135    * @return
136    */
 
137  36 toggle protected static boolean trimMapping(int[] from, int[] to, int fromRatio,
138    int toRatio)
139    {
140  36 int fromLength = Math.abs(from[1] - from[0]) + 1;
141  36 int toLength = Math.abs(to[1] - to[0]) + 1;
142  36 int fromOverlap = fromLength * toRatio - toLength * fromRatio;
143  36 if (fromOverlap == 0)
144    {
145  24 return true;
146    }
147  12 if (fromOverlap > 0 && fromOverlap % toRatio == 0)
148    {
149    /*
150    * restrict from range to make them match up
151    * it's kind of arbitrary which end we truncate - here it is the end
152    */
153  6 System.err.print(
154    "Truncating mapping from " + Arrays.toString(from) + " to ");
155  6 if (from[1] > from[0])
156    {
157  3 from[1] -= fromOverlap / toRatio;
158    }
159    else
160    {
161  3 from[1] += fromOverlap / toRatio;
162    }
163  6 System.err.println(Arrays.toString(from));
164  6 return true;
165    }
166  6 else if (fromOverlap < 0 && fromOverlap % fromRatio == 0)
167    {
168  5 fromOverlap = -fromOverlap; // > 0
169    /*
170    * restrict to range to make them match up
171    */
172  5 System.err.print(
173    "Truncating mapping to " + Arrays.toString(to) + " to ");
174  5 if (to[1] > to[0])
175    {
176  2 to[1] -= fromOverlap / fromRatio;
177    }
178    else
179    {
180  3 to[1] += fromOverlap / fromRatio;
181    }
182  5 System.err.println(Arrays.toString(to));
183  5 return true;
184    }
185   
186    /*
187    * Couldn't truncate to an exact match..
188    */
189  1 return false;
190    }
191   
192    /**
193    * Returns a sequence matching the given id, as follows
194    * <ul>
195    * <li>strict matching is on exact sequence name</li>
196    * <li>relaxed matching allows matching on a token within the sequence name,
197    * or a dbxref</li>
198    * <li>first tries to find a match in the alignment sequences</li>
199    * <li>else tries to find a match in the new sequences already generated while
200    * parsing the features file</li>
201    * <li>else creates a new placeholder sequence, adds it to the new sequences
202    * list, and returns it</li>
203    * </ul>
204    *
205    * @param seqId
206    * @param align
207    * @param newseqs
208    * @param relaxedIdMatching
209    *
210    * @return
211    */
 
212  16 toggle protected SequenceI findSequence(String seqId, AlignmentI align,
213    List<SequenceI> newseqs, boolean relaxedIdMatching)
214    {
215  16 if (seqId == null)
216    {
217  0 return null;
218    }
219  16 SequenceI match = null;
220  16 if (relaxedIdMatching)
221    {
222  3 if (lastmatchedAl != align)
223    {
224  3 lastmatchedAl = align;
225  3 matcher = new SequenceIdMatcher(align.getSequencesArray());
226  3 if (newseqs != null)
227    {
228  3 matcher.addAll(newseqs);
229    }
230    }
231  3 match = matcher.findIdMatch(seqId);
232    }
233    else
234    {
235  13 match = align.findName(seqId, true);
236  13 if (match == null && newseqs != null)
237    {
238  12 for (SequenceI m : newseqs)
239    {
240  3 if (seqId.equals(m.getName()))
241    {
242  1 return m;
243    }
244    }
245    }
246   
247    }
248  15 if (match == null && newseqs != null)
249    {
250  14 match = new SequenceDummy(seqId);
251  14 if (relaxedIdMatching)
252    {
253  3 matcher.addAll(Arrays.asList(new SequenceI[] { match }));
254    }
255    // add dummy sequence to the newseqs list
256  14 newseqs.add(match);
257    }
258  15 return match;
259    }
260   
261    /**
262    * Parses the input line to a map of name / value(s) pairs. For example the
263    * line <br>
264    * Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal
265    * <br>
266    * if parsed with delimiter=";" and separators {' ', '='} <br>
267    * would return a map with { Notes={Fe=S, Metal}, Method={manual curation,
268    * prediction}, source={Pfam}} <br>
269    *
270    * This method supports parsing of either GFF2 format (which uses space ' ' as
271    * the name/value delimiter, and allows multiple occurrences of the same
272    * name), or GFF3 format (which uses '=' as the name/value delimiter, and
273    * strictly does not allow repeat occurrences of the same name - but does
274    * allow a comma-separated list of values).
275    *
276    * @param text
277    * @param namesDelimiter
278    * the major delimiter between name-value pairs
279    * @param nameValueSeparator
280    * one or more separators used between name and value
281    * @param valuesDelimiter
282    * delimits a list of more than one value
283    * @return the name-values map (which may be empty but never null)
284    */
 
285  35 toggle public static Map<String, List<String>> parseNameValuePairs(String text,
286    String namesDelimiter, char nameValueSeparator,
287    String valuesDelimiter)
288    {
289  35 Map<String, List<String>> map = new HashMap<String, List<String>>();
290  35 if (text == null || text.trim().length() == 0)
291    {
292  2 return map;
293    }
294   
295  33 for (String pair : text.trim().split(namesDelimiter))
296    {
297  89 pair = pair.trim();
298  89 if (pair.length() == 0)
299    {
300  0 continue;
301    }
302   
303  89 int sepPos = pair.indexOf(nameValueSeparator);
304  89 if (sepPos == -1)
305    {
306    // no name=value present
307  2 continue;
308    }
309   
310  87 String key = pair.substring(0, sepPos).trim();
311  87 String values = pair.substring(sepPos + 1).trim();
312  87 if (values.length() > 0)
313    {
314  86 List<String> vals = map.get(key);
315  86 if (vals == null)
316    {
317  79 vals = new ArrayList<String>();
318  79 map.put(key, vals);
319    }
320  86 for (String val : values.split(valuesDelimiter))
321    {
322  87 vals.add(val);
323    }
324    }
325    }
326  33 return map;
327    }
328   
329    /**
330    * Constructs a SequenceFeature from the GFF column data. Subclasses may wish
331    * to call this method then adjust the SequenceFeature depending on the
332    * particular usage of different tools that generate GFF.
333    *
334    * @param gff
335    * @param attributes
336    * @return
337    */
 
338  23 toggle protected SequenceFeature buildSequenceFeature(String[] gff,
339    Map<String, List<String>> attributes)
340    {
341  23 return buildSequenceFeature(gff, TYPE_COL, gff[SOURCE_COL], attributes);
342    }
343   
344    /**
345    * @param gff
346    * @param typeColumn
347    * @param group
348    * @param attributes
349    * @return
350    */
 
351  24 toggle protected SequenceFeature buildSequenceFeature(String[] gff,
352    int typeColumn, String group, Map<String, List<String>> attributes)
353    {
354  24 try
355    {
356  24 int start = Integer.parseInt(gff[START_COL]);
357  24 int end = Integer.parseInt(gff[END_COL]);
358   
359    /*
360    * default 'score' is 0 rather than Float.NaN as the latter currently
361    * disables the 'graduated colour => colour by label' option
362    */
363  24 float score = 0f;
364  24 try
365    {
366  24 score = Float.parseFloat(gff[SCORE_COL]);
367    } catch (NumberFormatException nfe)
368    {
369    // e.g. '.' - leave as zero
370    }
371   
372  24 SequenceFeature sf = new SequenceFeature(gff[typeColumn],
373    gff[SOURCE_COL], start, end, score, group);
374   
375  24 sf.setStrand(gff[STRAND_COL]);
376   
377  24 sf.setPhase(gff[PHASE_COL]);
378   
379  24 if (attributes != null)
380    {
381    /*
382    * save 'raw' column 9 to allow roundtrip output as input
383    */
384  18 sf.setAttributes(gff[ATTRIBUTES_COL]);
385   
386    /*
387    * Add attributes in column 9 to the sequence feature's
388    * 'otherData' table; use Note as a best proxy for description
389    */
390  18 for (Entry<String, List<String>> attr : attributes.entrySet())
391    {
392  41 String values = StringUtils.listToDelimitedString(attr.getValue(),
393    ",");
394  41 sf.setValue(attr.getKey(), values);
395  41 if (NOTE.equals(attr.getKey()))
396    {
397  2 sf.setDescription(values);
398    }
399    }
400    }
401   
402  24 return sf;
403    } catch (NumberFormatException nfe)
404    {
405  0 System.err.println("Invalid number in gff: " + nfe.getMessage());
406  0 return null;
407    }
408    }
409   
410    /**
411    * Returns the character used to separate attributes names from values in GFF
412    * column 9. This is space for GFF2, '=' for GFF3.
413    *
414    * @return
415    */
416    protected abstract char getNameValueSeparator();
417   
418    /**
419    * Returns any existing mapping held on the alignment between the given
420    * dataset sequences, or a new one if none found. This is a convenience method
421    * to facilitate processing multiple GFF lines that make up a single 'spliced'
422    * mapping, by extending the first mapping as the others are read.
423    *
424    * @param align
425    * @param fromSeq
426    * @param toSeq
427    * @return
428    */
 
429  16 toggle protected AlignedCodonFrame getMapping(AlignmentI align,
430    SequenceI fromSeq, SequenceI toSeq)
431    {
432  16 AlignedCodonFrame acf = align.getMapping(fromSeq, toSeq);
433  16 if (acf == null)
434    {
435  15 acf = new AlignedCodonFrame();
436    }
437  16 return acf;
438    }
439   
440    }