Clover icon

jalviewX

  1. Project Clover database Wed Oct 31 2018 15:13:58 GMT
  2. Package jalview.io.gff

File Gff3Helper.java

 

Coverage histogram

../../../img/srcFileCovDistChart9.png
12% of files have more coverage

Code metrics

34
92
8
1
429
210
29
0.32
11.5
8
3.62

Classes

Class Line # Actions
Gff3Helper 40 92 29 20
0.850746385.1%
 

Contributing tests

This file is covered by 7 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.io.gff;
22   
23    import jalview.datamodel.AlignedCodonFrame;
24    import jalview.datamodel.AlignmentI;
25    import jalview.datamodel.MappingType;
26    import jalview.datamodel.SequenceFeature;
27    import jalview.datamodel.SequenceI;
28    import jalview.util.MapList;
29    import jalview.util.StringUtils;
30   
31    import java.io.IOException;
32    import java.util.List;
33    import java.util.Map;
34   
35    /**
36    * Base class with generic / common functionality for processing GFF3 data.
37    * Override this as required for any specialisations resulting from
38    * peculiarities of GFF3 generated by particular tools.
39    */
 
40    public class Gff3Helper extends GffHelperBase
41    {
42    public static final String ALLELES = "alleles";
43   
44    protected static final String TARGET = "Target";
45   
46    protected static final String ID = "ID";
47   
48    private static final String NAME = "Name";
49   
50    /**
51    * GFF3 uses '=' to delimit name/value pairs in column 9, and comma to
52    * separate multiple values for a name
53    *
54    * @param text
55    * @return
56    */
 
57  8 toggle public static Map<String, List<String>> parseNameValuePairs(String text)
58    {
59  8 return parseNameValuePairs(text, ";", '=', ",");
60    }
61   
62    /**
63    * Process one GFF feature line (as modelled by SequenceFeature)
64    *
65    * @param seq
66    * the sequence with which this feature is associated
67    * @param sf
68    * the sequence feature with ATTRIBUTES property containing any
69    * additional attributes
70    * @param align
71    * the alignment we are adding GFF to
72    * @param newseqs
73    * any new sequences referenced by the GFF
74    * @param relaxedIdMatching
75    * if true, match word tokens in sequence names
76    * @return true if the sequence feature should be added to the sequence, else
77    * false (i.e. it has been processed in another way e.g. to generate a
78    * mapping)
79    * @throws IOException
80    */
 
81  7 toggle @Override
82    public SequenceFeature processGff(SequenceI seq, String[] gff,
83    AlignmentI align, List<SequenceI> newseqs,
84    boolean relaxedIdMatching) throws IOException
85    {
86  7 SequenceFeature sf = null;
87   
88  7 if (gff.length == 9)
89    {
90  7 String soTerm = gff[TYPE_COL];
91  7 String atts = gff[ATTRIBUTES_COL];
92  7 Map<String, List<String>> attributes = parseNameValuePairs(atts);
93   
94  7 SequenceOntologyI so = SequenceOntologyFactory.getInstance();
95  7 if (so.isA(soTerm, SequenceOntologyI.PROTEIN_MATCH))
96    {
97  0 sf = processProteinMatch(attributes, seq, gff, align, newseqs,
98    relaxedIdMatching);
99    }
100  7 else if (so.isA(soTerm, SequenceOntologyI.NUCLEOTIDE_MATCH))
101    {
102  5 sf = processNucleotideMatch(attributes, seq, gff, align, newseqs,
103    relaxedIdMatching);
104    }
105    else
106    {
107  2 sf = buildSequenceFeature(gff, attributes);
108    }
109    }
110    else
111    {
112    /*
113    * fall back on generating a sequence feature with no special processing
114    */
115  0 sf = buildSequenceFeature(gff, null);
116    }
117   
118  7 return sf;
119    }
120   
121    /**
122    * Processes one GFF3 nucleotide (e.g. cDNA to genome) match.
123    *
124    * @param attributes
125    * parsed GFF column 9 key/value(s)
126    * @param seq
127    * the sequence the GFF feature is on
128    * @param gffColumns
129    * the GFF column data
130    * @param align
131    * the alignment the sequence belongs to, where any new mappings
132    * should be added
133    * @param newseqs
134    * a list of new 'virtual sequences' generated while parsing GFF
135    * @param relaxedIdMatching
136    * if true allow fuzzy search for a matching target sequence
137    * @return a sequence feature, if one should be added to the sequence, else
138    * null
139    * @throws IOException
140    */
 
141  5 toggle protected SequenceFeature processNucleotideMatch(
142    Map<String, List<String>> attributes, SequenceI seq,
143    String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
144    boolean relaxedIdMatching) throws IOException
145    {
146  5 String strand = gffColumns[STRAND_COL];
147   
148    /*
149    * (For now) we don't process mappings from reverse complement ; to do
150    * this would require (a) creating a virtual sequence placeholder for
151    * the reverse complement (b) resolving the sequence by its id from some
152    * source (GFF ##FASTA or other) (c) creating the reverse complement
153    * sequence (d) updating the mapping to be to the reverse complement
154    */
155  5 if ("-".equals(strand))
156    {
157  1 System.err.println(
158    "Skipping mapping from reverse complement as not yet supported");
159  1 return null;
160    }
161   
162  4 List<String> targets = attributes.get(TARGET);
163  4 if (targets == null)
164    {
165  0 System.err.println("'Target' missing in GFF");
166  0 return null;
167    }
168   
169    /*
170    * Typically we only expect one Target per GFF line, but this can handle
171    * multiple matches, to the same or different sequences (e.g. dna variants)
172    */
173  4 for (String target : targets)
174    {
175    /*
176    * Process "seqid start end [strand]"
177    */
178  4 String[] tokens = target.split(" ");
179  4 if (tokens.length < 3)
180    {
181  0 System.err.println("Incomplete Target: " + target);
182  0 continue;
183    }
184   
185    /*
186    * Locate the mapped sequence in the alignment, or as a
187    * (new or existing) virtual sequence in the newseqs list
188    */
189  4 String targetId = findTargetId(tokens[0], attributes);
190  4 SequenceI mappedSequence1 = findSequence(targetId, align, newseqs,
191    relaxedIdMatching);
192  4 SequenceI mappedSequence = mappedSequence1;
193  4 if (mappedSequence == null)
194    {
195  0 continue;
196    }
197   
198    /*
199    * get any existing mapping for these sequences (or start one),
200    * and add this mapped range
201    */
202  4 AlignedCodonFrame acf = getMapping(align, seq, mappedSequence);
203   
204  4 try
205    {
206  4 int toStart = Integer.parseInt(tokens[1]);
207  4 int toEnd = Integer.parseInt(tokens[2]);
208  4 if (tokens.length > 3 && "-".equals(tokens[3]))
209    {
210    // mapping to reverse strand - swap start/end
211  1 int temp = toStart;
212  1 toStart = toEnd;
213  1 toEnd = temp;
214    }
215   
216  4 int fromStart = Integer.parseInt(gffColumns[START_COL]);
217  4 int fromEnd = Integer.parseInt(gffColumns[END_COL]);
218  4 MapList mapping = constructMappingFromAlign(fromStart, fromEnd,
219    toStart, toEnd, MappingType.NucleotideToNucleotide);
220   
221  4 if (mapping != null)
222    {
223  4 acf.addMap(seq, mappedSequence, mapping);
224  4 align.addCodonFrame(acf);
225    }
226    } catch (NumberFormatException nfe)
227    {
228  0 System.err.println("Invalid start or end in Target " + target);
229    }
230    }
231   
232  4 SequenceFeature sf = buildSequenceFeature(gffColumns, attributes);
233  4 return sf;
234    }
235   
236    /**
237    * Returns the target sequence id extracted from the GFF name/value pairs.
238    * Default (standard behaviour) is the first token for "Target". This may be
239    * overridden where tools report this in a non-standard way.
240    *
241    * @param target
242    * first token of a "Target" value from GFF column 9, typically
243    * "seqid start end"
244    * @param set
245    * a map with all parsed column 9 attributes
246    * @return
247    */
 
248  4 toggle @SuppressWarnings("unused")
249    protected String findTargetId(String target,
250    Map<String, List<String>> set)
251    {
252  4 return target;
253    }
254   
255    /**
256    * Processes one GFF 'protein_match'; fields of interest are
257    * <ul>
258    * <li>feature group - the database reporting a match e.g. Pfam</li>
259    * <li>Name - the matched entry's accession id in the database</li>
260    * <li>ID - a sequence identifier for the matched region (which may be
261    * appended as FASTA in the GFF file)</li>
262    * </ul>
263    *
264    * @param set
265    * parsed GFF column 9 key/value(s)
266    * @param seq
267    * the sequence the GFF feature is on
268    * @param gffColumns
269    * the sequence feature holding GFF data
270    * @param align
271    * the alignment the sequence belongs to, where any new mappings
272    * should be added
273    * @param newseqs
274    * a list of new 'virtual sequences' generated while parsing GFF
275    * @param relaxedIdMatching
276    * if true allow fuzzy search for a matching target sequence
277    * @return the (real or virtual) sequence(s) mapped to by this match
278    * @throws IOException
279    */
 
280  1 toggle protected SequenceFeature processProteinMatch(
281    Map<String, List<String>> set, SequenceI seq, String[] gffColumns,
282    AlignmentI align, List<SequenceI> newseqs,
283    boolean relaxedIdMatching)
284    {
285    // This is currently tailored to InterProScan GFF output:
286    // ID holds the ID of the matched sequence, Target references the
287    // query sequence; this looks wrong, as ID should just be the GFF internal
288    // ID of the GFF feature, while Target would normally reference the matched
289    // sequence.
290    // TODO refactor as needed if other protein-protein GFF varies
291   
292  1 SequenceFeature sf = buildSequenceFeature(gffColumns, set);
293   
294    /*
295    * locate the mapped sequence in the alignment, or as a
296    * (new or existing) virtual sequence in the newseqs list
297    */
298  1 List<String> targets = set.get(TARGET);
299  1 if (targets != null)
300    {
301  1 for (String target : targets)
302    {
303   
304  1 SequenceI mappedSequence1 = findSequence(findTargetId(target, set),
305    align, newseqs, relaxedIdMatching);
306  1 SequenceI mappedSequence = mappedSequence1;
307  1 if (mappedSequence == null)
308    {
309  0 continue;
310    }
311   
312    /*
313    * give the mapped sequence a copy of the sequence feature, with
314    * start/end range adjusted
315    */
316  1 int sequenceFeatureLength = 1 + sf.getEnd() - sf.getBegin();
317  1 SequenceFeature sf2 = new SequenceFeature(sf, 1,
318    sequenceFeatureLength, sf.getFeatureGroup(), sf.getScore());
319  1 mappedSequence.addSequenceFeature(sf2);
320   
321    /*
322    * add a property to the mapped sequence so that it can eventually be
323    * renamed with its qualified accession id; renaming has to wait until
324    * all sequence reference resolution is complete
325    */
326  1 String accessionId = StringUtils
327    .listToDelimitedString(set.get(NAME), ",");
328  1 if (accessionId.length() > 0)
329    {
330  1 String database = sf.getType(); // TODO InterProScan only??
331  1 String qualifiedAccId = database + "|" + accessionId;
332  1 sf2.setValue(RENAME_TOKEN, qualifiedAccId);
333    }
334   
335    /*
336    * get any existing mapping for these sequences (or start one),
337    * and add this mapped range
338    */
339  1 AlignedCodonFrame alco = getMapping(align, seq, mappedSequence);
340  1 int[] from = new int[] { sf.getBegin(), sf.getEnd() };
341  1 int[] to = new int[] { 1, sequenceFeatureLength };
342  1 MapList mapping = new MapList(from, to, 1, 1);
343   
344  1 alco.addMap(seq, mappedSequence, mapping);
345  1 align.addCodonFrame(alco);
346    }
347    }
348   
349  1 return sf;
350    }
351   
352    /**
353    * Return '=' as the name-value separator used in column 9 attributes.
354    */
 
355  0 toggle @Override
356    protected char getNameValueSeparator()
357    {
358  0 return '=';
359    }
360   
361    /**
362    * Modifies the default SequenceFeature in order to set the Target sequence id
363    * as the description
364    */
 
365  7 toggle @Override
366    protected SequenceFeature buildSequenceFeature(String[] gff,
367    int typeColumn, String group,
368    Map<String, List<String>> attributes)
369    {
370  7 SequenceFeature sf = super.buildSequenceFeature(gff, typeColumn, group,
371    attributes);
372  7 String desc = getDescription(sf, attributes);
373  7 if (desc != null)
374    {
375  6 sf.setDescription(desc);
376    }
377  7 return sf;
378    }
379   
380    /**
381    * Apply heuristic rules to try to get the most useful feature description
382    *
383    * @param sf
384    * @param attributes
385    * @return
386    */
 
387  15 toggle protected String getDescription(SequenceFeature sf,
388    Map<String, List<String>> attributes)
389    {
390  15 String desc = null;
391  15 String target = (String) sf.getValue(TARGET);
392  15 if (target != null)
393    {
394  6 desc = target.split(" ")[0];
395    }
396   
397  15 SequenceOntologyI so = SequenceOntologyFactory.getInstance();
398  15 String type = sf.getType();
399  15 if (so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT))
400    {
401    /*
402    * Ensembl returns dna variants as 'alleles'
403    */
404  2 desc = StringUtils.listToDelimitedString(attributes.get(ALLELES),
405    ",");
406    }
407   
408    /*
409    * extract 'Name' for a transcript (to show gene name)
410    * or an exon (so 'colour by label' shows exon boundaries)
411    */
412  15 if (SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(type)
413    || so.isA(type, SequenceOntologyI.TRANSCRIPT)
414    || so.isA(type, SequenceOntologyI.EXON))
415    {
416  4 desc = StringUtils.listToDelimitedString(attributes.get("Name"), ",");
417    }
418   
419    /*
420    * if the above fails, try ID
421    */
422  15 if (desc == null)
423    {
424  4 desc = (String) sf.getValue(ID);
425    }
426   
427  15 return desc;
428    }
429    }