1. Project Clover database Fri Dec 6 2024 13:47:14 GMT
  2. Package jalview.io.gff

File Gff3Helper.java

 

Coverage histogram

../../../img/srcFileCovDistChart9.png
12% of files have more coverage

Code metrics

34
92
7
1
426
207
28
0.3
13.14
7
4

Classes

Class
Line #
Actions
Gff3Helper 40 92 28
0.8646616386.5%
 

Contributing tests

This file is covered by 7 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.io.gff;
22   
23    import jalview.datamodel.AlignedCodonFrame;
24    import jalview.datamodel.AlignmentI;
25    import jalview.datamodel.MappingType;
26    import jalview.datamodel.SequenceFeature;
27    import jalview.datamodel.SequenceI;
28    import jalview.util.MapList;
29    import jalview.util.StringUtils;
30   
31    import java.io.IOException;
32    import java.util.List;
33    import java.util.Map;
34   
35    /**
36    * Base class with generic / common functionality for processing GFF3 data.
37    * Override this as required for any specialisations resulting from
38    * peculiarities of GFF3 generated by particular tools.
39    */
 
40    public class Gff3Helper extends GffHelperBase
41    {
42    public static final String ALLELES = "alleles";
43   
44    protected static final String TARGET = "Target";
45   
46    protected static final String ID = "ID";
47   
48    private static final String NAME = "Name";
49   
50    /**
51    * GFF3 uses '=' to delimit name/value pairs in column 9, and comma to
52    * separate multiple values for a name
53    *
54    * @param text
55    * @return
56    */
 
57  8 toggle public static Map<String, List<String>> parseNameValuePairs(String text)
58    {
59  8 return parseNameValuePairs(text, ";", '=', ",");
60    }
61   
62    /**
63    * Process one GFF feature line (as modelled by SequenceFeature)
64    *
65    * @param seq
66    * the sequence with which this feature is associated
67    * @param sf
68    * the sequence feature with ATTRIBUTES property containing any
69    * additional attributes
70    * @param align
71    * the alignment we are adding GFF to
72    * @param newseqs
73    * any new sequences referenced by the GFF
74    * @param relaxedIdMatching
75    * if true, match word tokens in sequence names
76    * @return true if the sequence feature should be added to the sequence, else
77    * false (i.e. it has been processed in another way e.g. to generate a
78    * mapping)
79    * @throws IOException
80    */
 
81  7 toggle @Override
82    public SequenceFeature processGff(SequenceI seq, String[] gff,
83    AlignmentI align, List<SequenceI> newseqs,
84    boolean relaxedIdMatching) throws IOException
85    {
86  7 SequenceFeature sf = null;
87   
88  7 if (gff.length == 9)
89    {
90  7 String soTerm = gff[TYPE_COL];
91  7 String atts = gff[ATTRIBUTES_COL];
92  7 Map<String, List<String>> attributes = parseNameValuePairs(atts);
93   
94  7 SequenceOntologyI so = SequenceOntologyFactory.getInstance();
95  7 if (so.isA(soTerm, SequenceOntologyI.PROTEIN_MATCH))
96    {
97  0 sf = processProteinMatch(attributes, seq, gff, align, newseqs,
98    relaxedIdMatching);
99    }
100  7 else if (so.isA(soTerm, SequenceOntologyI.NUCLEOTIDE_MATCH))
101    {
102  5 sf = processNucleotideMatch(attributes, seq, gff, align, newseqs,
103    relaxedIdMatching);
104    }
105    else
106    {
107  2 sf = buildSequenceFeature(gff, attributes);
108    }
109    }
110    else
111    {
112    /*
113    * fall back on generating a sequence feature with no special processing
114    */
115  0 sf = buildSequenceFeature(gff, null);
116    }
117   
118  7 return sf;
119    }
120   
121    /**
122    * Processes one GFF3 nucleotide (e.g. cDNA to genome) match.
123    *
124    * @param attributes
125    * parsed GFF column 9 key/value(s)
126    * @param seq
127    * the sequence the GFF feature is on
128    * @param gffColumns
129    * the GFF column data
130    * @param align
131    * the alignment the sequence belongs to, where any new mappings
132    * should be added
133    * @param newseqs
134    * a list of new 'virtual sequences' generated while parsing GFF
135    * @param relaxedIdMatching
136    * if true allow fuzzy search for a matching target sequence
137    * @return a sequence feature, if one should be added to the sequence, else
138    * null
139    * @throws IOException
140    */
 
141  5 toggle protected SequenceFeature processNucleotideMatch(
142    Map<String, List<String>> attributes, SequenceI seq,
143    String[] gffColumns, AlignmentI align, List<SequenceI> newseqs,
144    boolean relaxedIdMatching) throws IOException
145    {
146  5 String strand = gffColumns[STRAND_COL];
147   
148    /*
149    * (For now) we don't process mappings from reverse complement ; to do
150    * this would require (a) creating a virtual sequence placeholder for
151    * the reverse complement (b) resolving the sequence by its id from some
152    * source (GFF ##FASTA or other) (c) creating the reverse complement
153    * sequence (d) updating the mapping to be to the reverse complement
154    */
155  5 if ("-".equals(strand))
156    {
157  1 jalview.bin.Console.errPrintln(
158    "Skipping mapping from reverse complement as not yet supported");
159  1 return null;
160    }
161   
162  4 List<String> targets = attributes.get(TARGET);
163  4 if (targets == null)
164    {
165  0 jalview.bin.Console.errPrintln("'Target' missing in GFF");
166  0 return null;
167    }
168   
169    /*
170    * Typically we only expect one Target per GFF line, but this can handle
171    * multiple matches, to the same or different sequences (e.g. dna variants)
172    */
173  4 for (String target : targets)
174    {
175    /*
176    * Process "seqid start end [strand]"
177    */
178  4 String[] tokens = target.split(" ");
179  4 if (tokens.length < 3)
180    {
181  0 jalview.bin.Console.errPrintln("Incomplete Target: " + target);
182  0 continue;
183    }
184   
185    /*
186    * Locate the mapped sequence in the alignment, or as a
187    * (new or existing) virtual sequence in the newseqs list
188    */
189  4 String targetId = findTargetId(tokens[0], attributes);
190  4 SequenceI mappedSequence1 = findSequence(targetId, align, newseqs,
191    relaxedIdMatching);
192  4 SequenceI mappedSequence = mappedSequence1;
193  4 if (mappedSequence == null)
194    {
195  0 continue;
196    }
197   
198    /*
199    * get any existing mapping for these sequences (or start one),
200    * and add this mapped range
201    */
202  4 AlignedCodonFrame acf = getMapping(align, seq, mappedSequence);
203   
204  4 try
205    {
206  4 int toStart = Integer.parseInt(tokens[1]);
207  4 int toEnd = Integer.parseInt(tokens[2]);
208  4 if (tokens.length > 3 && "-".equals(tokens[3]))
209    {
210    // mapping to reverse strand - swap start/end
211  1 int temp = toStart;
212  1 toStart = toEnd;
213  1 toEnd = temp;
214    }
215   
216  4 int fromStart = Integer.parseInt(gffColumns[START_COL]);
217  4 int fromEnd = Integer.parseInt(gffColumns[END_COL]);
218  4 MapList mapping = constructMappingFromAlign(fromStart, fromEnd,
219    toStart, toEnd, MappingType.NucleotideToNucleotide);
220   
221  4 if (mapping != null)
222    {
223  4 acf.addMap(seq, mappedSequence, mapping);
224  4 align.addCodonFrame(acf);
225    }
226    } catch (NumberFormatException nfe)
227    {
228  0 jalview.bin.Console
229    .errPrintln("Invalid start or end in Target " + target);
230    }
231    }
232   
233  4 SequenceFeature sf = buildSequenceFeature(gffColumns, attributes);
234  4 return sf;
235    }
236   
237    /**
238    * Returns the target sequence id extracted from the GFF name/value pairs.
239    * Default (standard behaviour) is the first token for "Target". This may be
240    * overridden where tools report this in a non-standard way.
241    *
242    * @param target
243    * first token of a "Target" value from GFF column 9, typically
244    * "seqid start end"
245    * @param set
246    * a map with all parsed column 9 attributes
247    * @return
248    */
 
249  4 toggle @SuppressWarnings("unused")
250    protected String findTargetId(String target,
251    Map<String, List<String>> set)
252    {
253  4 return target;
254    }
255   
256    /**
257    * Processes one GFF 'protein_match'; fields of interest are
258    * <ul>
259    * <li>feature group - the database reporting a match e.g. Pfam</li>
260    * <li>Name - the matched entry's accession id in the database</li>
261    * <li>ID - a sequence identifier for the matched region (which may be
262    * appended as FASTA in the GFF file)</li>
263    * </ul>
264    *
265    * @param set
266    * parsed GFF column 9 key/value(s)
267    * @param seq
268    * the sequence the GFF feature is on
269    * @param gffColumns
270    * the sequence feature holding GFF data
271    * @param align
272    * the alignment the sequence belongs to, where any new mappings
273    * should be added
274    * @param newseqs
275    * a list of new 'virtual sequences' generated while parsing GFF
276    * @param relaxedIdMatching
277    * if true allow fuzzy search for a matching target sequence
278    * @return the (real or virtual) sequence(s) mapped to by this match
279    * @throws IOException
280    */
 
281  1 toggle protected SequenceFeature processProteinMatch(
282    Map<String, List<String>> set, SequenceI seq, String[] gffColumns,
283    AlignmentI align, List<SequenceI> newseqs,
284    boolean relaxedIdMatching)
285    {
286    // This is currently tailored to InterProScan GFF output:
287    // ID holds the ID of the matched sequence, Target references the
288    // query sequence; this looks wrong, as ID should just be the GFF internal
289    // ID of the GFF feature, while Target would normally reference the matched
290    // sequence.
291    // TODO refactor as needed if other protein-protein GFF varies
292   
293  1 SequenceFeature sf = buildSequenceFeature(gffColumns, set);
294   
295    /*
296    * locate the mapped sequence in the alignment, or as a
297    * (new or existing) virtual sequence in the newseqs list
298    */
299  1 List<String> targets = set.get(TARGET);
300  1 if (targets != null)
301    {
302  1 for (String target : targets)
303    {
304   
305  1 SequenceI mappedSequence1 = findSequence(findTargetId(target, set),
306    align, newseqs, relaxedIdMatching);
307  1 SequenceI mappedSequence = mappedSequence1;
308  1 if (mappedSequence == null)
309    {
310  0 continue;
311    }
312   
313    /*
314    * give the mapped sequence a copy of the sequence feature, with
315    * start/end range adjusted
316    */
317  1 int sequenceFeatureLength = 1 + sf.getEnd() - sf.getBegin();
318  1 SequenceFeature sf2 = new SequenceFeature(sf, 1,
319    sequenceFeatureLength, sf.getFeatureGroup(), sf.getScore());
320  1 mappedSequence.addSequenceFeature(sf2);
321   
322    /*
323    * add a property to the mapped sequence so that it can eventually be
324    * renamed with its qualified accession id; renaming has to wait until
325    * all sequence reference resolution is complete
326    */
327  1 String accessionId = StringUtils
328    .listToDelimitedString(set.get(NAME), ",");
329  1 if (accessionId.length() > 0)
330    {
331  1 String database = sf.getType(); // TODO InterProScan only??
332  1 String qualifiedAccId = database + "|" + accessionId;
333  1 sf2.setValue(RENAME_TOKEN, qualifiedAccId);
334    }
335   
336    /*
337    * get any existing mapping for these sequences (or start one),
338    * and add this mapped range
339    */
340  1 AlignedCodonFrame alco = getMapping(align, seq, mappedSequence);
341  1 int[] from = new int[] { sf.getBegin(), sf.getEnd() };
342  1 int[] to = new int[] { 1, sequenceFeatureLength };
343  1 MapList mapping = new MapList(from, to, 1, 1);
344   
345  1 alco.addMap(seq, mappedSequence, mapping);
346  1 align.addCodonFrame(alco);
347    }
348    }
349   
350  1 return sf;
351    }
352   
353    /**
354    * Modifies the default SequenceFeature in order to set the Target sequence id
355    * as the description
356    */
 
357  7 toggle @Override
358    protected SequenceFeature buildSequenceFeature(String[] gff,
359    int typeColumn, String group,
360    Map<String, List<String>> attributes)
361    {
362  7 SequenceFeature sf = super.buildSequenceFeature(gff, typeColumn, group,
363    attributes);
364  7 String desc = getDescription(sf, attributes);
365  7 if (desc != null)
366    {
367  6 sf.setDescription(desc);
368    }
369  7 return sf;
370    }
371   
372    /**
373    * Apply heuristic rules to try to get the most useful feature description
374    *
375    * @param sf
376    * @param attributes
377    * @return
378    */
 
379  15 toggle protected String getDescription(SequenceFeature sf,
380    Map<String, List<String>> attributes)
381    {
382  15 String desc = null;
383  15 String target = (String) sf.getValue(TARGET);
384  15 if (target != null)
385    {
386  6 desc = target.split(" ")[0];
387    }
388   
389  15 SequenceOntologyI so = SequenceOntologyFactory.getInstance();
390  15 String type = sf.getType();
391  15 if (so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT))
392    {
393    /*
394    * Ensembl returns dna variants as 'alleles'
395    */
396  2 desc = StringUtils.listToDelimitedString(attributes.get(ALLELES),
397    ",");
398    }
399   
400    /*
401    * extract 'Name' for a transcript (to show gene name)
402    * or an exon (so 'colour by label' shows exon boundaries)
403    */
404  15 if (SequenceOntologyI.NMD_TRANSCRIPT_VARIANT.equals(type)
405    || so.isA(type, SequenceOntologyI.TRANSCRIPT)
406    || so.isA(type, SequenceOntologyI.EXON))
407    {
408  4 desc = StringUtils.listToDelimitedString(attributes.get("Name"), ",");
409    }
410   
411    /*
412    * if the above fails, try ID
413    */
414  15 if (desc == null)
415    {
416  4 desc = (String) sf.getValue(ID);
417    }
418   
419    /*
420    * and decode comma, equals, semi-colon as required by GFF3 spec
421    */
422  15 desc = StringUtils.urlDecode(desc, GFF_ENCODABLE);
423   
424  15 return desc;
425    }
426    }