Class |
Line # |
Actions |
|||
---|---|---|---|---|---|
ExonerateHelper | 39 | 88 | 37 |
1 | /* | |
2 | * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) | |
3 | * Copyright (C) $$Year-Rel$$ The Jalview Authors | |
4 | * | |
5 | * This file is part of Jalview. | |
6 | * | |
7 | * Jalview is free software: you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation, either version 3 | |
10 | * of the License, or (at your option) any later version. | |
11 | * | |
12 | * Jalview is distributed in the hope that it will be useful, but | |
13 | * WITHOUT ANY WARRANTY; without even the implied warranty | |
14 | * of MERCHANTABILITY or FITNESS FOR A PARTICULAR | |
15 | * PURPOSE. See the GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with Jalview. If not, see <http://www.gnu.org/licenses/>. | |
19 | * The Jalview Authors are detailed in the 'AUTHORS' file. | |
20 | */ | |
21 | package jalview.io.gff; | |
22 | ||
23 | import java.util.Locale; | |
24 | ||
25 | import jalview.datamodel.AlignedCodonFrame; | |
26 | import jalview.datamodel.AlignmentI; | |
27 | import jalview.datamodel.MappingType; | |
28 | import jalview.datamodel.SequenceFeature; | |
29 | import jalview.datamodel.SequenceI; | |
30 | import jalview.util.MapList; | |
31 | ||
32 | import java.io.IOException; | |
33 | import java.util.List; | |
34 | import java.util.Map; | |
35 | ||
36 | /** | |
37 | * A handler to parse GFF in the format generated by the exonerate tool | |
38 | */ | |
39 | public class ExonerateHelper extends Gff2Helper | |
40 | { | |
41 | private static final String SIMILARITY = "similarity"; | |
42 | ||
43 | private static final String GENOME2GENOME = "genome2genome"; | |
44 | ||
45 | private static final String CDNA2GENOME = "cdna2genome"; | |
46 | ||
47 | private static final String CODING2GENOME = "coding2genome"; | |
48 | ||
49 | private static final String CODING2CODING = "coding2coding"; | |
50 | ||
51 | private static final String PROTEIN2GENOME = "protein2genome"; | |
52 | ||
53 | private static final String PROTEIN2DNA = "protein2dna"; | |
54 | ||
55 | private static final String ALIGN = "Align"; | |
56 | ||
57 | private static final String QUERY = "Query"; | |
58 | ||
59 | private static final String TARGET = "Target"; | |
60 | ||
61 | /** | |
62 | * Process one GFF feature line (as modelled by SequenceFeature) | |
63 | * | |
64 | * @param seq | |
65 | * the sequence with which this feature is associated | |
66 | * @param gffColumns | |
67 | * the sequence feature with ATTRIBUTES property containing any | |
68 | * additional attributes | |
69 | * @param align | |
70 | * the alignment we are adding GFF to | |
71 | * @param newseqs | |
72 | * any new sequences referenced by the GFF | |
73 | * @param relaxedIdMatching | |
74 | * if true, match word tokens in sequence names | |
75 | * @return true if the sequence feature should be added to the sequence, else | |
76 | * false (i.e. it has been processed in another way e.g. to generate a | |
77 | * mapping) | |
78 | */ | |
79 | 7 | @Override |
80 | public SequenceFeature processGff(SequenceI seq, String[] gffColumns, | |
81 | AlignmentI align, List<SequenceI> newseqs, | |
82 | boolean relaxedIdMatching) | |
83 | { | |
84 | 7 | String attr = gffColumns[ATTRIBUTES_COL]; |
85 | 7 | Map<String, List<String>> set = parseNameValuePairs(attr); |
86 | ||
87 | 7 | try |
88 | { | |
89 | 7 | processGffSimilarity(set, seq, gffColumns, align, newseqs, |
90 | relaxedIdMatching); | |
91 | } catch (IOException ivfe) | |
92 | { | |
93 | 0 | jalview.bin.Console.errPrintln(ivfe); |
94 | } | |
95 | ||
96 | /* | |
97 | * return null to indicate we don't want to add a sequence feature for | |
98 | * similarity (only process it to create mappings) | |
99 | */ | |
100 | 7 | return null; |
101 | } | |
102 | ||
103 | /** | |
104 | * Processes the 'Query' (or 'Target') and 'Align' properties associated with | |
105 | * an exonerate GFF similarity feature; these properties define the mapping of | |
106 | * the annotated range to a related sequence. | |
107 | * | |
108 | * @param set | |
109 | * parsed GFF column 9 key/value(s) | |
110 | * @param seq | |
111 | * the sequence the GFF feature is on | |
112 | * @param gff | |
113 | * the GFF column data | |
114 | * @param align | |
115 | * the alignment the sequence belongs to, where any new mappings | |
116 | * should be added | |
117 | * @param newseqs | |
118 | * a list of new 'virtual sequences' generated while parsing GFF | |
119 | * @param relaxedIdMatching | |
120 | * if true allow fuzzy search for a matching target sequence | |
121 | * @throws IOException | |
122 | */ | |
123 | 11 | protected void processGffSimilarity(Map<String, List<String>> set, |
124 | SequenceI seq, String[] gff, AlignmentI align, | |
125 | List<SequenceI> newseqs, boolean relaxedIdMatching) | |
126 | throws IOException | |
127 | { | |
128 | /* | |
129 | * exonerate may be run with | |
130 | * --showquerygff - outputs 'features on the query' e.g. (protein2genome) | |
131 | * Target <dnaseqid> ; Align proteinStartPos dnaStartPos proteinCount | |
132 | * --showtargetgff - outputs 'features on the target' e.g. (protein2genome) | |
133 | * Query <proteinseqid> ; Align dnaStartPos proteinStartPos nucleotideCount | |
134 | * where the Align spec may repeat | |
135 | */ | |
136 | // TODO handle coding2coding and similar as well | |
137 | 11 | boolean featureIsOnTarget = true; |
138 | 11 | List<String> mapTo = set.get(QUERY); |
139 | 11 | if (mapTo == null) |
140 | { | |
141 | 3 | mapTo = set.get(TARGET); |
142 | 3 | featureIsOnTarget = false; |
143 | } | |
144 | 11 | MappingType type = getMappingType(gff[SOURCE_COL]); |
145 | ||
146 | 11 | if (type == null) |
147 | { | |
148 | 0 | throw new IOException("Sorry, I don't handle " + gff[SOURCE_COL]); |
149 | } | |
150 | ||
151 | 11 | if (mapTo == null || mapTo.size() != 1) |
152 | { | |
153 | 0 | throw new IOException( |
154 | "Expecting exactly one sequence in Query or Target field (got " | |
155 | + mapTo + ")"); | |
156 | } | |
157 | ||
158 | /* | |
159 | * locate the mapped sequence in the alignment or 'new' (GFF file) sequences; | |
160 | */ | |
161 | 11 | SequenceI mappedSequence = findSequence(mapTo.get(0), align, newseqs, |
162 | relaxedIdMatching); | |
163 | ||
164 | /* | |
165 | * If mapping is from protein to dna, we store it as dna to protein instead | |
166 | */ | |
167 | 11 | SequenceI mapFromSequence = seq; |
168 | 11 | SequenceI mapToSequence = mappedSequence; |
169 | 11 | if ((type == MappingType.NucleotideToPeptide && featureIsOnTarget) |
170 | || (type == MappingType.PeptideToNucleotide | |
171 | && !featureIsOnTarget)) | |
172 | { | |
173 | 3 | mapFromSequence = mappedSequence; |
174 | 3 | mapToSequence = seq; |
175 | } | |
176 | ||
177 | /* | |
178 | * Process the Align maps and create mappings. | |
179 | * These may be cdna-genome, cdna-protein, genome-protein. | |
180 | * The mapped sequences may or may not be in the alignment | |
181 | * (they may be included later in the GFF file). | |
182 | */ | |
183 | ||
184 | /* | |
185 | * get any existing mapping for these sequences (or start one), | |
186 | * and add this mapped range | |
187 | */ | |
188 | 11 | AlignedCodonFrame acf = getMapping(align, mapFromSequence, |
189 | mapToSequence); | |
190 | ||
191 | /* | |
192 | * exonerate GFF has the strand of the target in column 7 | |
193 | * (differs from GFF3 which has it in the Target descriptor) | |
194 | */ | |
195 | 11 | String strand = gff[STRAND_COL]; |
196 | 11 | boolean forwardStrand = true; |
197 | 11 | if ("-".equals(strand)) |
198 | { | |
199 | 9 | forwardStrand = false; |
200 | } | |
201 | 2 | else if (!"+".equals(strand)) |
202 | { | |
203 | 0 | jalview.bin.Console |
204 | .errPrintln("Strand must be specified for alignment"); | |
205 | 0 | return; |
206 | } | |
207 | ||
208 | 11 | List<String> alignedRegions = set.get(ALIGN); |
209 | 11 | for (String region : alignedRegions) |
210 | { | |
211 | 15 | MapList mapping = buildMapping(region, type, forwardStrand, |
212 | featureIsOnTarget, gff); | |
213 | ||
214 | 15 | if (mapping == null) |
215 | { | |
216 | 0 | continue; |
217 | } | |
218 | ||
219 | 15 | acf.addMap(mapFromSequence, mapToSequence, mapping); |
220 | } | |
221 | 11 | align.addCodonFrame(acf); |
222 | } | |
223 | ||
224 | /** | |
225 | * Construct the mapping | |
226 | * | |
227 | * @param region | |
228 | * @param type | |
229 | * @param forwardStrand | |
230 | * @param featureIsOnTarget | |
231 | * @param gff | |
232 | * @return | |
233 | */ | |
234 | 15 | protected MapList buildMapping(String region, MappingType type, |
235 | boolean forwardStrand, boolean featureIsOnTarget, String[] gff) | |
236 | { | |
237 | /* | |
238 | * process one "fromStart toStart fromCount" descriptor | |
239 | */ | |
240 | 15 | String[] tokens = region.split(" "); |
241 | 15 | if (tokens.length != 3) |
242 | { | |
243 | 0 | jalview.bin.Console |
244 | .errPrintln("Malformed Align descriptor: " + region); | |
245 | 0 | return null; |
246 | } | |
247 | ||
248 | /* | |
249 | * get start/end of from/to mappings | |
250 | * if feature is on the target sequence we have to invert the sense | |
251 | */ | |
252 | 15 | int alignFromStart; |
253 | 15 | int alignToStart; |
254 | 15 | int alignCount; |
255 | 15 | try |
256 | { | |
257 | 15 | alignFromStart = Integer.parseInt(tokens[0]); |
258 | 15 | alignToStart = Integer.parseInt(tokens[1]); |
259 | 15 | alignCount = Integer.parseInt(tokens[2]); |
260 | } catch (NumberFormatException nfe) | |
261 | { | |
262 | 0 | jalview.bin.Console.errPrintln(nfe.toString()); |
263 | 0 | return null; |
264 | } | |
265 | ||
266 | 15 | int fromStart; |
267 | 15 | int fromEnd; |
268 | 15 | int toStart; |
269 | 15 | int toEnd; |
270 | ||
271 | 15 | if (featureIsOnTarget) |
272 | { | |
273 | 12 | fromStart = alignToStart; |
274 | 12 | toStart = alignFromStart; |
275 | 12 | toEnd = forwardStrand ? toStart + alignCount - 1 |
276 | : toStart - (alignCount - 1); | |
277 | 12 | int toLength = Math.abs(toEnd - toStart) + 1; |
278 | 12 | int fromLength = toLength * type.getFromRatio() / type.getToRatio(); |
279 | 12 | fromEnd = fromStart + fromLength - 1; |
280 | } | |
281 | else | |
282 | { | |
283 | // we use the 'Align' values here not the feature start/end | |
284 | // not clear why they may differ but it seems they can | |
285 | 3 | fromStart = alignFromStart; |
286 | 3 | fromEnd = alignFromStart + alignCount - 1; |
287 | 3 | int fromLength = fromEnd - fromStart + 1; |
288 | 3 | int toLength = fromLength * type.getToRatio() / type.getFromRatio(); |
289 | 3 | toStart = alignToStart; |
290 | 3 | if (forwardStrand) |
291 | { | |
292 | 1 | toEnd = toStart + toLength - 1; |
293 | } | |
294 | else | |
295 | { | |
296 | 2 | toEnd = toStart - (toLength - 1); |
297 | } | |
298 | } | |
299 | ||
300 | 15 | MapList codonmapping = constructMappingFromAlign(fromStart, fromEnd, |
301 | toStart, toEnd, type); | |
302 | 15 | return codonmapping; |
303 | } | |
304 | ||
305 | /** | |
306 | * Returns a MappingType depending on the exonerate 'model' value. | |
307 | * | |
308 | * @param model | |
309 | * @return | |
310 | */ | |
311 | 18 | protected static MappingType getMappingType(String model) |
312 | { | |
313 | 18 | MappingType result = null; |
314 | ||
315 | 18 | if (model.contains(PROTEIN2DNA) || model.contains(PROTEIN2GENOME)) |
316 | { | |
317 | 13 | result = MappingType.PeptideToNucleotide; |
318 | } | |
319 | 5 | else if (model.contains(CODING2CODING) || model.contains(CODING2GENOME) |
320 | || model.contains(CDNA2GENOME) || model.contains(GENOME2GENOME)) | |
321 | { | |
322 | 4 | result = MappingType.NucleotideToNucleotide; |
323 | } | |
324 | 18 | return result; |
325 | } | |
326 | ||
327 | /** | |
328 | * Tests whether the GFF data looks like it was generated by exonerate, and is | |
329 | * a format we are willing to handle | |
330 | * | |
331 | * @param columns | |
332 | * @return | |
333 | */ | |
334 | 38 | public static boolean recognises(String[] columns) |
335 | { | |
336 | 38 | if (!SIMILARITY.equalsIgnoreCase(columns[TYPE_COL])) |
337 | { | |
338 | 22 | return false; |
339 | } | |
340 | ||
341 | /* | |
342 | * inspect alignment model | |
343 | */ | |
344 | 16 | String model = columns[SOURCE_COL]; |
345 | // e.g. exonerate:protein2genome:local | |
346 | 16 | if (model != null) |
347 | { | |
348 | 16 | String mdl = model.toLowerCase(Locale.ROOT); |
349 | 16 | if (mdl.contains(PROTEIN2DNA) || mdl.contains(PROTEIN2GENOME) |
350 | || mdl.contains(CODING2CODING) || mdl.contains(CODING2GENOME) | |
351 | || mdl.contains(CDNA2GENOME) || mdl.contains(GENOME2GENOME)) | |
352 | { | |
353 | 14 | return true; |
354 | } | |
355 | } | |
356 | 2 | jalview.bin.Console |
357 | .errPrintln("Sorry, I don't handle exonerate model " + model); | |
358 | 2 | return false; |
359 | } | |
360 | ||
361 | /** | |
362 | * An override to set feature group to "exonerate" instead of the default GFF | |
363 | * source value (column 2) | |
364 | */ | |
365 | 0 | @Override |
366 | protected SequenceFeature buildSequenceFeature(String[] gff, | |
367 | Map<String, List<String>> set) | |
368 | { | |
369 | 0 | SequenceFeature sf = super.buildSequenceFeature(gff, TYPE_COL, |
370 | "exonerate", set); | |
371 | ||
372 | 0 | return sf; |
373 | } | |
374 | ||
375 | } |