Class |
Line # |
Actions |
|||
---|---|---|---|---|---|
FeaturesFile | 76 | 431 | 164 |
1 | /* | |
2 | * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) | |
3 | * Copyright (C) $$Year-Rel$$ The Jalview Authors | |
4 | * | |
5 | * This file is part of Jalview. | |
6 | * | |
7 | * Jalview is free software: you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation, either version 3 | |
10 | * of the License, or (at your option) any later version. | |
11 | * | |
12 | * Jalview is distributed in the hope that it will be useful, but | |
13 | * WITHOUT ANY WARRANTY; without even the implied warranty | |
14 | * of MERCHANTABILITY or FITNESS FOR A PARTICULAR | |
15 | * PURPOSE. See the GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with Jalview. If not, see <http://www.gnu.org/licenses/>. | |
19 | * The Jalview Authors are detailed in the 'AUTHORS' file. | |
20 | */ | |
21 | package jalview.io; | |
22 | ||
23 | import java.awt.Color; | |
24 | import java.io.IOException; | |
25 | import java.util.ArrayList; | |
26 | import java.util.Arrays; | |
27 | import java.util.Collections; | |
28 | import java.util.HashMap; | |
29 | import java.util.LinkedHashMap; | |
30 | import java.util.List; | |
31 | import java.util.Locale; | |
32 | import java.util.Map; | |
33 | import java.util.Map.Entry; | |
34 | import java.util.TreeMap; | |
35 | ||
36 | import jalview.analysis.AlignmentUtils; | |
37 | import jalview.analysis.SequenceIdMatcher; | |
38 | import jalview.api.AlignViewportI; | |
39 | import jalview.api.FeatureColourI; | |
40 | import jalview.api.FeatureRenderer; | |
41 | import jalview.api.FeaturesSourceI; | |
42 | import jalview.datamodel.AlignedCodonFrame; | |
43 | import jalview.datamodel.Alignment; | |
44 | import jalview.datamodel.AlignmentI; | |
45 | import jalview.datamodel.MappedFeatures; | |
46 | import jalview.datamodel.SequenceDummy; | |
47 | import jalview.datamodel.SequenceFeature; | |
48 | import jalview.datamodel.SequenceI; | |
49 | import jalview.datamodel.features.FeatureMatcherSet; | |
50 | import jalview.datamodel.features.FeatureMatcherSetI; | |
51 | import jalview.gui.Desktop; | |
52 | import jalview.io.gff.GffHelperFactory; | |
53 | import jalview.io.gff.GffHelperI; | |
54 | import jalview.schemes.FeatureColour; | |
55 | import jalview.util.ColorUtils; | |
56 | import jalview.util.MapList; | |
57 | import jalview.util.ParseHtmlBodyAndLinks; | |
58 | import jalview.util.StringUtils; | |
59 | ||
60 | /** | |
61 | * Parses and writes features files, which may be in Jalview, GFF2 or GFF3 | |
62 | * format. These are tab-delimited formats but with differences in the use of | |
63 | * columns. | |
64 | * | |
65 | * A Jalview feature file may define feature colours and then declare that the | |
66 | * remainder of the file is in GFF format with the line 'GFF'. | |
67 | * | |
68 | * GFF3 files may include alignment mappings for features, which Jalview will | |
69 | * attempt to model, and may include sequence data following a ##FASTA line. | |
70 | * | |
71 | * | |
72 | * @author AMW | |
73 | * @author jbprocter | |
74 | * @author gmcarstairs | |
75 | */ | |
76 | public class FeaturesFile extends AlignFile implements FeaturesSourceI | |
77 | { | |
78 | private static final String EQUALS = "="; | |
79 | ||
80 | private static final String TAB_REGEX = "\\t"; | |
81 | ||
82 | private static final String STARTGROUP = "STARTGROUP"; | |
83 | ||
84 | private static final String ENDGROUP = "ENDGROUP"; | |
85 | ||
86 | private static final String STARTFILTERS = "STARTFILTERS"; | |
87 | ||
88 | private static final String ENDFILTERS = "ENDFILTERS"; | |
89 | ||
90 | private static final String ID_NOT_SPECIFIED = "ID_NOT_SPECIFIED"; | |
91 | ||
92 | protected static final String GFF_VERSION = "##gff-version"; | |
93 | ||
94 | private AlignmentI lastmatchedAl = null; | |
95 | ||
96 | private SequenceIdMatcher matcher = null; | |
97 | ||
98 | protected AlignmentI dataset; | |
99 | ||
100 | protected int gffVersion; | |
101 | ||
102 | /** | |
103 | * Creates a new FeaturesFile object. | |
104 | */ | |
105 | 4 | public FeaturesFile() |
106 | { | |
107 | } | |
108 | ||
109 | /** | |
110 | * Constructor which does not parse the file immediately | |
111 | * | |
112 | * @param file | |
113 | * File or String filename | |
114 | * @param paste | |
115 | * @throws IOException | |
116 | */ | |
117 | 8 | public FeaturesFile(Object file, DataSourceType paste) throws IOException |
118 | { | |
119 | 8 | super(false, file, paste); |
120 | } | |
121 | ||
122 | /** | |
123 | * @param source | |
124 | * @throws IOException | |
125 | */ | |
126 | 1 | public FeaturesFile(FileParse source) throws IOException |
127 | { | |
128 | 1 | super(source); |
129 | } | |
130 | ||
131 | /** | |
132 | * Constructor that optionally parses the file immediately | |
133 | * | |
134 | * @param parseImmediately | |
135 | * @param file | |
136 | * @param type | |
137 | * @throws IOException | |
138 | */ | |
139 | 5 | public FeaturesFile(boolean parseImmediately, Object file, |
140 | DataSourceType type) throws IOException | |
141 | { | |
142 | 5 | super(parseImmediately, file, type); |
143 | } | |
144 | ||
145 | /** | |
146 | * Parse GFF or sequence features file using case-independent matching, | |
147 | * discarding URLs | |
148 | * | |
149 | * @param align | |
150 | * - alignment/dataset containing sequences that are to be annotated | |
151 | * @param colours | |
152 | * - hashtable to store feature colour definitions | |
153 | * @param removeHTML | |
154 | * - process html strings into plain text | |
155 | * @return true if features were added | |
156 | */ | |
157 | 5 | public boolean parse(AlignmentI align, |
158 | Map<String, FeatureColourI> colours, boolean removeHTML) | |
159 | { | |
160 | 5 | return parse(align, colours, removeHTML, false); |
161 | } | |
162 | ||
163 | /** | |
164 | * Extends the default addProperties by also adding peptide-to-cDNA mappings | |
165 | * (if any) derived while parsing a GFF file | |
166 | */ | |
167 | 2 | @Override |
168 | public void addProperties(AlignmentI al) | |
169 | { | |
170 | 2 | super.addProperties(al); |
171 | 2 | if (dataset != null && dataset.getCodonFrames() != null) |
172 | { | |
173 | 2 | AlignmentI ds = (al.getDataset() == null) ? al : al.getDataset(); |
174 | 2 | for (AlignedCodonFrame codons : dataset.getCodonFrames()) |
175 | { | |
176 | 2 | ds.addCodonFrame(codons); |
177 | } | |
178 | } | |
179 | } | |
180 | ||
181 | /** | |
182 | * Parse GFF or Jalview format sequence features file | |
183 | * | |
184 | * @param align | |
185 | * - alignment/dataset containing sequences that are to be annotated | |
186 | * @param colours | |
187 | * - map to store feature colour definitions | |
188 | * @param removeHTML | |
189 | * - process html strings into plain text | |
190 | * @param relaxedIdmatching | |
191 | * - when true, ID matches to compound sequence IDs are allowed | |
192 | * @return true if features were added | |
193 | */ | |
194 | 9 | public boolean parse(AlignmentI align, |
195 | Map<String, FeatureColourI> colours, boolean removeHTML, | |
196 | boolean relaxedIdmatching) | |
197 | { | |
198 | 9 | return parse(align, colours, null, removeHTML, relaxedIdmatching); |
199 | } | |
200 | ||
201 | /** | |
202 | * Parse GFF or Jalview format sequence features file | |
203 | * | |
204 | * @param align | |
205 | * - alignment/dataset containing sequences that are to be annotated | |
206 | * @param colours | |
207 | * - map to store feature colour definitions | |
208 | * @param filters | |
209 | * - map to store feature filter definitions | |
210 | * @param removeHTML | |
211 | * - process html strings into plain text | |
212 | * @param relaxedIdmatching | |
213 | * - when true, ID matches to compound sequence IDs are allowed | |
214 | * @return true if features were added | |
215 | */ | |
216 | 13 | public boolean parse(AlignmentI align, |
217 | Map<String, FeatureColourI> colours, | |
218 | Map<String, FeatureMatcherSetI> filters, boolean removeHTML, | |
219 | boolean relaxedIdmatching) | |
220 | { | |
221 | 13 | Map<String, String> gffProps = new HashMap<>(); |
222 | /* | |
223 | * keep track of any sequences we try to create from the data | |
224 | */ | |
225 | 13 | List<SequenceI> newseqs = new ArrayList<>(); |
226 | ||
227 | 13 | String line = null; |
228 | 13 | try |
229 | { | |
230 | 13 | String[] gffColumns; |
231 | 13 | String featureGroup = null; |
232 | ||
233 | ? | while ((line = nextLine()) != null) |
234 | { | |
235 | // skip comments/process pragmas | |
236 | 730 | if (line.length() == 0 || line.startsWith("#")) |
237 | { | |
238 | 97 | if (line.toLowerCase(Locale.ROOT).startsWith("##")) |
239 | { | |
240 | 26 | processGffPragma(line, gffProps, align, newseqs); |
241 | } | |
242 | 97 | continue; |
243 | } | |
244 | ||
245 | 633 | gffColumns = line.split(TAB_REGEX); |
246 | 633 | if (gffColumns.length == 1) |
247 | { | |
248 | 2 | if (line.trim().equalsIgnoreCase("GFF")) |
249 | { | |
250 | /* | |
251 | * Jalview features file with appended GFF | |
252 | * assume GFF2 (though it may declare ##gff-version 3) | |
253 | */ | |
254 | 1 | gffVersion = 2; |
255 | 1 | continue; |
256 | } | |
257 | } | |
258 | ||
259 | 632 | if (gffColumns.length > 0 && gffColumns.length < 4) |
260 | { | |
261 | /* | |
262 | * if 2 or 3 tokens, we anticipate either 'startgroup', 'endgroup' or | |
263 | * a feature type colour specification | |
264 | */ | |
265 | 66 | String ft = gffColumns[0]; |
266 | 66 | if (ft.equalsIgnoreCase(STARTFILTERS)) |
267 | { | |
268 | 1 | parseFilters(filters); |
269 | 1 | continue; |
270 | } | |
271 | 65 | if (ft.equalsIgnoreCase(STARTGROUP)) |
272 | { | |
273 | 7 | featureGroup = gffColumns[1]; |
274 | } | |
275 | 58 | else if (ft.equalsIgnoreCase(ENDGROUP)) |
276 | { | |
277 | // We should check whether this is the current group, | |
278 | // but at present there's no way of showing more than 1 group | |
279 | 7 | featureGroup = null; |
280 | } | |
281 | else | |
282 | { | |
283 | 51 | String colscheme = gffColumns[1]; |
284 | 51 | FeatureColourI colour = FeatureColour |
285 | .parseJalviewFeatureColour(colscheme); | |
286 | 51 | if (colour != null) |
287 | { | |
288 | 51 | colours.put(ft, colour); |
289 | } | |
290 | } | |
291 | 65 | continue; |
292 | } | |
293 | ||
294 | /* | |
295 | * if not a comment, GFF pragma, startgroup, endgroup or feature | |
296 | * colour specification, that just leaves a feature details line | |
297 | * in either Jalview or GFF format | |
298 | */ | |
299 | 566 | if (gffVersion == 0) |
300 | { | |
301 | 540 | parseJalviewFeature(line, gffColumns, align, colours, removeHTML, |
302 | relaxedIdmatching, featureGroup); | |
303 | } | |
304 | else | |
305 | { | |
306 | 26 | parseGff(gffColumns, align, relaxedIdmatching, newseqs); |
307 | } | |
308 | } | |
309 | 13 | resetMatcher(); |
310 | } catch (Exception ex) | |
311 | { | |
312 | // should report somewhere useful for UI if necessary | |
313 | 0 | warningMessage = ((warningMessage == null) ? "" : warningMessage) |
314 | + "Parsing error at\n" + line; | |
315 | 0 | jalview.bin.Console.outPrintln( |
316 | "Error parsing feature file: " + ex + "\n" + line); | |
317 | 0 | ex.printStackTrace(System.err); |
318 | 0 | resetMatcher(); |
319 | 0 | return false; |
320 | } | |
321 | ||
322 | /* | |
323 | * experimental - add any dummy sequences with features to the alignment | |
324 | * - we need them for Ensembl feature extraction - though maybe not otherwise | |
325 | */ | |
326 | 13 | for (SequenceI newseq : newseqs) |
327 | { | |
328 | 3 | if (newseq.getFeatures().hasFeatures()) |
329 | { | |
330 | 1 | align.addSequence(newseq); |
331 | } | |
332 | } | |
333 | 13 | return true; |
334 | } | |
335 | ||
336 | /** | |
337 | * Reads input lines from STARTFILTERS to ENDFILTERS and adds a feature type | |
338 | * filter to the map for each line parsed. After exit from this method, | |
339 | * nextLine() should return the line after ENDFILTERS (or we are already at | |
340 | * end of file if ENDFILTERS was missing). | |
341 | * | |
342 | * @param filters | |
343 | * @throws IOException | |
344 | */ | |
345 | 2 | protected void parseFilters(Map<String, FeatureMatcherSetI> filters) |
346 | throws IOException | |
347 | { | |
348 | 2 | String line; |
349 | ? | while ((line = nextLine()) != null) |
350 | { | |
351 | 5 | if (line.toUpperCase(Locale.ROOT).startsWith(ENDFILTERS)) |
352 | { | |
353 | 1 | return; |
354 | } | |
355 | 4 | String[] tokens = line.split(TAB_REGEX); |
356 | 4 | if (tokens.length != 2) |
357 | { | |
358 | 0 | jalview.bin.Console.errPrintln(String.format( |
359 | "Invalid token count %d for %d", tokens.length, line)); | |
360 | } | |
361 | else | |
362 | { | |
363 | 4 | String featureType = tokens[0]; |
364 | 4 | FeatureMatcherSetI fm = FeatureMatcherSet.fromString(tokens[1]); |
365 | 4 | if (fm != null && filters != null) |
366 | { | |
367 | 2 | filters.put(featureType, fm); |
368 | } | |
369 | } | |
370 | } | |
371 | } | |
372 | ||
373 | /** | |
374 | * Try to parse a Jalview format feature specification and add it as a | |
375 | * sequence feature to any matching sequences in the alignment. Returns true | |
376 | * if successful (a feature was added), or false if not. | |
377 | * | |
378 | * @param line | |
379 | * @param gffColumns | |
380 | * @param alignment | |
381 | * @param featureColours | |
382 | * @param removeHTML | |
383 | * @param relaxedIdmatching | |
384 | * @param featureGroup | |
385 | */ | |
386 | 540 | protected boolean parseJalviewFeature(String line, String[] gffColumns, |
387 | AlignmentI alignment, Map<String, FeatureColourI> featureColours, | |
388 | boolean removeHTML, boolean relaxedIdMatching, | |
389 | String featureGroup) | |
390 | { | |
391 | /* | |
392 | * tokens: description seqid seqIndex start end type [score] | |
393 | */ | |
394 | 540 | if (gffColumns.length < 6) |
395 | { | |
396 | 0 | jalview.bin.Console.errPrintln("Ignoring feature line '" + line |
397 | + "' with too few columns (" + gffColumns.length + ")"); | |
398 | 0 | return false; |
399 | } | |
400 | 540 | String desc = gffColumns[0]; |
401 | 540 | String seqId = gffColumns[1]; |
402 | 540 | SequenceI seq = findSequence(seqId, alignment, null, relaxedIdMatching); |
403 | ||
404 | 540 | if (!ID_NOT_SPECIFIED.equals(seqId)) |
405 | { | |
406 | 539 | seq = findSequence(seqId, alignment, null, relaxedIdMatching); |
407 | } | |
408 | else | |
409 | { | |
410 | 1 | seqId = null; |
411 | 1 | seq = null; |
412 | 1 | String seqIndex = gffColumns[2]; |
413 | 1 | try |
414 | { | |
415 | 1 | int idx = Integer.parseInt(seqIndex); |
416 | 1 | seq = alignment.getSequenceAt(idx); |
417 | } catch (NumberFormatException ex) | |
418 | { | |
419 | 0 | jalview.bin.Console |
420 | .errPrintln("Invalid sequence index: " + seqIndex); | |
421 | } | |
422 | } | |
423 | ||
424 | 540 | if (seq == null) |
425 | { | |
426 | 0 | jalview.bin.Console.outPrintln("Sequence not found: " + line); |
427 | 0 | return false; |
428 | } | |
429 | ||
430 | 540 | int startPos = Integer.parseInt(gffColumns[3]); |
431 | 540 | int endPos = Integer.parseInt(gffColumns[4]); |
432 | ||
433 | 540 | String ft = gffColumns[5]; |
434 | ||
435 | 540 | if (!featureColours.containsKey(ft)) |
436 | { | |
437 | /* | |
438 | * Perhaps an old style groups file with no colours - | |
439 | * synthesize a colour from the feature type | |
440 | */ | |
441 | 3 | Color colour = ColorUtils.createColourFromName(ft); |
442 | 3 | featureColours.put(ft, new FeatureColour(colour)); |
443 | } | |
444 | 540 | SequenceFeature sf = null; |
445 | 540 | if (gffColumns.length > 6) |
446 | { | |
447 | 48 | float score = Float.NaN; |
448 | 48 | try |
449 | { | |
450 | 48 | score = Float.valueOf(gffColumns[6]).floatValue(); |
451 | } catch (NumberFormatException ex) | |
452 | { | |
453 | 0 | sf = new SequenceFeature(ft, desc, startPos, endPos, featureGroup); |
454 | } | |
455 | 48 | sf = new SequenceFeature(ft, desc, startPos, endPos, score, |
456 | featureGroup); | |
457 | } | |
458 | else | |
459 | { | |
460 | 492 | sf = new SequenceFeature(ft, desc, startPos, endPos, featureGroup); |
461 | } | |
462 | ||
463 | 540 | parseDescriptionHTML(sf, removeHTML); |
464 | ||
465 | 540 | seq.addSequenceFeature(sf); |
466 | ||
467 | ? | while (seqId != null |
468 | && (seq = alignment.findName(seq, seqId, false)) != null) | |
469 | { | |
470 | 0 | seq.addSequenceFeature(new SequenceFeature(sf)); |
471 | } | |
472 | 540 | return true; |
473 | } | |
474 | ||
475 | /** | |
476 | * clear any temporary handles used to speed up ID matching | |
477 | */ | |
478 | 13 | protected void resetMatcher() |
479 | { | |
480 | 13 | lastmatchedAl = null; |
481 | 13 | matcher = null; |
482 | } | |
483 | ||
484 | /** | |
485 | * Returns a sequence matching the given id, as follows | |
486 | * <ul> | |
487 | * <li>strict matching is on exact sequence name</li> | |
488 | * <li>relaxed matching allows matching on a token within the sequence name, | |
489 | * or a dbxref</li> | |
490 | * <li>first tries to find a match in the alignment sequences</li> | |
491 | * <li>else tries to find a match in the new sequences already generated while | |
492 | * parsing the features file</li> | |
493 | * <li>else creates a new placeholder sequence, adds it to the new sequences | |
494 | * list, and returns it</li> | |
495 | * </ul> | |
496 | * | |
497 | * @param seqId | |
498 | * @param align | |
499 | * @param newseqs | |
500 | * @param relaxedIdMatching | |
501 | * | |
502 | * @return | |
503 | */ | |
504 | 1105 | protected SequenceI findSequence(String seqId, AlignmentI align, |
505 | List<SequenceI> newseqs, boolean relaxedIdMatching) | |
506 | { | |
507 | // TODO encapsulate in SequenceIdMatcher, share the matcher | |
508 | // with the GffHelper (removing code duplication) | |
509 | 1105 | SequenceI match = null; |
510 | 1105 | if (relaxedIdMatching) |
511 | { | |
512 | 12 | if (lastmatchedAl != align) |
513 | { | |
514 | 3 | lastmatchedAl = align; |
515 | 3 | matcher = new SequenceIdMatcher(align.getSequencesArray()); |
516 | 3 | if (newseqs != null) |
517 | { | |
518 | 3 | matcher.addAll(newseqs); |
519 | } | |
520 | } | |
521 | 12 | match = matcher.findIdMatch(seqId); |
522 | } | |
523 | else | |
524 | { | |
525 | 1093 | match = align.findName(seqId, true); |
526 | 1093 | if (match == null && newseqs != null) |
527 | { | |
528 | 9 | for (SequenceI m : newseqs) |
529 | { | |
530 | 7 | if (seqId.equals(m.getName())) |
531 | { | |
532 | 7 | return m; |
533 | } | |
534 | } | |
535 | } | |
536 | ||
537 | } | |
538 | 1098 | if (match == null && newseqs != null) |
539 | { | |
540 | 5 | match = new SequenceDummy(seqId); |
541 | 5 | if (relaxedIdMatching) |
542 | { | |
543 | 3 | matcher.addAll(Arrays.asList(new SequenceI[] { match })); |
544 | } | |
545 | // add dummy sequence to the newseqs list | |
546 | 5 | newseqs.add(match); |
547 | } | |
548 | 1098 | return match; |
549 | } | |
550 | ||
551 | 540 | public void parseDescriptionHTML(SequenceFeature sf, boolean removeHTML) |
552 | { | |
553 | 540 | if (sf.getDescription() == null) |
554 | { | |
555 | 0 | return; |
556 | } | |
557 | 540 | ParseHtmlBodyAndLinks parsed = new ParseHtmlBodyAndLinks( |
558 | sf.getDescription(), removeHTML, newline); | |
559 | ||
560 | 540 | if (removeHTML) |
561 | { | |
562 | 326 | sf.setDescription(parsed.getNonHtmlContent()); |
563 | } | |
564 | ||
565 | 540 | for (String link : parsed.getLinks()) |
566 | { | |
567 | 136 | sf.addLink(link); |
568 | } | |
569 | } | |
570 | ||
571 | /** | |
572 | * Returns contents of a Jalview format features file, for visible features, | |
573 | * as filtered by type and group. Features with a null group are displayed if | |
574 | * their feature type is visible. Non-positional features may optionally be | |
575 | * included (with no check on type or group). | |
576 | * | |
577 | * @param sequences | |
578 | * @param fr | |
579 | * @param includeNonPositional | |
580 | * if true, include non-positional features (regardless of group or | |
581 | * type) | |
582 | * @param includeComplement | |
583 | * if true, include visible complementary (CDS/protein) positional | |
584 | * features, with locations converted to local sequence coordinates | |
585 | * @return | |
586 | */ | |
587 | 10 | public String printJalviewFormat(SequenceI[] sequences, |
588 | FeatureRenderer fr, boolean includeNonPositional, | |
589 | boolean includeComplement) | |
590 | { | |
591 | 10 | Map<String, FeatureColourI> visibleColours = fr |
592 | .getDisplayedFeatureCols(); | |
593 | 10 | Map<String, FeatureMatcherSetI> featureFilters = fr.getFeatureFilters(); |
594 | ||
595 | /* | |
596 | * write out feature colours (if we know them) | |
597 | */ | |
598 | // TODO: decide if feature links should also be written here ? | |
599 | 10 | StringBuilder out = new StringBuilder(256); |
600 | 10 | if (visibleColours != null) |
601 | { | |
602 | 10 | for (Entry<String, FeatureColourI> featureColour : visibleColours |
603 | .entrySet()) | |
604 | { | |
605 | 15 | FeatureColourI colour = featureColour.getValue(); |
606 | 15 | out.append(colour.toJalviewFormat(featureColour.getKey())) |
607 | .append(newline); | |
608 | } | |
609 | } | |
610 | ||
611 | 10 | String[] types = visibleColours == null ? new String[0] |
612 | : visibleColours.keySet() | |
613 | .toArray(new String[visibleColours.keySet().size()]); | |
614 | ||
615 | /* | |
616 | * feature filters if any | |
617 | */ | |
618 | 10 | outputFeatureFilters(out, visibleColours, featureFilters); |
619 | ||
620 | /* | |
621 | * output features within groups | |
622 | */ | |
623 | 10 | int count = outputFeaturesByGroup(out, fr, types, sequences, |
624 | includeNonPositional); | |
625 | ||
626 | 10 | if (includeComplement) |
627 | { | |
628 | 0 | count += outputComplementFeatures(out, fr, sequences); |
629 | } | |
630 | ||
631 | 10 | return count > 0 ? out.toString() : "No Features Visible"; |
632 | } | |
633 | ||
634 | /** | |
635 | * Outputs any visible complementary (CDS/peptide) positional features as | |
636 | * Jalview format, within feature group. The coordinates of the linked | |
637 | * features are converted to the corresponding positions of the local | |
638 | * sequences. | |
639 | * | |
640 | * @param out | |
641 | * @param fr | |
642 | * @param sequences | |
643 | * @return | |
644 | */ | |
645 | 0 | private int outputComplementFeatures(StringBuilder out, |
646 | FeatureRenderer fr, SequenceI[] sequences) | |
647 | { | |
648 | 0 | AlignViewportI comp = fr.getViewport().getCodingComplement(); |
649 | 0 | FeatureRenderer fr2 = Desktop.getAlignFrameFor(comp) |
650 | .getFeatureRenderer(); | |
651 | ||
652 | /* | |
653 | * bin features by feature group and sequence | |
654 | */ | |
655 | 0 | Map<String, Map<String, List<SequenceFeature>>> map = new TreeMap<>( |
656 | String.CASE_INSENSITIVE_ORDER); | |
657 | 0 | int count = 0; |
658 | ||
659 | 0 | for (SequenceI seq : sequences) |
660 | { | |
661 | /* | |
662 | * find complementary features | |
663 | */ | |
664 | 0 | List<SequenceFeature> complementary = findComplementaryFeatures(seq, |
665 | fr2); | |
666 | 0 | String seqName = seq.getName(); |
667 | ||
668 | 0 | for (SequenceFeature sf : complementary) |
669 | { | |
670 | 0 | String group = sf.getFeatureGroup(); |
671 | 0 | if (!map.containsKey(group)) |
672 | { | |
673 | 0 | map.put(group, new LinkedHashMap<>()); // preserves sequence order |
674 | } | |
675 | 0 | Map<String, List<SequenceFeature>> groupFeatures = map.get(group); |
676 | 0 | if (!groupFeatures.containsKey(seqName)) |
677 | { | |
678 | 0 | groupFeatures.put(seqName, new ArrayList<>()); |
679 | } | |
680 | 0 | List<SequenceFeature> foundFeatures = groupFeatures.get(seqName); |
681 | 0 | foundFeatures.add(sf); |
682 | 0 | count++; |
683 | } | |
684 | } | |
685 | ||
686 | /* | |
687 | * output features by group | |
688 | */ | |
689 | 0 | for (Entry<String, Map<String, List<SequenceFeature>>> groupFeatures : map |
690 | .entrySet()) | |
691 | { | |
692 | 0 | out.append(newline); |
693 | 0 | String group = groupFeatures.getKey(); |
694 | 0 | if (!"".equals(group)) |
695 | { | |
696 | 0 | out.append(STARTGROUP).append(TAB).append(group).append(newline); |
697 | } | |
698 | 0 | Map<String, List<SequenceFeature>> seqFeaturesMap = groupFeatures |
699 | .getValue(); | |
700 | 0 | for (Entry<String, List<SequenceFeature>> seqFeatures : seqFeaturesMap |
701 | .entrySet()) | |
702 | { | |
703 | 0 | String sequenceName = seqFeatures.getKey(); |
704 | 0 | for (SequenceFeature sf : seqFeatures.getValue()) |
705 | { | |
706 | 0 | formatJalviewFeature(out, sequenceName, sf); |
707 | } | |
708 | } | |
709 | 0 | if (!"".equals(group)) |
710 | { | |
711 | 0 | out.append(ENDGROUP).append(TAB).append(group).append(newline); |
712 | } | |
713 | } | |
714 | ||
715 | 0 | return count; |
716 | } | |
717 | ||
718 | /** | |
719 | * Answers a list of mapped features visible in the (CDS/protein) complement, | |
720 | * with feature positions translated to local sequence coordinates | |
721 | * | |
722 | * @param seq | |
723 | * @param fr2 | |
724 | * @return | |
725 | */ | |
726 | 0 | protected List<SequenceFeature> findComplementaryFeatures(SequenceI seq, |
727 | FeatureRenderer fr2) | |
728 | { | |
729 | /* | |
730 | * avoid duplication of features (e.g. peptide feature | |
731 | * at all 3 mapped codon positions) | |
732 | */ | |
733 | 0 | List<SequenceFeature> found = new ArrayList<>(); |
734 | 0 | List<SequenceFeature> complementary = new ArrayList<>(); |
735 | ||
736 | 0 | for (int pos = seq.getStart(); pos <= seq.getEnd(); pos++) |
737 | { | |
738 | 0 | MappedFeatures mf = fr2.findComplementFeaturesAtResidue(seq, pos); |
739 | ||
740 | 0 | if (mf != null) |
741 | { | |
742 | 0 | for (SequenceFeature sf : mf.features) |
743 | { | |
744 | /* | |
745 | * make a virtual feature with local coordinates | |
746 | */ | |
747 | 0 | if (!found.contains(sf)) |
748 | { | |
749 | 0 | String group = sf.getFeatureGroup(); |
750 | 0 | if (group == null) |
751 | { | |
752 | 0 | group = ""; |
753 | } | |
754 | 0 | found.add(sf); |
755 | 0 | int begin = sf.getBegin(); |
756 | 0 | int end = sf.getEnd(); |
757 | 0 | int[] range = mf.getMappedPositions(begin, end); |
758 | 0 | SequenceFeature sf2 = new SequenceFeature(sf, range[0], |
759 | range[1], group, sf.getScore()); | |
760 | 0 | complementary.add(sf2); |
761 | } | |
762 | } | |
763 | } | |
764 | } | |
765 | ||
766 | 0 | return complementary; |
767 | } | |
768 | ||
769 | /** | |
770 | * Outputs any feature filters defined for visible feature types, sandwiched | |
771 | * by STARTFILTERS and ENDFILTERS lines | |
772 | * | |
773 | * @param out | |
774 | * @param visible | |
775 | * @param featureFilters | |
776 | */ | |
777 | 13 | void outputFeatureFilters(StringBuilder out, |
778 | Map<String, FeatureColourI> visible, | |
779 | Map<String, FeatureMatcherSetI> featureFilters) | |
780 | { | |
781 | 13 | if (visible == null || featureFilters == null |
782 | || featureFilters.isEmpty()) | |
783 | { | |
784 | 10 | return; |
785 | } | |
786 | ||
787 | 3 | boolean first = true; |
788 | 3 | for (String featureType : visible.keySet()) |
789 | { | |
790 | 4 | FeatureMatcherSetI filter = featureFilters.get(featureType); |
791 | 4 | if (filter != null) |
792 | { | |
793 | 3 | if (first) |
794 | { | |
795 | 2 | first = false; |
796 | 2 | out.append(newline).append(STARTFILTERS).append(newline); |
797 | } | |
798 | 3 | out.append(featureType).append(TAB).append(filter.toStableString()) |
799 | .append(newline); | |
800 | } | |
801 | } | |
802 | 3 | if (!first) |
803 | { | |
804 | 2 | out.append(ENDFILTERS).append(newline); |
805 | } | |
806 | ||
807 | } | |
808 | ||
809 | /** | |
810 | * Appends output of visible sequence features within feature groups to the | |
811 | * output buffer. Groups other than the null or empty group are sandwiched by | |
812 | * STARTGROUP and ENDGROUP lines. Answers the number of features written. | |
813 | * | |
814 | * @param out | |
815 | * @param fr | |
816 | * @param featureTypes | |
817 | * @param sequences | |
818 | * @param includeNonPositional | |
819 | * @return | |
820 | */ | |
821 | 10 | private int outputFeaturesByGroup(StringBuilder out, FeatureRenderer fr, |
822 | String[] featureTypes, SequenceI[] sequences, | |
823 | boolean includeNonPositional) | |
824 | { | |
825 | 10 | List<String> featureGroups = fr.getFeatureGroups(); |
826 | ||
827 | /* | |
828 | * sort groups alphabetically, and ensure that features with a | |
829 | * null or empty group are output after those in named groups | |
830 | */ | |
831 | 10 | List<String> sortedGroups = new ArrayList<>(featureGroups); |
832 | 10 | sortedGroups.remove(null); |
833 | 10 | sortedGroups.remove(""); |
834 | 10 | Collections.sort(sortedGroups); |
835 | 10 | sortedGroups.add(null); |
836 | 10 | sortedGroups.add(""); |
837 | ||
838 | 10 | int count = 0; |
839 | 10 | List<String> visibleGroups = fr.getDisplayedFeatureGroups(); |
840 | ||
841 | /* | |
842 | * loop over all groups (may be visible or not); | |
843 | * non-positional features are output even if group is not visible | |
844 | */ | |
845 | 10 | for (String group : sortedGroups) |
846 | { | |
847 | 33 | boolean firstInGroup = true; |
848 | 33 | boolean isNullGroup = group == null || "".equals(group); |
849 | ||
850 | 528 | for (int i = 0; i < sequences.length; i++) |
851 | { | |
852 | 495 | String sequenceName = sequences[i].getName(); |
853 | 495 | List<SequenceFeature> features = new ArrayList<>(); |
854 | ||
855 | /* | |
856 | * get any non-positional features in this group, if wanted | |
857 | * (for any feature type, whether visible or not) | |
858 | */ | |
859 | 495 | if (includeNonPositional) |
860 | { | |
861 | 90 | features.addAll(sequences[i].getFeatures() |
862 | .getFeaturesForGroup(false, group)); | |
863 | } | |
864 | ||
865 | /* | |
866 | * add positional features for visible feature types, but | |
867 | * (for named groups) only if feature group is visible | |
868 | */ | |
869 | 495 | if (featureTypes.length > 0 |
870 | && (isNullGroup || visibleGroups.contains(group))) | |
871 | { | |
872 | 390 | features.addAll(sequences[i].getFeatures() |
873 | .getFeaturesForGroup(true, group, featureTypes)); | |
874 | } | |
875 | ||
876 | 495 | for (SequenceFeature sf : features) |
877 | { | |
878 | 25 | if (sf.isNonPositional() || fr.isVisible(sf)) |
879 | { | |
880 | 23 | count++; |
881 | 23 | if (firstInGroup) |
882 | { | |
883 | 18 | out.append(newline); |
884 | 18 | if (!isNullGroup) |
885 | { | |
886 | 10 | out.append(STARTGROUP).append(TAB).append(group) |
887 | .append(newline); | |
888 | } | |
889 | } | |
890 | 23 | firstInGroup = false; |
891 | 23 | formatJalviewFeature(out, sequenceName, sf); |
892 | } | |
893 | } | |
894 | } | |
895 | ||
896 | 33 | if (!isNullGroup && !firstInGroup) |
897 | { | |
898 | 10 | out.append(ENDGROUP).append(TAB).append(group).append(newline); |
899 | } | |
900 | } | |
901 | 10 | return count; |
902 | } | |
903 | ||
904 | /** | |
905 | * Formats one feature in Jalview format and appends to the string buffer | |
906 | * | |
907 | * @param out | |
908 | * @param sequenceName | |
909 | * @param sequenceFeature | |
910 | */ | |
911 | 23 | protected void formatJalviewFeature(StringBuilder out, |
912 | String sequenceName, SequenceFeature sequenceFeature) | |
913 | { | |
914 | 23 | if (sequenceFeature.description == null |
915 | || sequenceFeature.description.equals("")) | |
916 | { | |
917 | 0 | out.append(sequenceFeature.type).append(TAB); |
918 | } | |
919 | else | |
920 | { | |
921 | 23 | if (sequenceFeature.links != null |
922 | && sequenceFeature.getDescription().indexOf("<html>") == -1) | |
923 | { | |
924 | 0 | out.append("<html>"); |
925 | } | |
926 | ||
927 | 23 | out.append(sequenceFeature.description); |
928 | 23 | if (sequenceFeature.links != null) |
929 | { | |
930 | 2 | for (int l = 0; l < sequenceFeature.links.size(); l++) |
931 | { | |
932 | 1 | String label = sequenceFeature.links.elementAt(l); |
933 | 1 | String href = label.substring(label.indexOf("|") + 1); |
934 | 1 | label = label.substring(0, label.indexOf("|")); |
935 | ||
936 | 1 | if (sequenceFeature.description.indexOf(href) == -1) |
937 | { | |
938 | 0 | out.append(" <a href=\"").append(href).append("\">") |
939 | .append(label).append("</a>"); | |
940 | } | |
941 | } | |
942 | ||
943 | 1 | if (sequenceFeature.getDescription().indexOf("</html>") == -1) |
944 | { | |
945 | 0 | out.append("</html>"); |
946 | } | |
947 | } | |
948 | ||
949 | 23 | out.append(TAB); |
950 | } | |
951 | 23 | out.append(sequenceName); |
952 | 23 | out.append("\t-1\t"); |
953 | 23 | out.append(sequenceFeature.begin); |
954 | 23 | out.append(TAB); |
955 | 23 | out.append(sequenceFeature.end); |
956 | 23 | out.append(TAB); |
957 | 23 | out.append(sequenceFeature.type); |
958 | 23 | if (!Float.isNaN(sequenceFeature.score)) |
959 | { | |
960 | 18 | out.append(TAB); |
961 | 18 | out.append(sequenceFeature.score); |
962 | } | |
963 | 23 | out.append(newline); |
964 | } | |
965 | ||
966 | /** | |
967 | * Parse method that is called when a GFF file is dragged to the desktop | |
968 | */ | |
969 | 2 | @Override |
970 | public void parse() | |
971 | { | |
972 | 2 | AlignViewportI av = getViewport(); |
973 | 2 | if (av != null) |
974 | { | |
975 | 0 | if (av.getAlignment() != null) |
976 | { | |
977 | 0 | dataset = av.getAlignment().getDataset(); |
978 | } | |
979 | 0 | if (dataset == null) |
980 | { | |
981 | // working in the applet context ? | |
982 | 0 | dataset = av.getAlignment(); |
983 | } | |
984 | } | |
985 | else | |
986 | { | |
987 | 2 | dataset = new Alignment(new SequenceI[] {}); |
988 | } | |
989 | ||
990 | 2 | Map<String, FeatureColourI> featureColours = new HashMap<>(); |
991 | 2 | boolean parseResult = parse(dataset, featureColours, false, true); |
992 | 2 | if (!parseResult) |
993 | { | |
994 | // pass error up somehow | |
995 | } | |
996 | 2 | if (av != null) |
997 | { | |
998 | // update viewport with the dataset data ? | |
999 | } | |
1000 | else | |
1001 | { | |
1002 | 2 | setSeqs(dataset.getSequencesArray()); |
1003 | } | |
1004 | } | |
1005 | ||
1006 | /** | |
1007 | * Implementation of unused abstract method | |
1008 | * | |
1009 | * @return error message | |
1010 | */ | |
1011 | 0 | @Override |
1012 | public String print(SequenceI[] sqs, boolean jvsuffix) | |
1013 | { | |
1014 | 0 | jalview.bin.Console |
1015 | .outPrintln("Use printGffFormat() or printJalviewFormat()"); | |
1016 | 0 | return null; |
1017 | } | |
1018 | ||
1019 | /** | |
1020 | * Returns features output in GFF2 format | |
1021 | * | |
1022 | * @param sequences | |
1023 | * the sequences whose features are to be output | |
1024 | * @param visible | |
1025 | * a map whose keys are the type names of visible features | |
1026 | * @param visibleFeatureGroups | |
1027 | * @param includeNonPositionalFeatures | |
1028 | * @param includeComplement | |
1029 | * @return | |
1030 | */ | |
1031 | 11 | public String printGffFormat(SequenceI[] sequences, FeatureRenderer fr, |
1032 | boolean includeNonPositionalFeatures, boolean includeComplement) | |
1033 | { | |
1034 | 11 | FeatureRenderer fr2 = null; |
1035 | 11 | if (includeComplement) |
1036 | { | |
1037 | 0 | AlignViewportI comp = fr.getViewport().getCodingComplement(); |
1038 | 0 | fr2 = Desktop.getAlignFrameFor(comp).getFeatureRenderer(); |
1039 | } | |
1040 | ||
1041 | 11 | Map<String, FeatureColourI> visibleColours = fr |
1042 | .getDisplayedFeatureCols(); | |
1043 | ||
1044 | 11 | StringBuilder out = new StringBuilder(256); |
1045 | ||
1046 | 11 | out.append(String.format("%s %d\n", GFF_VERSION, |
1047 | 11 | gffVersion == 0 ? 2 : gffVersion)); |
1048 | ||
1049 | 11 | String[] types = visibleColours == null ? new String[0] |
1050 | : visibleColours.keySet() | |
1051 | .toArray(new String[visibleColours.keySet().size()]); | |
1052 | ||
1053 | 11 | for (SequenceI seq : sequences) |
1054 | { | |
1055 | 165 | List<SequenceFeature> seqFeatures = new ArrayList<>(); |
1056 | 165 | List<SequenceFeature> features = new ArrayList<>(); |
1057 | 165 | if (includeNonPositionalFeatures) |
1058 | { | |
1059 | 30 | features.addAll(seq.getFeatures().getNonPositionalFeatures()); |
1060 | } | |
1061 | 165 | if (visibleColours != null && !visibleColours.isEmpty()) |
1062 | { | |
1063 | 105 | features.addAll(seq.getFeatures().getPositionalFeatures(types)); |
1064 | } | |
1065 | 165 | for (SequenceFeature sf : features) |
1066 | { | |
1067 | 16 | if (sf.isNonPositional() || fr.isVisible(sf)) |
1068 | { | |
1069 | /* | |
1070 | * drop features hidden by group visibility, colour threshold, | |
1071 | * or feature filter condition | |
1072 | */ | |
1073 | 13 | seqFeatures.add(sf); |
1074 | } | |
1075 | } | |
1076 | ||
1077 | 165 | if (includeComplement) |
1078 | { | |
1079 | 0 | seqFeatures.addAll(findComplementaryFeatures(seq, fr2)); |
1080 | } | |
1081 | ||
1082 | /* | |
1083 | * sort features here if wanted | |
1084 | */ | |
1085 | 165 | for (SequenceFeature sf : seqFeatures) |
1086 | { | |
1087 | 13 | formatGffFeature(out, seq, sf); |
1088 | 13 | out.append(newline); |
1089 | } | |
1090 | } | |
1091 | ||
1092 | 11 | return out.toString(); |
1093 | } | |
1094 | ||
1095 | /** | |
1096 | * Formats one feature as GFF and appends to the string buffer | |
1097 | */ | |
1098 | 13 | private void formatGffFeature(StringBuilder out, SequenceI seq, |
1099 | SequenceFeature sf) | |
1100 | { | |
1101 | 13 | String source = sf.featureGroup; |
1102 | 13 | if (source == null) |
1103 | { | |
1104 | 9 | source = sf.getDescription(); |
1105 | } | |
1106 | ||
1107 | 13 | out.append(seq.getName()); |
1108 | 13 | out.append(TAB); |
1109 | 13 | out.append(source); |
1110 | 13 | out.append(TAB); |
1111 | 13 | out.append(sf.type); |
1112 | 13 | out.append(TAB); |
1113 | 13 | out.append(sf.begin); |
1114 | 13 | out.append(TAB); |
1115 | 13 | out.append(sf.end); |
1116 | 13 | out.append(TAB); |
1117 | 13 | out.append(sf.score); |
1118 | 13 | out.append(TAB); |
1119 | ||
1120 | 13 | int strand = sf.getStrand(); |
1121 | 12 | out.append(strand == 1 ? "+" : (strand == -1 ? "-" : ".")); |
1122 | 13 | out.append(TAB); |
1123 | ||
1124 | 13 | String phase = sf.getPhase(); |
1125 | 13 | out.append(phase == null ? "." : phase); |
1126 | ||
1127 | 13 | if (sf.otherDetails != null && !sf.otherDetails.isEmpty()) |
1128 | { | |
1129 | 7 | Map<String, Object> map = sf.otherDetails; |
1130 | 7 | formatAttributes(out, map); |
1131 | } | |
1132 | } | |
1133 | ||
1134 | /** | |
1135 | * A helper method that outputs attributes stored in the map as | |
1136 | * semicolon-delimited values e.g. | |
1137 | * | |
1138 | * <pre> | |
1139 | * AC_Male=0;AF_NFE=0.00000e 00;Hom_FIN=0;GQ_MEDIAN=9 | |
1140 | * </pre> | |
1141 | * | |
1142 | * A map-valued attribute is formatted as a comma-delimited list within | |
1143 | * braces, for example | |
1144 | * | |
1145 | * <pre> | |
1146 | * jvmap_CSQ={ALLELE_NUM=1,UNIPARC=UPI0002841053,Feature=ENST00000585561} | |
1147 | * </pre> | |
1148 | * | |
1149 | * The {@code jvmap_} prefix designates a values map and is removed if the | |
1150 | * value is parsed when read in. (The GFF3 specification allows | |
1151 | * 'semi-structured data' to be represented provided the attribute name begins | |
1152 | * with a lower case letter.) | |
1153 | * | |
1154 | * @param sb | |
1155 | * @param map | |
1156 | * @see http://gmod.org/wiki/GFF3#GFF3_Format | |
1157 | */ | |
1158 | 7 | void formatAttributes(StringBuilder sb, Map<String, Object> map) |
1159 | { | |
1160 | 7 | sb.append(TAB); |
1161 | 7 | boolean first = true; |
1162 | 7 | for (String key : map.keySet()) |
1163 | { | |
1164 | 17 | if (SequenceFeature.STRAND.equals(key) |
1165 | || SequenceFeature.PHASE.equals(key)) | |
1166 | { | |
1167 | /* | |
1168 | * values stashed in map but output to their own columns | |
1169 | */ | |
1170 | 2 | continue; |
1171 | } | |
1172 | { | |
1173 | 15 | if (!first) |
1174 | { | |
1175 | 8 | sb.append(";"); |
1176 | } | |
1177 | } | |
1178 | 15 | first = false; |
1179 | 15 | Object value = map.get(key); |
1180 | 15 | if (value instanceof Map<?, ?>) |
1181 | { | |
1182 | 1 | formatMapAttribute(sb, key, (Map<?, ?>) value); |
1183 | } | |
1184 | else | |
1185 | { | |
1186 | 14 | String formatted = StringUtils.urlEncode(value.toString(), |
1187 | GffHelperI.GFF_ENCODABLE); | |
1188 | 14 | sb.append(key).append(EQUALS).append(formatted); |
1189 | } | |
1190 | } | |
1191 | } | |
1192 | ||
1193 | /** | |
1194 | * Formats the map entries as | |
1195 | * | |
1196 | * <pre> | |
1197 | * key=key1=value1,key2=value2,... | |
1198 | * </pre> | |
1199 | * | |
1200 | * and appends this to the string buffer | |
1201 | * | |
1202 | * @param sb | |
1203 | * @param key | |
1204 | * @param map | |
1205 | */ | |
1206 | 1 | private void formatMapAttribute(StringBuilder sb, String key, |
1207 | Map<?, ?> map) | |
1208 | { | |
1209 | 1 | if (map == null || map.isEmpty()) |
1210 | { | |
1211 | 0 | return; |
1212 | } | |
1213 | ||
1214 | /* | |
1215 | * AbstractMap.toString would be a shortcut here, but more reliable | |
1216 | * to code the required format in case toString changes in future | |
1217 | */ | |
1218 | 1 | sb.append(key).append(EQUALS); |
1219 | 1 | boolean first = true; |
1220 | 1 | for (Entry<?, ?> entry : map.entrySet()) |
1221 | { | |
1222 | 2 | if (!first) |
1223 | { | |
1224 | 1 | sb.append(","); |
1225 | } | |
1226 | 2 | first = false; |
1227 | 2 | sb.append(entry.getKey().toString()).append(EQUALS); |
1228 | 2 | String formatted = StringUtils.urlEncode(entry.getValue().toString(), |
1229 | GffHelperI.GFF_ENCODABLE); | |
1230 | 2 | sb.append(formatted); |
1231 | } | |
1232 | } | |
1233 | ||
1234 | /** | |
1235 | * Returns a mapping given list of one or more Align descriptors (exonerate | |
1236 | * format) | |
1237 | * | |
1238 | * @param alignedRegions | |
1239 | * a list of "Align fromStart toStart fromCount" | |
1240 | * @param mapIsFromCdna | |
1241 | * if true, 'from' is dna, else 'from' is protein | |
1242 | * @param strand | |
1243 | * either 1 (forward) or -1 (reverse) | |
1244 | * @return | |
1245 | * @throws IOException | |
1246 | */ | |
1247 | 0 | protected MapList constructCodonMappingFromAlign( |
1248 | List<String> alignedRegions, boolean mapIsFromCdna, int strand) | |
1249 | throws IOException | |
1250 | { | |
1251 | 0 | if (strand == 0) |
1252 | { | |
1253 | 0 | throw new IOException( |
1254 | "Invalid strand for a codon mapping (cannot be 0)"); | |
1255 | } | |
1256 | 0 | int regions = alignedRegions.size(); |
1257 | // arrays to hold [start, end] for each aligned region | |
1258 | 0 | int[] fromRanges = new int[regions * 2]; // from dna |
1259 | 0 | int[] toRanges = new int[regions * 2]; // to protein |
1260 | 0 | int fromRangesIndex = 0; |
1261 | 0 | int toRangesIndex = 0; |
1262 | ||
1263 | 0 | for (String range : alignedRegions) |
1264 | { | |
1265 | /* | |
1266 | * Align mapFromStart mapToStart mapFromCount | |
1267 | * e.g. if mapIsFromCdna | |
1268 | * Align 11270 143 120 | |
1269 | * means: | |
1270 | * 120 bases from pos 11270 align to pos 143 in peptide | |
1271 | * if !mapIsFromCdna this would instead be | |
1272 | * Align 143 11270 40 | |
1273 | */ | |
1274 | 0 | String[] tokens = range.split(" "); |
1275 | 0 | if (tokens.length != 3) |
1276 | { | |
1277 | 0 | throw new IOException("Wrong number of fields for Align"); |
1278 | } | |
1279 | 0 | int fromStart = 0; |
1280 | 0 | int toStart = 0; |
1281 | 0 | int fromCount = 0; |
1282 | 0 | try |
1283 | { | |
1284 | 0 | fromStart = Integer.parseInt(tokens[0]); |
1285 | 0 | toStart = Integer.parseInt(tokens[1]); |
1286 | 0 | fromCount = Integer.parseInt(tokens[2]); |
1287 | } catch (NumberFormatException nfe) | |
1288 | { | |
1289 | 0 | throw new IOException( |
1290 | "Invalid number in Align field: " + nfe.getMessage()); | |
1291 | } | |
1292 | ||
1293 | /* | |
1294 | * Jalview always models from dna to protein, so adjust values if the | |
1295 | * GFF mapping is from protein to dna | |
1296 | */ | |
1297 | 0 | if (!mapIsFromCdna) |
1298 | { | |
1299 | 0 | fromCount *= 3; |
1300 | 0 | int temp = fromStart; |
1301 | 0 | fromStart = toStart; |
1302 | 0 | toStart = temp; |
1303 | } | |
1304 | 0 | fromRanges[fromRangesIndex++] = fromStart; |
1305 | 0 | fromRanges[fromRangesIndex++] = fromStart + strand * (fromCount - 1); |
1306 | ||
1307 | /* | |
1308 | * If a codon has an intron gap, there will be contiguous 'toRanges'; | |
1309 | * this is handled for us by the MapList constructor. | |
1310 | * (It is not clear that exonerate ever generates this case) | |
1311 | */ | |
1312 | 0 | toRanges[toRangesIndex++] = toStart; |
1313 | 0 | toRanges[toRangesIndex++] = toStart + (fromCount - 1) / 3; |
1314 | } | |
1315 | ||
1316 | 0 | return new MapList(fromRanges, toRanges, 3, 1); |
1317 | } | |
1318 | ||
1319 | /** | |
1320 | * Parse a GFF format feature. This may include creating a 'dummy' sequence to | |
1321 | * hold the feature, or for its mapped sequence, or both, to be resolved | |
1322 | * either later in the GFF file (##FASTA section), or when the user loads | |
1323 | * additional sequences. | |
1324 | * | |
1325 | * @param gffColumns | |
1326 | * @param alignment | |
1327 | * @param relaxedIdMatching | |
1328 | * @param newseqs | |
1329 | * @return | |
1330 | */ | |
1331 | 26 | protected SequenceI parseGff(String[] gffColumns, AlignmentI alignment, |
1332 | boolean relaxedIdMatching, List<SequenceI> newseqs) | |
1333 | { | |
1334 | /* | |
1335 | * GFF: seqid source type start end score strand phase [attributes] | |
1336 | */ | |
1337 | 26 | if (gffColumns.length < 5) |
1338 | { | |
1339 | 0 | jalview.bin.Console |
1340 | .errPrintln("Ignoring GFF feature line with too few columns (" | |
1341 | + gffColumns.length + ")"); | |
1342 | 0 | return null; |
1343 | } | |
1344 | ||
1345 | /* | |
1346 | * locate referenced sequence in alignment _or_ | |
1347 | * as a forward or external reference (SequenceDummy) | |
1348 | */ | |
1349 | 26 | String seqId = gffColumns[0]; |
1350 | 26 | SequenceI seq = findSequence(seqId, alignment, newseqs, |
1351 | relaxedIdMatching); | |
1352 | ||
1353 | 26 | SequenceFeature sf = null; |
1354 | 26 | GffHelperI helper = GffHelperFactory.getHelper(gffColumns); |
1355 | 26 | if (helper != null) |
1356 | { | |
1357 | 26 | try |
1358 | { | |
1359 | 26 | sf = helper.processGff(seq, gffColumns, alignment, newseqs, |
1360 | relaxedIdMatching); | |
1361 | 26 | if (sf != null) |
1362 | { | |
1363 | 19 | seq.addSequenceFeature(sf); |
1364 | ? | while ((seq = alignment.findName(seq, seqId, true)) != null) |
1365 | { | |
1366 | 0 | seq.addSequenceFeature(new SequenceFeature(sf)); |
1367 | } | |
1368 | } | |
1369 | } catch (IOException e) | |
1370 | { | |
1371 | 0 | jalview.bin.Console |
1372 | .errPrintln("GFF parsing failed with: " + e.getMessage()); | |
1373 | 0 | return null; |
1374 | } | |
1375 | } | |
1376 | ||
1377 | 26 | return seq; |
1378 | } | |
1379 | ||
1380 | /** | |
1381 | * After encountering ##fasta in a GFF3 file, process the remainder of the | |
1382 | * file as FAST sequence data. Any placeholder sequences created during | |
1383 | * feature parsing are updated with the actual sequences. | |
1384 | * | |
1385 | * @param align | |
1386 | * @param newseqs | |
1387 | * @throws IOException | |
1388 | */ | |
1389 | 4 | protected void processAsFasta(AlignmentI align, List<SequenceI> newseqs) |
1390 | throws IOException | |
1391 | { | |
1392 | 4 | try |
1393 | { | |
1394 | 4 | mark(); |
1395 | } catch (IOException q) | |
1396 | { | |
1397 | } | |
1398 | // Opening a FastaFile object with the remainder of this object's dataIn. | |
1399 | // Tell the constructor to NOT close the dataIn when finished. | |
1400 | 4 | FastaFile parser = new FastaFile(this, false); |
1401 | 4 | List<SequenceI> includedseqs = parser.getSeqs(); |
1402 | ||
1403 | 4 | SequenceIdMatcher smatcher = new SequenceIdMatcher(newseqs); |
1404 | ||
1405 | /* | |
1406 | * iterate over includedseqs, and replacing matching ones with newseqs | |
1407 | * sequences. Generic iterator not used here because we modify | |
1408 | * includedseqs as we go | |
1409 | */ | |
1410 | 12 | for (int p = 0, pSize = includedseqs.size(); p < pSize; p++) |
1411 | { | |
1412 | // search for any dummy seqs that this sequence can be used to update | |
1413 | 8 | SequenceI includedSeq = includedseqs.get(p); |
1414 | 8 | SequenceI dummyseq = smatcher.findIdMatch(includedSeq); |
1415 | 8 | if (dummyseq != null && dummyseq instanceof SequenceDummy) |
1416 | { | |
1417 | // probably have the pattern wrong | |
1418 | // idea is that a flyweight proxy for a sequence ID can be created for | |
1419 | // 1. stable reference creation | |
1420 | // 2. addition of annotation | |
1421 | // 3. future replacement by a real sequence | |
1422 | // current pattern is to create SequenceDummy objects - a convenience | |
1423 | // constructor for a Sequence. | |
1424 | // problem is that when promoted to a real sequence, all references | |
1425 | // need to be updated somehow. We avoid that by keeping the same object. | |
1426 | 8 | ((SequenceDummy) dummyseq).become(includedSeq); |
1427 | 8 | dummyseq.createDatasetSequence(); |
1428 | ||
1429 | /* | |
1430 | * Update mappings so they are now to the dataset sequence | |
1431 | */ | |
1432 | 8 | for (AlignedCodonFrame mapping : align.getCodonFrames()) |
1433 | { | |
1434 | 8 | mapping.updateToDataset(dummyseq); |
1435 | } | |
1436 | ||
1437 | /* | |
1438 | * replace parsed sequence with the realised forward reference | |
1439 | */ | |
1440 | 8 | includedseqs.set(p, dummyseq); |
1441 | ||
1442 | /* | |
1443 | * and remove from the newseqs list | |
1444 | */ | |
1445 | 8 | newseqs.remove(dummyseq); |
1446 | } | |
1447 | } | |
1448 | ||
1449 | /* | |
1450 | * finally add sequences to the dataset | |
1451 | */ | |
1452 | 4 | for (SequenceI seq : includedseqs) |
1453 | { | |
1454 | // experimental: mapping-based 'alignment' to query sequence | |
1455 | 8 | AlignmentUtils.alignSequenceAs(seq, align, |
1456 | String.valueOf(align.getGapCharacter()), false, true); | |
1457 | ||
1458 | // rename sequences if GFF handler requested this | |
1459 | // TODO a more elegant way e.g. gffHelper.postProcess(newseqs) ? | |
1460 | 8 | List<SequenceFeature> sfs = seq.getFeatures().getPositionalFeatures(); |
1461 | 8 | if (!sfs.isEmpty()) |
1462 | { | |
1463 | 4 | String newName = (String) sfs.get(0) |
1464 | .getValue(GffHelperI.RENAME_TOKEN); | |
1465 | 4 | if (newName != null) |
1466 | { | |
1467 | 0 | seq.setName(newName); |
1468 | } | |
1469 | } | |
1470 | 8 | align.addSequence(seq); |
1471 | } | |
1472 | } | |
1473 | ||
1474 | /** | |
1475 | * Process a ## directive | |
1476 | * | |
1477 | * @param line | |
1478 | * @param gffProps | |
1479 | * @param align | |
1480 | * @param newseqs | |
1481 | * @throws IOException | |
1482 | */ | |
1483 | 26 | protected void processGffPragma(String line, Map<String, String> gffProps, |
1484 | AlignmentI align, List<SequenceI> newseqs) throws IOException | |
1485 | { | |
1486 | 26 | line = line.trim(); |
1487 | 26 | if ("###".equals(line)) |
1488 | { | |
1489 | // close off any open 'forward references' | |
1490 | 0 | return; |
1491 | } | |
1492 | ||
1493 | 26 | String[] tokens = line.substring(2).split(" "); |
1494 | 26 | String pragma = tokens[0]; |
1495 | 26 | String value = tokens.length == 1 ? null : tokens[1]; |
1496 | ||
1497 | 26 | if ("gff-version".equalsIgnoreCase(pragma)) |
1498 | { | |
1499 | 7 | if (value != null) |
1500 | { | |
1501 | 7 | try |
1502 | { | |
1503 | // value may be e.g. "3.1.2" | |
1504 | 7 | gffVersion = Integer.parseInt(value.split("\\.")[0]); |
1505 | } catch (NumberFormatException e) | |
1506 | { | |
1507 | // ignore | |
1508 | } | |
1509 | } | |
1510 | } | |
1511 | 19 | else if ("sequence-region".equalsIgnoreCase(pragma)) |
1512 | { | |
1513 | // could capture <seqid start end> if wanted here | |
1514 | } | |
1515 | 19 | else if ("feature-ontology".equalsIgnoreCase(pragma)) |
1516 | { | |
1517 | // should resolve against the specified feature ontology URI | |
1518 | } | |
1519 | 19 | else if ("attribute-ontology".equalsIgnoreCase(pragma)) |
1520 | { | |
1521 | // URI of attribute ontology - not currently used in GFF3 | |
1522 | } | |
1523 | 19 | else if ("source-ontology".equalsIgnoreCase(pragma)) |
1524 | { | |
1525 | // URI of source ontology - not currently used in GFF3 | |
1526 | } | |
1527 | 19 | else if ("species-build".equalsIgnoreCase(pragma)) |
1528 | { | |
1529 | // save URI of specific NCBI taxon version of annotations | |
1530 | 0 | gffProps.put("species-build", value); |
1531 | } | |
1532 | 19 | else if ("fasta".equalsIgnoreCase(pragma)) |
1533 | { | |
1534 | // process the rest of the file as a fasta file and replace any dummy | |
1535 | // sequence IDs | |
1536 | 4 | processAsFasta(align, newseqs); |
1537 | } | |
1538 | else | |
1539 | { | |
1540 | 15 | jalview.bin.Console.errPrintln("Ignoring unknown pragma: " + line); |
1541 | } | |
1542 | } | |
1543 | } |