Class |
Line # |
Actions |
|||
---|---|---|---|---|---|
EmblFlatFile | 48 | 53 | 24 |
1 | /* | |
2 | * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) | |
3 | * Copyright (C) $$Year-Rel$$ The Jalview Authors | |
4 | * | |
5 | * This file is part of Jalview. | |
6 | * | |
7 | * Jalview is free software: you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation, either version 3 | |
10 | * of the License, or (at your option) any later version. | |
11 | * | |
12 | * Jalview is distributed in the hope that it will be useful, but | |
13 | * WITHOUT ANY WARRANTY; without even the implied warranty | |
14 | * of MERCHANTABILITY or FITNESS FOR A PARTICULAR | |
15 | * PURPOSE. See the GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with Jalview. If not, see <http://www.gnu.org/licenses/>. | |
19 | * The Jalview Authors are detailed in the 'AUTHORS' file. | |
20 | */ | |
21 | package jalview.io; | |
22 | ||
23 | import java.io.IOException; | |
24 | ||
25 | import jalview.bin.Console; | |
26 | import jalview.datamodel.DBRefEntry; | |
27 | import jalview.util.DBRefUtils; | |
28 | ||
29 | /** | |
30 | * A class that provides selective parsing of the EMBL flatfile format. | |
31 | * <p> | |
32 | * The initial implementation is limited to extracting fields used by Jalview | |
33 | * after fetching an EMBL or EMBLCDS entry: | |
34 | * | |
35 | * <pre> | |
36 | * accession, version, sequence, xref | |
37 | * and (for CDS feature) location, protein_id, product, codon_start, translation | |
38 | * </pre> | |
39 | * | |
40 | * For a complete parser, it may be best to adopt that provided in | |
41 | * https://github.com/enasequence/sequencetools/tree/master/src/main/java/uk/ac/ebi/embl/flatfile | |
42 | * (but note this has a dependency on the Apache Commons library) | |
43 | * | |
44 | * @author gmcarstairs | |
45 | * @see ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/doc/usrman.txt | |
46 | * @see ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html | |
47 | */ | |
48 | public class EmblFlatFile extends EMBLLikeFlatFile | |
49 | { | |
50 | /** | |
51 | * Constructor given a data source and the id of the source database | |
52 | * | |
53 | * @param fp | |
54 | * @param sourceId | |
55 | * @throws IOException | |
56 | */ | |
57 | 3 | public EmblFlatFile(FileParse fp, String sourceId) throws IOException |
58 | { | |
59 | 3 | super(fp, sourceId); |
60 | } | |
61 | ||
62 | /** | |
63 | * Parses the flatfile, and if successful, saves as an annotated sequence | |
64 | * which may be retrieved by calling {@code getSequence()} | |
65 | * | |
66 | * @throws IOException | |
67 | */ | |
68 | 3 | @Override |
69 | public void parse() throws IOException | |
70 | { | |
71 | 3 | String line = nextLine(); |
72 | 121 | while (line != null) |
73 | { | |
74 | 118 | if (line.startsWith("ID")) |
75 | { | |
76 | 3 | line = parseID(line); |
77 | } | |
78 | 115 | else if (line.startsWith("DE")) |
79 | { | |
80 | 3 | line = parseDE(line); |
81 | } | |
82 | 112 | else if (line.startsWith("DR")) |
83 | { | |
84 | 8 | line = parseDR(line); |
85 | } | |
86 | 104 | else if (line.startsWith("SQ")) |
87 | { | |
88 | 3 | line = parseSequence(); |
89 | } | |
90 | 101 | else if (line.startsWith("FT")) |
91 | { | |
92 | 23 | line = parseFeature(line.substring(2)); |
93 | } | |
94 | else | |
95 | { | |
96 | 78 | line = nextLine(); |
97 | } | |
98 | } | |
99 | 3 | buildSequence(); |
100 | } | |
101 | ||
102 | /** | |
103 | * Extracts and saves the primary accession and version (SV value) from an ID | |
104 | * line, or null if not found. Returns the next line after the one processed. | |
105 | * | |
106 | * @param line | |
107 | * @throws IOException | |
108 | */ | |
109 | 3 | String parseID(String line) throws IOException |
110 | { | |
111 | 3 | String[] tokens = line.substring(2).split(";"); |
112 | ||
113 | /* | |
114 | * first is primary accession | |
115 | */ | |
116 | 3 | String token = tokens[0].trim(); |
117 | 3 | if (!token.isEmpty()) |
118 | { | |
119 | 3 | this.accession = token; |
120 | } | |
121 | ||
122 | /* | |
123 | * second token is 'SV versionNo' | |
124 | */ | |
125 | 3 | if (tokens.length > 1) |
126 | { | |
127 | 3 | token = tokens[1].trim(); |
128 | 3 | if (token.startsWith("SV")) |
129 | { | |
130 | 3 | String[] bits = token.trim().split(WHITESPACE); |
131 | 3 | this.version = bits[bits.length - 1]; |
132 | } | |
133 | } | |
134 | ||
135 | /* | |
136 | * seventh token is 'length BP' | |
137 | */ | |
138 | 3 | if (tokens.length > 6) |
139 | { | |
140 | 3 | token = tokens[6].trim(); |
141 | 3 | String[] bits = token.trim().split(WHITESPACE); |
142 | 3 | try |
143 | { | |
144 | 3 | this.length = Integer.valueOf(bits[0]); |
145 | } catch (NumberFormatException e) | |
146 | { | |
147 | 0 | Console.error("bad length read in flatfile, line: " + line); |
148 | } | |
149 | } | |
150 | ||
151 | 3 | return nextLine(); |
152 | } | |
153 | ||
154 | /** | |
155 | * Reads sequence description from the first DE line found. Any trailing | |
156 | * period is discarded. If there are multiple DE lines, only the first (short | |
157 | * description) is read, the rest are ignored. | |
158 | * | |
159 | * @param line | |
160 | * @return | |
161 | * @throws IOException | |
162 | */ | |
163 | 3 | String parseDE(String line) throws IOException |
164 | { | |
165 | 3 | String desc = line.substring(2).trim(); |
166 | 3 | if (desc.endsWith(".")) |
167 | { | |
168 | 2 | desc = desc.substring(0, desc.length() - 1); |
169 | } | |
170 | 3 | this.description = desc; |
171 | ||
172 | /* | |
173 | * pass over any additional DE lines | |
174 | */ | |
175 | ? | while ((line = nextLine()) != null) |
176 | { | |
177 | 3 | if (!line.startsWith("DE")) |
178 | { | |
179 | 3 | break; |
180 | } | |
181 | } | |
182 | ||
183 | 3 | return line; |
184 | } | |
185 | ||
186 | /** | |
187 | * Processes one DR line and saves as a DBRefEntry cross-reference. Returns | |
188 | * the line following the line processed. | |
189 | * | |
190 | * @param line | |
191 | * @throws IOException | |
192 | */ | |
193 | 8 | String parseDR(String line) throws IOException |
194 | { | |
195 | 8 | String[] tokens = line.substring(2).split(";"); |
196 | 8 | if (tokens.length > 1) |
197 | { | |
198 | /* | |
199 | * ensure UniProtKB/Swiss-Prot converted to UNIPROT | |
200 | */ | |
201 | 8 | String db = tokens[0].trim(); |
202 | 8 | db = DBRefUtils.getCanonicalName(db); |
203 | 8 | String acc = tokens[1].trim(); |
204 | 8 | if (acc.endsWith(".")) |
205 | { | |
206 | 4 | acc = acc.substring(0, acc.length() - 1); |
207 | } | |
208 | 8 | String version = "0"; |
209 | 8 | if (tokens.length > 2) |
210 | { | |
211 | 4 | String secondaryId = tokens[2].trim(); |
212 | 4 | if (!secondaryId.isEmpty()) |
213 | { | |
214 | // todo: is this right? secondary id is not a version number | |
215 | // version = secondaryId; | |
216 | } | |
217 | } | |
218 | 8 | this.dbrefs.add(new DBRefEntry(db, version, acc)); |
219 | } | |
220 | ||
221 | 8 | return nextLine(); |
222 | } | |
223 | ||
224 | 447 | @Override |
225 | protected boolean isFeatureContinuationLine(String line) | |
226 | { | |
227 | 447 | return line.startsWith("FT "); // 4 spaces |
228 | } | |
229 | } |