Class |
Line # |
Actions |
|||
---|---|---|---|---|---|
IdentifyFile | 36 | 194 | 96 |
1 | /* | |
2 | * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) | |
3 | * Copyright (C) $$Year-Rel$$ The Jalview Authors | |
4 | * | |
5 | * This file is part of Jalview. | |
6 | * | |
7 | * Jalview is free software: you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation, either version 3 | |
10 | * of the License, or (at your option) any later version. | |
11 | * | |
12 | * Jalview is distributed in the hope that it will be useful, but | |
13 | * WITHOUT ANY WARRANTY; without even the implied warranty | |
14 | * of MERCHANTABILITY or FITNESS FOR A PARTICULAR | |
15 | * PURPOSE. See the GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with Jalview. If not, see <http://www.gnu.org/licenses/>. | |
19 | * The Jalview Authors are detailed in the 'AUTHORS' file. | |
20 | */ | |
21 | package jalview.io; | |
22 | ||
23 | import java.io.File; | |
24 | import java.io.FileNotFoundException; | |
25 | import java.io.IOException; | |
26 | import java.util.Locale; | |
27 | ||
28 | import jalview.bin.Console; | |
29 | ||
30 | /** | |
31 | * DOCUMENT ME! | |
32 | * | |
33 | * @author $author$ | |
34 | * @version $Revision$ | |
35 | */ | |
36 | public class IdentifyFile | |
37 | { | |
38 | ||
39 | private static final String XMLHEADER = "<?XML VERSION=\"1.0\" ENCODING=\"UTF-8\" STANDALONE=\"YES\"?>"; | |
40 | ||
41 | 2 | public FileFormatI identify(Object file, DataSourceType protocol) |
42 | throws FileFormatException, FileNotFoundException | |
43 | { | |
44 | // BH 2018 | |
45 | 2 | return (file instanceof File ? identify((File) file, protocol) |
46 | : identify((String) file, protocol)); | |
47 | ||
48 | } | |
49 | ||
50 | 6 | public FileFormatI identify(File file, DataSourceType sourceType) |
51 | throws FileFormatException | |
52 | { | |
53 | // BH 2018 | |
54 | 6 | String emessage = "UNIDENTIFIED FILE PARSING ERROR"; |
55 | 6 | FileParse parser = null; |
56 | 6 | try |
57 | { | |
58 | 6 | parser = new FileParse(file, sourceType); |
59 | 6 | if (parser.isValid()) |
60 | { | |
61 | 6 | return identify(parser); |
62 | } | |
63 | } catch (Exception e) | |
64 | { | |
65 | 0 | Console.error("Error whilst identifying " + file, e); |
66 | 0 | emessage = e.getMessage(); |
67 | } | |
68 | 0 | if (parser != null) |
69 | { | |
70 | 0 | throw new FileFormatException(parser.errormessage); |
71 | } | |
72 | 0 | throw new FileFormatException(emessage); |
73 | } | |
74 | ||
75 | /** | |
76 | * Identify a datasource's file content. | |
77 | * | |
78 | * @note Do not use this method for stream sources - create a FileParse object | |
79 | * instead. | |
80 | * | |
81 | * @param file | |
82 | * @param sourceType | |
83 | * @return | |
84 | * @throws FileFormatException | |
85 | */ | |
86 | 410 | public FileFormatI identify(String file, DataSourceType sourceType) |
87 | throws FileFormatException, FileNotFoundException | |
88 | { | |
89 | 410 | String emessage = "UNIDENTIFIED FILE PARSING ERROR"; |
90 | 410 | FileParse parser = null; |
91 | 410 | FileNotFoundException fnf = null; |
92 | 410 | try |
93 | { | |
94 | 410 | parser = new FileParse(file, sourceType); |
95 | 410 | if (parser.isValid()) |
96 | { | |
97 | 410 | return identify(parser); |
98 | } | |
99 | } catch (FileNotFoundException e) | |
100 | { | |
101 | 0 | fnf = e; |
102 | 0 | emessage = "Could not find '" + file + "'"; |
103 | 0 | Console.error("Could not find '" + file + "'", e); |
104 | } catch (IOException e) | |
105 | { | |
106 | 0 | Console.error("Error whilst trying to read " + file, e); |
107 | } catch (Exception e) | |
108 | { | |
109 | 0 | Console.error("Error whilst identifying " + file, e); |
110 | 0 | emessage = e.getMessage(); |
111 | } | |
112 | 0 | if (parser != null) |
113 | { | |
114 | 0 | throw new FileFormatException(parser.errormessage); |
115 | } | |
116 | 0 | if (fnf != null) |
117 | { | |
118 | 0 | throw (fnf); |
119 | } | |
120 | 0 | throw new FileFormatException(emessage); |
121 | } | |
122 | ||
123 | 422 | public FileFormatI identify(FileParse source) throws FileFormatException |
124 | { | |
125 | 422 | return identify(source, true); |
126 | // preserves original behaviour prior to version 2.3 | |
127 | } | |
128 | ||
129 | 0 | public FileFormatI identify(AlignmentFileReaderI file, |
130 | boolean closeSource) throws IOException | |
131 | { | |
132 | 0 | FileParse fp = new FileParse(file.getInFile(), |
133 | file.getDataSourceType()); | |
134 | 0 | return identify(fp, closeSource); |
135 | } | |
136 | ||
137 | /** | |
138 | * Identify contents of source, closing it or resetting source to start | |
139 | * afterwards. | |
140 | * | |
141 | * @param source | |
142 | * @param closeSource | |
143 | * @return (best guess at) file format | |
144 | * @throws FileFormatException | |
145 | */ | |
146 | 422 | public FileFormatI identify(FileParse source, boolean closeSource) |
147 | throws FileFormatException | |
148 | { | |
149 | 422 | FileFormatI reply = FileFormat.Pfam; |
150 | 422 | String data; |
151 | 422 | int bytesRead = 0; |
152 | 422 | int trimmedLength = 0; |
153 | 422 | boolean isXml = false; // set true if first line is XMLHEADER |
154 | 422 | boolean lineswereskipped = false; |
155 | 422 | boolean isBinary = false; // true if length is non-zero and non-printable |
156 | // characters are encountered | |
157 | ||
158 | 422 | try |
159 | { | |
160 | 422 | if (!closeSource) |
161 | { | |
162 | 0 | source.mark(); |
163 | } | |
164 | 422 | boolean aaIndexHeaderRead = false; |
165 | ||
166 | ? | while ((data = source.nextLine()) != null) |
167 | { | |
168 | 527 | bytesRead += data.length(); |
169 | 527 | trimmedLength += data.trim().length(); |
170 | 527 | if (!lineswereskipped) |
171 | { | |
172 | 63124 | for (int i = 0; !isBinary && i < data.length(); i++) |
173 | { | |
174 | 62702 | char c = data.charAt(i); |
175 | 62702 | isBinary = (c < 32 && c != '\t' && c != '\n' && c != '\r' |
176 | && c != 5 && c != 27); // nominal binary character filter | |
177 | // excluding CR, LF, tab,DEL and ^E | |
178 | // for certain blast ids | |
179 | } | |
180 | } | |
181 | 527 | if (isBinary) |
182 | { | |
183 | // jar files are special - since they contain all sorts of random | |
184 | // characters. | |
185 | 31 | if (source.inFile != null) |
186 | { | |
187 | 31 | String fileStr = source.inFile.getName(); |
188 | 31 | if (fileStr.contains(".jar") || fileStr.contains(".zip") |
189 | || fileStr.contains(".jvp")) | |
190 | { | |
191 | // possibly a Jalview archive (but check further) | |
192 | 28 | reply = FileFormat.Jalview; |
193 | } | |
194 | } | |
195 | 31 | if (!lineswereskipped && data.startsWith("PK")) |
196 | { | |
197 | 31 | reply = FileFormat.Jalview; // archive |
198 | 31 | break; |
199 | } | |
200 | } | |
201 | 496 | data = data.toUpperCase(Locale.ROOT); |
202 | ||
203 | 496 | if (data.startsWith(ScoreMatrixFile.SCOREMATRIX)) |
204 | { | |
205 | 1 | reply = FileFormat.ScoreMatrix; |
206 | 1 | break; |
207 | } | |
208 | 495 | if (data.startsWith(XMLHEADER) && !lineswereskipped) |
209 | { | |
210 | 2 | isXml = true; |
211 | } | |
212 | 495 | if (data.startsWith("LOCUS")) |
213 | { | |
214 | 1 | reply = FileFormat.GenBank; |
215 | 1 | break; |
216 | } | |
217 | 494 | if (data.startsWith("ID ")) |
218 | { | |
219 | 1 | if (data.substring(2).trim().split(";").length == 7) |
220 | { | |
221 | 1 | reply = FileFormat.Embl; |
222 | 1 | break; |
223 | } | |
224 | } | |
225 | 493 | if (data.startsWith("H ") && !aaIndexHeaderRead) |
226 | { | |
227 | 1 | aaIndexHeaderRead = true; |
228 | } | |
229 | 493 | if (data.startsWith("D ") && aaIndexHeaderRead) |
230 | { | |
231 | 1 | reply = FileFormat.ScoreMatrix; |
232 | 1 | break; |
233 | } | |
234 | 492 | if (data.startsWith("##GFF-VERSION")) |
235 | { | |
236 | // GFF - possibly embedded in a Jalview features file! | |
237 | 4 | reply = FileFormat.Features; |
238 | 4 | break; |
239 | } | |
240 | 488 | if (looksLikeFeatureData(data)) |
241 | { | |
242 | 4 | reply = FileFormat.Features; |
243 | 4 | break; |
244 | } | |
245 | 484 | if (data.indexOf("# STOCKHOLM") > -1) |
246 | { | |
247 | 8 | reply = FileFormat.Stockholm; |
248 | 8 | break; |
249 | } | |
250 | 476 | if (data.indexOf("_ENTRY.ID") > -1 |
251 | || data.indexOf("_AUDIT_AUTHOR.NAME") > -1 | |
252 | || data.indexOf("_ATOM_SITE.") > -1) | |
253 | { | |
254 | 1 | reply = FileFormat.MMCif; |
255 | 1 | break; |
256 | } | |
257 | // if (data.indexOf(">") > -1) | |
258 | 475 | if (data.startsWith(">")) |
259 | { | |
260 | // FASTA, PIR file or BLC file | |
261 | 332 | boolean checkPIR = false, starterm = false; |
262 | 332 | if ((data.indexOf(">P1;") > -1) || (data.indexOf(">DL;") > -1)) |
263 | { | |
264 | // watch for PIR file attributes | |
265 | 1 | checkPIR = true; |
266 | 1 | reply = FileFormat.PIR; |
267 | } | |
268 | // could also be BLC file, read next line to confirm | |
269 | 332 | data = source.nextLine(); |
270 | ||
271 | 332 | if (data.indexOf(">") > -1) |
272 | { | |
273 | 1 | reply = FileFormat.BLC; |
274 | } | |
275 | else | |
276 | { | |
277 | // Is this a single line BLC file? | |
278 | 331 | String data1 = source.nextLine(); |
279 | 331 | String data2 = source.nextLine(); |
280 | 331 | int c1; |
281 | 331 | if (checkPIR) |
282 | { | |
283 | 1 | starterm = (data1 != null && data1.indexOf("*") > -1) |
284 | || (data2 != null && data2.indexOf("*") > -1); | |
285 | } | |
286 | ? | if (data2 != null && (c1 = data.indexOf("*")) > -1) |
287 | { | |
288 | 1 | if (c1 == 0 && c1 == data2.indexOf("*")) |
289 | { | |
290 | 0 | reply = FileFormat.BLC; |
291 | } | |
292 | else | |
293 | { | |
294 | 1 | reply = FileFormat.Fasta; // possibly a bad choice - may be |
295 | // recognised as | |
296 | // PIR | |
297 | } | |
298 | // otherwise can still possibly be a PIR file | |
299 | } | |
300 | else | |
301 | { | |
302 | 330 | reply = FileFormat.Fasta; |
303 | // TODO : AMSA File is indicated if there is annotation in the | |
304 | // FASTA file - but FASTA will automatically generate this at the | |
305 | // mo. | |
306 | 330 | if (!checkPIR) |
307 | { | |
308 | 329 | break; |
309 | } | |
310 | } | |
311 | } | |
312 | // final check for PIR content. require | |
313 | // >P1;title\n<blah>\nterminated sequence to occur at least once. | |
314 | ||
315 | // TODO the PIR/fasta ambiguity may be the use case that is needed to | |
316 | // have | |
317 | // a 'Parse as type XXX' parameter for the applet/application. | |
318 | 3 | if (checkPIR) |
319 | { | |
320 | 1 | String dta = null; |
321 | 1 | if (!starterm) |
322 | { | |
323 | 1 | do |
324 | { | |
325 | 1 | try |
326 | { | |
327 | 1 | dta = source.nextLine(); |
328 | } catch (IOException ex) | |
329 | { | |
330 | } | |
331 | 1 | if (dta != null && dta.indexOf("*") > -1) |
332 | { | |
333 | 1 | starterm = true; |
334 | } | |
335 | 1 | } while (dta != null && !starterm); |
336 | } | |
337 | 1 | if (starterm) |
338 | { | |
339 | 1 | reply = FileFormat.PIR; |
340 | 1 | break; |
341 | } | |
342 | else | |
343 | { | |
344 | 0 | reply = FileFormat.Fasta; // probably a bad choice! |
345 | } | |
346 | } | |
347 | // read as a FASTA (probably) | |
348 | 2 | break; |
349 | } | |
350 | 143 | if (data.indexOf("{\"") > -1) |
351 | { | |
352 | 1 | reply = FileFormat.Json; |
353 | 1 | break; |
354 | } | |
355 | 142 | int lessThan = data.indexOf("<"); |
356 | 142 | if ((lessThan > -1)) // possible Markup Language data i.e HTML, |
357 | // RNAML, XML | |
358 | { | |
359 | 13 | String upper = data.toUpperCase(Locale.ROOT); |
360 | 13 | if (upper.substring(lessThan).startsWith("<HTML")) |
361 | { | |
362 | 1 | reply = FileFormat.Html; |
363 | 1 | break; |
364 | } | |
365 | 12 | if (upper.substring(lessThan).startsWith("<RNAML")) |
366 | { | |
367 | 2 | reply = FileFormat.Rnaml; |
368 | 2 | break; |
369 | } | |
370 | 10 | if (isXml && data.contains( |
371 | "<NS2:JALVIEWUSERCOLOURS SCHEMENAME=\"SEQUENCE FEATURES\" XMLNS:NS2=\"WWW.JALVIEW.ORG/COLOURS\">")) | |
372 | { | |
373 | 2 | reply = FileFormat.FeatureSettings; |
374 | 2 | break; |
375 | } | |
376 | } | |
377 | ||
378 | 137 | if ((data.length() < 1) || (data.indexOf("#") == 0)) |
379 | { | |
380 | 13 | lineswereskipped = true; |
381 | 13 | continue; |
382 | } | |
383 | ||
384 | 124 | if (data.indexOf("PILEUP") > -1) |
385 | { | |
386 | 1 | reply = FileFormat.Pileup; |
387 | ||
388 | 1 | break; |
389 | } | |
390 | ||
391 | 123 | if ((data.indexOf("//") == 0) || ((data.indexOf("!!") > -1) && (data |
392 | .indexOf("!!") < data.indexOf("_MULTIPLE_ALIGNMENT ")))) | |
393 | { | |
394 | 1 | reply = FileFormat.MSF; |
395 | ||
396 | 1 | break; |
397 | } | |
398 | 122 | else if (data.indexOf("CLUSTAL") > -1) |
399 | { | |
400 | 1 | reply = FileFormat.Clustal; |
401 | ||
402 | 1 | break; |
403 | } | |
404 | ||
405 | 121 | else if (data.indexOf("HEADER") == 0 || data.indexOf("ATOM") == 0) |
406 | { | |
407 | 19 | reply = FileFormat.PDB; |
408 | 19 | break; |
409 | } | |
410 | 102 | else if (data.matches("\\s*\\d+\\s+\\d+\\s*")) |
411 | { | |
412 | 1 | reply = FileFormat.Phylip; |
413 | 1 | break; |
414 | } | |
415 | else | |
416 | { | |
417 | 101 | if (!lineswereskipped && looksLikeJnetData(data)) |
418 | { | |
419 | 0 | reply = FileFormat.Jnet; |
420 | 0 | break; |
421 | } | |
422 | } | |
423 | ||
424 | 101 | lineswereskipped = true; // this means there was some junk before any |
425 | // key file signature | |
426 | } | |
427 | 422 | if (closeSource) |
428 | { | |
429 | 422 | source.close(); |
430 | } | |
431 | else | |
432 | { | |
433 | 0 | source.reset(bytesRead); // so the file can be parsed from the mark |
434 | } | |
435 | } catch (Exception ex) | |
436 | { | |
437 | 0 | Console.error("File Identification failed!\n" + ex); |
438 | 0 | throw new FileFormatException(source.errormessage); |
439 | } | |
440 | 422 | if (trimmedLength == 0) |
441 | { | |
442 | 0 | Console.error("File Identification failed! - Empty file was read."); |
443 | 0 | throw new FileFormatException("EMPTY DATA FILE"); |
444 | } | |
445 | 422 | Console.debug("File format identified as " + reply.toString()); |
446 | 422 | return reply; |
447 | } | |
448 | ||
449 | /** | |
450 | * Returns true if the data appears to be Jnet concise annotation format | |
451 | * | |
452 | * @param data | |
453 | * @return | |
454 | */ | |
455 | 16 | protected boolean looksLikeJnetData(String data) |
456 | { | |
457 | 16 | char firstChar = data.charAt(0); |
458 | 16 | int colonPos = data.indexOf(":"); |
459 | 16 | int commaPos = data.indexOf(","); |
460 | 16 | boolean isJnet = firstChar != '*' && firstChar != ' ' && colonPos > -1 |
461 | && commaPos > -1 && colonPos < commaPos; | |
462 | // && data.indexOf(",")<data.indexOf(",", data.indexOf(","))) / ?? | |
463 | 16 | return isJnet; |
464 | } | |
465 | ||
466 | /** | |
467 | * Returns true if the data has at least 6 tab-delimited fields _and_ fields 4 | |
468 | * and 5 are integer (start/end) | |
469 | * | |
470 | * @param data | |
471 | * @return | |
472 | */ | |
473 | 495 | protected boolean looksLikeFeatureData(String data) |
474 | { | |
475 | 495 | if (data == null) |
476 | { | |
477 | 1 | return false; |
478 | } | |
479 | 494 | String[] columns = data.split("\t"); |
480 | 494 | if (columns.length < 6) |
481 | { | |
482 | 486 | return false; |
483 | } | |
484 | 21 | for (int col = 3; col < 5; col++) |
485 | { | |
486 | 15 | try |
487 | { | |
488 | 15 | Integer.parseInt(columns[col]); |
489 | } catch (NumberFormatException e) | |
490 | { | |
491 | 2 | return false; |
492 | } | |
493 | } | |
494 | 6 | return true; |
495 | } | |
496 | ||
497 | /** | |
498 | * | |
499 | * @param args | |
500 | * @j2sIgnore | |
501 | */ | |
502 | 0 | public static void main(String[] args) |
503 | { | |
504 | 0 | for (int i = 0; args != null && i < args.length; i++) |
505 | { | |
506 | 0 | IdentifyFile ider = new IdentifyFile(); |
507 | 0 | FileFormatI type = null; |
508 | 0 | try |
509 | { | |
510 | 0 | type = ider.identify(args[i], DataSourceType.FILE); |
511 | } catch (FileNotFoundException e) | |
512 | { | |
513 | 0 | Console.error(String.format("Error '%s' fetching file %s", args[i], |
514 | e.getMessage())); | |
515 | } catch (FileFormatException e) | |
516 | { | |
517 | 0 | Console.error( |
518 | String.format("Error '%s' identifying file type for %s", | |
519 | args[i], e.getMessage())); | |
520 | } | |
521 | 0 | Console.debug("Type of " + args[i] + " is " + type); |
522 | } | |
523 | 0 | if (args == null || args.length == 0) |
524 | { | |
525 | 0 | Console.error("Usage: <Filename> [<Filename> ...]"); |
526 | } | |
527 | } | |
528 | ||
529 | } |