1. Project Clover database Fri Dec 6 2024 13:47:14 GMT
  2. Package jalview.io

File IdentifyFile.java

 

Coverage histogram

../../img/srcFileCovDistChart8.png
21% of files have more coverage

Code metrics

112
194
9
1
529
407
96
0.49
21.56
9
10.67

Classes

Class
Line #
Actions
IdentifyFile 36 194 96
0.780952478.1%
 

Contributing tests

This file is covered by 190 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.io;
22   
23    import java.io.File;
24    import java.io.FileNotFoundException;
25    import java.io.IOException;
26    import java.util.Locale;
27   
28    import jalview.bin.Console;
29   
30    /**
31    * DOCUMENT ME!
32    *
33    * @author $author$
34    * @version $Revision$
35    */
 
36    public class IdentifyFile
37    {
38   
39    private static final String XMLHEADER = "<?XML VERSION=\"1.0\" ENCODING=\"UTF-8\" STANDALONE=\"YES\"?>";
40   
 
41  2 toggle public FileFormatI identify(Object file, DataSourceType protocol)
42    throws FileFormatException, FileNotFoundException
43    {
44    // BH 2018
45  2 return (file instanceof File ? identify((File) file, protocol)
46    : identify((String) file, protocol));
47   
48    }
49   
 
50  6 toggle public FileFormatI identify(File file, DataSourceType sourceType)
51    throws FileFormatException
52    {
53    // BH 2018
54  6 String emessage = "UNIDENTIFIED FILE PARSING ERROR";
55  6 FileParse parser = null;
56  6 try
57    {
58  6 parser = new FileParse(file, sourceType);
59  6 if (parser.isValid())
60    {
61  6 return identify(parser);
62    }
63    } catch (Exception e)
64    {
65  0 Console.error("Error whilst identifying " + file, e);
66  0 emessage = e.getMessage();
67    }
68  0 if (parser != null)
69    {
70  0 throw new FileFormatException(parser.errormessage);
71    }
72  0 throw new FileFormatException(emessage);
73    }
74   
75    /**
76    * Identify a datasource's file content.
77    *
78    * @note Do not use this method for stream sources - create a FileParse object
79    * instead.
80    *
81    * @param file
82    * @param sourceType
83    * @return
84    * @throws FileFormatException
85    */
 
86  410 toggle public FileFormatI identify(String file, DataSourceType sourceType)
87    throws FileFormatException, FileNotFoundException
88    {
89  410 String emessage = "UNIDENTIFIED FILE PARSING ERROR";
90  410 FileParse parser = null;
91  410 FileNotFoundException fnf = null;
92  410 try
93    {
94  410 parser = new FileParse(file, sourceType);
95  410 if (parser.isValid())
96    {
97  410 return identify(parser);
98    }
99    } catch (FileNotFoundException e)
100    {
101  0 fnf = e;
102  0 emessage = "Could not find '" + file + "'";
103  0 Console.error("Could not find '" + file + "'", e);
104    } catch (IOException e)
105    {
106  0 Console.error("Error whilst trying to read " + file, e);
107    } catch (Exception e)
108    {
109  0 Console.error("Error whilst identifying " + file, e);
110  0 emessage = e.getMessage();
111    }
112  0 if (parser != null)
113    {
114  0 throw new FileFormatException(parser.errormessage);
115    }
116  0 if (fnf != null)
117    {
118  0 throw (fnf);
119    }
120  0 throw new FileFormatException(emessage);
121    }
122   
 
123  422 toggle public FileFormatI identify(FileParse source) throws FileFormatException
124    {
125  422 return identify(source, true);
126    // preserves original behaviour prior to version 2.3
127    }
128   
 
129  0 toggle public FileFormatI identify(AlignmentFileReaderI file,
130    boolean closeSource) throws IOException
131    {
132  0 FileParse fp = new FileParse(file.getInFile(),
133    file.getDataSourceType());
134  0 return identify(fp, closeSource);
135    }
136   
137    /**
138    * Identify contents of source, closing it or resetting source to start
139    * afterwards.
140    *
141    * @param source
142    * @param closeSource
143    * @return (best guess at) file format
144    * @throws FileFormatException
145    */
 
146  422 toggle public FileFormatI identify(FileParse source, boolean closeSource)
147    throws FileFormatException
148    {
149  422 FileFormatI reply = FileFormat.Pfam;
150  422 String data;
151  422 int bytesRead = 0;
152  422 int trimmedLength = 0;
153  422 boolean isXml = false; // set true if first line is XMLHEADER
154  422 boolean lineswereskipped = false;
155  422 boolean isBinary = false; // true if length is non-zero and non-printable
156    // characters are encountered
157   
158  422 try
159    {
160  422 if (!closeSource)
161    {
162  0 source.mark();
163    }
164  422 boolean aaIndexHeaderRead = false;
165   
166  ? while ((data = source.nextLine()) != null)
167    {
168  527 bytesRead += data.length();
169  527 trimmedLength += data.trim().length();
170  527 if (!lineswereskipped)
171    {
172  63124 for (int i = 0; !isBinary && i < data.length(); i++)
173    {
174  62702 char c = data.charAt(i);
175  62702 isBinary = (c < 32 && c != '\t' && c != '\n' && c != '\r'
176    && c != 5 && c != 27); // nominal binary character filter
177    // excluding CR, LF, tab,DEL and ^E
178    // for certain blast ids
179    }
180    }
181  527 if (isBinary)
182    {
183    // jar files are special - since they contain all sorts of random
184    // characters.
185  31 if (source.inFile != null)
186    {
187  31 String fileStr = source.inFile.getName();
188  31 if (fileStr.contains(".jar") || fileStr.contains(".zip")
189    || fileStr.contains(".jvp"))
190    {
191    // possibly a Jalview archive (but check further)
192  28 reply = FileFormat.Jalview;
193    }
194    }
195  31 if (!lineswereskipped && data.startsWith("PK"))
196    {
197  31 reply = FileFormat.Jalview; // archive
198  31 break;
199    }
200    }
201  496 data = data.toUpperCase(Locale.ROOT);
202   
203  496 if (data.startsWith(ScoreMatrixFile.SCOREMATRIX))
204    {
205  1 reply = FileFormat.ScoreMatrix;
206  1 break;
207    }
208  495 if (data.startsWith(XMLHEADER) && !lineswereskipped)
209    {
210  2 isXml = true;
211    }
212  495 if (data.startsWith("LOCUS"))
213    {
214  1 reply = FileFormat.GenBank;
215  1 break;
216    }
217  494 if (data.startsWith("ID "))
218    {
219  1 if (data.substring(2).trim().split(";").length == 7)
220    {
221  1 reply = FileFormat.Embl;
222  1 break;
223    }
224    }
225  493 if (data.startsWith("H ") && !aaIndexHeaderRead)
226    {
227  1 aaIndexHeaderRead = true;
228    }
229  493 if (data.startsWith("D ") && aaIndexHeaderRead)
230    {
231  1 reply = FileFormat.ScoreMatrix;
232  1 break;
233    }
234  492 if (data.startsWith("##GFF-VERSION"))
235    {
236    // GFF - possibly embedded in a Jalview features file!
237  4 reply = FileFormat.Features;
238  4 break;
239    }
240  488 if (looksLikeFeatureData(data))
241    {
242  4 reply = FileFormat.Features;
243  4 break;
244    }
245  484 if (data.indexOf("# STOCKHOLM") > -1)
246    {
247  8 reply = FileFormat.Stockholm;
248  8 break;
249    }
250  476 if (data.indexOf("_ENTRY.ID") > -1
251    || data.indexOf("_AUDIT_AUTHOR.NAME") > -1
252    || data.indexOf("_ATOM_SITE.") > -1)
253    {
254  1 reply = FileFormat.MMCif;
255  1 break;
256    }
257    // if (data.indexOf(">") > -1)
258  475 if (data.startsWith(">"))
259    {
260    // FASTA, PIR file or BLC file
261  332 boolean checkPIR = false, starterm = false;
262  332 if ((data.indexOf(">P1;") > -1) || (data.indexOf(">DL;") > -1))
263    {
264    // watch for PIR file attributes
265  1 checkPIR = true;
266  1 reply = FileFormat.PIR;
267    }
268    // could also be BLC file, read next line to confirm
269  332 data = source.nextLine();
270   
271  332 if (data.indexOf(">") > -1)
272    {
273  1 reply = FileFormat.BLC;
274    }
275    else
276    {
277    // Is this a single line BLC file?
278  331 String data1 = source.nextLine();
279  331 String data2 = source.nextLine();
280  331 int c1;
281  331 if (checkPIR)
282    {
283  1 starterm = (data1 != null && data1.indexOf("*") > -1)
284    || (data2 != null && data2.indexOf("*") > -1);
285    }
286  ? if (data2 != null && (c1 = data.indexOf("*")) > -1)
287    {
288  1 if (c1 == 0 && c1 == data2.indexOf("*"))
289    {
290  0 reply = FileFormat.BLC;
291    }
292    else
293    {
294  1 reply = FileFormat.Fasta; // possibly a bad choice - may be
295    // recognised as
296    // PIR
297    }
298    // otherwise can still possibly be a PIR file
299    }
300    else
301    {
302  330 reply = FileFormat.Fasta;
303    // TODO : AMSA File is indicated if there is annotation in the
304    // FASTA file - but FASTA will automatically generate this at the
305    // mo.
306  330 if (!checkPIR)
307    {
308  329 break;
309    }
310    }
311    }
312    // final check for PIR content. require
313    // >P1;title\n<blah>\nterminated sequence to occur at least once.
314   
315    // TODO the PIR/fasta ambiguity may be the use case that is needed to
316    // have
317    // a 'Parse as type XXX' parameter for the applet/application.
318  3 if (checkPIR)
319    {
320  1 String dta = null;
321  1 if (!starterm)
322    {
323  1 do
324    {
325  1 try
326    {
327  1 dta = source.nextLine();
328    } catch (IOException ex)
329    {
330    }
331  1 if (dta != null && dta.indexOf("*") > -1)
332    {
333  1 starterm = true;
334    }
335  1 } while (dta != null && !starterm);
336    }
337  1 if (starterm)
338    {
339  1 reply = FileFormat.PIR;
340  1 break;
341    }
342    else
343    {
344  0 reply = FileFormat.Fasta; // probably a bad choice!
345    }
346    }
347    // read as a FASTA (probably)
348  2 break;
349    }
350  143 if (data.indexOf("{\"") > -1)
351    {
352  1 reply = FileFormat.Json;
353  1 break;
354    }
355  142 int lessThan = data.indexOf("<");
356  142 if ((lessThan > -1)) // possible Markup Language data i.e HTML,
357    // RNAML, XML
358    {
359  13 String upper = data.toUpperCase(Locale.ROOT);
360  13 if (upper.substring(lessThan).startsWith("<HTML"))
361    {
362  1 reply = FileFormat.Html;
363  1 break;
364    }
365  12 if (upper.substring(lessThan).startsWith("<RNAML"))
366    {
367  2 reply = FileFormat.Rnaml;
368  2 break;
369    }
370  10 if (isXml && data.contains(
371    "<NS2:JALVIEWUSERCOLOURS SCHEMENAME=\"SEQUENCE FEATURES\" XMLNS:NS2=\"WWW.JALVIEW.ORG/COLOURS\">"))
372    {
373  2 reply = FileFormat.FeatureSettings;
374  2 break;
375    }
376    }
377   
378  137 if ((data.length() < 1) || (data.indexOf("#") == 0))
379    {
380  13 lineswereskipped = true;
381  13 continue;
382    }
383   
384  124 if (data.indexOf("PILEUP") > -1)
385    {
386  1 reply = FileFormat.Pileup;
387   
388  1 break;
389    }
390   
391  123 if ((data.indexOf("//") == 0) || ((data.indexOf("!!") > -1) && (data
392    .indexOf("!!") < data.indexOf("_MULTIPLE_ALIGNMENT "))))
393    {
394  1 reply = FileFormat.MSF;
395   
396  1 break;
397    }
398  122 else if (data.indexOf("CLUSTAL") > -1)
399    {
400  1 reply = FileFormat.Clustal;
401   
402  1 break;
403    }
404   
405  121 else if (data.indexOf("HEADER") == 0 || data.indexOf("ATOM") == 0)
406    {
407  19 reply = FileFormat.PDB;
408  19 break;
409    }
410  102 else if (data.matches("\\s*\\d+\\s+\\d+\\s*"))
411    {
412  1 reply = FileFormat.Phylip;
413  1 break;
414    }
415    else
416    {
417  101 if (!lineswereskipped && looksLikeJnetData(data))
418    {
419  0 reply = FileFormat.Jnet;
420  0 break;
421    }
422    }
423   
424  101 lineswereskipped = true; // this means there was some junk before any
425    // key file signature
426    }
427  422 if (closeSource)
428    {
429  422 source.close();
430    }
431    else
432    {
433  0 source.reset(bytesRead); // so the file can be parsed from the mark
434    }
435    } catch (Exception ex)
436    {
437  0 Console.error("File Identification failed!\n" + ex);
438  0 throw new FileFormatException(source.errormessage);
439    }
440  422 if (trimmedLength == 0)
441    {
442  0 Console.error("File Identification failed! - Empty file was read.");
443  0 throw new FileFormatException("EMPTY DATA FILE");
444    }
445  422 Console.debug("File format identified as " + reply.toString());
446  422 return reply;
447    }
448   
449    /**
450    * Returns true if the data appears to be Jnet concise annotation format
451    *
452    * @param data
453    * @return
454    */
 
455  16 toggle protected boolean looksLikeJnetData(String data)
456    {
457  16 char firstChar = data.charAt(0);
458  16 int colonPos = data.indexOf(":");
459  16 int commaPos = data.indexOf(",");
460  16 boolean isJnet = firstChar != '*' && firstChar != ' ' && colonPos > -1
461    && commaPos > -1 && colonPos < commaPos;
462    // && data.indexOf(",")<data.indexOf(",", data.indexOf(","))) / ??
463  16 return isJnet;
464    }
465   
466    /**
467    * Returns true if the data has at least 6 tab-delimited fields _and_ fields 4
468    * and 5 are integer (start/end)
469    *
470    * @param data
471    * @return
472    */
 
473  495 toggle protected boolean looksLikeFeatureData(String data)
474    {
475  495 if (data == null)
476    {
477  1 return false;
478    }
479  494 String[] columns = data.split("\t");
480  494 if (columns.length < 6)
481    {
482  486 return false;
483    }
484  21 for (int col = 3; col < 5; col++)
485    {
486  15 try
487    {
488  15 Integer.parseInt(columns[col]);
489    } catch (NumberFormatException e)
490    {
491  2 return false;
492    }
493    }
494  6 return true;
495    }
496   
497    /**
498    *
499    * @param args
500    * @j2sIgnore
501    */
 
502  0 toggle public static void main(String[] args)
503    {
504  0 for (int i = 0; args != null && i < args.length; i++)
505    {
506  0 IdentifyFile ider = new IdentifyFile();
507  0 FileFormatI type = null;
508  0 try
509    {
510  0 type = ider.identify(args[i], DataSourceType.FILE);
511    } catch (FileNotFoundException e)
512    {
513  0 Console.error(String.format("Error '%s' fetching file %s", args[i],
514    e.getMessage()));
515    } catch (FileFormatException e)
516    {
517  0 Console.error(
518    String.format("Error '%s' identifying file type for %s",
519    args[i], e.getMessage()));
520    }
521  0 Console.debug("Type of " + args[i] + " is " + type);
522    }
523  0 if (args == null || args.length == 0)
524    {
525  0 Console.error("Usage: <Filename> [<Filename> ...]");
526    }
527    }
528   
529    }