Clover icon

jalviewX

  1. Project Clover database Wed Oct 31 2018 15:13:58 GMT
  2. Package jalview.io

File IdentifyFile.java

 
testIdentify: Problem opening rf00031_folded.stk : FILE CANNOT BE OPENE...
 

Coverage histogram

../../img/srcFileCovDistChart8.png
19% of files have more coverage

Code metrics

100
175
9
1
479
363
84
0.48
19.44
9
9.33

Classes

Class Line # Actions
IdentifyFile 32 175 84 69
0.757042275.7%
 

Contributing tests

This file is covered by 95 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.io;
22   
23    import java.io.File;
24    import java.io.IOException;
25   
26    /**
27    * DOCUMENT ME!
28    *
29    * @author $author$
30    * @version $Revision$
31    */
 
32    public class IdentifyFile
33    {
34   
 
35  2 toggle public FileFormatI identify(Object file, DataSourceType protocol) throws FileFormatException
36    {
37    // BH 2018
38  2 return (file instanceof File ? identify((File) file, protocol) : identify((String) file, protocol));
39   
40    }
41   
 
42  0 toggle public FileFormatI identify(File file, DataSourceType sourceType)
43    throws FileFormatException
44    {
45    // BH 2018
46  0 String emessage = "UNIDENTIFIED FILE PARSING ERROR";
47  0 FileParse parser = null;
48  0 try
49    {
50  0 parser = new FileParse(file, sourceType);
51  0 if (parser.isValid())
52    {
53  0 return identify(parser);
54    }
55    } catch (Exception e)
56    {
57  0 System.err.println("Error whilst identifying " + file);
58  0 e.printStackTrace(System.err);
59  0 emessage = e.getMessage();
60    }
61  0 if (parser != null)
62    {
63  0 throw new FileFormatException(parser.errormessage);
64    }
65  0 throw new FileFormatException(emessage);
66    }
67   
68    /**
69    * Identify a datasource's file content.
70    *
71    * @note Do not use this method for stream sources - create a FileParse object
72    * instead.
73    *
74    * @param file
75    * @param sourceType
76    * @return
77    * @throws FileFormatException
78    */
 
79  132 toggle public FileFormatI identify(String file, DataSourceType sourceType)
80    throws FileFormatException
81    {
82  132 String emessage = "UNIDENTIFIED FILE PARSING ERROR";
83  132 FileParse parser = null;
84  132 try
85    {
86  132 parser = new FileParse(file, sourceType);
87  130 if (parser.isValid())
88    {
89  130 return identify(parser);
90    }
91    } catch (Exception e)
92    {
93  0 System.err.println("Error whilst identifying " + file);
94  0 e.printStackTrace(System.err);
95  0 emessage = e.getMessage();
96    }
97  0 if (parser != null)
98    {
99  0 throw new FileFormatException(parser.errormessage);
100    }
101  0 Test failure here throw new FileFormatException(emessage);
102    }
103   
 
104  136 toggle public FileFormatI identify(FileParse source) throws FileFormatException
105    {
106  136 return identify(source, true);
107    // preserves original behaviour prior to version 2.3
108    }
109   
 
110  0 toggle public FileFormatI identify(AlignmentFileReaderI file,
111    boolean closeSource) throws IOException
112    {
113  0 FileParse fp = new FileParse(file.getInFile(),
114    file.getDataSourceType());
115  0 return identify(fp, closeSource);
116    }
117   
118    /**
119    * Identify contents of source, closing it or resetting source to start
120    * afterwards.
121    *
122    * @param source
123    * @param closeSource
124    * @return (best guess at) file format
125    * @throws FileFormatException
126    */
 
127  136 toggle public FileFormatI identify(FileParse source, boolean closeSource)
128    throws FileFormatException
129    {
130  136 FileFormatI reply = FileFormat.Pfam;
131  136 String data;
132  136 int bytesRead = 0;
133  136 int trimmedLength = 0;
134  136 boolean lineswereskipped = false;
135  136 boolean isBinary = false; // true if length is non-zero and non-printable
136    // characters are encountered
137   
138  136 try
139    {
140  136 if (!closeSource)
141    {
142  0 source.mark();
143    }
144  136 boolean aaIndexHeaderRead = false;
145   
146  ? while ((data = source.nextLine()) != null)
147    {
148  240 bytesRead += data.length();
149  240 trimmedLength += data.trim().length();
150  240 if (!lineswereskipped)
151    {
152  7230 for (int i = 0; !isBinary && i < data.length(); i++)
153    {
154  7094 char c = data.charAt(i);
155  7094 isBinary = (c < 32 && c != '\t' && c != '\n' && c != '\r'
156    && c != 5 && c != 27); // nominal binary character filter
157    // excluding CR, LF, tab,DEL and ^E
158    // for certain blast ids
159    }
160    }
161  240 if (isBinary)
162    {
163    // jar files are special - since they contain all sorts of random
164    // characters.
165  17 if (source.inFile != null)
166    {
167  17 String fileStr = source.inFile.getName();
168    // possibly a Jalview archive.
169  17 if (fileStr.lastIndexOf(".jar") > -1
170    || fileStr.lastIndexOf(".zip") > -1)
171    {
172  6 reply = FileFormat.Jalview;
173    }
174    }
175  17 if (!lineswereskipped && data.startsWith("PK"))
176    {
177  17 reply = FileFormat.Jalview; // archive.
178  17 break;
179    }
180    }
181  223 data = data.toUpperCase();
182   
183  223 if (data.startsWith(ScoreMatrixFile.SCOREMATRIX))
184    {
185  1 reply = FileFormat.ScoreMatrix;
186  1 break;
187    }
188  222 if (data.startsWith("H ") && !aaIndexHeaderRead)
189    {
190  1 aaIndexHeaderRead = true;
191    }
192  222 if (data.startsWith("D ") && aaIndexHeaderRead)
193    {
194  1 reply = FileFormat.ScoreMatrix;
195  1 break;
196    }
197  221 if (data.startsWith("##GFF-VERSION"))
198    {
199    // GFF - possibly embedded in a Jalview features file!
200  4 reply = FileFormat.Features;
201  4 break;
202    }
203  217 if (looksLikeFeatureData(data))
204    {
205  4 reply = FileFormat.Features;
206  4 break;
207    }
208  213 if (data.indexOf("# STOCKHOLM") > -1)
209    {
210  7 reply = FileFormat.Stockholm;
211  7 break;
212    }
213  206 if (data.indexOf("_ENTRY.ID") > -1
214    || data.indexOf("_AUDIT_AUTHOR.NAME") > -1
215    || data.indexOf("_ATOM_SITE.") > -1)
216    {
217  1 reply = FileFormat.MMCif;
218  1 break;
219    }
220    // if (data.indexOf(">") > -1)
221  205 if (data.startsWith(">"))
222    {
223    // FASTA, PIR file or BLC file
224  76 boolean checkPIR = false, starterm = false;
225  76 if ((data.indexOf(">P1;") > -1) || (data.indexOf(">DL;") > -1))
226    {
227    // watch for PIR file attributes
228  1 checkPIR = true;
229  1 reply = FileFormat.PIR;
230    }
231    // could also be BLC file, read next line to confirm
232  76 data = source.nextLine();
233   
234  76 if (data.indexOf(">") > -1)
235    {
236  1 reply = FileFormat.BLC;
237    }
238    else
239    {
240    // Is this a single line BLC file?
241  75 String data1 = source.nextLine();
242  75 String data2 = source.nextLine();
243  75 int c1;
244  75 if (checkPIR)
245    {
246  1 starterm = (data1 != null && data1.indexOf("*") > -1)
247    || (data2 != null && data2.indexOf("*") > -1);
248    }
249  ? if (data2 != null && (c1 = data.indexOf("*")) > -1)
250    {
251  1 if (c1 == 0 && c1 == data2.indexOf("*"))
252    {
253  0 reply = FileFormat.BLC;
254    }
255    else
256    {
257  1 reply = FileFormat.Fasta; // possibly a bad choice - may be
258    // recognised as
259    // PIR
260    }
261    // otherwise can still possibly be a PIR file
262    }
263    else
264    {
265  74 reply = FileFormat.Fasta;
266    // TODO : AMSA File is indicated if there is annotation in the
267    // FASTA file - but FASTA will automatically generate this at the
268    // mo.
269  74 if (!checkPIR)
270    {
271  73 break;
272    }
273    }
274    }
275    // final check for PIR content. require
276    // >P1;title\n<blah>\nterminated sequence to occur at least once.
277   
278    // TODO the PIR/fasta ambiguity may be the use case that is needed to
279    // have
280    // a 'Parse as type XXX' parameter for the applet/application.
281  3 if (checkPIR)
282    {
283  1 String dta = null;
284  1 if (!starterm)
285    {
286  1 do
287    {
288  1 try
289    {
290  1 dta = source.nextLine();
291    } catch (IOException ex)
292    {
293    }
294  1 if (dta != null && dta.indexOf("*") > -1)
295    {
296  1 starterm = true;
297    }
298  1 } while (dta != null && !starterm);
299    }
300  1 if (starterm)
301    {
302  1 reply = FileFormat.PIR;
303  1 break;
304    }
305    else
306    {
307  0 reply = FileFormat.Fasta; // probably a bad choice!
308    }
309    }
310    // read as a FASTA (probably)
311  2 break;
312    }
313  129 if (data.indexOf("{\"") > -1)
314    {
315  1 reply = FileFormat.Json;
316  1 break;
317    }
318  128 int lessThan = data.indexOf("<");
319  128 if ((lessThan > -1)) // possible Markup Language data i.e HTML,
320    // RNAML, XML
321    {
322  10 String upper = data.toUpperCase();
323  10 if (upper.substring(lessThan).startsWith("<HTML"))
324    {
325  1 reply = FileFormat.Html;
326  1 break;
327    }
328  9 if (upper.substring(lessThan).startsWith("<RNAML"))
329    {
330  2 reply = FileFormat.Rnaml;
331  2 break;
332    }
333    }
334   
335  125 if ((data.length() < 1) || (data.indexOf("#") == 0))
336    {
337  13 lineswereskipped = true;
338  13 continue;
339    }
340   
341  112 if (data.indexOf("PILEUP") > -1)
342    {
343  1 reply = FileFormat.Pileup;
344   
345  1 break;
346    }
347   
348  111 if ((data.indexOf("//") == 0) || ((data.indexOf("!!") > -1) && (data
349    .indexOf("!!") < data.indexOf("_MULTIPLE_ALIGNMENT "))))
350    {
351  1 reply = FileFormat.MSF;
352   
353  1 break;
354    }
355  110 else if (data.indexOf("CLUSTAL") > -1)
356    {
357  1 reply = FileFormat.Clustal;
358   
359  1 break;
360    }
361   
362  109 else if (data.indexOf("HEADER") == 0 || data.indexOf("ATOM") == 0)
363    {
364  10 reply = FileFormat.PDB;
365  10 break;
366    }
367  99 else if (data.matches("\\s*\\d+\\s+\\d+\\s*"))
368    {
369  1 reply = FileFormat.Phylip;
370  1 break;
371    }
372    else
373    {
374  98 if (!lineswereskipped && looksLikeJnetData(data))
375    {
376  0 reply = FileFormat.Jnet;
377  0 break;
378    }
379    }
380   
381  98 lineswereskipped = true; // this means there was some junk before any
382    // key file signature
383    }
384  136 if (closeSource)
385    {
386  136 source.close();
387    }
388    else
389    {
390  0 source.reset(bytesRead); // so the file can be parsed from the mark
391    }
392    } catch (Exception ex)
393    {
394  0 System.err.println("File Identification failed!\n" + ex);
395  0 throw new FileFormatException(source.errormessage);
396    }
397  136 if (trimmedLength == 0)
398    {
399  0 System.err.println(
400    "File Identification failed! - Empty file was read.");
401  0 throw new FileFormatException("EMPTY DATA FILE");
402    }
403  136 System.out.println("File format identified as " + reply.toString());
404  136 return reply;
405    }
406   
407    /**
408    * Returns true if the data appears to be Jnet concise annotation format
409    *
410    * @param data
411    * @return
412    */
 
413  13 toggle protected boolean looksLikeJnetData(String data)
414    {
415  13 char firstChar = data.charAt(0);
416  13 int colonPos = data.indexOf(":");
417  13 int commaPos = data.indexOf(",");
418  13 boolean isJnet = firstChar != '*' && firstChar != ' ' && colonPos > -1
419    && commaPos > -1 && colonPos < commaPos;
420    // && data.indexOf(",")<data.indexOf(",", data.indexOf(","))) / ??
421  13 return isJnet;
422    }
423   
424    /**
425    * Returns true if the data has at least 6 tab-delimited fields _and_ fields 4
426    * and 5 are integer (start/end)
427    *
428    * @param data
429    * @return
430    */
 
431  224 toggle protected boolean looksLikeFeatureData(String data)
432    {
433  224 if (data == null)
434    {
435  1 return false;
436    }
437  223 String[] columns = data.split("\t");
438  223 if (columns.length < 6)
439    {
440  215 return false;
441    }
442  21 for (int col = 3; col < 5; col++)
443    {
444  15 try
445    {
446  15 Integer.parseInt(columns[col]);
447    } catch (NumberFormatException e)
448    {
449  2 return false;
450    }
451    }
452  6 return true;
453    }
454   
 
455  0 toggle public static void main(String[] args)
456    {
457  0 for (int i = 0; args != null && i < args.length; i++)
458    {
459  0 IdentifyFile ider = new IdentifyFile();
460  0 FileFormatI type = null;
461  0 try
462    {
463  0 type = ider.identify(args[i], DataSourceType.FILE);
464    } catch (FileFormatException e)
465    {
466  0 System.err.println(
467    String.format("Error '%s' identifying file type for %s",
468    args[i], e.getMessage()));
469    }
470  0 System.out.println("Type of " + args[i] + " is " + type);
471    }
472  0 if (args == null || args.length == 0)
473    {
474  0 System.err.println("Usage: <Filename> [<Filename> ...]");
475    }
476    }
477   
478   
479    }