Clover icon

Coverage Report

  1. Project Clover database Thu Dec 4 2025 16:11:35 GMT
  2. Package jalview.io

File IdentifyFile.java

 

Coverage histogram

../../img/srcFileCovDistChart8.png
20% of files have more coverage

Code metrics

122
213
12
1
578
452
109
0.51
17.75
12
9.08

Classes

Class Line # Actions
IdentifyFile 36 213 109
0.77809877.8%
 

Contributing tests

This file is covered by 197 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.io;
22   
23    import java.io.File;
24    import java.io.FileNotFoundException;
25    import java.io.IOException;
26    import java.util.Locale;
27   
28    import jalview.bin.Console;
29   
30    /**
31    * DOCUMENT ME!
32    *
33    * @author $author$
34    * @version $Revision$
35    */
 
36    public class IdentifyFile
37    {
38   
39    private static final String XMLHEADER = "<?XML VERSION=\"1.0\" ENCODING=\"UTF-8\" STANDALONE=\"YES\"?>";
40   
 
41  2 toggle public FileFormatI identify(Object file, DataSourceType protocol)
42    throws FileFormatException, FileNotFoundException
43    {
44  2 return identify(file, protocol, false);
45    }
46   
 
47  13 toggle public FileFormatI identify(Object file, DataSourceType protocol,
48    boolean checkForAnnotations)
49    throws FileFormatException, FileNotFoundException
50    {
51    // BH 2018
52  13 return (file instanceof File
53    ? identify((File) file, protocol, checkForAnnotations)
54    : identify((String) file, protocol, checkForAnnotations));
55   
56    }
57   
 
58  6 toggle public FileFormatI identify(File file, DataSourceType sourceType)
59    throws FileFormatException
60    {
61  6 return identify(file, sourceType, false);
62    }
63   
 
64  6 toggle public FileFormatI identify(File file, DataSourceType sourceType,
65    boolean checkForAnnotations) throws FileFormatException
66    {
67    // BH 2018
68  6 String emessage = "UNIDENTIFIED FILE PARSING ERROR";
69  6 FileParse parser = null;
70  6 try
71    {
72   
73  6 parser = new FileParse(file, sourceType);
74  6 if (parser.isValid())
75    {
76  6 FileFormatI format = identify(parser, true, checkForAnnotations);
77  6 if (file.getName()!=null && file.getName().toLowerCase(Locale.ROOT).endsWith(".a3m"))
78    {
79  0 Console.debug("Identified an A3m file! ("+file.getName()+")");
80  0 format = FileFormat.A3M;
81    }
82  6 return format;
83    }
84    } catch (Exception e)
85    {
86  0 Console.error("Error whilst identifying " + file, e);
87  0 emessage = e.getMessage();
88    }
89  0 if (parser != null)
90    {
91  0 throw new FileFormatException(parser.errormessage);
92    }
93  0 throw new FileFormatException(emessage);
94    }
95   
 
96  425 toggle public FileFormatI identify(String file, DataSourceType sourceType)
97    throws FileFormatException, FileNotFoundException
98    {
99  425 return identify(file, sourceType, false);
100    }
101   
102    /**
103    * Identify a datasource's file content.
104    *
105    * @note Do not use this method for stream sources - create a FileParse object
106    * instead.
107    *
108    * @param file
109    * @param sourceType
110    * @return
111    * @throws FileFormatException
112    */
 
113  438 toggle public FileFormatI identify(String file, DataSourceType sourceType,
114    boolean checkForAnnotations)
115    throws FileFormatException, FileNotFoundException
116    {
117  438 String emessage = "UNIDENTIFIED FILE PARSING ERROR";
118  438 FileParse parser = null;
119  438 FileNotFoundException fnf = null;
120  438 try
121    {
122  438 parser = new FileParse(file, sourceType);
123  438 if (parser.isValid())
124    {
125  438 FileFormatI format = identify(parser);
126  438 if (DataSourceType.FILE.equals(sourceType) && file!=null && file.toLowerCase(Locale.ROOT).endsWith(".a3m"))
127    {
128  1 Console.debug("Identified an A3m file! ("+file+")");
129  1 format = FileFormat.A3M;
130    }
131  438 return format;
132   
133    }
134    } catch (FileNotFoundException e)
135    {
136  0 fnf = e;
137  0 emessage = "Could not find '" + file + "'";
138  0 Console.error("Could not find '" + file + "'", e);
139    } catch (IOException e)
140    {
141  0 Console.error("Error whilst trying to read " + file, e);
142    } catch (Exception e)
143    {
144  0 Console.error("Error whilst identifying " + file, e);
145  0 emessage = e.getMessage();
146    }
147  0 if (parser != null)
148    {
149  0 throw new FileFormatException(parser.errormessage);
150    }
151  0 if (fnf != null)
152    {
153  0 throw (fnf);
154    }
155  0 throw new FileFormatException(emessage);
156    }
157   
 
158  444 toggle public FileFormatI identify(FileParse source) throws FileFormatException
159    {
160  444 return identify(source, true, false);
161    // preserves original behaviour prior to version 2.3
162    }
163   
 
164  0 toggle public FileFormatI identify(AlignmentFileReaderI file,
165    boolean closeSource) throws IOException
166    {
167  0 FileParse fp = new FileParse(file.getInFile(),
168    file.getDataSourceType());
169  0 return identify(fp, closeSource, false);
170    }
171   
172    /**
173    * Identify contents of source, closing it or resetting source to start
174    * afterwards.
175    *
176    * @param source
177    * @param closeSource
178    * @return (best guess at) file format
179    * @throws FileFormatException
180    */
 
181  450 toggle public FileFormatI identify(FileParse source, boolean closeSource,
182    boolean checkForAnnotations) throws FileFormatException
183    {
184  450 FileFormatI reply = FileFormat.Pfam;
185  450 String data;
186  450 int bytesRead = 0;
187  450 int trimmedLength = 0;
188  450 boolean isXml = false; // set true if first line is XMLHEADER
189  450 boolean lineswereskipped = false;
190  450 boolean isBinary = false; // true if length is non-zero and non-printable
191    // characters are encountered
192   
193  450 try
194    {
195  450 if (!closeSource)
196    {
197  0 source.mark();
198    }
199  450 boolean aaIndexHeaderRead = false;
200   
201  ? while ((data = source.nextLine()) != null)
202    {
203  716 bytesRead += data.length();
204  716 trimmedLength += data.trim().length();
205  716 if (!lineswereskipped)
206    {
207  63519 for (int i = 0; !isBinary && i < data.length(); i++)
208    {
209  63069 char c = data.charAt(i);
210  63069 isBinary = (c < 32 && c != '\t' && c != '\n' && c != '\r'
211    && c != 5 && c != 27); // nominal binary character filter
212    // excluding CR, LF, tab,DEL and ^E
213    // for certain blast ids
214    }
215    }
216  716 if (isBinary)
217    {
218    // jar files are special - since they contain all sorts of random
219    // characters.
220  42 if (source.inFile != null)
221    {
222  42 String fileStr = source.inFile.getName();
223  42 if (fileStr.contains(".jar") || fileStr.contains(".zip")
224    || fileStr.contains(".jvp"))
225    {
226    // possibly a Jalview archive (but check further)
227  37 reply = FileFormat.Jalview;
228    }
229    }
230  42 if (!lineswereskipped && data.startsWith("PK"))
231    {
232  42 reply = FileFormat.Jalview; // archive
233  42 break;
234    }
235    }
236  674 data = data.toUpperCase(Locale.ROOT);
237   
238  674 if (data.startsWith(ScoreMatrixFile.SCOREMATRIX))
239    {
240  1 reply = FileFormat.ScoreMatrix;
241  1 break;
242    }
243  673 if (data.startsWith(XMLHEADER) && !lineswereskipped)
244    {
245  2 isXml = true;
246    }
247  673 if (data.startsWith("HMMER3"))
248    {
249  1 reply = FileFormat.HMMER3;
250  1 break;
251    }
252  672 if (data.startsWith("LOCUS"))
253    {
254  1 reply = FileFormat.GenBank;
255  1 break;
256    }
257  671 if (data.startsWith("ID "))
258    {
259  1 if (data.substring(2).trim().split(";").length == 7)
260    {
261  1 reply = FileFormat.Embl;
262  1 break;
263    }
264    }
265  670 if (data.startsWith("H ") && !aaIndexHeaderRead)
266    {
267  1 aaIndexHeaderRead = true;
268    }
269  670 if (data.startsWith("D ") && aaIndexHeaderRead)
270    {
271  1 reply = FileFormat.ScoreMatrix;
272  1 break;
273    }
274  669 if (data.startsWith("##GFF-VERSION"))
275    {
276    // GFF - possibly embedded in a Jalview features file!
277  6 reply = FileFormat.Features;
278  6 break;
279    }
280  663 if (looksLikeFeatureData(data))
281    {
282  5 reply = FileFormat.Features;
283  5 break;
284    }
285  658 if (data.indexOf("# STOCKHOLM") > -1)
286    {
287  8 reply = FileFormat.Stockholm;
288  8 break;
289    }
290  650 if (data.indexOf("_ENTRY.ID") > -1
291    || data.indexOf("_AUDIT_AUTHOR.NAME") > -1
292    || data.indexOf("_ATOM_SITE.") > -1)
293    {
294  1 reply = FileFormat.MMCif;
295  1 break;
296    }
297    // if (data.indexOf(">") > -1)
298  649 if (data.startsWith(">"))
299    {
300    // FASTA, PIR file or BLC file
301  339 boolean checkPIR = false, starterm = false;
302  339 if ((data.indexOf(">P1;") > -1) || (data.indexOf(">DL;") > -1))
303    {
304    // watch for PIR file attributes
305  2 checkPIR = true;
306  2 reply = FileFormat.PIR;
307    }
308    // could also be BLC file, read next line to confirm
309  339 data = source.nextLine();
310   
311  339 if (data.indexOf(">") > -1)
312    {
313  2 reply = FileFormat.BLC;
314    }
315    else
316    {
317    // Is this a single line BLC file?
318  337 String data1 = source.nextLine();
319  337 String data2 = source.nextLine();
320  337 int c1;
321  337 if (checkPIR)
322    {
323  2 starterm = (data1 != null && data1.indexOf("*") > -1)
324    || (data2 != null && data2.indexOf("*") > -1);
325    }
326  ? if (data2 != null && (c1 = data.indexOf("*")) > -1)
327    {
328  1 if (c1 == 0 && c1 == data2.indexOf("*"))
329    {
330  0 reply = FileFormat.BLC;
331    }
332    else
333    {
334  1 reply = FileFormat.Fasta; // possibly a bad choice - may be
335    // recognised as PIR
336    }
337    // otherwise can still possibly be a PIR file
338    }
339    else
340    {
341  336 reply = FileFormat.Fasta;
342    // TODO : AMSA File is indicated if there is annotation in the
343    // FASTA file - but FASTA will automatically generate this at the
344    // mo.
345  336 if (!checkPIR)
346    {
347  334 break;
348    }
349    }
350    }
351    // final check for PIR content. require
352    // >P1;title\n<blah>\nterminated sequence to occur at least once.
353   
354    // TODO the PIR/fasta ambiguity may be the use case that is needed to
355    // have
356    // a 'Parse as type XXX' parameter for the applet/application.
357  5 if (checkPIR)
358    {
359  2 String dta = null;
360  2 if (!starterm)
361    {
362  2 do
363    {
364  2 try
365    {
366  2 dta = source.nextLine();
367    } catch (IOException ex)
368    {
369    }
370  2 if (dta != null && dta.indexOf("*") > -1)
371    {
372  2 starterm = true;
373    }
374  2 } while (dta != null && !starterm);
375    }
376  2 if (starterm)
377    {
378  2 reply = FileFormat.PIR;
379  2 break;
380    }
381    else
382    {
383  0 reply = FileFormat.Fasta; // probably a bad choice!
384    }
385    }
386    // read as a FASTA (probably)
387  3 break;
388    }
389  310 if (data.indexOf("{\"") > -1)
390    {
391  1 reply = FileFormat.Json;
392  1 break;
393    }
394  309 int lessThan = data.indexOf("<");
395  309 if ((lessThan > -1)) // possible Markup Language data i.e HTML, RNAML,
396    // XML
397    {
398  22 String upper = data.toUpperCase(Locale.ROOT);
399  22 if (upper.substring(lessThan).startsWith("<HTML"))
400    {
401  6 reply = FileFormat.Html;
402  6 break;
403    }
404  16 if (upper.substring(lessThan).startsWith("<RNAML"))
405    {
406  2 reply = FileFormat.Rnaml;
407  2 break;
408    }
409  14 if (isXml && data.contains(
410    "<NS2:JALVIEWUSERCOLOURS SCHEMENAME=\"SEQUENCE FEATURES\" XMLNS:NS2=\"WWW.JALVIEW.ORG/COLOURS\">"))
411    {
412  2 reply = FileFormat.FeatureSettings;
413  2 break;
414    }
415  12 if (upper.substring(lessThan).startsWith("<BSML"))
416    {
417  0 reply = FileFormat.BSML;
418  0 break;
419    }
420    }
421   
422  299 if ((data.length() < 1) || (data.indexOf("#") == 0))
423    {
424  52 lineswereskipped = true;
425  52 continue;
426    }
427   
428  247 if (data.indexOf("PILEUP") > -1)
429    {
430  2 reply = FileFormat.Pileup;
431   
432  2 break;
433    }
434   
435  245 if ((data.indexOf("//") == 0) || ((data.indexOf("!!") > -1) && (data
436    .indexOf("!!") < data.indexOf("_MULTIPLE_ALIGNMENT "))))
437    {
438  2 reply = FileFormat.MSF;
439   
440  2 break;
441    }
442  243 else if (data.indexOf("CLUSTAL") > -1)
443    {
444  2 reply = FileFormat.Clustal;
445   
446  2 break;
447    }
448   
449  241 else if (data.indexOf("HEADER") == 0 || data.indexOf("ATOM") == 0)
450    {
451  13 reply = FileFormat.PDB;
452  13 break;
453    }
454  228 else if (data.matches("\\s*\\d+\\s+\\d+\\s*"))
455    {
456  1 reply = FileFormat.Phylip;
457  1 break;
458    }
459  227 else if (checkForAnnotations && !lineswereskipped
460    && data.equals(AnnotationFile.JALVIEW_ANNOTATION))
461    {
462  0 reply = FileFormat.JalviewAnnotation;
463    }
464    else
465    {
466  227 if (!lineswereskipped && looksLikeJnetData(data))
467    {
468  0 reply = FileFormat.Jnet;
469  0 break;
470    }
471    }
472   
473  227 lineswereskipped = true; // this means there was some junk before any
474    // key file signature
475    }
476  450 if (closeSource)
477    {
478  450 source.close();
479    }
480    else
481    {
482  0 source.reset(bytesRead); // so the file can be parsed from the mark
483    }
484    } catch (Exception ex)
485    {
486  0 Console.error("File Identification failed!\n" + ex);
487  0 throw new FileFormatException(source.errormessage);
488    }
489  450 if (trimmedLength == 0)
490    {
491  0 Console.error("File Identification failed! - Empty file was read.");
492  0 throw new FileFormatException("EMPTY DATA FILE");
493    }
494  450 Console.debug("File format identified as " + reply.toString());
495  450 return reply;
496    }
497   
498    /**
499    * Returns true if the data appears to be Jnet concise annotation format
500    *
501    * @param data
502    * @return
503    */
 
504  26 toggle protected boolean looksLikeJnetData(String data)
505    {
506  26 char firstChar = data.charAt(0);
507  26 int colonPos = data.indexOf(":");
508  26 int commaPos = data.indexOf(",");
509  26 boolean isJnet = firstChar != '*' && firstChar != ' ' && colonPos > -1
510    && commaPos > -1 && colonPos < commaPos;
511    // && data.indexOf(",")<data.indexOf(",", data.indexOf(","))) / ??
512  26 return isJnet;
513    }
514   
515    /**
516    * Returns true if the data has at least 6 tab-delimited fields _and_ fields 4
517    * and 5 are integer (start/end)
518    *
519    * @param data
520    * @return
521    */
 
522  670 toggle protected boolean looksLikeFeatureData(String data)
523    {
524  670 if (data == null)
525    {
526  1 return false;
527    }
528  669 String[] columns = data.split("\t");
529  669 if (columns.length < 6)
530    {
531  660 return false;
532    }
533  24 for (int col = 3; col < 5; col++)
534    {
535  17 try
536    {
537  17 Integer.parseInt(columns[col]);
538    } catch (NumberFormatException e)
539    {
540  2 return false;
541    }
542    }
543  7 return true;
544    }
545   
546    /**
547    *
548    * @param args
549    * @j2sIgnore
550    */
 
551  0 toggle public static void main(String[] args)
552    {
553  0 for (int i = 0; args != null && i < args.length; i++)
554    {
555  0 IdentifyFile ider = new IdentifyFile();
556  0 FileFormatI type = null;
557  0 try
558    {
559  0 type = ider.identify(args[i], DataSourceType.FILE);
560    } catch (FileNotFoundException e)
561    {
562  0 Console.error(String.format("Error '%s' fetching file %s", args[i],
563    e.getMessage()));
564    } catch (FileFormatException e)
565    {
566  0 Console.error(
567    String.format("Error '%s' identifying file type for %s",
568    args[i], e.getMessage()));
569    }
570  0 Console.info("Type of " + args[i] + " is " + type);
571    }
572  0 if (args == null || args.length == 0)
573    {
574  0 Console.error("Usage: <Filename> [<Filename> ...]");
575    }
576    }
577   
578    }