Clover icon

Coverage Report

  1. Project Clover database Mon Dec 1 2025 15:35:32 GMT
  2. Package jalview.io

File IdentifyFile.java

 

Coverage histogram

../../img/srcFileCovDistChart8.png
21% of files have more coverage

Code metrics

118
207
12
1
568
442
107
0.52
17.25
12
8.92

Classes

Class Line # Actions
IdentifyFile 36 207 107
0.780415478%
 

Contributing tests

This file is covered by 207 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.io;
22   
23    import java.io.File;
24    import java.io.FileNotFoundException;
25    import java.io.IOException;
26    import java.util.Locale;
27   
28    import jalview.bin.Console;
29   
30    /**
31    * DOCUMENT ME!
32    *
33    * @author $author$
34    * @version $Revision$
35    */
 
36    public class IdentifyFile
37    {
38   
39    private static final String XMLHEADER = "<?XML VERSION=\"1.0\" ENCODING=\"UTF-8\" STANDALONE=\"YES\"?>";
40   
 
41  2 toggle public FileFormatI identify(Object file, DataSourceType protocol)
42    throws FileFormatException, FileNotFoundException
43    {
44  2 return identify(file, protocol, false);
45    }
46   
 
47  13 toggle public FileFormatI identify(Object file, DataSourceType protocol,
48    boolean checkForAnnotations)
49    throws FileFormatException, FileNotFoundException
50    {
51    // BH 2018
52  13 return (file instanceof File
53    ? identify((File) file, protocol, checkForAnnotations)
54    : identify((String) file, protocol, checkForAnnotations));
55   
56    }
57   
 
58  6 toggle public FileFormatI identify(File file, DataSourceType sourceType)
59    throws FileFormatException
60    {
61  6 return identify(file, sourceType, false);
62    }
63   
 
64  6 toggle public FileFormatI identify(File file, DataSourceType sourceType,
65    boolean checkForAnnotations) throws FileFormatException
66    {
67    // BH 2018
68  6 String emessage = "UNIDENTIFIED FILE PARSING ERROR";
69  6 FileParse parser = null;
70  6 try
71    {
72   
73  6 parser = new FileParse(file, sourceType);
74  6 if (parser.isValid())
75    {
76  6 FileFormatI format = identify(parser, true, checkForAnnotations);
77  6 if (file.getName()!=null && file.getName().toLowerCase(Locale.ROOT).endsWith(".a3m"))
78    {
79  0 Console.debug("Identified an A3m file! ("+file.getName()+")");
80  0 format = FileFormat.A3M;
81    }
82  6 return format;
83    }
84    } catch (Exception e)
85    {
86  0 Console.error("Error whilst identifying " + file, e);
87  0 emessage = e.getMessage();
88    }
89  0 if (parser != null)
90    {
91  0 throw new FileFormatException(parser.errormessage);
92    }
93  0 throw new FileFormatException(emessage);
94    }
95   
 
96  431 toggle public FileFormatI identify(String file, DataSourceType sourceType)
97    throws FileFormatException, FileNotFoundException
98    {
99  431 return identify(file, sourceType, false);
100    }
101   
102    /**
103    * Identify a datasource's file content.
104    *
105    * @note Do not use this method for stream sources - create a FileParse object
106    * instead.
107    *
108    * @param file
109    * @param sourceType
110    * @return
111    * @throws FileFormatException
112    */
 
113  444 toggle public FileFormatI identify(String file, DataSourceType sourceType,
114    boolean checkForAnnotations)
115    throws FileFormatException, FileNotFoundException
116    {
117  444 String emessage = "UNIDENTIFIED FILE PARSING ERROR";
118  444 FileParse parser = null;
119  444 FileNotFoundException fnf = null;
120  444 try
121    {
122  444 parser = new FileParse(file, sourceType);
123  444 if (parser.isValid())
124    {
125  444 FileFormatI format = identify(parser);
126  444 if (DataSourceType.FILE.equals(sourceType) && file!=null && file.toLowerCase(Locale.ROOT).endsWith(".a3m"))
127    {
128  1 Console.debug("Identified an A3m file! ("+file+")");
129  1 format = FileFormat.A3M;
130    }
131  444 return format;
132   
133    }
134    } catch (FileNotFoundException e)
135    {
136  0 fnf = e;
137  0 emessage = "Could not find '" + file + "'";
138  0 Console.error("Could not find '" + file + "'", e);
139    } catch (IOException e)
140    {
141  0 Console.error("Error whilst trying to read " + file, e);
142    } catch (Exception e)
143    {
144  0 Console.error("Error whilst identifying " + file, e);
145  0 emessage = e.getMessage();
146    }
147  0 if (parser != null)
148    {
149  0 throw new FileFormatException(parser.errormessage);
150    }
151  0 if (fnf != null)
152    {
153  0 throw (fnf);
154    }
155  0 throw new FileFormatException(emessage);
156    }
157   
 
158  450 toggle public FileFormatI identify(FileParse source) throws FileFormatException
159    {
160  450 return identify(source, true, false);
161    // preserves original behaviour prior to version 2.3
162    }
163   
 
164  0 toggle public FileFormatI identify(AlignmentFileReaderI file,
165    boolean closeSource) throws IOException
166    {
167  0 FileParse fp = new FileParse(file.getInFile(),
168    file.getDataSourceType());
169  0 return identify(fp, closeSource, false);
170    }
171   
172    /**
173    * Identify contents of source, closing it or resetting source to start
174    * afterwards.
175    *
176    * @param source
177    * @param closeSource
178    * @return (best guess at) file format
179    * @throws FileFormatException
180    */
 
181  456 toggle public FileFormatI identify(FileParse source, boolean closeSource,
182    boolean checkForAnnotations) throws FileFormatException
183    {
184  456 FileFormatI reply = FileFormat.Pfam;
185  456 String data;
186  456 int bytesRead = 0;
187  456 int trimmedLength = 0;
188  456 boolean isXml = false; // set true if first line is XMLHEADER
189  456 boolean lineswereskipped = false;
190  456 boolean isBinary = false; // true if length is non-zero and non-printable
191    // characters are encountered
192   
193  456 try
194    {
195  456 if (!closeSource)
196    {
197  0 source.mark();
198    }
199  456 boolean aaIndexHeaderRead = false;
200   
201  ? while ((data = source.nextLine()) != null)
202    {
203  707 bytesRead += data.length();
204  707 trimmedLength += data.trim().length();
205  707 if (!lineswereskipped)
206    {
207  63800 for (int i = 0; !isBinary && i < data.length(); i++)
208    {
209  63344 char c = data.charAt(i);
210  63344 isBinary = (c < 32 && c != '\t' && c != '\n' && c != '\r'
211    && c != 5 && c != 27); // nominal binary character filter
212    // excluding CR, LF, tab,DEL and ^E
213    // for certain blast ids
214    }
215    }
216  707 if (isBinary)
217    {
218    // jar files are special - since they contain all sorts of random
219    // characters.
220  35 if (source.inFile != null)
221    {
222  35 String fileStr = source.inFile.getName();
223  35 if (fileStr.contains(".jar") || fileStr.contains(".zip")
224    || fileStr.contains(".jvp"))
225    {
226    // possibly a Jalview archive (but check further)
227  30 reply = FileFormat.Jalview;
228    }
229    }
230  35 if (!lineswereskipped && data.startsWith("PK"))
231    {
232  35 reply = FileFormat.Jalview; // archive
233  35 break;
234    }
235    }
236  672 data = data.toUpperCase(Locale.ROOT);
237   
238  672 if (data.startsWith(ScoreMatrixFile.SCOREMATRIX))
239    {
240  1 reply = FileFormat.ScoreMatrix;
241  1 break;
242    }
243  671 if (data.startsWith(XMLHEADER) && !lineswereskipped)
244    {
245  2 isXml = true;
246    }
247  671 if (data.startsWith("LOCUS"))
248    {
249  1 reply = FileFormat.GenBank;
250  1 break;
251    }
252  670 if (data.startsWith("ID "))
253    {
254  1 if (data.substring(2).trim().split(";").length == 7)
255    {
256  1 reply = FileFormat.Embl;
257  1 break;
258    }
259    }
260  669 if (data.startsWith("H ") && !aaIndexHeaderRead)
261    {
262  1 aaIndexHeaderRead = true;
263    }
264  669 if (data.startsWith("D ") && aaIndexHeaderRead)
265    {
266  1 reply = FileFormat.ScoreMatrix;
267  1 break;
268    }
269  668 if (data.startsWith("##GFF-VERSION"))
270    {
271    // GFF - possibly embedded in a Jalview features file!
272  6 reply = FileFormat.Features;
273  6 break;
274    }
275  662 if (looksLikeFeatureData(data))
276    {
277  5 reply = FileFormat.Features;
278  5 break;
279    }
280  657 if (data.indexOf("# STOCKHOLM") > -1)
281    {
282  8 reply = FileFormat.Stockholm;
283  8 break;
284    }
285  649 if (data.indexOf("_ENTRY.ID") > -1
286    || data.indexOf("_AUDIT_AUTHOR.NAME") > -1
287    || data.indexOf("_ATOM_SITE.") > -1)
288    {
289  1 reply = FileFormat.MMCif;
290  1 break;
291    }
292    // if (data.indexOf(">") > -1)
293  648 if (data.startsWith(">"))
294    {
295    // FASTA, PIR file or BLC file
296  351 boolean checkPIR = false, starterm = false;
297  351 if ((data.indexOf(">P1;") > -1) || (data.indexOf(">DL;") > -1))
298    {
299    // watch for PIR file attributes
300  1 checkPIR = true;
301  1 reply = FileFormat.PIR;
302    }
303    // could also be BLC file, read next line to confirm
304  351 data = source.nextLine();
305   
306  351 if (data.indexOf(">") > -1)
307    {
308  1 reply = FileFormat.BLC;
309    }
310    else
311    {
312    // Is this a single line BLC file?
313  350 String data1 = source.nextLine();
314  350 String data2 = source.nextLine();
315  350 int c1;
316  350 if (checkPIR)
317    {
318  1 starterm = (data1 != null && data1.indexOf("*") > -1)
319    || (data2 != null && data2.indexOf("*") > -1);
320    }
321  ? if (data2 != null && (c1 = data.indexOf("*")) > -1)
322    {
323  1 if (c1 == 0 && c1 == data2.indexOf("*"))
324    {
325  0 reply = FileFormat.BLC;
326    }
327    else
328    {
329  1 reply = FileFormat.Fasta; // possibly a bad choice - may be
330    // recognised as PIR
331    }
332    // otherwise can still possibly be a PIR file
333    }
334    else
335    {
336  349 reply = FileFormat.Fasta;
337    // TODO : AMSA File is indicated if there is annotation in the
338    // FASTA file - but FASTA will automatically generate this at the
339    // mo.
340  349 if (!checkPIR)
341    {
342  348 break;
343    }
344    }
345    }
346    // final check for PIR content. require
347    // >P1;title\n<blah>\nterminated sequence to occur at least once.
348   
349    // TODO the PIR/fasta ambiguity may be the use case that is needed to
350    // have
351    // a 'Parse as type XXX' parameter for the applet/application.
352  3 if (checkPIR)
353    {
354  1 String dta = null;
355  1 if (!starterm)
356    {
357  1 do
358    {
359  1 try
360    {
361  1 dta = source.nextLine();
362    } catch (IOException ex)
363    {
364    }
365  1 if (dta != null && dta.indexOf("*") > -1)
366    {
367  1 starterm = true;
368    }
369  1 } while (dta != null && !starterm);
370    }
371  1 if (starterm)
372    {
373  1 reply = FileFormat.PIR;
374  1 break;
375    }
376    else
377    {
378  0 reply = FileFormat.Fasta; // probably a bad choice!
379    }
380    }
381    // read as a FASTA (probably)
382  2 break;
383    }
384  297 if (data.indexOf("{\"") > -1)
385    {
386  1 reply = FileFormat.Json;
387  1 break;
388    }
389  296 int lessThan = data.indexOf("<");
390  296 if ((lessThan > -1)) // possible Markup Language data i.e HTML, RNAML,
391    // XML
392    {
393  22 String upper = data.toUpperCase(Locale.ROOT);
394  22 if (upper.substring(lessThan).startsWith("<HTML"))
395    {
396  6 reply = FileFormat.Html;
397  6 break;
398    }
399  16 if (upper.substring(lessThan).startsWith("<RNAML"))
400    {
401  2 reply = FileFormat.Rnaml;
402  2 break;
403    }
404  14 if (isXml && data.contains(
405    "<NS2:JALVIEWUSERCOLOURS SCHEMENAME=\"SEQUENCE FEATURES\" XMLNS:NS2=\"WWW.JALVIEW.ORG/COLOURS\">"))
406    {
407  2 reply = FileFormat.FeatureSettings;
408  2 break;
409    }
410    }
411   
412  286 if ((data.length() < 1) || (data.indexOf("#") == 0))
413    {
414  51 lineswereskipped = true;
415  51 continue;
416    }
417   
418  235 if (data.indexOf("PILEUP") > -1)
419    {
420  1 reply = FileFormat.Pileup;
421   
422  1 break;
423    }
424   
425  234 if ((data.indexOf("//") == 0) || ((data.indexOf("!!") > -1) && (data
426    .indexOf("!!") < data.indexOf("_MULTIPLE_ALIGNMENT "))))
427    {
428  1 reply = FileFormat.MSF;
429   
430  1 break;
431    }
432  233 else if (data.indexOf("CLUSTAL") > -1)
433    {
434  1 reply = FileFormat.Clustal;
435   
436  1 break;
437    }
438   
439  232 else if (data.indexOf("HEADER") == 0 || data.indexOf("ATOM") == 0)
440    {
441  19 reply = FileFormat.PDB;
442  19 break;
443    }
444  213 else if (data.matches("\\s*\\d+\\s+\\d+\\s*"))
445    {
446  1 reply = FileFormat.Phylip;
447  1 break;
448    }
449  212 else if (checkForAnnotations && !lineswereskipped
450    && data.equals(AnnotationFile.JALVIEW_ANNOTATION))
451    {
452  0 reply = FileFormat.JalviewAnnotation;
453    }
454    else
455    {
456  212 if (!lineswereskipped && looksLikeJnetData(data))
457    {
458  0 reply = FileFormat.Jnet;
459  0 break;
460    }
461    }
462   
463  212 lineswereskipped = true; // this means there was some junk before any
464    // key file signature
465    }
466  456 if (closeSource)
467    {
468  456 source.close();
469    }
470    else
471    {
472  0 source.reset(bytesRead); // so the file can be parsed from the mark
473    }
474    } catch (Exception ex)
475    {
476  0 Console.error("File Identification failed!\n" + ex);
477  0 throw new FileFormatException(source.errormessage);
478    }
479  456 if (trimmedLength == 0)
480    {
481  0 Console.error("File Identification failed! - Empty file was read.");
482  0 throw new FileFormatException("EMPTY DATA FILE");
483    }
484  456 Console.debug("File format identified as " + reply.toString());
485  456 return reply;
486    }
487   
488    /**
489    * Returns true if the data appears to be Jnet concise annotation format
490    *
491    * @param data
492    * @return
493    */
 
494  25 toggle protected boolean looksLikeJnetData(String data)
495    {
496  25 char firstChar = data.charAt(0);
497  25 int colonPos = data.indexOf(":");
498  25 int commaPos = data.indexOf(",");
499  25 boolean isJnet = firstChar != '*' && firstChar != ' ' && colonPos > -1
500    && commaPos > -1 && colonPos < commaPos;
501    // && data.indexOf(",")<data.indexOf(",", data.indexOf(","))) / ??
502  25 return isJnet;
503    }
504   
505    /**
506    * Returns true if the data has at least 6 tab-delimited fields _and_ fields 4
507    * and 5 are integer (start/end)
508    *
509    * @param data
510    * @return
511    */
 
512  669 toggle protected boolean looksLikeFeatureData(String data)
513    {
514  669 if (data == null)
515    {
516  1 return false;
517    }
518  668 String[] columns = data.split("\t");
519  668 if (columns.length < 6)
520    {
521  659 return false;
522    }
523  24 for (int col = 3; col < 5; col++)
524    {
525  17 try
526    {
527  17 Integer.parseInt(columns[col]);
528    } catch (NumberFormatException e)
529    {
530  2 return false;
531    }
532    }
533  7 return true;
534    }
535   
536    /**
537    *
538    * @param args
539    * @j2sIgnore
540    */
 
541  0 toggle public static void main(String[] args)
542    {
543  0 for (int i = 0; args != null && i < args.length; i++)
544    {
545  0 IdentifyFile ider = new IdentifyFile();
546  0 FileFormatI type = null;
547  0 try
548    {
549  0 type = ider.identify(args[i], DataSourceType.FILE);
550    } catch (FileNotFoundException e)
551    {
552  0 Console.error(String.format("Error '%s' fetching file %s", args[i],
553    e.getMessage()));
554    } catch (FileFormatException e)
555    {
556  0 Console.error(
557    String.format("Error '%s' identifying file type for %s",
558    args[i], e.getMessage()));
559    }
560  0 Console.info("Type of " + args[i] + " is " + type);
561    }
562  0 if (args == null || args.length == 0)
563    {
564  0 Console.error("Usage: <Filename> [<Filename> ...]");
565    }
566    }
567   
568    }