Clover icon

jalviewX

  1. Project Clover database Wed Oct 31 2018 15:13:58 GMT
  2. Package jalview.io

File ScoreMatrixFile.java

 

Coverage histogram

../../img/srcFileCovDistChart10.png
0% of files have more coverage

Code metrics

60
129
9
1
452
264
52
0.4
14.33
9
5.78

Classes

Class Line # Actions
ScoreMatrixFile 53 129 52 8
0.95959696%
 

Contributing tests

This file is covered by 21 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.io;
22   
23    import jalview.analysis.scoremodels.ScoreMatrix;
24    import jalview.analysis.scoremodels.ScoreModels;
25    import jalview.datamodel.SequenceI;
26   
27    import java.io.IOException;
28    import java.util.StringTokenizer;
29   
30    /**
31    * A class that can parse a file containing a substitution matrix and register
32    * it for use in Jalview
33    * <p>
34    * Accepts 'NCBI' format (e.g.
35    * https://www.ncbi.nlm.nih.gov/Class/FieldGuide/BLOSUM62.txt), with the
36    * addition of a header line to provide a matrix name, e.g.
37    *
38    * <pre>
39    * ScoreMatrix BLOSUM62
40    * </pre>
41    *
42    * Also accepts 'AAindex' format (as described at
43    * http://www.genome.jp/aaindex/aaindex_help.html) with the minimum data
44    * required being
45    *
46    * <pre>
47    * H accession number (used as score matrix identifier in Jalview)
48    * D description (used for tooltip in Jalview)
49    * M rows = symbolList
50    * and the substitution scores
51    * </pre>
52    */
 
53    public class ScoreMatrixFile extends AlignFile
54    implements AlignmentFileReaderI
55    {
56    // first non-comment line identifier - also checked in IdentifyFile
57    public static final String SCOREMATRIX = "SCOREMATRIX";
58   
59    private static final String DELIMITERS = " ,\t";
60   
61    private static final String COMMENT_CHAR = "#";
62   
63    private String matrixName;
64   
65    /*
66    * aaindex format has scores for diagonal and below only
67    */
68    boolean isLowerDiagonalOnly;
69   
70    /*
71    * ncbi format has symbols as first column on score rows
72    */
73    boolean hasGuideColumn;
74   
75    /**
76    * Constructor
77    *
78    * @param source
79    * @throws IOException
80    */
 
81  27 toggle public ScoreMatrixFile(FileParse source) throws IOException
82    {
83  27 super(false, source);
84    }
85   
 
86  0 toggle @Override
87    public String print(SequenceI[] sqs, boolean jvsuffix)
88    {
89  0 return null;
90    }
91   
92    /**
93    * Parses the score matrix file, and if successful registers the matrix so it
94    * will be shown in Jalview menus. This method is not thread-safe (a separate
95    * instance of this class should be used by each thread).
96    */
 
97  1 toggle @Override
98    public void parse() throws IOException
99    {
100  1 ScoreMatrix sm = parseMatrix();
101   
102  1 ScoreModels.getInstance().registerScoreModel(sm);
103    }
104   
105    /**
106    * Parses the score matrix file and constructs a ScoreMatrix object. If an
107    * error is found in parsing, it is thrown as FileFormatException. Any
108    * warnings are written to syserr.
109    *
110    * @return
111    * @throws IOException
112    */
 
113  27 toggle public ScoreMatrix parseMatrix() throws IOException
114    {
115  27 ScoreMatrix sm = null;
116  27 int lineNo = 0;
117  27 String name = null;
118  27 char[] alphabet = null;
119  27 float[][] scores = null;
120  27 int size = 0;
121  27 int row = 0;
122  27 String err = null;
123  27 String data;
124  27 isLowerDiagonalOnly = false;
125   
126  ? while ((data = nextLine()) != null)
127    {
128  193 lineNo++;
129  193 data = data.trim();
130  193 if (data.startsWith(COMMENT_CHAR) || data.length() == 0)
131    {
132  33 continue;
133    }
134  160 if (data.toUpperCase().startsWith(SCOREMATRIX))
135    {
136    /*
137    * Parse name from ScoreMatrix <name>
138    * we allow any delimiter after ScoreMatrix then take the rest of the line
139    */
140  20 if (name != null)
141    {
142  1 throw new FileFormatException(
143    "Error: 'ScoreMatrix' repeated in file at line "
144    + lineNo);
145    }
146  19 StringTokenizer nameLine = new StringTokenizer(data, DELIMITERS);
147  19 if (nameLine.countTokens() < 2)
148    {
149  1 err = "Format error: expected 'ScoreMatrix <name>', found '"
150    + data + "' at line " + lineNo;
151  1 throw new FileFormatException(err);
152    }
153  18 nameLine.nextToken(); // 'ScoreMatrix'
154  18 name = nameLine.nextToken(); // next field
155  18 name = data.substring(1).substring(data.substring(1).indexOf(name));
156  18 continue;
157    }
158  140 else if (data.startsWith("H ") && name == null)
159    {
160    /*
161    * AAindex identifier
162    */
163  7 return parseAAIndexFormat(lineNo, data);
164    }
165  133 else if (name == null)
166    {
167  1 err = "Format error: 'ScoreMatrix <name>' should be the first non-comment line";
168  1 throw new FileFormatException(err);
169    }
170   
171    /*
172    * next non-comment line after ScoreMatrix should be the
173    * column header line with the alphabet of scored symbols
174    */
175  132 if (alphabet == null)
176    {
177  17 StringTokenizer columnHeadings = new StringTokenizer(data,
178    DELIMITERS);
179  17 size = columnHeadings.countTokens();
180  17 alphabet = new char[size];
181  17 int col = 0;
182  138 while (columnHeadings.hasMoreTokens())
183    {
184  121 alphabet[col++] = columnHeadings.nextToken().charAt(0);
185    }
186  17 scores = new float[size][];
187  17 continue;
188    }
189   
190    /*
191    * too much information
192    */
193  115 if (row >= size)
194    {
195  1 err = "Unexpected extra input line in score model file: '" + data
196    + "'";
197  1 throw new FileFormatException(err);
198    }
199   
200  114 parseValues(data, lineNo, scores, row, alphabet);
201  106 row++;
202    }
203   
204    /*
205    * out of data - check we found enough
206    */
207  8 if (row < size)
208    {
209  1 err = String.format(
210    "Expected %d rows of score data in score matrix but only found %d",
211    size, row);
212  1 throw new FileFormatException(err);
213    }
214   
215    /*
216    * If we get here, then name, alphabet and scores have been parsed successfully
217    */
218  7 sm = new ScoreMatrix(name, alphabet, scores);
219  7 matrixName = name;
220   
221  7 return sm;
222    }
223   
224    /**
225    * Parse input as AAIndex format, starting from the header line with the
226    * accession id
227    *
228    * @param lineNo
229    * @param data
230    * @return
231    * @throws IOException
232    */
 
233  7 toggle protected ScoreMatrix parseAAIndexFormat(int lineNo, String data)
234    throws IOException
235    {
236  7 String name = data.substring(2).trim();
237  7 String description = null;
238   
239  7 float[][] scores = null;
240  7 char[] alphabet = null;
241  7 int row = 0;
242  7 int size = 0;
243   
244  ? while ((data = nextLine()) != null)
245    {
246  54 lineNo++;
247  54 data = data.trim();
248  54 if (skipAAindexLine(data))
249    {
250  11 continue;
251    }
252  43 if (data.startsWith("D "))
253    {
254  2 description = data.substring(2).trim();
255    }
256  41 else if (data.startsWith("M "))
257    {
258  6 alphabet = parseAAindexRowsColumns(lineNo, data);
259  5 size = alphabet.length;
260  5 scores = new float[size][size];
261    }
262  35 else if (scores == null)
263    {
264  1 throw new FileFormatException(
265    "No alphabet specified in matrix file");
266    }
267  34 else if (row >= size)
268    {
269  2 throw new FileFormatException("Too many data rows in matrix file");
270    }
271    else
272    {
273  32 parseValues(data, lineNo, scores, row, alphabet);
274  31 row++;
275    }
276    }
277   
278  2 ScoreMatrix sm = new ScoreMatrix(name, description, alphabet, scores);
279  2 matrixName = name;
280   
281  2 return sm;
282    }
283   
284    /**
285    * Parse one row of score values, delimited by whitespace or commas. The line
286    * may optionally include the symbol from which the scores are defined. Values
287    * may be present for all columns, or only up to the diagonal (in which case
288    * upper diagonal values are set symmetrically).
289    *
290    * @param data
291    * the line to be parsed
292    * @param lineNo
293    * @param scores
294    * the score matrix to add data to
295    * @param row
296    * the row number / alphabet index position
297    * @param alphabet
298    * @return
299    * @throws exception
300    * if invalid, or too few, or too many values
301    */
 
302  146 toggle protected void parseValues(String data, int lineNo, float[][] scores,
303    int row, char[] alphabet) throws FileFormatException
304    {
305  146 String err;
306  146 int size = alphabet.length;
307  146 StringTokenizer scoreLine = new StringTokenizer(data, DELIMITERS);
308   
309  146 int tokenCount = scoreLine.countTokens();
310   
311    /*
312    * inspect first row to see if it includes the symbol in the first column,
313    * and to see if it is lower diagonal values only (i.e. just one score)
314    */
315  146 if (row == 0)
316    {
317  22 if (data.startsWith(String.valueOf(alphabet[0])))
318    {
319  14 hasGuideColumn = true;
320    }
321  22 if (tokenCount == (hasGuideColumn ? 2 : 1))
322    {
323  6 isLowerDiagonalOnly = true;
324    }
325    }
326   
327  146 if (hasGuideColumn)
328    {
329    /*
330    * check 'guide' symbol is the row'th letter of the alphabet
331    */
332  112 String symbol = scoreLine.nextToken();
333  112 if (symbol.length() > 1 || symbol.charAt(0) != alphabet[row])
334    {
335  2 err = String.format(
336    "Error parsing score matrix at line %d, expected '%s' but found '%s'",
337    lineNo, alphabet[row], symbol);
338  2 throw new FileFormatException(err);
339    }
340  110 tokenCount = scoreLine.countTokens(); // excluding guide symbol
341    }
342   
343    /*
344    * check the right number of values (lower diagonal or full format)
345    */
346  144 if (isLowerDiagonalOnly && tokenCount != row + 1)
347    {
348  1 err = String.format(
349    "Expected %d scores at line %d: '%s' but found %d", row + 1,
350    lineNo, data, tokenCount);
351  1 throw new FileFormatException(err);
352    }
353   
354  143 if (!isLowerDiagonalOnly && tokenCount != size)
355    {
356  4 err = String.format(
357    "Expected %d scores at line %d: '%s' but found %d", size,
358    lineNo, data, scoreLine.countTokens());
359  4 throw new FileFormatException(err);
360    }
361   
362    /*
363    * parse and set the values, setting the symmetrical value
364    * as well if lower diagonal format data
365    */
366  139 scores[row] = new float[size];
367  139 int col = 0;
368  139 String value = null;
369  2304 while (scoreLine.hasMoreTokens())
370    {
371  2167 try
372    {
373  2167 value = scoreLine.nextToken();
374  2167 scores[row][col] = Float.valueOf(value);
375  2165 if (isLowerDiagonalOnly)
376    {
377  231 scores[col][row] = scores[row][col];
378    }
379  2165 col++;
380    } catch (NumberFormatException e)
381    {
382  2 err = String.format("Invalid score value '%s' at line %d column %d",
383    value, lineNo, col);
384  2 throw new FileFormatException(err);
385    }
386    }
387    }
388   
389    /**
390    * Parse the line in an aaindex file that looks like
391    *
392    * <pre>
393    * M rows = ARNDCQEGHILKMFPSTWYV, cols = ARNDCQEGHILKMFPSTWYV
394    * </pre>
395    *
396    * rejecting it if rows and cols do not match. Returns the string of
397    * characters in the row/cols alphabet.
398    *
399    * @param lineNo
400    * @param data
401    * @return
402    * @throws FileFormatException
403    */
 
404  6 toggle protected char[] parseAAindexRowsColumns(int lineNo, String data)
405    throws FileFormatException
406    {
407  6 String err = "Unexpected aaIndex score matrix data at line " + lineNo
408    + ": " + data;
409   
410  6 try
411    {
412  6 String[] toks = data.split(",");
413  6 String rowsAlphabet = toks[0].split("=")[1].trim();
414  6 String colsAlphabet = toks[1].split("=")[1].trim();
415  6 if (!rowsAlphabet.equals(colsAlphabet))
416    {
417  1 throw new FileFormatException("rows != cols");
418    }
419  5 return rowsAlphabet.toCharArray();
420    } catch (Throwable t)
421    {
422  1 throw new FileFormatException(err + " " + t.getMessage());
423    }
424    }
425   
426    /**
427    * Answers true if line is one we are not interested in from AAindex format
428    * file
429    *
430    * @param data
431    * @return
432    */
 
433  54 toggle protected boolean skipAAindexLine(String data)
434    {
435  54 if (data.startsWith(COMMENT_CHAR) || data.length() == 0)
436    {
437  0 return true;
438    }
439  54 if (data.startsWith("*") || data.startsWith("R ")
440    || data.startsWith("A ") || data.startsWith("T ")
441    || data.startsWith("J ") || data.startsWith("//"))
442    {
443  11 return true;
444    }
445  43 return false;
446    }
447   
 
448  2 toggle public String getMatrixName()
449    {
450  2 return matrixName;
451    }
452    }