1. Project Clover database Wed Nov 13 2024 18:27:33 GMT
  2. Package jalview.io

File ScoreMatrixFile.java

 

Coverage histogram

../../img/srcFileCovDistChart10.png
0% of files have more coverage

Code metrics

60
129
9
1
453
264
52
0.4
14.33
9
5.78

Classes

Class
Line #
Actions
ScoreMatrixFile 53 129 52
0.95959696%
 

Contributing tests

This file is covered by 21 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.io;
22   
23    import java.io.IOException;
24    import java.util.StringTokenizer;
25   
26    import jalview.analysis.scoremodels.ScoreMatrix;
27    import jalview.analysis.scoremodels.ScoreModels;
28    import jalview.datamodel.SequenceI;
29   
30    /**
31    * A class that can parse a file containing a substitution matrix and register
32    * it for use in Jalview
33    * <p>
34    * Accepts 'NCBI' format (e.g.
35    * https://www.ncbi.nlm.nih.gov/Class/FieldGuide/BLOSUM62.txt), with the
36    * addition of a header line to provide a matrix name, e.g.
37    *
38    * <pre>
39    * ScoreMatrix BLOSUM62
40    * </pre>
41    *
42    * Also accepts 'AAindex' format (as described at
43    * http://www.genome.jp/aaindex/aaindex_help.html) with the minimum data
44    * required being
45    *
46    * <pre>
47    * H accession number (used as score matrix identifier in Jalview)
48    * D description (used for tooltip in Jalview)
49    * M rows = symbolList
50    * and the substitution scores
51    * </pre>
52    */
 
53    public class ScoreMatrixFile extends AlignFile
54    implements AlignmentFileReaderI
55    {
56    // first non-comment line identifier - also checked in IdentifyFile
57    public static final String SCOREMATRIX = "SCOREMATRIX";
58   
59    private static final String DELIMITERS = " ,\t";
60   
61    private static final String COMMENT_CHAR = "#";
62   
63    private String matrixName;
64   
65    /*
66    * aaindex format has scores for diagonal and below only
67    */
68    boolean isLowerDiagonalOnly;
69   
70    /*
71    * ncbi format has symbols as first column on score rows
72    */
73    boolean hasGuideColumn;
74   
75    /**
76    * Constructor
77    *
78    * @param source
79    * @throws IOException
80    */
 
81  352 toggle public ScoreMatrixFile(FileParse source) throws IOException
82    {
83  352 super(false, source);
84    }
85   
 
86  0 toggle @Override
87    public String print(SequenceI[] sqs, boolean jvsuffix)
88    {
89  0 return null;
90    }
91   
92    /**
93    * Parses the score matrix file, and if successful registers the matrix so it
94    * will be shown in Jalview menus. This method is not thread-safe (a separate
95    * instance of this class should be used by each thread).
96    */
 
97  1 toggle @Override
98    public void parse() throws IOException
99    {
100  1 ScoreMatrix sm = parseMatrix();
101   
102  1 ScoreModels.getInstance().registerScoreModel(sm);
103    }
104   
105    /**
106    * Parses the score matrix file and constructs a ScoreMatrix object. If an
107    * error is found in parsing, it is thrown as FileFormatException. Any
108    * warnings are written to syserr.
109    *
110    * @return
111    * @throws IOException
112    */
 
113  352 toggle public ScoreMatrix parseMatrix() throws IOException
114    {
115  352 ScoreMatrix sm = null;
116  352 int lineNo = 0;
117  352 String name = null;
118  352 char[] alphabet = null;
119  352 float[][] scores = null;
120  352 int size = 0;
121  352 int row = 0;
122  352 String err = null;
123  352 String data;
124  352 isLowerDiagonalOnly = false;
125   
126  ? while ((data = nextLine()) != null)
127    {
128  8872 lineNo++;
129  8874 data = data.trim();
130  8874 if (data.startsWith(COMMENT_CHAR) || data.length() == 0)
131    {
132  2957 continue;
133    }
134    // equivalent to data.startsWithIgnoreCase(SCOREMATRIX)
135  5917 if (data.regionMatches(true, 0, SCOREMATRIX, 0, SCOREMATRIX.length()))
136    {
137    /*
138    * Parse name from ScoreMatrix <name>
139    * we allow any delimiter after ScoreMatrix then take the rest of the line
140    */
141  345 if (name != null)
142    {
143  1 throw new FileFormatException(
144    "Error: 'ScoreMatrix' repeated in file at line "
145    + lineNo);
146    }
147  344 StringTokenizer nameLine = new StringTokenizer(data, DELIMITERS);
148  344 if (nameLine.countTokens() < 2)
149    {
150  1 err = "Format error: expected 'ScoreMatrix <name>', found '"
151    + data + "' at line " + lineNo;
152  1 throw new FileFormatException(err);
153    }
154  343 nameLine.nextToken(); // 'ScoreMatrix'
155  343 name = nameLine.nextToken(); // next field
156  343 name = data.substring(1).substring(data.substring(1).indexOf(name));
157  343 continue;
158    }
159  5572 else if (data.startsWith("H ") && name == null)
160    {
161    /*
162    * AAindex identifier
163    */
164  7 return parseAAIndexFormat(lineNo, data);
165    }
166  5565 else if (name == null)
167    {
168  1 err = "Format error: 'ScoreMatrix <name>' should be the first non-comment line";
169  1 throw new FileFormatException(err);
170    }
171   
172    /*
173    * next non-comment line after ScoreMatrix should be the
174    * column header line with the alphabet of scored symbols
175    */
176  5564 if (alphabet == null)
177    {
178  342 StringTokenizer columnHeadings = new StringTokenizer(data,
179    DELIMITERS);
180  342 size = columnHeadings.countTokens();
181  342 alphabet = new char[size];
182  342 int col = 0;
183  5570 while (columnHeadings.hasMoreTokens())
184    {
185  5228 alphabet[col++] = columnHeadings.nextToken().charAt(0);
186    }
187  342 scores = new float[size][];
188  342 continue;
189    }
190   
191    /*
192    * too much information
193    */
194  5222 if (row >= size)
195    {
196  1 err = "Unexpected extra input line in score model file: '" + data
197    + "'";
198  1 throw new FileFormatException(err);
199    }
200   
201  5221 parseValues(data, lineNo, scores, row, alphabet);
202  5213 row++;
203    }
204   
205    /*
206    * out of data - check we found enough
207    */
208  333 if (row < size)
209    {
210  1 err = String.format(
211    "Expected %d rows of score data in score matrix but only found %d",
212    size, row);
213  1 throw new FileFormatException(err);
214    }
215   
216    /*
217    * If we get here, then name, alphabet and scores have been parsed successfully
218    */
219  332 sm = new ScoreMatrix(name, alphabet, scores);
220  332 matrixName = name;
221   
222  332 return sm;
223    }
224   
225    /**
226    * Parse input as AAIndex format, starting from the header line with the
227    * accession id
228    *
229    * @param lineNo
230    * @param data
231    * @return
232    * @throws IOException
233    */
 
234  7 toggle protected ScoreMatrix parseAAIndexFormat(int lineNo, String data)
235    throws IOException
236    {
237  7 String name = data.substring(2).trim();
238  7 String description = null;
239   
240  7 float[][] scores = null;
241  7 char[] alphabet = null;
242  7 int row = 0;
243  7 int size = 0;
244   
245  ? while ((data = nextLine()) != null)
246    {
247  54 lineNo++;
248  54 data = data.trim();
249  54 if (skipAAindexLine(data))
250    {
251  11 continue;
252    }
253  43 if (data.startsWith("D "))
254    {
255  2 description = data.substring(2).trim();
256    }
257  41 else if (data.startsWith("M "))
258    {
259  6 alphabet = parseAAindexRowsColumns(lineNo, data);
260  5 size = alphabet.length;
261  5 scores = new float[size][size];
262    }
263  35 else if (scores == null)
264    {
265  1 throw new FileFormatException(
266    "No alphabet specified in matrix file");
267    }
268  34 else if (row >= size)
269    {
270  2 throw new FileFormatException("Too many data rows in matrix file");
271    }
272    else
273    {
274  32 parseValues(data, lineNo, scores, row, alphabet);
275  31 row++;
276    }
277    }
278   
279  2 ScoreMatrix sm = new ScoreMatrix(name, description, alphabet, scores);
280  2 matrixName = name;
281   
282  2 return sm;
283    }
284   
285    /**
286    * Parse one row of score values, delimited by whitespace or commas. The line
287    * may optionally include the symbol from which the scores are defined. Values
288    * may be present for all columns, or only up to the diagonal (in which case
289    * upper diagonal values are set symmetrically).
290    *
291    * @param data
292    * the line to be parsed
293    * @param lineNo
294    * @param scores
295    * the score matrix to add data to
296    * @param row
297    * the row number / alphabet index position
298    * @param alphabet
299    * @return
300    * @throws exception
301    * if invalid, or too few, or too many values
302    */
 
303  5253 toggle protected void parseValues(String data, int lineNo, float[][] scores,
304    int row, char[] alphabet) throws FileFormatException
305    {
306  5253 String err;
307  5253 int size = alphabet.length;
308  5253 StringTokenizer scoreLine = new StringTokenizer(data, DELIMITERS);
309   
310  5253 int tokenCount = scoreLine.countTokens();
311   
312    /*
313    * inspect first row to see if it includes the symbol in the first column,
314    * and to see if it is lower diagonal values only (i.e. just one score)
315    */
316  5253 if (row == 0)
317    {
318  347 if (data.startsWith(String.valueOf(alphabet[0])))
319    {
320  339 hasGuideColumn = true;
321    }
322  347 if (tokenCount == (hasGuideColumn ? 2 : 1))
323    {
324  6 isLowerDiagonalOnly = true;
325    }
326    }
327   
328  5253 if (hasGuideColumn)
329    {
330    /*
331    * check 'guide' symbol is the row'th letter of the alphabet
332    */
333  5219 String symbol = scoreLine.nextToken();
334  5219 if (symbol.length() > 1 || symbol.charAt(0) != alphabet[row])
335    {
336  2 err = String.format(
337    "Error parsing score matrix at line %d, expected '%s' but found '%s'",
338    lineNo, alphabet[row], symbol);
339  2 throw new FileFormatException(err);
340    }
341  5217 tokenCount = scoreLine.countTokens(); // excluding guide symbol
342    }
343   
344    /*
345    * check the right number of values (lower diagonal or full format)
346    */
347  5251 if (isLowerDiagonalOnly && tokenCount != row + 1)
348    {
349  1 err = String.format(
350    "Expected %d scores at line %d: '%s' but found %d", row + 1,
351    lineNo, data, tokenCount);
352  1 throw new FileFormatException(err);
353    }
354   
355  5250 if (!isLowerDiagonalOnly && tokenCount != size)
356    {
357  4 err = String.format(
358    "Expected %d scores at line %d: '%s' but found %d", size,
359    lineNo, data, scoreLine.countTokens());
360  4 throw new FileFormatException(err);
361    }
362   
363    /*
364    * parse and set the values, setting the symmetrical value
365    * as well if lower diagonal format data
366    */
367  5246 scores[row] = new float[size];
368  5246 int col = 0;
369  5246 String value = null;
370  111749 while (scoreLine.hasMoreTokens())
371    {
372  106536 try
373    {
374  106540 value = scoreLine.nextToken();
375  106550 scores[row][col] = Float.valueOf(value);
376  106377 if (isLowerDiagonalOnly)
377    {
378  231 scores[col][row] = scores[row][col];
379    }
380  106554 col++;
381    } catch (NumberFormatException e)
382    {
383  2 err = String.format("Invalid score value '%s' at line %d column %d",
384    value, lineNo, col);
385  2 throw new FileFormatException(err);
386    }
387    }
388    }
389   
390    /**
391    * Parse the line in an aaindex file that looks like
392    *
393    * <pre>
394    * M rows = ARNDCQEGHILKMFPSTWYV, cols = ARNDCQEGHILKMFPSTWYV
395    * </pre>
396    *
397    * rejecting it if rows and cols do not match. Returns the string of
398    * characters in the row/cols alphabet.
399    *
400    * @param lineNo
401    * @param data
402    * @return
403    * @throws FileFormatException
404    */
 
405  6 toggle protected char[] parseAAindexRowsColumns(int lineNo, String data)
406    throws FileFormatException
407    {
408  6 String err = "Unexpected aaIndex score matrix data at line " + lineNo
409    + ": " + data;
410   
411  6 try
412    {
413  6 String[] toks = data.split(",");
414  6 String rowsAlphabet = toks[0].split("=")[1].trim();
415  6 String colsAlphabet = toks[1].split("=")[1].trim();
416  6 if (!rowsAlphabet.equals(colsAlphabet))
417    {
418  1 throw new FileFormatException("rows != cols");
419    }
420  5 return rowsAlphabet.toCharArray();
421    } catch (Throwable t)
422    {
423  1 throw new FileFormatException(err + " " + t.getMessage());
424    }
425    }
426   
427    /**
428    * Answers true if line is one we are not interested in from AAindex format
429    * file
430    *
431    * @param data
432    * @return
433    */
 
434  54 toggle protected boolean skipAAindexLine(String data)
435    {
436  54 if (data.startsWith(COMMENT_CHAR) || data.length() == 0)
437    {
438  0 return true;
439    }
440  54 if (data.startsWith("*") || data.startsWith("R ")
441    || data.startsWith("A ") || data.startsWith("T ")
442    || data.startsWith("J ") || data.startsWith("//"))
443    {
444  11 return true;
445    }
446  43 return false;
447    }
448   
 
449  2 toggle public String getMatrixName()
450    {
451  2 return matrixName;
452    }
453    }