Class |
Line # |
Actions |
|||
---|---|---|---|---|---|
ScoreMatrixFile | 53 | 129 | 52 |
1 | /* | |
2 | * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) | |
3 | * Copyright (C) $$Year-Rel$$ The Jalview Authors | |
4 | * | |
5 | * This file is part of Jalview. | |
6 | * | |
7 | * Jalview is free software: you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation, either version 3 | |
10 | * of the License, or (at your option) any later version. | |
11 | * | |
12 | * Jalview is distributed in the hope that it will be useful, but | |
13 | * WITHOUT ANY WARRANTY; without even the implied warranty | |
14 | * of MERCHANTABILITY or FITNESS FOR A PARTICULAR | |
15 | * PURPOSE. See the GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with Jalview. If not, see <http://www.gnu.org/licenses/>. | |
19 | * The Jalview Authors are detailed in the 'AUTHORS' file. | |
20 | */ | |
21 | package jalview.io; | |
22 | ||
23 | import java.io.IOException; | |
24 | import java.util.StringTokenizer; | |
25 | ||
26 | import jalview.analysis.scoremodels.ScoreMatrix; | |
27 | import jalview.analysis.scoremodels.ScoreModels; | |
28 | import jalview.datamodel.SequenceI; | |
29 | ||
30 | /** | |
31 | * A class that can parse a file containing a substitution matrix and register | |
32 | * it for use in Jalview | |
33 | * <p> | |
34 | * Accepts 'NCBI' format (e.g. | |
35 | * https://www.ncbi.nlm.nih.gov/Class/FieldGuide/BLOSUM62.txt), with the | |
36 | * addition of a header line to provide a matrix name, e.g. | |
37 | * | |
38 | * <pre> | |
39 | * ScoreMatrix BLOSUM62 | |
40 | * </pre> | |
41 | * | |
42 | * Also accepts 'AAindex' format (as described at | |
43 | * http://www.genome.jp/aaindex/aaindex_help.html) with the minimum data | |
44 | * required being | |
45 | * | |
46 | * <pre> | |
47 | * H accession number (used as score matrix identifier in Jalview) | |
48 | * D description (used for tooltip in Jalview) | |
49 | * M rows = symbolList | |
50 | * and the substitution scores | |
51 | * </pre> | |
52 | */ | |
53 | public class ScoreMatrixFile extends AlignFile | |
54 | implements AlignmentFileReaderI | |
55 | { | |
56 | // first non-comment line identifier - also checked in IdentifyFile | |
57 | public static final String SCOREMATRIX = "SCOREMATRIX"; | |
58 | ||
59 | private static final String DELIMITERS = " ,\t"; | |
60 | ||
61 | private static final String COMMENT_CHAR = "#"; | |
62 | ||
63 | private String matrixName; | |
64 | ||
65 | /* | |
66 | * aaindex format has scores for diagonal and below only | |
67 | */ | |
68 | boolean isLowerDiagonalOnly; | |
69 | ||
70 | /* | |
71 | * ncbi format has symbols as first column on score rows | |
72 | */ | |
73 | boolean hasGuideColumn; | |
74 | ||
75 | /** | |
76 | * Constructor | |
77 | * | |
78 | * @param source | |
79 | * @throws IOException | |
80 | */ | |
81 | 352 | public ScoreMatrixFile(FileParse source) throws IOException |
82 | { | |
83 | 352 | super(false, source); |
84 | } | |
85 | ||
86 | 0 | @Override |
87 | public String print(SequenceI[] sqs, boolean jvsuffix) | |
88 | { | |
89 | 0 | return null; |
90 | } | |
91 | ||
92 | /** | |
93 | * Parses the score matrix file, and if successful registers the matrix so it | |
94 | * will be shown in Jalview menus. This method is not thread-safe (a separate | |
95 | * instance of this class should be used by each thread). | |
96 | */ | |
97 | 1 | @Override |
98 | public void parse() throws IOException | |
99 | { | |
100 | 1 | ScoreMatrix sm = parseMatrix(); |
101 | ||
102 | 1 | ScoreModels.getInstance().registerScoreModel(sm); |
103 | } | |
104 | ||
105 | /** | |
106 | * Parses the score matrix file and constructs a ScoreMatrix object. If an | |
107 | * error is found in parsing, it is thrown as FileFormatException. Any | |
108 | * warnings are written to syserr. | |
109 | * | |
110 | * @return | |
111 | * @throws IOException | |
112 | */ | |
113 | 352 | public ScoreMatrix parseMatrix() throws IOException |
114 | { | |
115 | 352 | ScoreMatrix sm = null; |
116 | 352 | int lineNo = 0; |
117 | 352 | String name = null; |
118 | 352 | char[] alphabet = null; |
119 | 352 | float[][] scores = null; |
120 | 352 | int size = 0; |
121 | 352 | int row = 0; |
122 | 352 | String err = null; |
123 | 352 | String data; |
124 | 352 | isLowerDiagonalOnly = false; |
125 | ||
126 | ? | while ((data = nextLine()) != null) |
127 | { | |
128 | 8872 | lineNo++; |
129 | 8874 | data = data.trim(); |
130 | 8874 | if (data.startsWith(COMMENT_CHAR) || data.length() == 0) |
131 | { | |
132 | 2957 | continue; |
133 | } | |
134 | // equivalent to data.startsWithIgnoreCase(SCOREMATRIX) | |
135 | 5917 | if (data.regionMatches(true, 0, SCOREMATRIX, 0, SCOREMATRIX.length())) |
136 | { | |
137 | /* | |
138 | * Parse name from ScoreMatrix <name> | |
139 | * we allow any delimiter after ScoreMatrix then take the rest of the line | |
140 | */ | |
141 | 345 | if (name != null) |
142 | { | |
143 | 1 | throw new FileFormatException( |
144 | "Error: 'ScoreMatrix' repeated in file at line " | |
145 | + lineNo); | |
146 | } | |
147 | 344 | StringTokenizer nameLine = new StringTokenizer(data, DELIMITERS); |
148 | 344 | if (nameLine.countTokens() < 2) |
149 | { | |
150 | 1 | err = "Format error: expected 'ScoreMatrix <name>', found '" |
151 | + data + "' at line " + lineNo; | |
152 | 1 | throw new FileFormatException(err); |
153 | } | |
154 | 343 | nameLine.nextToken(); // 'ScoreMatrix' |
155 | 343 | name = nameLine.nextToken(); // next field |
156 | 343 | name = data.substring(1).substring(data.substring(1).indexOf(name)); |
157 | 343 | continue; |
158 | } | |
159 | 5572 | else if (data.startsWith("H ") && name == null) |
160 | { | |
161 | /* | |
162 | * AAindex identifier | |
163 | */ | |
164 | 7 | return parseAAIndexFormat(lineNo, data); |
165 | } | |
166 | 5565 | else if (name == null) |
167 | { | |
168 | 1 | err = "Format error: 'ScoreMatrix <name>' should be the first non-comment line"; |
169 | 1 | throw new FileFormatException(err); |
170 | } | |
171 | ||
172 | /* | |
173 | * next non-comment line after ScoreMatrix should be the | |
174 | * column header line with the alphabet of scored symbols | |
175 | */ | |
176 | 5564 | if (alphabet == null) |
177 | { | |
178 | 342 | StringTokenizer columnHeadings = new StringTokenizer(data, |
179 | DELIMITERS); | |
180 | 342 | size = columnHeadings.countTokens(); |
181 | 342 | alphabet = new char[size]; |
182 | 342 | int col = 0; |
183 | 5570 | while (columnHeadings.hasMoreTokens()) |
184 | { | |
185 | 5228 | alphabet[col++] = columnHeadings.nextToken().charAt(0); |
186 | } | |
187 | 342 | scores = new float[size][]; |
188 | 342 | continue; |
189 | } | |
190 | ||
191 | /* | |
192 | * too much information | |
193 | */ | |
194 | 5222 | if (row >= size) |
195 | { | |
196 | 1 | err = "Unexpected extra input line in score model file: '" + data |
197 | + "'"; | |
198 | 1 | throw new FileFormatException(err); |
199 | } | |
200 | ||
201 | 5221 | parseValues(data, lineNo, scores, row, alphabet); |
202 | 5213 | row++; |
203 | } | |
204 | ||
205 | /* | |
206 | * out of data - check we found enough | |
207 | */ | |
208 | 333 | if (row < size) |
209 | { | |
210 | 1 | err = String.format( |
211 | "Expected %d rows of score data in score matrix but only found %d", | |
212 | size, row); | |
213 | 1 | throw new FileFormatException(err); |
214 | } | |
215 | ||
216 | /* | |
217 | * If we get here, then name, alphabet and scores have been parsed successfully | |
218 | */ | |
219 | 332 | sm = new ScoreMatrix(name, alphabet, scores); |
220 | 332 | matrixName = name; |
221 | ||
222 | 332 | return sm; |
223 | } | |
224 | ||
225 | /** | |
226 | * Parse input as AAIndex format, starting from the header line with the | |
227 | * accession id | |
228 | * | |
229 | * @param lineNo | |
230 | * @param data | |
231 | * @return | |
232 | * @throws IOException | |
233 | */ | |
234 | 7 | protected ScoreMatrix parseAAIndexFormat(int lineNo, String data) |
235 | throws IOException | |
236 | { | |
237 | 7 | String name = data.substring(2).trim(); |
238 | 7 | String description = null; |
239 | ||
240 | 7 | float[][] scores = null; |
241 | 7 | char[] alphabet = null; |
242 | 7 | int row = 0; |
243 | 7 | int size = 0; |
244 | ||
245 | ? | while ((data = nextLine()) != null) |
246 | { | |
247 | 54 | lineNo++; |
248 | 54 | data = data.trim(); |
249 | 54 | if (skipAAindexLine(data)) |
250 | { | |
251 | 11 | continue; |
252 | } | |
253 | 43 | if (data.startsWith("D ")) |
254 | { | |
255 | 2 | description = data.substring(2).trim(); |
256 | } | |
257 | 41 | else if (data.startsWith("M ")) |
258 | { | |
259 | 6 | alphabet = parseAAindexRowsColumns(lineNo, data); |
260 | 5 | size = alphabet.length; |
261 | 5 | scores = new float[size][size]; |
262 | } | |
263 | 35 | else if (scores == null) |
264 | { | |
265 | 1 | throw new FileFormatException( |
266 | "No alphabet specified in matrix file"); | |
267 | } | |
268 | 34 | else if (row >= size) |
269 | { | |
270 | 2 | throw new FileFormatException("Too many data rows in matrix file"); |
271 | } | |
272 | else | |
273 | { | |
274 | 32 | parseValues(data, lineNo, scores, row, alphabet); |
275 | 31 | row++; |
276 | } | |
277 | } | |
278 | ||
279 | 2 | ScoreMatrix sm = new ScoreMatrix(name, description, alphabet, scores); |
280 | 2 | matrixName = name; |
281 | ||
282 | 2 | return sm; |
283 | } | |
284 | ||
285 | /** | |
286 | * Parse one row of score values, delimited by whitespace or commas. The line | |
287 | * may optionally include the symbol from which the scores are defined. Values | |
288 | * may be present for all columns, or only up to the diagonal (in which case | |
289 | * upper diagonal values are set symmetrically). | |
290 | * | |
291 | * @param data | |
292 | * the line to be parsed | |
293 | * @param lineNo | |
294 | * @param scores | |
295 | * the score matrix to add data to | |
296 | * @param row | |
297 | * the row number / alphabet index position | |
298 | * @param alphabet | |
299 | * @return | |
300 | * @throws exception | |
301 | * if invalid, or too few, or too many values | |
302 | */ | |
303 | 5253 | protected void parseValues(String data, int lineNo, float[][] scores, |
304 | int row, char[] alphabet) throws FileFormatException | |
305 | { | |
306 | 5253 | String err; |
307 | 5253 | int size = alphabet.length; |
308 | 5253 | StringTokenizer scoreLine = new StringTokenizer(data, DELIMITERS); |
309 | ||
310 | 5253 | int tokenCount = scoreLine.countTokens(); |
311 | ||
312 | /* | |
313 | * inspect first row to see if it includes the symbol in the first column, | |
314 | * and to see if it is lower diagonal values only (i.e. just one score) | |
315 | */ | |
316 | 5253 | if (row == 0) |
317 | { | |
318 | 347 | if (data.startsWith(String.valueOf(alphabet[0]))) |
319 | { | |
320 | 339 | hasGuideColumn = true; |
321 | } | |
322 | 347 | if (tokenCount == (hasGuideColumn ? 2 : 1)) |
323 | { | |
324 | 6 | isLowerDiagonalOnly = true; |
325 | } | |
326 | } | |
327 | ||
328 | 5253 | if (hasGuideColumn) |
329 | { | |
330 | /* | |
331 | * check 'guide' symbol is the row'th letter of the alphabet | |
332 | */ | |
333 | 5219 | String symbol = scoreLine.nextToken(); |
334 | 5219 | if (symbol.length() > 1 || symbol.charAt(0) != alphabet[row]) |
335 | { | |
336 | 2 | err = String.format( |
337 | "Error parsing score matrix at line %d, expected '%s' but found '%s'", | |
338 | lineNo, alphabet[row], symbol); | |
339 | 2 | throw new FileFormatException(err); |
340 | } | |
341 | 5217 | tokenCount = scoreLine.countTokens(); // excluding guide symbol |
342 | } | |
343 | ||
344 | /* | |
345 | * check the right number of values (lower diagonal or full format) | |
346 | */ | |
347 | 5251 | if (isLowerDiagonalOnly && tokenCount != row + 1) |
348 | { | |
349 | 1 | err = String.format( |
350 | "Expected %d scores at line %d: '%s' but found %d", row + 1, | |
351 | lineNo, data, tokenCount); | |
352 | 1 | throw new FileFormatException(err); |
353 | } | |
354 | ||
355 | 5250 | if (!isLowerDiagonalOnly && tokenCount != size) |
356 | { | |
357 | 4 | err = String.format( |
358 | "Expected %d scores at line %d: '%s' but found %d", size, | |
359 | lineNo, data, scoreLine.countTokens()); | |
360 | 4 | throw new FileFormatException(err); |
361 | } | |
362 | ||
363 | /* | |
364 | * parse and set the values, setting the symmetrical value | |
365 | * as well if lower diagonal format data | |
366 | */ | |
367 | 5246 | scores[row] = new float[size]; |
368 | 5246 | int col = 0; |
369 | 5246 | String value = null; |
370 | 111749 | while (scoreLine.hasMoreTokens()) |
371 | { | |
372 | 106536 | try |
373 | { | |
374 | 106540 | value = scoreLine.nextToken(); |
375 | 106550 | scores[row][col] = Float.valueOf(value); |
376 | 106377 | if (isLowerDiagonalOnly) |
377 | { | |
378 | 231 | scores[col][row] = scores[row][col]; |
379 | } | |
380 | 106554 | col++; |
381 | } catch (NumberFormatException e) | |
382 | { | |
383 | 2 | err = String.format("Invalid score value '%s' at line %d column %d", |
384 | value, lineNo, col); | |
385 | 2 | throw new FileFormatException(err); |
386 | } | |
387 | } | |
388 | } | |
389 | ||
390 | /** | |
391 | * Parse the line in an aaindex file that looks like | |
392 | * | |
393 | * <pre> | |
394 | * M rows = ARNDCQEGHILKMFPSTWYV, cols = ARNDCQEGHILKMFPSTWYV | |
395 | * </pre> | |
396 | * | |
397 | * rejecting it if rows and cols do not match. Returns the string of | |
398 | * characters in the row/cols alphabet. | |
399 | * | |
400 | * @param lineNo | |
401 | * @param data | |
402 | * @return | |
403 | * @throws FileFormatException | |
404 | */ | |
405 | 6 | protected char[] parseAAindexRowsColumns(int lineNo, String data) |
406 | throws FileFormatException | |
407 | { | |
408 | 6 | String err = "Unexpected aaIndex score matrix data at line " + lineNo |
409 | + ": " + data; | |
410 | ||
411 | 6 | try |
412 | { | |
413 | 6 | String[] toks = data.split(","); |
414 | 6 | String rowsAlphabet = toks[0].split("=")[1].trim(); |
415 | 6 | String colsAlphabet = toks[1].split("=")[1].trim(); |
416 | 6 | if (!rowsAlphabet.equals(colsAlphabet)) |
417 | { | |
418 | 1 | throw new FileFormatException("rows != cols"); |
419 | } | |
420 | 5 | return rowsAlphabet.toCharArray(); |
421 | } catch (Throwable t) | |
422 | { | |
423 | 1 | throw new FileFormatException(err + " " + t.getMessage()); |
424 | } | |
425 | } | |
426 | ||
427 | /** | |
428 | * Answers true if line is one we are not interested in from AAindex format | |
429 | * file | |
430 | * | |
431 | * @param data | |
432 | * @return | |
433 | */ | |
434 | 54 | protected boolean skipAAindexLine(String data) |
435 | { | |
436 | 54 | if (data.startsWith(COMMENT_CHAR) || data.length() == 0) |
437 | { | |
438 | 0 | return true; |
439 | } | |
440 | 54 | if (data.startsWith("*") || data.startsWith("R ") |
441 | || data.startsWith("A ") || data.startsWith("T ") | |
442 | || data.startsWith("J ") || data.startsWith("//")) | |
443 | { | |
444 | 11 | return true; |
445 | } | |
446 | 43 | return false; |
447 | } | |
448 | ||
449 | 2 | public String getMatrixName() |
450 | { | |
451 | 2 | return matrixName; |
452 | } | |
453 | } |