Clover icon

Coverage Report

  1. Project Clover database Thu Nov 7 2024 17:01:39 GMT
  2. Package jalview.analysis.scoremodels

File ScoreMatrix.java

 

Coverage histogram

../../../img/srcFileCovDistChart0.png
0% of files have more coverage

Code metrics

86
132
25
1
628
343
74
0.56
5.28
25
2.96

Classes

Class Line # Actions
ScoreMatrix 39 132 74
0.00%
 

Contributing tests

No tests hitting this source file were found.

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.analysis.scoremodels;
22   
23    import jalview.api.AlignmentViewPanel;
24    import jalview.api.analysis.PairwiseScoreModelI;
25    import jalview.api.analysis.ScoreModelI;
26    import jalview.api.analysis.SimilarityParamsI;
27    import jalview.datamodel.AlignmentView;
28    import jalview.math.Matrix;
29    import jalview.math.MatrixI;
30    import jalview.util.Comparison;
31   
32    import java.util.Arrays;
33   
34    /**
35    * A class that models a substitution score matrix for any given alphabet of
36    * symbols. Instances of this class are immutable and thread-safe, so the same
37    * object is returned from calls to getInstance().
38    */
 
39    public class ScoreMatrix extends SimilarityScoreModel
40    implements PairwiseScoreModelI
41    {
42    private static final char GAP_CHARACTER = Comparison.GAP_DASH;
43   
44    /*
45    * an arbitrary score to assign for identity of an unknown symbol
46    * (this is the value on the diagonal in the * column of the NCBI matrix)
47    * (though a case could be made for using the minimum diagonal value)
48    */
49    private static final int UNKNOWN_IDENTITY_SCORE = 1;
50   
51    /*
52    * Jalview 2.10.1 treated gaps as X (peptide) or N (nucleotide)
53    * for pairwise scoring; 2.10.2 uses gap score (last column) in
54    * score matrix (JAL-2397)
55    * Set this flag to true (via Groovy) for 2.10.1 behaviour
56    */
57    private static boolean scoreGapAsAny = false;
58   
59    public static final short UNMAPPED = (short) -1;
60   
61    private static final String BAD_ASCII_ERROR = "Unexpected character %s in getPairwiseScore";
62   
63    private static final int MAX_ASCII = 127;
64   
65    /*
66    * the name of the model as shown in menus
67    * each score model in use should have a unique name
68    */
69    private String name;
70   
71    /*
72    * a description for the model as shown in tooltips
73    */
74    private String description;
75   
76    /*
77    * the characters that the model provides scores for
78    */
79    private char[] symbols;
80   
81    /*
82    * the score matrix; both dimensions must equal the number of symbols
83    * matrix[i][j] is the substitution score for replacing symbols[i] with symbols[j]
84    */
85    private float[][] matrix;
86   
87    /*
88    * quick lookup to convert from an ascii character value to the index
89    * of the corresponding symbol in the score matrix
90    */
91    private short[] symbolIndex;
92   
93    /*
94    * true for Protein Score matrix, false for dna score matrix
95    */
96    private boolean peptide;
97   
98    private float minValue;
99   
100    private float maxValue;
101   
102    private boolean symmetric;
103   
104    /**
105    * Constructor given a name, symbol alphabet, and matrix of scores for pairs
106    * of symbols. The matrix should be square and of the same size as the
107    * alphabet, for example 20x20 for a 20 symbol alphabet.
108    *
109    * @param theName
110    * Unique, human readable name for the matrix
111    * @param alphabet
112    * the symbols to which scores apply
113    * @param values
114    * Pairwise scores indexed according to the symbol alphabet
115    */
 
116  0 toggle public ScoreMatrix(String theName, char[] alphabet, float[][] values)
117    {
118  0 this(theName, null, alphabet, values);
119    }
120   
121    /**
122    * Constructor given a name, description, symbol alphabet, and matrix of
123    * scores for pairs of symbols. The matrix should be square and of the same
124    * size as the alphabet, for example 20x20 for a 20 symbol alphabet.
125    *
126    * @param theName
127    * Unique, human readable name for the matrix
128    * @param theDescription
129    * descriptive display name suitable for use in menus
130    * @param alphabet
131    * the symbols to which scores apply
132    * @param values
133    * Pairwise scores indexed according to the symbol alphabet
134    */
 
135  0 toggle public ScoreMatrix(String theName, String theDescription, char[] alphabet,
136    float[][] values)
137    {
138  0 if (alphabet.length != values.length)
139    {
140  0 throw new IllegalArgumentException(
141    "score matrix size must match alphabet size");
142    }
143  0 for (float[] row : values)
144    {
145  0 if (row.length != alphabet.length)
146    {
147  0 throw new IllegalArgumentException(
148    "score matrix size must be square");
149    }
150    }
151   
152  0 this.matrix = values;
153  0 this.name = theName;
154  0 this.description = theDescription;
155  0 this.symbols = alphabet;
156   
157  0 symbolIndex = buildSymbolIndex(alphabet);
158   
159  0 findMinMax();
160   
161  0 symmetric = checkSymmetry();
162   
163    /*
164    * crude heuristic for now...
165    */
166  0 peptide = alphabet.length >= 20;
167    }
168   
169    /**
170    * Answers true if the matrix is symmetric, else false. Usually, substitution
171    * matrices are symmetric, which allows calculations to be short cut.
172    *
173    * @return
174    */
 
175  0 toggle private boolean checkSymmetry()
176    {
177  0 for (int i = 0; i < matrix.length; i++)
178    {
179  0 for (int j = i; j < matrix.length; j++)
180    {
181  0 if (matrix[i][j] != matrix[j][i])
182    {
183  0 return false;
184    }
185    }
186    }
187  0 return true;
188    }
189   
190    /**
191    * Record the minimum and maximum score values
192    */
 
193  0 toggle protected void findMinMax()
194    {
195  0 float min = Float.MAX_VALUE;
196  0 float max = -Float.MAX_VALUE;
197  0 if (matrix != null)
198    {
199  0 for (float[] row : matrix)
200    {
201  0 if (row != null)
202    {
203  0 for (float f : row)
204    {
205  0 min = Math.min(min, f);
206  0 max = Math.max(max, f);
207    }
208    }
209    }
210    }
211  0 minValue = min;
212  0 maxValue = max;
213    }
214   
215    /**
216    * Returns an array A where A[i] is the position in the alphabet array of the
217    * character whose value is i. For example if the alphabet is { 'A', 'D', 'X'
218    * } then A['D'] = A[68] = 1.
219    * <p>
220    * Unmapped characters (not in the alphabet) get an index of -1.
221    * <p>
222    * Mappings are added automatically for lower case symbols (for non case
223    * sensitive scoring), unless they are explicitly present in the alphabet (are
224    * scored separately in the score matrix).
225    * <p>
226    * the gap character (space, dash or dot) included in the alphabet (if any) is
227    * recorded in a field
228    *
229    * @param alphabet
230    * @return
231    */
 
232  0 toggle short[] buildSymbolIndex(char[] alphabet)
233    {
234  0 short[] index = new short[MAX_ASCII + 1];
235  0 Arrays.fill(index, UNMAPPED);
236  0 short pos = 0;
237  0 for (char c : alphabet)
238    {
239  0 if (c <= MAX_ASCII)
240    {
241  0 index[c] = pos;
242    }
243   
244    /*
245    * also map lower-case character (unless separately mapped)
246    */
247  0 if (c >= 'A' && c <= 'Z')
248    {
249  0 short lowerCase = (short) (c + ('a' - 'A'));
250  0 if (index[lowerCase] == UNMAPPED)
251    {
252  0 index[lowerCase] = pos;
253    }
254    }
255  0 pos++;
256    }
257  0 return index;
258    }
259   
 
260  0 toggle @Override
261    public String getName()
262    {
263  0 return name;
264    }
265   
 
266  0 toggle @Override
267    public String getDescription()
268    {
269  0 return description;
270    }
271   
 
272  0 toggle @Override
273    public boolean isDNA()
274    {
275  0 return !peptide;
276    }
277   
 
278  0 toggle @Override
279    public boolean isProtein()
280    {
281  0 return peptide;
282    }
283   
284    /**
285    * Returns a copy of the score matrix as used in getPairwiseScore. If using
286    * this matrix directly, callers <em>must</em> also call
287    * <code>getMatrixIndex</code> in order to get the matrix index for each
288    * character (symbol).
289    *
290    * @return
291    * @see #getMatrixIndex(char)
292    */
 
293  0 toggle public float[][] getMatrix()
294    {
295  0 float[][] v = new float[matrix.length][matrix.length];
296  0 for (int i = 0; i < matrix.length; i++)
297    {
298  0 v[i] = Arrays.copyOf(matrix[i], matrix[i].length);
299    }
300  0 return v;
301    }
302   
303    /**
304    * Answers the matrix index for a given character, or -1 if unmapped in the
305    * matrix. Use this method only if using <code>getMatrix</code> in order to
306    * compute scores directly (without symbol lookup) for efficiency.
307    *
308    * @param c
309    * @return
310    * @see #getMatrix()
311    */
 
312  0 toggle public int getMatrixIndex(char c)
313    {
314  0 if (c < symbolIndex.length)
315    {
316  0 return symbolIndex[c];
317    }
318    else
319    {
320  0 return UNMAPPED;
321    }
322    }
323   
324    /**
325    * Returns the pairwise score for substituting c with d. If either c or d is
326    * an unexpected character, returns 1 for identity (c == d), else the minimum
327    * score value in the matrix.
328    */
 
329  0 toggle @Override
330    public float getPairwiseScore(char c, char d)
331    {
332  0 if (c >= symbolIndex.length)
333    {
334  0 jalview.bin.Console.errPrintln(String.format(BAD_ASCII_ERROR, c));
335  0 return 0;
336    }
337  0 if (d >= symbolIndex.length)
338    {
339  0 jalview.bin.Console.errPrintln(String.format(BAD_ASCII_ERROR, d));
340  0 return 0;
341    }
342   
343  0 int cIndex = symbolIndex[c];
344  0 int dIndex = symbolIndex[d];
345  0 if (cIndex != UNMAPPED && dIndex != UNMAPPED)
346    {
347  0 return matrix[cIndex][dIndex];
348    }
349   
350    /*
351    * one or both symbols not found in the matrix
352    * currently scoring as 1 (for identity) or the minimum
353    * matrix score value (otherwise)
354    * (a case could be made for using minimum row/column value instead)
355    */
356  0 return c == d ? UNKNOWN_IDENTITY_SCORE : getMinimumScore();
357    }
358   
359    /**
360    * pretty print the matrix
361    */
 
362  0 toggle @Override
363    public String toString()
364    {
365  0 return outputMatrix(false);
366    }
367   
368    /**
369    * Print the score matrix, optionally formatted as html, with the alphabet
370    * symbols as column headings and at the start of each row.
371    * <p>
372    * The non-html format should give an output which can be parsed as a score
373    * matrix file
374    *
375    * @param html
376    * @return
377    */
 
378  0 toggle public String outputMatrix(boolean html)
379    {
380  0 StringBuilder sb = new StringBuilder(512);
381   
382    /*
383    * heading row with alphabet
384    */
385  0 if (html)
386    {
387  0 sb.append("<table border=\"1\">");
388  0 sb.append(html ? "<tr><th></th>" : "");
389    }
390    else
391    {
392  0 sb.append("ScoreMatrix ").append(getName()).append("\n");
393    }
394  0 for (char sym : symbols)
395    {
396  0 if (html)
397    {
398  0 sb.append("<th>&nbsp;").append(sym).append("&nbsp;</th>");
399    }
400    else
401    {
402  0 sb.append("\t").append(sym);
403    }
404    }
405  0 sb.append(html ? "</tr>\n" : "\n");
406   
407    /*
408    * table of scores
409    */
410  0 for (char c1 : symbols)
411    {
412  0 if (html)
413    {
414  0 sb.append("<tr><td>");
415    }
416  0 sb.append(c1).append(html ? "</td>" : "");
417  0 for (char c2 : symbols)
418    {
419  0 sb.append(html ? "<td>" : "\t")
420    .append(matrix[symbolIndex[c1]][symbolIndex[c2]])
421  0 .append(html ? "</td>" : "");
422    }
423  0 sb.append(html ? "</tr>\n" : "\n");
424    }
425  0 if (html)
426    {
427  0 sb.append("</table>");
428    }
429  0 return sb.toString();
430    }
431   
432    /**
433    * Answers the number of symbols coded for (also equal to the number of rows
434    * and columns of the score matrix)
435    *
436    * @return
437    */
 
438  0 toggle public int getSize()
439    {
440  0 return symbols.length;
441    }
442   
443    /**
444    * Computes an NxN matrix where N is the number of sequences, and entry [i, j]
445    * is sequence[i] pairwise multiplied with sequence[j], as a sum of scores
446    * computed using the current score matrix. For example
447    * <ul>
448    * <li>Sequences:</li>
449    * <li>FKL</li>
450    * <li>R-D</li>
451    * <li>QIA</li>
452    * <li>GWC</li>
453    * <li>Score matrix is BLOSUM62</li>
454    * <li>Gaps treated same as X (unknown)</li>
455    * <li>product [0, 0] = F.F + K.K + L.L = 6 + 5 + 4 = 15</li>
456    * <li>product [1, 1] = R.R + -.- + D.D = 5 + -1 + 6 = 10</li>
457    * <li>product [2, 2] = Q.Q + I.I + A.A = 5 + 4 + 4 = 13</li>
458    * <li>product [3, 3] = G.G + W.W + C.C = 6 + 11 + 9 = 26</li>
459    * <li>product[0, 1] = F.R + K.- + L.D = -3 + -1 + -3 = -8
460    * <li>and so on</li>
461    * </ul>
462    * This method is thread-safe.
463    */
 
464  0 toggle @Override
465    public MatrixI findSimilarities(AlignmentView seqstrings,
466    SimilarityParamsI options)
467    {
468  0 char gapChar = scoreGapAsAny ? (seqstrings.isNa() ? 'N' : 'X')
469    : GAP_CHARACTER;
470  0 String[] seqs = seqstrings.getSequenceStrings(gapChar);
471  0 return findSimilarities(seqs, options);
472    }
473   
474    /**
475    * Computes pairwise similarities of a set of sequences using the given
476    * parameters
477    *
478    * @param seqs
479    * @param params
480    * @return
481    */
 
482  0 toggle protected MatrixI findSimilarities(String[] seqs,
483    SimilarityParamsI params)
484    {
485  0 double[][] values = new double[seqs.length][seqs.length];
486  0 for (int row = 0; row < seqs.length; row++)
487    {
488  0 for (int col = symmetric ? row : 0; col < seqs.length; col++)
489    {
490  0 double total = computeSimilarity(seqs[row], seqs[col], params);
491  0 values[row][col] = total;
492  0 if (symmetric)
493    {
494  0 values[col][row] = total;
495    }
496    }
497    }
498  0 return new Matrix(values);
499    }
500   
501    /**
502    * Calculates the pairwise similarity of two strings using the given
503    * calculation parameters
504    *
505    * @param seq1
506    * @param seq2
507    * @param params
508    * @return
509    */
 
510  0 toggle protected double computeSimilarity(String seq1, String seq2,
511    SimilarityParamsI params)
512    {
513  0 int len1 = seq1.length();
514  0 int len2 = seq2.length();
515  0 double total = 0;
516   
517  0 int width = Math.max(len1, len2);
518  0 for (int i = 0; i < width; i++)
519    {
520  0 if (i >= len1 || i >= len2)
521    {
522    /*
523    * off the end of one sequence; stop if we are only matching
524    * on the shorter sequence length, else treat as trailing gap
525    */
526  0 if (params.denominateByShortestLength())
527    {
528  0 break;
529    }
530    }
531   
532  0 char c1 = i >= len1 ? GAP_CHARACTER : seq1.charAt(i);
533  0 char c2 = i >= len2 ? GAP_CHARACTER : seq2.charAt(i);
534  0 boolean gap1 = Comparison.isGap(c1);
535  0 boolean gap2 = Comparison.isGap(c2);
536   
537  0 if (gap1 && gap2)
538    {
539    /*
540    * gap-gap: include if options say so, else ignore
541    */
542  0 if (!params.includeGappedColumns())
543    {
544  0 continue;
545    }
546    }
547  0 else if (gap1 || gap2)
548    {
549    /*
550    * gap-residue: score if options say so
551    */
552  0 if (!params.includeGaps())
553    {
554  0 continue;
555    }
556    }
557  0 float score = getPairwiseScore(c1, c2);
558  0 total += score;
559    }
560  0 return total;
561    }
562   
563    /**
564    * Answers a hashcode computed from the symbol alphabet and the matrix score
565    * values
566    */
 
567  0 toggle @Override
568    public int hashCode()
569    {
570  0 int hs = Arrays.hashCode(symbols);
571  0 for (float[] row : matrix)
572    {
573  0 hs = hs * 31 + Arrays.hashCode(row);
574    }
575  0 return hs;
576    }
577   
578    /**
579    * Answers true if the argument is a ScoreMatrix with the same symbol alphabet
580    * and score values, else false
581    */
 
582  0 toggle @Override
583    public boolean equals(Object obj)
584    {
585  0 if (!(obj instanceof ScoreMatrix))
586    {
587  0 return false;
588    }
589  0 ScoreMatrix sm = (ScoreMatrix) obj;
590  0 if (Arrays.equals(symbols, sm.symbols)
591    && Arrays.deepEquals(matrix, sm.matrix))
592    {
593  0 return true;
594    }
595  0 return false;
596    }
597   
598    /**
599    * Returns the alphabet the matrix scores for, as a string of characters
600    *
601    * @return
602    */
 
603  0 toggle String getSymbols()
604    {
605  0 return new String(symbols);
606    }
607   
 
608  0 toggle public float getMinimumScore()
609    {
610  0 return minValue;
611    }
612   
 
613  0 toggle public float getMaximumScore()
614    {
615  0 return maxValue;
616    }
617   
 
618  0 toggle @Override
619    public ScoreModelI getInstance(AlignmentViewPanel avp)
620    {
621  0 return this;
622    }
623   
 
624  0 toggle public boolean isSymmetric()
625    {
626  0 return symmetric;
627    }
628    }