Clover icon

jalviewX

  1. Project Clover database Wed Oct 31 2018 15:13:58 GMT
  2. Package jalview.datamodel

File ResidueCount.java

 

Coverage histogram

../../img/srcFileCovDistChart10.png
0% of files have more coverage

Code metrics

104
152
24
2
643
374
79
0.52
6.33
12
3.29

Classes

Class Line # Actions
ResidueCount 35 150 78 0
1.0100%
ResidueCount.SymbolCounts 40 2 1 0
1.0100%
 

Contributing tests

This file is covered by 107 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.datamodel;
22   
23    import jalview.util.Comparison;
24    import jalview.util.Format;
25    import jalview.util.QuickSort;
26    import jalview.util.SparseCount;
27   
28    /**
29    * A class to count occurrences of residues in a profile, optimised for speed
30    * and memory footprint.
31    *
32    * @author gmcarstairs
33    *
34    */
 
35    public class ResidueCount
36    {
37    /**
38    * A data bean to hold the results of counting symbols
39    */
 
40    public class SymbolCounts
41    {
42    /**
43    * the symbols seen (as char values), in no particular order
44    */
45    public final char[] symbols;
46   
47    /**
48    * the counts for each symbol, in the same order as the symbols
49    */
50    public final int[] values;
51   
 
52  165771 toggle SymbolCounts(char[] s, int[] v)
53    {
54  165771 symbols = s;
55  165773 values = v;
56    }
57    }
58   
59    private static final int TOUPPERCASE = 'A' - 'a';
60   
61    /*
62    * nucleotide symbols to count (including N unknown)
63    */
64    private static final String NUCS = "ACGNTU";
65   
66    /*
67    * amino acid symbols to count (including X unknown)
68    * NB we also include U so as to support counting of RNA bases
69    * in the "don't know" case of nucleotide / peptide
70    */
71    private static final String AAS = "ACDEFGHIKLMNPQRSTUVWXY";
72   
73    private static final int GAP_COUNT = 0;
74   
75    /*
76    * fast lookup tables holding the index into our count
77    * arrays of each symbol; index 0 is reserved for gap counting
78    */
79    private static int[] NUC_INDEX = new int[26];
80   
81    private static int[] AA_INDEX = new int[26];
 
82  1 toggle static
83    {
84  7 for (int i = 0; i < NUCS.length(); i++)
85    {
86  6 NUC_INDEX[NUCS.charAt(i) - 'A'] = i + 1;
87    }
88  23 for (int i = 0; i < AAS.length(); i++)
89    {
90  22 AA_INDEX[AAS.charAt(i) - 'A'] = i + 1;
91    }
92    }
93   
94    /*
95    * counts array, just big enough for the nucleotide or peptide
96    * character set (plus gap counts in position 0)
97    */
98    private short[] counts;
99   
100    /*
101    * alternative array of int counts for use if any count
102    * exceeds the maximum value of short (32767)
103    */
104    private int[] intCounts;
105   
106    /*
107    * flag set if we switch from short to int counts
108    */
109    private boolean useIntCounts;
110   
111    /*
112    * general-purpose counter, only for use for characters
113    * that are not in the expected alphabet
114    */
115    private SparseCount otherData;
116   
117    /*
118    * keeps track of the maximum count value recorded
119    * (if this class ever allows decrements, would need to
120    * calculate this on request instead)
121    */
122    int maxCount;
123   
124    /*
125    * if we think we are counting nucleotide, can get by with smaller
126    * array to hold counts
127    */
128    private boolean isNucleotide;
129   
130    /**
131    * Default constructor allocates arrays able to count either nucleotide or
132    * peptide bases. Use this constructor if not sure which the data is.
133    */
 
134  8 toggle public ResidueCount()
135    {
136  8 this(false);
137    }
138   
139    /**
140    * Constructor that allocates an array just big enough for the anticipated
141    * characters, plus one position to count gaps
142    */
 
143  185873 toggle public ResidueCount(boolean nucleotide)
144    {
145  185877 isNucleotide = nucleotide;
146  185885 int charsToCount = nucleotide ? NUCS.length() : AAS.length();
147  185881 counts = new short[charsToCount + 1];
148    }
149   
150    /**
151    * Increments the count for the given character. The supplied character may be
152    * upper or lower case but counts are for the upper case only. Gap characters
153    * (space, ., -) are all counted together.
154    *
155    * @param c
156    * @return the new value of the count for the character
157    */
 
158  1739812 toggle public int add(final char c)
159    {
160  1740058 char u = toUpperCase(c);
161  1739885 int newValue = 0;
162  1740019 int offset = getOffset(u);
163   
164    /*
165    * offset 0 is reserved for gap counting, so 0 here means either
166    * an unexpected character, or a gap character passed in error
167    */
168  1743223 if (offset == 0)
169    {
170  127952 if (Comparison.isGap(u))
171    {
172  127873 newValue = addGap();
173    }
174    else
175    {
176  79 newValue = addOtherCharacter(u);
177    }
178    }
179    else
180    {
181  1615440 newValue = increment(offset);
182    }
183  1743809 return newValue;
184    }
185   
186    /**
187    * Increment the count at the specified offset. If this would result in short
188    * overflow, promote to counting int values instead.
189    *
190    * @param offset
191    * @return the new value of the count at this offset
192    */
 
193  1615680 toggle int increment(int offset)
194    {
195  1615891 int newValue = 0;
196  1616658 if (useIntCounts)
197    {
198  1 newValue = intCounts[offset];
199  1 intCounts[offset] = ++newValue;
200    }
201    else
202    {
203  1617866 if (counts[offset] == Short.MAX_VALUE)
204    {
205  7 handleOverflow();
206  7 newValue = intCounts[offset];
207  7 intCounts[offset] = ++newValue;
208    }
209    else
210    {
211  1617591 newValue = counts[offset];
212  1617741 counts[offset] = (short) ++newValue;
213    }
214    }
215  1617868 maxCount = Math.max(maxCount, newValue);
216  1618039 return newValue;
217    }
218   
219    /**
220    * Switch from counting in short to counting in int
221    */
 
222  9 toggle synchronized void handleOverflow()
223    {
224  9 intCounts = new int[counts.length];
225  136 for (int i = 0; i < counts.length; i++)
226    {
227  127 intCounts[i] = counts[i];
228    }
229  9 counts = null;
230  9 useIntCounts = true;
231    }
232   
233    /**
234    * Returns this character's offset in the count array
235    *
236    * @param c
237    * @return
238    */
 
239  1740318 toggle int getOffset(char c)
240    {
241  1740395 int offset = 0;
242  1742444 if ('A' <= c && c <= 'Z')
243    {
244  1615511 offset = isNucleotide ? NUC_INDEX[c - 'A'] : AA_INDEX[c - 'A'];
245    }
246  1742420 return offset;
247    }
248   
249    /**
250    * @param c
251    * @return
252    */
 
253  1739948 toggle protected char toUpperCase(final char c)
254    {
255  1739970 char u = c;
256  1739993 if ('a' <= c && c <= 'z')
257    {
258  297 u = (char) (c + TOUPPERCASE);
259    }
260  1740169 return u;
261    }
262   
263    /**
264    * Increment count for some unanticipated character. The first time this
265    * called, a SparseCount is instantiated to hold these 'extra' counts.
266    *
267    * @param c
268    * @return the new value of the count for the character
269    */
 
270  79 toggle int addOtherCharacter(char c)
271    {
272  79 if (otherData == null)
273    {
274  60 otherData = new SparseCount();
275    }
276  79 int newValue = otherData.add(c, 1);
277  79 maxCount = Math.max(maxCount, newValue);
278  79 return newValue;
279    }
280   
281    /**
282    * Set count for some unanticipated character. The first time this called, a
283    * SparseCount is instantiated to hold these 'extra' counts.
284    *
285    * @param c
286    * @param value
287    */
 
288  5 toggle void setOtherCharacter(char c, int value)
289    {
290  5 if (otherData == null)
291    {
292  2 otherData = new SparseCount();
293    }
294  5 otherData.put(c, value);
295    }
296   
297    /**
298    * Increment count of gap characters
299    *
300    * @return the new count of gaps
301    */
 
302  264612 toggle public int addGap()
303    {
304  264619 int newValue;
305  264627 if (useIntCounts)
306    {
307  1 newValue = ++intCounts[GAP_COUNT];
308    }
309    else
310    {
311  264633 newValue = ++counts[GAP_COUNT];
312    }
313  264642 return newValue;
314    }
315   
316    /**
317    * Answers true if we are counting ints (only after overflow of short counts)
318    *
319    * @return
320    */
 
321  10 toggle boolean isCountingInts()
322    {
323  10 return useIntCounts;
324    }
325   
326    /**
327    * Sets the count for the given character. The supplied character may be upper
328    * or lower case but counts are for the upper case only.
329    *
330    * @param c
331    * @param count
332    */
 
333  52 toggle public void put(char c, int count)
334    {
335  52 char u = toUpperCase(c);
336  52 int offset = getOffset(u);
337   
338    /*
339    * offset 0 is reserved for gap counting, so 0 here means either
340    * an unexpected character, or a gap character passed in error
341    */
342  52 if (offset == 0)
343    {
344  8 if (Comparison.isGap(u))
345    {
346  3 set(0, count);
347    }
348    else
349    {
350  5 setOtherCharacter(u, count);
351  5 maxCount = Math.max(maxCount, count);
352    }
353    }
354    else
355    {
356  44 set(offset, count);
357  44 maxCount = Math.max(maxCount, count);
358    }
359    }
360   
361    /**
362    * Sets the count at the specified offset. If this would result in short
363    * overflow, promote to counting int values instead.
364    *
365    * @param offset
366    * @param value
367    */
 
368  47 toggle void set(int offset, int value)
369    {
370  47 if (useIntCounts)
371    {
372  1 intCounts[offset] = value;
373    }
374    else
375    {
376  46 if (value > Short.MAX_VALUE || value < Short.MIN_VALUE)
377    {
378  2 handleOverflow();
379  2 intCounts[offset] = value;
380    }
381    else
382    {
383  44 counts[offset] = (short) value;
384    }
385    }
386    }
387   
388    /**
389    * Returns the count for the given character, or zero if no count held
390    *
391    * @param c
392    * @return
393    */
 
394  62 toggle public int getCount(char c)
395    {
396  62 char u = toUpperCase(c);
397  62 int offset = getOffset(u);
398  62 if (offset == 0)
399    {
400  11 if (!Comparison.isGap(u))
401    {
402    // should have called getGapCount()
403  6 return otherData == null ? 0 : otherData.get(u);
404    }
405    }
406  56 return useIntCounts ? intCounts[offset] : counts[offset];
407    }
408   
 
409  185863 toggle public int getGapCount()
410    {
411  185865 return useIntCounts ? intCounts[0] : counts[0];
412    }
413   
414    /**
415    * Answers true if this object wraps a counter for unexpected characters
416    *
417    * @return
418    */
 
419  7 toggle boolean isUsingOtherData()
420    {
421  7 return otherData != null;
422    }
423   
424    /**
425    * Returns the character (or concatenated characters) for the symbol(s) with
426    * the given count in the profile. Can be used to get the modal residue by
427    * supplying the modal count value. Returns an empty string if no symbol has
428    * the given count. The symbols are in alphabetic order of standard peptide or
429    * nucleotide characters, followed by 'other' symbols if any.
430    *
431    * @return
432    */
 
433  117499 toggle public String getResiduesForCount(int count)
434    {
435  117499 if (count == 0)
436    {
437  2020 return "";
438    }
439   
440    /*
441    * find counts for the given value and append the
442    * corresponding symbol
443    */
444  115481 StringBuilder modal = new StringBuilder();
445  115488 if (useIntCounts)
446    {
447  60 for (int i = 1; i < intCounts.length; i++)
448    {
449  56 if (intCounts[i] == count)
450    {
451  4 modal.append(
452  4 isNucleotide ? NUCS.charAt(i - 1) : AAS.charAt(i - 1));
453    }
454    }
455    }
456    else
457    {
458  2650210 for (int i = 1; i < counts.length; i++)
459    {
460  2534860 if (counts[i] == count)
461    {
462  124829 modal.append(
463  124829 isNucleotide ? NUCS.charAt(i - 1) : AAS.charAt(i - 1));
464    }
465    }
466    }
467  115497 if (otherData != null)
468    {
469  80 for (int i = 0; i < otherData.size(); i++)
470    {
471  40 if (otherData.valueAt(i) == count)
472    {
473  38 modal.append((char) otherData.keyAt(i));
474    }
475    }
476    }
477  115495 return modal.toString();
478    }
479   
480    /**
481    * Returns the highest count for any symbol(s) in the profile (excluding gap)
482    *
483    * @return
484    */
 
485  117490 toggle public int getModalCount()
486    {
487  117490 return maxCount;
488    }
489   
490    /**
491    * Returns the number of distinct symbols with a non-zero count (excluding the
492    * gap symbol)
493    *
494    * @return
495    */
 
496  165748 toggle public int size()
497    {
498  165748 int size = 0;
499  165748 if (useIntCounts)
500    {
501  53 for (int i = 1; i < intCounts.length; i++)
502    {
503  50 if (intCounts[i] > 0)
504    {
505  10 size++;
506    }
507    }
508    }
509    else
510    {
511  3807280 for (int i = 1; i < counts.length; i++)
512    {
513  3642306 if (counts[i] > 0)
514    {
515  321215 size++;
516    }
517    }
518    }
519   
520    /*
521    * include 'other' characters recorded (even if count is zero
522    * though that would be a strange use case)
523    */
524  165759 if (otherData != null)
525    {
526  23 size += otherData.size();
527    }
528   
529  165760 return size;
530    }
531   
532    /**
533    * Returns a data bean holding those symbols that have a non-zero count
534    * (excluding the gap symbol), with their counts.
535    *
536    * @return
537    */
 
538  165750 toggle public SymbolCounts getSymbolCounts()
539    {
540  165751 int size = size();
541  165763 char[] symbols = new char[size];
542  165780 int[] values = new int[size];
543  165776 int j = 0;
544   
545  165781 if (useIntCounts)
546    {
547  53 for (int i = 1; i < intCounts.length; i++)
548    {
549  50 if (intCounts[i] > 0)
550    {
551  10 char symbol = isNucleotide ? NUCS.charAt(i - 1)
552    : AAS.charAt(i - 1);
553  10 symbols[j] = symbol;
554  10 values[j] = intCounts[i];
555  10 j++;
556    }
557    }
558    }
559    else
560    {
561  3808188 for (int i = 1; i < counts.length; i++)
562    {
563  3643215 if (counts[i] > 0)
564    {
565  321212 char symbol = isNucleotide ? NUCS.charAt(i - 1)
566    : AAS.charAt(i - 1);
567  321217 symbols[j] = symbol;
568  321223 values[j] = counts[i];
569  321234 j++;
570    }
571    }
572    }
573  165762 if (otherData != null)
574    {
575  48 for (int i = 0; i < otherData.size(); i++)
576    {
577  25 symbols[j] = (char) otherData.keyAt(i);
578  25 values[j] = otherData.valueAt(i);
579  25 j++;
580    }
581    }
582   
583  165770 return new SymbolCounts(symbols, values);
584    }
585   
586    /**
587    * Returns a tooltip string showing residues in descending order of their
588    * percentage frequency in the profile
589    *
590    * @param normaliseBy
591    * the divisor for residue counts (may or may not include gapped
592    * sequence count)
593    * @param percentageDecPl
594    * the number of decimal places to show in percentages
595    * @return
596    */
 
597  48516 toggle public String getTooltip(int normaliseBy, int percentageDecPl)
598    {
599  48516 SymbolCounts symbolCounts = getSymbolCounts();
600  48519 char[] ca = symbolCounts.symbols;
601  48518 int[] vl = symbolCounts.values;
602   
603    /*
604    * sort characters into ascending order of their counts
605    */
606  48518 QuickSort.sort(vl, ca);
607   
608    /*
609    * traverse in reverse order (highest count first) to build tooltip
610    */
611  48518 boolean first = true;
612  48518 StringBuilder sb = new StringBuilder(64);
613  136932 for (int c = ca.length - 1; c >= 0; c--)
614    {
615  88413 final char residue = ca[c];
616    // TODO combine residues which share a percentage
617    // (see AAFrequency.completeCdnaConsensus)
618  88411 float tval = (vl[c] * 100f) / normaliseBy;
619  88421 sb.append(first ? "" : "; ").append(residue).append(" ");
620  88413 Format.appendPercentage(sb, tval, percentageDecPl);
621  88412 sb.append("%");
622  88414 first = false;
623    }
624  48518 return sb.toString();
625    }
626   
627    /**
628    * Returns a string representation of the symbol counts, for debug purposes.
629    */
 
630  3 toggle @Override
631    public String toString()
632    {
633  3 StringBuilder sb = new StringBuilder();
634  3 sb.append("[ ");
635  3 SymbolCounts sc = getSymbolCounts();
636  12 for (int i = 0; i < sc.symbols.length; i++)
637    {
638  9 sb.append(sc.symbols[i]).append(":").append(sc.values[i]).append(" ");
639    }
640  3 sb.append("]");
641  3 return sb.toString();
642    }
643    }