Clover icon

Coverage Report

  1. Project Clover database Thu Aug 13 2020 12:04:21 BST
  2. Package jalview.datamodel

File ResidueCount.java

 

Coverage histogram

../../img/srcFileCovDistChart10.png
0% of files have more coverage

Code metrics

104
152
24
2
643
374
79
0.52
6.33
12
3.29

Classes

Class Line # Actions
ResidueCount 35 150 78
1.0100%
ResidueCount.SymbolCounts 40 2 1
1.0100%
 

Contributing tests

This file is covered by 123 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.datamodel;
22   
23    import jalview.util.Comparison;
24    import jalview.util.Format;
25    import jalview.util.QuickSort;
26    import jalview.util.SparseCount;
27   
28    /**
29    * A class to count occurrences of residues in a profile, optimised for speed
30    * and memory footprint.
31    *
32    * @author gmcarstairs
33    *
34    */
 
35    public class ResidueCount
36    {
37    /**
38    * A data bean to hold the results of counting symbols
39    */
 
40    public class SymbolCounts
41    {
42    /**
43    * the symbols seen (as char values), in no particular order
44    */
45    public final char[] symbols;
46   
47    /**
48    * the counts for each symbol, in the same order as the symbols
49    */
50    public final int[] values;
51   
 
52  139345 toggle SymbolCounts(char[] s, int[] v)
53    {
54  139347 symbols = s;
55  139350 values = v;
56    }
57    }
58   
59    private static final int TOUPPERCASE = 'A' - 'a';
60   
61    /*
62    * nucleotide symbols to count (including N unknown)
63    */
64    private static final String NUCS = "ACGNTU";
65   
66    /*
67    * amino acid symbols to count (including X unknown)
68    * NB we also include U so as to support counting of RNA bases
69    * in the "don't know" case of nucleotide / peptide
70    */
71    private static final String AAS = "ACDEFGHIKLMNPQRSTUVWXY";
72   
73    private static final int GAP_COUNT = 0;
74   
75    /*
76    * fast lookup tables holding the index into our count
77    * arrays of each symbol; index 0 is reserved for gap counting
78    */
79    private static int[] NUC_INDEX = new int[26];
80   
81    private static int[] AA_INDEX = new int[26];
 
82  18 toggle static
83    {
84  126 for (int i = 0; i < NUCS.length(); i++)
85    {
86  108 NUC_INDEX[NUCS.charAt(i) - 'A'] = i + 1;
87    }
88  414 for (int i = 0; i < AAS.length(); i++)
89    {
90  396 AA_INDEX[AAS.charAt(i) - 'A'] = i + 1;
91    }
92    }
93   
94    /*
95    * counts array, just big enough for the nucleotide or peptide
96    * character set (plus gap counts in position 0)
97    */
98    private short[] counts;
99   
100    /*
101    * alternative array of int counts for use if any count
102    * exceeds the maximum value of short (32767)
103    */
104    private int[] intCounts;
105   
106    /*
107    * flag set if we switch from short to int counts
108    */
109    private boolean useIntCounts;
110   
111    /*
112    * general-purpose counter, only for use for characters
113    * that are not in the expected alphabet
114    */
115    private SparseCount otherData;
116   
117    /*
118    * keeps track of the maximum count value recorded
119    * (if this class ever allows decrements, would need to
120    * calculate this on request instead)
121    */
122    int maxCount;
123   
124    /*
125    * if we think we are counting nucleotide, can get by with smaller
126    * array to hold counts
127    */
128    private boolean isNucleotide;
129   
130    /**
131    * Default constructor allocates arrays able to count either nucleotide or
132    * peptide bases. Use this constructor if not sure which the data is.
133    */
 
134  10 toggle public ResidueCount()
135    {
136  10 this(false);
137    }
138   
139    /**
140    * Constructor that allocates an array just big enough for the anticipated
141    * characters, plus one position to count gaps
142    */
 
143  591051 toggle public ResidueCount(boolean nucleotide)
144    {
145  591081 isNucleotide = nucleotide;
146  591038 int charsToCount = nucleotide ? NUCS.length() : AAS.length();
147  591041 counts = new short[charsToCount + 1];
148    }
149   
150    /**
151    * Increments the count for the given character. The supplied character may be
152    * upper or lower case but counts are for the upper case only. Gap characters
153    * (space, ., -) are all counted together.
154    *
155    * @param c
156    * @return the new value of the count for the character
157    */
 
158  9954585 toggle public int add(final char c)
159    {
160  9957535 char u = toUpperCase(c);
161  10048355 int newValue = 0;
162  10055597 int offset = getOffset(u);
163   
164    /*
165    * offset 0 is reserved for gap counting, so 0 here means either
166    * an unexpected character, or a gap character passed in error
167    */
168  10165719 if (offset == 0)
169    {
170  7964435 if (Comparison.isGap(u))
171    {
172  8110497 newValue = addGap();
173    }
174    else
175    {
176  86 newValue = addOtherCharacter(u);
177    }
178    }
179    else
180    {
181  2080587 newValue = increment(offset);
182    }
183  10115454 return newValue;
184    }
185   
186    /**
187    * Increment the count at the specified offset. If this would result in short
188    * overflow, promote to counting int values instead.
189    *
190    * @param offset
191    * @return the new value of the count at this offset
192    */
 
193  2080475 toggle int increment(int offset)
194    {
195  2080618 int newValue = 0;
196  2081152 if (useIntCounts)
197    {
198  1 newValue = intCounts[offset];
199  1 intCounts[offset] = ++newValue;
200    }
201    else
202    {
203  2084023 if (counts[offset] == Short.MAX_VALUE)
204    {
205  7 handleOverflow();
206  7 newValue = intCounts[offset];
207  7 intCounts[offset] = ++newValue;
208    }
209    else
210    {
211  2085878 newValue = counts[offset];
212  2085770 counts[offset] = (short) ++newValue;
213    }
214    }
215  2089257 maxCount = Math.max(maxCount, newValue);
216  2090170 return newValue;
217    }
218   
219    /**
220    * Switch from counting in short to counting in int
221    */
 
222  9 toggle synchronized void handleOverflow()
223    {
224  9 intCounts = new int[counts.length];
225  136 for (int i = 0; i < counts.length; i++)
226    {
227  127 intCounts[i] = counts[i];
228    }
229  9 counts = null;
230  9 useIntCounts = true;
231    }
232   
233    /**
234    * Returns this character's offset in the count array
235    *
236    * @param c
237    * @return
238    */
 
239  10058505 toggle int getOffset(char c)
240    {
241  10064775 int offset = 0;
242  10179473 if ('A' <= c && c <= 'Z')
243    {
244  2082477 offset = isNucleotide ? NUC_INDEX[c - 'A'] : AA_INDEX[c - 'A'];
245    }
246  10085467 return offset;
247    }
248   
249    /**
250    * @param c
251    * @return
252    */
 
253  9975140 toggle protected char toUpperCase(final char c)
254    {
255  9993853 char u = c;
256  10019880 if ('a' <= c && c <= 'z')
257    {
258  315 u = (char) (c + TOUPPERCASE);
259    }
260  10025007 return u;
261    }
262   
263    /**
264    * Increment count for some unanticipated character. The first time this
265    * called, a SparseCount is instantiated to hold these 'extra' counts.
266    *
267    * @param c
268    * @return the new value of the count for the character
269    */
 
270  86 toggle int addOtherCharacter(char c)
271    {
272  86 if (otherData == null)
273    {
274  67 otherData = new SparseCount();
275    }
276  86 int newValue = otherData.add(c, 1);
277  86 maxCount = Math.max(maxCount, newValue);
278  86 return newValue;
279    }
280   
281    /**
282    * Set count for some unanticipated character. The first time this called, a
283    * SparseCount is instantiated to hold these 'extra' counts.
284    *
285    * @param c
286    * @param value
287    */
 
288  5 toggle void setOtherCharacter(char c, int value)
289    {
290  5 if (otherData == null)
291    {
292  2 otherData = new SparseCount();
293    }
294  5 otherData.put(c, value);
295    }
296   
297    /**
298    * Increment count of gap characters
299    *
300    * @return the new count of gaps
301    */
 
302  8248046 toggle public int addGap()
303    {
304  8266541 int newValue;
305  8246969 if (useIntCounts)
306    {
307  1 newValue = ++intCounts[GAP_COUNT];
308    }
309    else
310    {
311  8288140 newValue = ++counts[GAP_COUNT];
312    }
313  8287959 return newValue;
314    }
315   
316    /**
317    * Answers true if we are counting ints (only after overflow of short counts)
318    *
319    * @return
320    */
 
321  10 toggle boolean isCountingInts()
322    {
323  10 return useIntCounts;
324    }
325   
326    /**
327    * Sets the count for the given character. The supplied character may be upper
328    * or lower case but counts are for the upper case only.
329    *
330    * @param c
331    * @param count
332    */
 
333  66 toggle public void put(char c, int count)
334    {
335  66 char u = toUpperCase(c);
336  66 int offset = getOffset(u);
337   
338    /*
339    * offset 0 is reserved for gap counting, so 0 here means either
340    * an unexpected character, or a gap character passed in error
341    */
342  66 if (offset == 0)
343    {
344  8 if (Comparison.isGap(u))
345    {
346  3 set(0, count);
347    }
348    else
349    {
350  5 setOtherCharacter(u, count);
351  5 maxCount = Math.max(maxCount, count);
352    }
353    }
354    else
355    {
356  58 set(offset, count);
357  58 maxCount = Math.max(maxCount, count);
358    }
359    }
360   
361    /**
362    * Sets the count at the specified offset. If this would result in short
363    * overflow, promote to counting int values instead.
364    *
365    * @param offset
366    * @param value
367    */
 
368  61 toggle void set(int offset, int value)
369    {
370  61 if (useIntCounts)
371    {
372  1 intCounts[offset] = value;
373    }
374    else
375    {
376  60 if (value > Short.MAX_VALUE || value < Short.MIN_VALUE)
377    {
378  2 handleOverflow();
379  2 intCounts[offset] = value;
380    }
381    else
382    {
383  58 counts[offset] = (short) value;
384    }
385    }
386    }
387   
388    /**
389    * Returns the count for the given character, or zero if no count held
390    *
391    * @param c
392    * @return
393    */
 
394  62 toggle public int getCount(char c)
395    {
396  62 char u = toUpperCase(c);
397  62 int offset = getOffset(u);
398  62 if (offset == 0)
399    {
400  11 if (!Comparison.isGap(u))
401    {
402    // should have called getGapCount()
403  6 return otherData == null ? 0 : otherData.get(u);
404    }
405    }
406  56 return useIntCounts ? intCounts[offset] : counts[offset];
407    }
408   
 
409  590980 toggle public int getGapCount()
410    {
411  590940 return useIntCounts ? intCounts[0] : counts[0];
412    }
413   
414    /**
415    * Answers true if this object wraps a counter for unexpected characters
416    *
417    * @return
418    */
 
419  7 toggle boolean isUsingOtherData()
420    {
421  7 return otherData != null;
422    }
423   
424    /**
425    * Returns the character (or concatenated characters) for the symbol(s) with
426    * the given count in the profile. Can be used to get the modal residue by
427    * supplying the modal count value. Returns an empty string if no symbol has
428    * the given count. The symbols are in alphabetic order of standard peptide or
429    * nucleotide characters, followed by 'other' symbols if any.
430    *
431    * @return
432    */
 
433  514234 toggle public String getResiduesForCount(int count)
434    {
435  514231 if (count == 0)
436    {
437  1569 return "";
438    }
439   
440    /*
441    * find counts for the given value and append the
442    * corresponding symbol
443    */
444  512669 StringBuilder modal = new StringBuilder();
445  515925 if (useIntCounts)
446    {
447  60 for (int i = 1; i < intCounts.length; i++)
448    {
449  56 if (intCounts[i] == count)
450    {
451  4 modal.append(
452  4 isNucleotide ? NUCS.charAt(i - 1) : AAS.charAt(i - 1));
453    }
454    }
455    }
456    else
457    {
458  5208242 for (int i = 1; i < counts.length; i++)
459    {
460  4697548 if (counts[i] == count)
461    {
462  523932 modal.append(
463  523968 isNucleotide ? NUCS.charAt(i - 1) : AAS.charAt(i - 1));
464    }
465    }
466    }
467  516848 if (otherData != null)
468    {
469  94 for (int i = 0; i < otherData.size(); i++)
470    {
471  47 if (otherData.valueAt(i) == count)
472    {
473  45 modal.append((char) otherData.keyAt(i));
474    }
475    }
476    }
477  516797 return modal.toString();
478    }
479   
480    /**
481    * Returns the highest count for any symbol(s) in the profile (excluding gap)
482    *
483    * @return
484    */
 
485  514252 toggle public int getModalCount()
486    {
487  514252 return maxCount;
488    }
489   
490    /**
491    * Returns the number of distinct symbols with a non-zero count (excluding the
492    * gap symbol)
493    *
494    * @return
495    */
 
496  139332 toggle public int size()
497    {
498  139334 int size = 0;
499  139335 if (useIntCounts)
500    {
501  53 for (int i = 1; i < intCounts.length; i++)
502    {
503  50 if (intCounts[i] > 0)
504    {
505  10 size++;
506    }
507    }
508    }
509    else
510    {
511  3198975 for (int i = 1; i < counts.length; i++)
512    {
513  3060605 if (counts[i] > 0)
514    {
515  270844 size++;
516    }
517    }
518    }
519   
520    /*
521    * include 'other' characters recorded (even if count is zero
522    * though that would be a strange use case)
523    */
524  139347 if (otherData != null)
525    {
526  23 size += otherData.size();
527    }
528   
529  139334 return size;
530    }
531   
532    /**
533    * Returns a data bean holding those symbols that have a non-zero count
534    * (excluding the gap symbol), with their counts.
535    *
536    * @return
537    */
 
538  139337 toggle public SymbolCounts getSymbolCounts()
539    {
540  139335 int size = size();
541  139337 char[] symbols = new char[size];
542  139330 int[] values = new int[size];
543  139340 int j = 0;
544   
545  139352 if (useIntCounts)
546    {
547  53 for (int i = 1; i < intCounts.length; i++)
548    {
549  50 if (intCounts[i] > 0)
550    {
551  10 char symbol = isNucleotide ? NUCS.charAt(i - 1)
552    : AAS.charAt(i - 1);
553  10 symbols[j] = symbol;
554  10 values[j] = intCounts[i];
555  10 j++;
556    }
557    }
558    }
559    else
560    {
561  3199026 for (int i = 1; i < counts.length; i++)
562    {
563  3060672 if (counts[i] > 0)
564    {
565  270836 char symbol = isNucleotide ? NUCS.charAt(i - 1)
566    : AAS.charAt(i - 1);
567  270835 symbols[j] = symbol;
568  270833 values[j] = counts[i];
569  270782 j++;
570    }
571    }
572    }
573  139340 if (otherData != null)
574    {
575  48 for (int i = 0; i < otherData.size(); i++)
576    {
577  25 symbols[j] = (char) otherData.keyAt(i);
578  25 values[j] = otherData.valueAt(i);
579  25 j++;
580    }
581    }
582   
583  139344 return new SymbolCounts(symbols, values);
584    }
585   
586    /**
587    * Returns a tooltip string showing residues in descending order of their
588    * percentage frequency in the profile
589    *
590    * @param normaliseBy
591    * the divisor for residue counts (may or may not include gapped
592    * sequence count)
593    * @param percentageDecPl
594    * the number of decimal places to show in percentages
595    * @return
596    */
 
597  33763 toggle public String getTooltip(int normaliseBy, int percentageDecPl)
598    {
599  33763 SymbolCounts symbolCounts = getSymbolCounts();
600  33762 char[] ca = symbolCounts.symbols;
601  33762 int[] vl = symbolCounts.values;
602   
603    /*
604    * sort characters into ascending order of their counts
605    */
606  33762 QuickSort.sort(vl, ca);
607   
608    /*
609    * traverse in reverse order (highest count first) to build tooltip
610    */
611  33759 boolean first = true;
612  33759 StringBuilder sb = new StringBuilder(64);
613  96171 for (int c = ca.length - 1; c >= 0; c--)
614    {
615  62409 final char residue = ca[c];
616    // TODO combine residues which share a percentage
617    // (see AAFrequency.completeCdnaConsensus)
618  62408 float tval = (vl[c] * 100f) / normaliseBy;
619  62416 sb.append(first ? "" : "; ").append(residue).append(" ");
620  62409 Format.appendPercentage(sb, tval, percentageDecPl);
621  62418 sb.append("%");
622  62418 first = false;
623    }
624  33762 return sb.toString();
625    }
626   
627    /**
628    * Returns a string representation of the symbol counts, for debug purposes.
629    */
 
630  3 toggle @Override
631    public String toString()
632    {
633  3 StringBuilder sb = new StringBuilder();
634  3 sb.append("[ ");
635  3 SymbolCounts sc = getSymbolCounts();
636  12 for (int i = 0; i < sc.symbols.length; i++)
637    {
638  9 sb.append(sc.symbols[i]).append(":").append(sc.values[i]).append(" ");
639    }
640  3 sb.append("]");
641  3 return sb.toString();
642    }
643    }