1. Project Clover database Fri Dec 6 2024 13:47:14 GMT
  2. Package jalview.datamodel

File ResidueCount.java

 

Coverage histogram

../../img/srcFileCovDistChart10.png
0% of files have more coverage

Code metrics

104
150
24
2
640
369
79
0.53
6.25
12
3.29

Classes

Class
Line #
Actions
ResidueCount 35 148 78
1.0100%
ResidueCount.SymbolCounts 40 2 1
1.0100%
 

Contributing tests

This file is covered by 277 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.datamodel;
22   
23    import jalview.util.Comparison;
24    import jalview.util.Format;
25    import jalview.util.QuickSort;
26    import jalview.util.SparseCount;
27   
28    /**
29    * A class to count occurrences of residues in a profile, optimised for speed
30    * and memory footprint.
31    *
32    * @author gmcarstairs
33    *
34    */
 
35    public class ResidueCount
36    {
37    /**
38    * A data bean to hold the results of counting symbols
39    */
 
40    public class SymbolCounts
41    {
42    /**
43    * the symbols seen (as char values), in no particular order
44    */
45    public final char[] symbols;
46   
47    /**
48    * the counts for each symbol, in the same order as the symbols
49    */
50    public final int[] values;
51   
 
52  311770 toggle SymbolCounts(char[] s, int[] v)
53    {
54  311773 symbols = s;
55  311782 values = v;
56    }
57    }
58   
59    private static final int TOUPPERCASE = 'A' - 'a';
60   
61    /*
62    * nucleotide symbols to count (including N unknown)
63    */
64    private static final String NUCS = "ACGNTU";
65   
66    /*
67    * amino acid symbols to count (including X unknown)
68    * NB we also include U so as to support counting of RNA bases
69    * in the "don't know" case of nucleotide / peptide
70    */
71    private static final String AAS = "ACDEFGHIKLMNPQRSTUVWXY";
72   
73    static final int GAP_COUNT = 0;
74   
75    /*
76    * fast lookup tables holding the index into our count
77    * arrays of each symbol; index 0 is reserved for gap counting
78    */
79    private static int[] NUC_INDEX = new int[26];
80   
81    private static int[] AA_INDEX = new int[26];
 
82  50 toggle static
83    {
84  350 for (int i = 0; i < NUCS.length(); i++)
85    {
86  300 NUC_INDEX[NUCS.charAt(i) - 'A'] = i + 1;
87    }
88  1150 for (int i = 0; i < AAS.length(); i++)
89    {
90  1100 AA_INDEX[AAS.charAt(i) - 'A'] = i + 1;
91    }
92    }
93   
94    /*
95    * counts array, just big enough for the nucleotide or peptide
96    * character set (plus gap counts in position 0)
97    */
98    private short[] counts;
99   
100    /*
101    * alternative array of int counts for use if any count
102    * exceeds the maximum value of short (32767)
103    */
104    private int[] intCounts;
105   
106    /*
107    * flag set if we switch from short to int counts
108    */
109    private boolean useIntCounts;
110   
111    /*
112    * general-purpose counter, only for use for characters
113    * that are not in the expected alphabet
114    */
115    private SparseCount otherData;
116   
117    /*
118    * keeps track of the maximum count value recorded
119    * (if this class ever allows decrements, would need to
120    * calculate this on request instead)
121    */
122    int maxCount;
123   
124    /*
125    * if we think we are counting nucleotide, can get by with smaller
126    * array to hold counts
127    */
128    private boolean isNucleotide;
129   
130    /**
131    * Default constructor allocates arrays able to count either nucleotide or
132    * peptide bases. Use this constructor if not sure which the data is.
133    */
 
134  10 toggle public ResidueCount()
135    {
136  10 this(false);
137    }
138   
139    /**
140    * Constructor that allocates an array just big enough for the anticipated
141    * characters, plus one position to count gaps
142    */
 
143  753135 toggle public ResidueCount(boolean nucleotide)
144    {
145  753185 isNucleotide = nucleotide;
146  753226 int charsToCount = nucleotide ? NUCS.length() : AAS.length();
147  753214 counts = new short[charsToCount + 1];
148    }
149   
150    /**
151    * Increments the count for the given character. The supplied character may be
152    * upper or lower case but counts are for the upper case only. Gap characters
153    * (space, ., -) are all counted together.
154    *
155    * @param c
156    * @return the new value of the count for the character
157    */
 
158  12107078 toggle public int add(final char c)
159    {
160  12115096 char u = toUpperCase(c);
161  12129083 int newValue = 0;
162  12139842 int offset = getOffset(u);
163   
164    /*
165    * offset 0 is reserved for gap counting, so 0 here means either
166    * an unexpected character, or a gap character passed in error
167    */
168  12148206 if (offset == 0)
169    {
170  8845309 if (Comparison.isGap(u))
171    {
172  8849170 newValue = addGap();
173    }
174    else
175    {
176  104 newValue = addOtherCharacter(u);
177    }
178    }
179    else
180    {
181  3295336 newValue = increment(offset);
182    }
183  12141360 return newValue;
184    }
185   
186    /**
187    * Increment the count at the specified offset. If this would result in short
188    * overflow, promote to counting int values instead.
189    *
190    * @param offset
191    * @return the new value of the count at this offset
192    */
 
193  12582494 toggle int increment(int offset)
194    {
195  12591460 int newValue = 0;
196  12595386 if (useIntCounts)
197    {
198  2 newValue = intCounts[offset];
199  2 intCounts[offset] = ++newValue;
200    }
201    else
202    {
203  12599177 if (counts[offset] == Short.MAX_VALUE)
204    {
205  8 handleOverflow();
206  8 newValue = intCounts[offset];
207  8 intCounts[offset] = ++newValue;
208    }
209    else
210    {
211  12608408 newValue = counts[offset];
212  12613766 counts[offset] = (short) ++newValue;
213    }
214    }
215   
216  12620310 if (offset != GAP_COUNT)
217    {
218    // update modal residue count
219  3300016 maxCount = Math.max(maxCount, newValue);
220    }
221  12615182 return newValue;
222    }
223   
224    /**
225    * Switch from counting in short to counting in int
226    */
 
227  10 toggle synchronized void handleOverflow()
228    {
229  10 intCounts = new int[counts.length];
230  144 for (int i = 0; i < counts.length; i++)
231    {
232  134 intCounts[i] = counts[i];
233    }
234  10 counts = null;
235  10 useIntCounts = true;
236    }
237   
238    /**
239    * Returns this character's offset in the count array
240    *
241    * @param c
242    * @return
243    */
 
244  12118765 toggle int getOffset(char c)
245    {
246  12132583 int offset = 0;
247  12146730 if ('A' <= c && c <= 'Z')
248    {
249  3295947 offset = isNucleotide ? NUC_INDEX[c - 'A'] : AA_INDEX[c - 'A'];
250    }
251  12141588 return offset;
252    }
253   
254    /**
255    * @param c
256    * @return
257    */
 
258  12115093 toggle protected char toUpperCase(final char c)
259    {
260  12125005 char u = c;
261  12133286 if ('a' <= c && c <= 'z')
262    {
263  5796 u = (char) (c + TOUPPERCASE);
264    }
265  12140564 return u;
266    }
267   
268    /**
269    * Increment count for some unanticipated character. The first time this
270    * called, a SparseCount is instantiated to hold these 'extra' counts.
271    *
272    * @param c
273    * @return the new value of the count for the character
274    */
 
275  104 toggle int addOtherCharacter(char c)
276    {
277  104 if (otherData == null)
278    {
279  85 otherData = new SparseCount();
280    }
281  104 int newValue = otherData.add(c, 1);
282  104 maxCount = Math.max(maxCount, newValue);
283  104 return newValue;
284    }
285   
286    /**
287    * Set count for some unanticipated character. The first time this called, a
288    * SparseCount is instantiated to hold these 'extra' counts.
289    *
290    * @param c
291    * @param value
292    */
 
293  5 toggle void setOtherCharacter(char c, int value)
294    {
295  5 if (otherData == null)
296    {
297  2 otherData = new SparseCount();
298    }
299  5 otherData.put(c, value);
300    }
301   
302    /**
303    * Increment count of gap characters
304    *
305    * @return the new count of gaps
306    */
 
307  9309939 toggle public int addGap()
308    {
309  9313114 int newValue = increment(GAP_COUNT);
310  9314886 return newValue;
311    }
312   
313    /**
314    * Answers true if we are counting ints (only after overflow of short counts)
315    *
316    * @return
317    */
 
318  13 toggle boolean isCountingInts()
319    {
320  13 return useIntCounts;
321    }
322   
323    /**
324    * Sets the count for the given character. The supplied character may be upper
325    * or lower case but counts are for the upper case only.
326    *
327    * @param c
328    * @param count
329    */
 
330  66 toggle public void put(char c, int count)
331    {
332  66 char u = toUpperCase(c);
333  66 int offset = getOffset(u);
334   
335    /*
336    * offset 0 is reserved for gap counting, so 0 here means either
337    * an unexpected character, or a gap character passed in error
338    */
339  66 if (offset == 0)
340    {
341  8 if (Comparison.isGap(u))
342    {
343  3 set(0, count);
344    }
345    else
346    {
347  5 setOtherCharacter(u, count);
348  5 maxCount = Math.max(maxCount, count);
349    }
350    }
351    else
352    {
353  58 set(offset, count);
354  58 maxCount = Math.max(maxCount, count);
355    }
356    }
357   
358    /**
359    * Sets the count at the specified offset. If this would result in short
360    * overflow, promote to counting int values instead.
361    *
362    * @param offset
363    * @param value
364    */
 
365  62 toggle void set(int offset, int value)
366    {
367  62 if (useIntCounts)
368    {
369  1 intCounts[offset] = value;
370    }
371    else
372    {
373  61 if (value > Short.MAX_VALUE || value < Short.MIN_VALUE)
374    {
375  2 handleOverflow();
376  2 intCounts[offset] = value;
377    }
378    else
379    {
380  59 counts[offset] = (short) value;
381    }
382    }
383    }
384   
385    /**
386    * Returns the count for the given character, or zero if no count held
387    *
388    * @param c
389    * @return
390    */
 
391  62 toggle public int getCount(char c)
392    {
393  62 char u = toUpperCase(c);
394  62 int offset = getOffset(u);
395  62 if (offset == 0)
396    {
397  11 if (!Comparison.isGap(u))
398    {
399    // should have called getGapCount()
400  6 return otherData == null ? 0 : otherData.get(u);
401    }
402    }
403  56 return useIntCounts ? intCounts[offset] : counts[offset];
404    }
405   
 
406  753020 toggle public int getGapCount()
407    {
408  753005 return useIntCounts ? intCounts[0] : counts[0];
409    }
410   
411    /**
412    * Answers true if this object wraps a counter for unexpected characters
413    *
414    * @return
415    */
 
416  7 toggle boolean isUsingOtherData()
417    {
418  7 return otherData != null;
419    }
420   
421    /**
422    * Returns the character (or concatenated characters) for the symbol(s) with
423    * the given count in the profile. Can be used to get the modal residue by
424    * supplying the modal count value. Returns an empty string if no symbol has
425    * the given count. The symbols are in alphabetic order of standard peptide or
426    * nucleotide characters, followed by 'other' symbols if any.
427    *
428    * @return
429    */
 
430  608312 toggle public String getResiduesForCount(int count)
431    {
432  608361 if (count == 0)
433    {
434  6994 return "";
435    }
436   
437    /*
438    * find counts for the given value and append the
439    * corresponding symbol
440    */
441  601383 StringBuilder modal = new StringBuilder();
442  601416 if (useIntCounts)
443    {
444  60 for (int i = 1; i < intCounts.length; i++)
445    {
446  56 if (intCounts[i] == count)
447    {
448  4 modal.append(
449  4 isNucleotide ? NUCS.charAt(i - 1) : AAS.charAt(i - 1));
450    }
451    }
452    }
453    else
454    {
455  6345777 for (int i = 1; i < counts.length; i++)
456    {
457  5745673 if (counts[i] == count)
458    {
459  611787 modal.append(
460  611792 isNucleotide ? NUCS.charAt(i - 1) : AAS.charAt(i - 1));
461    }
462    }
463    }
464  601376 if (otherData != null)
465    {
466  124 for (int i = 0; i < otherData.size(); i++)
467    {
468  62 if (otherData.valueAt(i) == count)
469    {
470  60 modal.append((char) otherData.keyAt(i));
471    }
472    }
473    }
474  601406 return modal.toString();
475    }
476   
477    /**
478    * Returns the highest count for any symbol(s) in the profile (excluding gap)
479    *
480    * @return
481    */
 
482  608296 toggle public int getModalCount()
483    {
484  608299 return maxCount;
485    }
486   
487    /**
488    * Returns the number of distinct symbols with a non-zero count (excluding the
489    * gap symbol)
490    *
491    * @return
492    */
 
493  311750 toggle public int size()
494    {
495  311755 int size = 0;
496  311777 if (useIntCounts)
497    {
498  53 for (int i = 1; i < intCounts.length; i++)
499    {
500  50 if (intCounts[i] > 0)
501    {
502  10 size++;
503    }
504    }
505    }
506    else
507    {
508  7154311 for (int i = 1; i < counts.length; i++)
509    {
510  6846110 if (counts[i] > 0)
511    {
512  639676 size++;
513    }
514    }
515    }
516   
517    /*
518    * include 'other' characters recorded (even if count is zero
519    * though that would be a strange use case)
520    */
521  311755 if (otherData != null)
522    {
523  43 size += otherData.size();
524    }
525   
526  311773 return size;
527    }
528   
529    /**
530    * Returns a data bean holding those symbols that have a non-zero count
531    * (excluding the gap symbol), with their counts.
532    *
533    * @return
534    */
 
535  311752 toggle public SymbolCounts getSymbolCounts()
536    {
537  311759 int size = size();
538  311769 char[] symbols = new char[size];
539  311783 int[] values = new int[size];
540  311787 int j = 0;
541   
542  311795 if (useIntCounts)
543    {
544  53 for (int i = 1; i < intCounts.length; i++)
545    {
546  50 if (intCounts[i] > 0)
547    {
548  10 char symbol = isNucleotide ? NUCS.charAt(i - 1)
549    : AAS.charAt(i - 1);
550  10 symbols[j] = symbol;
551  10 values[j] = intCounts[i];
552  10 j++;
553    }
554    }
555    }
556    else
557    {
558  7153087 for (int i = 1; i < counts.length; i++)
559    {
560  6846665 if (counts[i] > 0)
561    {
562  639629 char symbol = isNucleotide ? NUCS.charAt(i - 1)
563    : AAS.charAt(i - 1);
564  639655 symbols[j] = symbol;
565  639697 values[j] = counts[i];
566  639698 j++;
567    }
568    }
569    }
570  311768 if (otherData != null)
571    {
572  88 for (int i = 0; i < otherData.size(); i++)
573    {
574  45 symbols[j] = (char) otherData.keyAt(i);
575  45 values[j] = otherData.valueAt(i);
576  45 j++;
577    }
578    }
579   
580  311770 return new SymbolCounts(symbols, values);
581    }
582   
583    /**
584    * Returns a tooltip string showing residues in descending order of their
585    * percentage frequency in the profile
586    *
587    * @param normaliseBy
588    * the divisor for residue counts (may or may not include gapped
589    * sequence count)
590    * @param percentageDecPl
591    * the number of decimal places to show in percentages
592    * @return
593    */
 
594  62514 toggle public String getTooltip(int normaliseBy, int percentageDecPl)
595    {
596  62514 SymbolCounts symbolCounts = getSymbolCounts();
597  62514 char[] ca = symbolCounts.symbols;
598  62514 int[] vl = symbolCounts.values;
599   
600    /*
601    * sort characters into ascending order of their counts
602    */
603  62514 QuickSort.sort(vl, ca);
604   
605    /*
606    * traverse in reverse order (highest count first) to build tooltip
607    */
608  62514 boolean first = true;
609  62514 StringBuilder sb = new StringBuilder(64);
610  188438 for (int c = ca.length - 1; c >= 0; c--)
611    {
612  125924 final char residue = ca[c];
613    // TODO combine residues which share a percentage
614    // (see AAFrequency.completeCdnaConsensus)
615  125924 float tval = (vl[c] * 100f) / normaliseBy;
616  125924 sb.append(first ? "" : "; ").append(residue).append(" ");
617  125924 Format.appendPercentage(sb, tval, percentageDecPl);
618  125924 sb.append("%");
619  125924 first = false;
620    }
621  62514 return sb.toString();
622    }
623   
624    /**
625    * Returns a string representation of the symbol counts, for debug purposes.
626    */
 
627  3 toggle @Override
628    public String toString()
629    {
630  3 StringBuilder sb = new StringBuilder();
631  3 sb.append("[ ");
632  3 SymbolCounts sc = getSymbolCounts();
633  12 for (int i = 0; i < sc.symbols.length; i++)
634    {
635  9 sb.append(sc.symbols[i]).append(":").append(sc.values[i]).append(" ");
636    }
637  3 sb.append("]");
638  3 return sb.toString();
639    }
640    }