Clover icon

Coverage Report

  1. Project Clover database Thu Dec 4 2025 14:43:25 GMT
  2. Package jalview.datamodel

File ResidueCount.java

 

Coverage histogram

../../img/srcFileCovDistChart10.png
0% of files have more coverage

Code metrics

106
158
26
2
675
390
82
0.52
6.08
13
3.15

Classes

Class Line # Actions
ResidueCount 37 156 81
1.0100%
ResidueCount.SymbolCounts 42 2 1
1.0100%
 

Contributing tests

This file is covered by 224 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.datamodel;
22   
23    import jalview.util.Comparison;
24    import jalview.util.Format;
25    import jalview.util.QuickSort;
26    import jalview.util.SparseCount;
27   
28    import java.util.List;
29   
30    /**
31    * A class to count occurrences of residues in a profile, optimised for speed
32    * and memory footprint.
33    *
34    * @author gmcarstairs
35    *
36    */
 
37    public class ResidueCount
38    {
39    /**
40    * A data bean to hold the results of counting symbols
41    */
 
42    public class SymbolCounts
43    {
44    /**
45    * the symbols seen (as char values), in no particular order
46    */
47    public final char[] symbols;
48   
49    /**
50    * the counts for each symbol, in the same order as the symbols
51    */
52    public final int[] values;
53   
 
54  195188 toggle SymbolCounts(char[] s, int[] v)
55    {
56  195197 symbols = s;
57  195203 values = v;
58    }
59    }
60   
61    private static final int TOUPPERCASE = 'A' - 'a';
62   
63    /*
64    * nucleotide symbols to count (including N unknown)
65    */
66    private static final String NUCS = "ACGNTU";
67   
68    /*
69    * amino acid symbols to count (including X unknown)
70    * NB we also include U so as to support counting of RNA bases
71    * in the "don't know" case of nucleotide / peptide
72    */
73    private static final String AAS = "ACDEFGHIKLMNPQRSTUVWXY";
74   
75    static final int GAP_COUNT = 0;
76   
77    /*
78    * fast lookup tables holding the index into our count
79    * arrays of each symbol; index 0 is reserved for gap counting
80    */
81    private static int[] NUC_INDEX = new int[26];
82   
83    private static int[] AA_INDEX = new int[26];
 
84  53 toggle static
85    {
86  371 for (int i = 0; i < NUCS.length(); i++)
87    {
88  318 NUC_INDEX[NUCS.charAt(i) - 'A'] = i + 1;
89    }
90  1219 for (int i = 0; i < AAS.length(); i++)
91    {
92  1166 AA_INDEX[AAS.charAt(i) - 'A'] = i + 1;
93    }
94    }
95   
96    /*
97    * counts array, just big enough for the nucleotide or peptide
98    * character set (plus gap counts in position 0)
99    */
100    private short[] counts;
101   
102    /*
103    * alternative array of int counts for use if any count
104    * exceeds the maximum value of short (32767)
105    */
106    private int[] intCounts;
107   
108    /*
109    * flag set if we switch from short to int counts
110    */
111    private boolean useIntCounts;
112   
113    /*
114    * general-purpose counter, only for use for characters
115    * that are not in the expected alphabet
116    */
117    private SparseCount otherData;
118   
119    /*
120    * keeps track of the maximum count value recorded
121    * (if this class ever allows decrements, would need to
122    * calculate this on request instead)
123    */
124    int maxCount;
125   
126    /*
127    * if we think we are counting nucleotide, can get by with smaller
128    * array to hold counts
129    */
130    private boolean isNucleotide;
131   
132    /**
133    * Default constructor allocates arrays able to count either nucleotide or
134    * peptide bases. Use this constructor if not sure which the data is.
135    */
 
136  482 toggle public ResidueCount()
137    {
138  482 this(false);
139    }
140   
141    /**
142    * Constructor that allocates an array just big enough for the anticipated
143    * characters, plus one position to count gaps
144    */
 
145  769798 toggle public ResidueCount(boolean nucleotide)
146    {
147  769795 isNucleotide = nucleotide;
148  769801 int charsToCount = nucleotide ? NUCS.length() : AAS.length();
149  769809 counts = new short[charsToCount + 1];
150    }
151   
152    /**
153    * A constructor that counts frequency of all symbols (including gaps) in the
154    * sequences (not case-sensitive)
155    *
156    * @param sequences
157    */
 
158  1 toggle public ResidueCount(List<SequenceI> sequences)
159    {
160  1 this();
161  1 for (SequenceI seq : sequences)
162    {
163  24 for (int i = 0; i < seq.getLength(); i++)
164    {
165  22 add(seq.getCharAt(i));
166    }
167    }
168    }
169   
170    /**
171    * Increments the count for the given character. The supplied character may be
172    * upper or lower case but counts are for the upper case only. Gap characters
173    * (space, ., -) are all counted together.
174    *
175    * @param c
176    * @return the new value of the count for the character
177    */
 
178  11588316 toggle public int add(final char c)
179    {
180  11588696 char u = toUpperCase(c);
181  11588073 int newValue = 0;
182  11588173 int offset = getOffset(u);
183   
184    /*
185    * offset 0 is reserved for gap counting, so 0 here means either
186    * an unexpected character, or a gap character passed in error
187    */
188  11589680 if (offset == 0)
189    {
190  8748260 if (Comparison.isGap(u))
191    {
192  8748122 newValue = addGap();
193    }
194    else
195    {
196  143 newValue = addOtherCharacter(u);
197    }
198    }
199    else
200    {
201  2841892 newValue = increment(offset);
202    }
203  11591971 return newValue;
204    }
205   
206    /**
207    * Increment the count at the specified offset. If this would result in short
208    * overflow, promote to counting int values instead.
209    *
210    * @param offset
211    * @return the new value of the count at this offset
212    */
 
213  11915161 toggle int increment(int offset)
214    {
215  11915314 int newValue = 0;
216  11915772 if (useIntCounts)
217    {
218  2 newValue = intCounts[offset];
219  2 intCounts[offset] = ++newValue;
220    }
221    else
222    {
223  11915678 if (counts[offset] == Short.MAX_VALUE)
224    {
225  8 handleOverflow();
226  8 newValue = intCounts[offset];
227  8 intCounts[offset] = ++newValue;
228    }
229    else
230    {
231  11915747 newValue = counts[offset];
232  11915929 counts[offset] = (short) ++newValue;
233    }
234    }
235   
236  11916966 if (offset != GAP_COUNT)
237    {
238    // update modal residue count
239  2843558 maxCount = Math.max(maxCount, newValue);
240    }
241  11916499 return newValue;
242    }
243   
244    /**
245    * Switch from counting in short to counting in int
246    */
 
247  10 toggle synchronized void handleOverflow()
248    {
249  10 intCounts = new int[counts.length];
250  144 for (int i = 0; i < counts.length; i++)
251    {
252  134 intCounts[i] = counts[i];
253    }
254  10 counts = null;
255  10 useIntCounts = true;
256    }
257   
258    /**
259    * Returns this character's offset in the count array
260    *
261    * @param c
262    * @return
263    */
 
264  11597141 toggle int getOffset(char c)
265    {
266  11597232 int offset = 0;
267  11598284 if ('A' <= c && c <= 'Z')
268    {
269  2850804 offset = isNucleotide ? NUC_INDEX[c - 'A'] : AA_INDEX[c - 'A'];
270    }
271  11598705 return offset;
272    }
273   
274    /**
275    * @param c
276    * @return
277    */
 
278  11597489 toggle protected char toUpperCase(final char c)
279    {
280  11597511 char u = c;
281  11597666 if ('a' <= c && c <= 'z')
282    {
283  3447 u = (char) (c + TOUPPERCASE);
284    }
285  11597885 return u;
286    }
287   
288    /**
289    * Increment count for some unanticipated character. The first time this
290    * called, a SparseCount is instantiated to hold these 'extra' counts.
291    *
292    * @param c
293    * @return the new value of the count for the character
294    */
 
295  143 toggle int addOtherCharacter(char c)
296    {
297  143 if (otherData == null)
298    {
299  128 otherData = new SparseCount();
300    }
301  143 int newValue = otherData.add(c, 1);
302  143 maxCount = Math.max(maxCount, newValue);
303  143 return newValue;
304    }
305   
306    /**
307    * Set count for some unanticipated character. The first time this called, a
308    * SparseCount is instantiated to hold these 'extra' counts.
309    *
310    * @param c
311    * @param value
312    */
 
313  5 toggle void setOtherCharacter(char c, int value)
314    {
315  5 if (otherData == null)
316    {
317  2 otherData = new SparseCount();
318    }
319  5 otherData.put(c, value);
320    }
321   
322    /**
323    * Increment count of gap characters
324    *
325    * @return the new count of gaps
326    */
 
327  9073472 toggle public int addGap()
328    {
329  9073475 int newValue = increment(GAP_COUNT);
330  9073487 return newValue;
331    }
332   
333    /**
334    * Answers true if we are counting ints (only after overflow of short counts)
335    *
336    * @return
337    */
 
338  13 toggle boolean isCountingInts()
339    {
340  13 return useIntCounts;
341    }
342   
343    /**
344    * Sets the count for the given character. The supplied character may be upper
345    * or lower case but counts are for the upper case only.
346    *
347    * @param c
348    * @param count
349    */
 
350  9446 toggle public void put(char c, int count)
351    {
352  9446 char u = toUpperCase(c);
353  9446 int offset = getOffset(u);
354   
355    /*
356    * offset 0 is reserved for gap counting, so 0 here means either
357    * an unexpected character, or a gap character passed in error
358    */
359  9446 if (offset == 0)
360    {
361  8 if (Comparison.isGap(u))
362    {
363  3 set(0, count);
364    }
365    else
366    {
367  5 setOtherCharacter(u, count);
368  5 maxCount = Math.max(maxCount, count);
369    }
370    }
371    else
372    {
373  9438 set(offset, count);
374  9438 maxCount = Math.max(maxCount, count);
375    }
376    }
377   
378    /**
379    * Sets the count at the specified offset. If this would result in short
380    * overflow, promote to counting int values instead.
381    *
382    * @param offset
383    * @param value
384    */
 
385  9442 toggle void set(int offset, int value)
386    {
387  9442 if (useIntCounts)
388    {
389  1 intCounts[offset] = value;
390    }
391    else
392    {
393  9441 if (value > Short.MAX_VALUE || value < Short.MIN_VALUE)
394    {
395  2 handleOverflow();
396  2 intCounts[offset] = value;
397    }
398    else
399    {
400  9439 counts[offset] = (short) value;
401    }
402    }
403    }
404   
405    /**
406    * Returns the count for the given character, or zero if no count held
407    *
408    * @param c
409    * @return
410    */
 
411  118 toggle public int getCount(char c)
412    {
413  118 char u = toUpperCase(c);
414  118 int offset = getOffset(u);
415  118 if (offset == 0)
416    {
417  13 if (!Comparison.isGap(u))
418    {
419    // should have called getGapCount()
420  8 return otherData == null ? 0 : otherData.get(u);
421    }
422    }
423  110 return useIntCounts ? intCounts[offset] : counts[offset];
424    }
425   
 
426  769838 toggle public int getGapCount()
427    {
428  769841 return useIntCounts ? intCounts[0] : counts[0];
429    }
430   
431    /**
432    * Answers true if this object wraps a counter for unexpected characters
433    *
434    * @return
435    */
 
436  7 toggle boolean isUsingOtherData()
437    {
438  7 return otherData != null;
439    }
440   
441    /**
442    * Returns the character (or concatenated characters) for the symbol(s) with
443    * the given count in the profile. Can be used to get the modal residue by
444    * supplying the modal count value. Returns an empty string if no symbol has
445    * the given count. The symbols are in alphabetic order of standard peptide or
446    * nucleotide characters, followed by 'other' symbols if any.
447    *
448    * @return
449    */
 
450  654899 toggle public String getResiduesForCount(int count)
451    {
452  654902 if (count == 0)
453    {
454  4719 return "";
455    }
456   
457    /*
458    * find counts for the given value and append the
459    * corresponding symbol
460    */
461  650182 StringBuilder modal = new StringBuilder();
462  650192 if (useIntCounts)
463    {
464  60 for (int i = 1; i < intCounts.length; i++)
465    {
466  56 if (intCounts[i] == count)
467    {
468  4 modal.append(
469  4 isNucleotide ? NUCS.charAt(i - 1) : AAS.charAt(i - 1));
470    }
471    }
472    }
473    else
474    {
475  6695852 for (int i = 1; i < counts.length; i++)
476    {
477  6045656 if (counts[i] == count)
478    {
479  659986 modal.append(
480  659985 isNucleotide ? NUCS.charAt(i - 1) : AAS.charAt(i - 1));
481    }
482    }
483    }
484  650215 if (otherData != null)
485    {
486  166 for (int i = 0; i < otherData.size(); i++)
487    {
488  83 if (otherData.valueAt(i) == count)
489    {
490  81 modal.append((char) otherData.keyAt(i));
491    }
492    }
493    }
494  650215 return modal.toString();
495    }
496   
497    /**
498    * Returns the highest count for any symbol(s) in the profile (excluding gap)
499    *
500    * @return
501    */
 
502  654889 toggle public int getModalCount()
503    {
504  654892 return maxCount;
505    }
506   
507    /**
508    * Returns the number of distinct symbols with a non-zero count (excluding the
509    * gap symbol)
510    *
511    * @return
512    */
 
513  195252 toggle public int size()
514    {
515  195249 int size = 0;
516  195249 if (useIntCounts)
517    {
518  53 for (int i = 1; i < intCounts.length; i++)
519    {
520  50 if (intCounts[i] > 0)
521    {
522  10 size++;
523    }
524    }
525    }
526    else
527    {
528  4486438 for (int i = 1; i < counts.length; i++)
529    {
530  4291758 if (counts[i] > 0)
531    {
532  426691 size++;
533    }
534    }
535    }
536   
537    /*
538    * include 'other' characters recorded (even if count is zero
539    * though that would be a strange use case)
540    */
541  195106 if (otherData != null)
542    {
543  71 size += otherData.size();
544    }
545   
546  195111 return size;
547    }
548   
549    /**
550    * Returns a data bean holding those symbols that have a non-zero count
551    * (excluding the gap symbol), with their counts.
552    *
553    * @return
554    */
 
555  195260 toggle public SymbolCounts getSymbolCounts()
556    {
557  195260 int size = size();
558  195113 char[] symbols = new char[size];
559  195127 int[] values = new int[size];
560  195147 int j = 0;
561   
562  195153 if (useIntCounts)
563    {
564  53 for (int i = 1; i < intCounts.length; i++)
565    {
566  50 if (intCounts[i] > 0)
567    {
568  10 char symbol = isNucleotide ? NUCS.charAt(i - 1)
569    : AAS.charAt(i - 1);
570  10 symbols[j] = symbol;
571  10 values[j] = intCounts[i];
572  10 j++;
573    }
574    }
575    }
576    else
577    {
578  4486722 for (int i = 1; i < counts.length; i++)
579    {
580  4292104 if (counts[i] > 0)
581    {
582  426690 char symbol = isNucleotide ? NUCS.charAt(i - 1)
583    : AAS.charAt(i - 1);
584  426685 symbols[j] = symbol;
585  426698 values[j] = counts[i];
586  426717 j++;
587    }
588    }
589    }
590  195174 if (otherData != null)
591    {
592  144 for (int i = 0; i < otherData.size(); i++)
593    {
594  73 symbols[j] = (char) otherData.keyAt(i);
595  73 values[j] = otherData.valueAt(i);
596  73 j++;
597    }
598    }
599   
600  195170 return new SymbolCounts(symbols, values);
601    }
602   
603    /**
604    * Returns a tooltip string showing residues in descending order of their
605    * percentage frequency in the profile
606    *
607    * @param normaliseBy
608    * the divisor for residue counts (may or may not include gapped
609    * sequence count)
610    * @param percentageDecPl
611    * the number of decimal places to show in percentages
612    * @return
613    */
 
614  53116 toggle public String getTooltip(int normaliseBy, int percentageDecPl)
615    {
616  53115 SymbolCounts symbolCounts = getSymbolCounts();
617  53116 char[] ca = symbolCounts.symbols;
618  53116 int[] vl = symbolCounts.values;
619   
620    /*
621    * sort characters into ascending order of their counts
622    */
623  53116 QuickSort.sort(vl, ca);
624   
625    /*
626    * traverse in reverse order (highest count first) to build tooltip
627    */
628  53117 boolean first = true;
629  53117 StringBuilder sb = new StringBuilder(64);
630  158642 for (int c = ca.length - 1; c >= 0; c--)
631    {
632  105526 final char residue = ca[c];
633    // TODO combine residues which share a percentage
634    // (see AAFrequency.completeCdnaConsensus)
635  105518 float tval = (vl[c] * 100f) / normaliseBy;
636  105534 sb.append(first ? "" : "; ").append(residue).append(" ");
637  105513 Format.appendPercentage(sb, tval, percentageDecPl);
638  105529 sb.append("%");
639  105523 first = false;
640    }
641  53111 return sb.toString();
642    }
643   
644    /**
645    * Returns a string representation of the symbol counts, for debug purposes.
646    */
 
647  3 toggle @Override
648    public String toString()
649    {
650  3 StringBuilder sb = new StringBuilder();
651  3 sb.append("[ ");
652  3 SymbolCounts sc = getSymbolCounts();
653  12 for (int i = 0; i < sc.symbols.length; i++)
654    {
655  9 sb.append(sc.symbols[i]).append(":").append(sc.values[i]).append(" ");
656    }
657  3 sb.append("]");
658  3 return sb.toString();
659    }
660   
661    /**
662    * Answers the total count for all symbols (excluding gaps)
663    *
664    * @return
665    */
 
666  6 toggle public int getTotalResidueCount()
667    {
668  6 int total = 0;
669  6 for (char symbol : this.getSymbolCounts().symbols)
670    {
671  32 total += getCount(symbol);
672    }
673  6 return total;
674    }
675    }