Clover icon

Coverage Report

  1. Project Clover database Wed Sep 18 2024 02:54:09 BST
  2. Package jalview.datamodel



Coverage histogram

0% of files have more coverage

Code metrics



Class Line # Actions
ResidueCount 35 148 78
ResidueCount.SymbolCounts 40 2 1

Contributing tests

This file is covered by 331 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.datamodel;
23    import jalview.util.Comparison;
24    import jalview.util.Format;
25    import jalview.util.QuickSort;
26    import jalview.util.SparseCount;
28    /**
29    * A class to count occurrences of residues in a profile, optimised for speed
30    * and memory footprint.
31    *
32    * @author gmcarstairs
33    *
34    */
35    public class ResidueCount
36    {
37    /**
38    * A data bean to hold the results of counting symbols
39    */
40    public class SymbolCounts
41    {
42    /**
43    * the symbols seen (as char values), in no particular order
44    */
45    public final char[] symbols;
47    /**
48    * the counts for each symbol, in the same order as the symbols
49    */
50    public final int[] values;
52  303746 toggle SymbolCounts(char[] s, int[] v)
53    {
54  303755 symbols = s;
55  303766 values = v;
56    }
57    }
59    private static final int TOUPPERCASE = 'A' - 'a';
61    /*
62    * nucleotide symbols to count (including N unknown)
63    */
64    private static final String NUCS = "ACGNTU";
66    /*
67    * amino acid symbols to count (including X unknown)
68    * NB we also include U so as to support counting of RNA bases
69    * in the "don't know" case of nucleotide / peptide
70    */
71    private static final String AAS = "ACDEFGHIKLMNPQRSTUVWXY";
73    static final int GAP_COUNT = 0;
75    /*
76    * fast lookup tables holding the index into our count
77    * arrays of each symbol; index 0 is reserved for gap counting
78    */
79    private static int[] NUC_INDEX = new int[26];
81    private static int[] AA_INDEX = new int[26];
82  50 toggle static
83    {
84  350 for (int i = 0; i < NUCS.length(); i++)
85    {
86  300 NUC_INDEX[NUCS.charAt(i) - 'A'] = i + 1;
87    }
88  1150 for (int i = 0; i < AAS.length(); i++)
89    {
90  1100 AA_INDEX[AAS.charAt(i) - 'A'] = i + 1;
91    }
92    }
94    /*
95    * counts array, just big enough for the nucleotide or peptide
96    * character set (plus gap counts in position 0)
97    */
98    private short[] counts;
100    /*
101    * alternative array of int counts for use if any count
102    * exceeds the maximum value of short (32767)
103    */
104    private int[] intCounts;
106    /*
107    * flag set if we switch from short to int counts
108    */
109    private boolean useIntCounts;
111    /*
112    * general-purpose counter, only for use for characters
113    * that are not in the expected alphabet
114    */
115    private SparseCount otherData;
117    /*
118    * keeps track of the maximum count value recorded
119    * (if this class ever allows decrements, would need to
120    * calculate this on request instead)
121    */
122    int maxCount;
124    /*
125    * if we think we are counting nucleotide, can get by with smaller
126    * array to hold counts
127    */
128    private boolean isNucleotide;
130    /**
131    * Default constructor allocates arrays able to count either nucleotide or
132    * peptide bases. Use this constructor if not sure which the data is.
133    */
134  10 toggle public ResidueCount()
135    {
136  10 this(false);
137    }
139    /**
140    * Constructor that allocates an array just big enough for the anticipated
141    * characters, plus one position to count gaps
142    */
143  750266 toggle public ResidueCount(boolean nucleotide)
144    {
145  750271 isNucleotide = nucleotide;
146  750318 int charsToCount = nucleotide ? NUCS.length() : AAS.length();
147  750304 counts = new short[charsToCount + 1];
148    }
150    /**
151    * Increments the count for the given character. The supplied character may be
152    * upper or lower case but counts are for the upper case only. Gap characters
153    * (space, ., -) are all counted together.
154    *
155    * @param c
156    * @return the new value of the count for the character
157    */
158  12078968 toggle public int add(final char c)
159    {
160  12083015 char u = toUpperCase(c);
161  12092957 int newValue = 0;
162  12095738 int offset = getOffset(u);
164    /*
165    * offset 0 is reserved for gap counting, so 0 here means either
166    * an unexpected character, or a gap character passed in error
167    */
168  12120000 if (offset == 0)
169    {
170  8837700 if (Comparison.isGap(u))
171    {
172  8848341 newValue = addGap();
173    }
174    else
175    {
176  104 newValue = addOtherCharacter(u);
177    }
178    }
179    else
180    {
181  3272636 newValue = increment(offset);
182    }
183  12113808 return newValue;
184    }
186    /**
187    * Increment the count at the specified offset. If this would result in short
188    * overflow, promote to counting int values instead.
189    *
190    * @param offset
191    * @return the new value of the count at this offset
192    */
193  12559621 toggle int increment(int offset)
194    {
195  12561194 int newValue = 0;
196  12567735 if (useIntCounts)
197    {
198  2 newValue = intCounts[offset];
199  2 intCounts[offset] = ++newValue;
200    }
201    else
202    {
203  12562617 if (counts[offset] == Short.MAX_VALUE)
204    {
205  8 handleOverflow();
206  8 newValue = intCounts[offset];
207  8 intCounts[offset] = ++newValue;
208    }
209    else
210    {
211  12570736 newValue = counts[offset];
212  12572829 counts[offset] = (short) ++newValue;
213    }
214    }
216  12582704 if (offset != GAP_COUNT)
217    {
218    // update modal residue count
219  3273157 maxCount = Math.max(maxCount, newValue);
220    }
221  12577513 return newValue;
222    }
224    /**
225    * Switch from counting in short to counting in int
226    */
227  10 toggle synchronized void handleOverflow()
228    {
229  10 intCounts = new int[counts.length];
230  144 for (int i = 0; i < counts.length; i++)
231    {
232  134 intCounts[i] = counts[i];
233    }
234  10 counts = null;
235  10 useIntCounts = true;
236    }
238    /**
239    * Returns this character's offset in the count array
240    *
241    * @param c
242    * @return
243    */
244  12088478 toggle int getOffset(char c)
245    {
246  12093047 int offset = 0;
247  12115203 if ('A' <= c && c <= 'Z')
248    {
249  3271462 offset = isNucleotide ? NUC_INDEX[c - 'A'] : AA_INDEX[c - 'A'];
250    }
251  12115751 return offset;
252    }
254    /**
255    * @param c
256    * @return
257    */
258  12081652 toggle protected char toUpperCase(final char c)
259    {
260  12083778 char u = c;
261  12085615 if ('a' <= c && c <= 'z')
262    {
263  5607 u = (char) (c + TOUPPERCASE);
264    }
265  12095018 return u;
266    }
268    /**
269    * Increment count for some unanticipated character. The first time this
270    * called, a SparseCount is instantiated to hold these 'extra' counts.
271    *
272    * @param c
273    * @return the new value of the count for the character
274    */
275  104 toggle int addOtherCharacter(char c)
276    {
277  104 if (otherData == null)
278    {
279  85 otherData = new SparseCount();
280    }
281  104 int newValue = otherData.add(c, 1);
282  104 maxCount = Math.max(maxCount, newValue);
283  104 return newValue;
284    }
286    /**
287    * Set count for some unanticipated character. The first time this called, a
288    * SparseCount is instantiated to hold these 'extra' counts.
289    *
290    * @param c
291    * @param value
292    */
293  5 toggle void setOtherCharacter(char c, int value)
294    {
295  5 if (otherData == null)
296    {
297  2 otherData = new SparseCount();
298    }
299  5 otherData.put(c, value);
300    }
302    /**
303    * Increment count of gap characters
304    *
305    * @return the new count of gaps
306    */
307  9301574 toggle public int addGap()
308    {
309  9302372 int newValue = increment(GAP_COUNT);
310  9306336 return newValue;
311    }
313    /**
314    * Answers true if we are counting ints (only after overflow of short counts)
315    *
316    * @return
317    */
318  13 toggle boolean isCountingInts()
319    {
320  13 return useIntCounts;
321    }
323    /**
324    * Sets the count for the given character. The supplied character may be upper
325    * or lower case but counts are for the upper case only.
326    *
327    * @param c
328    * @param count
329    */
330  66 toggle public void put(char c, int count)
331    {
332  66 char u = toUpperCase(c);
333  66 int offset = getOffset(u);
335    /*
336    * offset 0 is reserved for gap counting, so 0 here means either
337    * an unexpected character, or a gap character passed in error
338    */
339  66 if (offset == 0)
340    {
341  8 if (Comparison.isGap(u))
342    {
343  3 set(0, count);
344    }
345    else
346    {
347  5 setOtherCharacter(u, count);
348  5 maxCount = Math.max(maxCount, count);
349    }
350    }
351    else
352    {
353  58 set(offset, count);
354  58 maxCount = Math.max(maxCount, count);
355    }
356    }
358    /**
359    * Sets the count at the specified offset. If this would result in short
360    * overflow, promote to counting int values instead.
361    *
362    * @param offset
363    * @param value
364    */
365  62 toggle void set(int offset, int value)
366    {
367  62 if (useIntCounts)
368    {
369  1 intCounts[offset] = value;
370    }
371    else
372    {
373  61 if (value > Short.MAX_VALUE || value < Short.MIN_VALUE)
374    {
375  2 handleOverflow();
376  2 intCounts[offset] = value;
377    }
378    else
379    {
380  59 counts[offset] = (short) value;
381    }
382    }
383    }
385    /**
386    * Returns the count for the given character, or zero if no count held
387    *
388    * @param c
389    * @return
390    */
391  62 toggle public int getCount(char c)
392    {
393  62 char u = toUpperCase(c);
394  62 int offset = getOffset(u);
395  62 if (offset == 0)
396    {
397  11 if (!Comparison.isGap(u))
398    {
399    // should have called getGapCount()
400  6 return otherData == null ? 0 : otherData.get(u);
401    }
402    }
403  56 return useIntCounts ? intCounts[offset] : counts[offset];
404    }
406  750181 toggle public int getGapCount()
407    {
408  750244 return useIntCounts ? intCounts[0] : counts[0];
409    }
411    /**
412    * Answers true if this object wraps a counter for unexpected characters
413    *
414    * @return
415    */
416  7 toggle boolean isUsingOtherData()
417    {
418  7 return otherData != null;
419    }
421    /**
422    * Returns the character (or concatenated characters) for the symbol(s) with
423    * the given count in the profile. Can be used to get the modal residue by
424    * supplying the modal count value. Returns an empty string if no symbol has
425    * the given count. The symbols are in alphabetic order of standard peptide or
426    * nucleotide characters, followed by 'other' symbols if any.
427    *
428    * @return
429    */
430  609537 toggle public String getResiduesForCount(int count)
431    {
432  609541 if (count == 0)
433    {
434  6889 return "";
435    }
437    /*
438    * find counts for the given value and append the
439    * corresponding symbol
440    */
441  602662 StringBuilder modal = new StringBuilder();
442  602696 if (useIntCounts)
443    {
444  60 for (int i = 1; i < intCounts.length; i++)
445    {
446  56 if (intCounts[i] == count)
447    {
448  4 modal.append(
449  4 isNucleotide ? NUCS.charAt(i - 1) : AAS.charAt(i - 1));
450    }
451    }
452    }
453    else
454    {
455  6372676 for (int i = 1; i < counts.length; i++)
456    {
457  5771814 if (counts[i] == count)
458    {
459  613026 modal.append(
460  612997 isNucleotide ? NUCS.charAt(i - 1) : AAS.charAt(i - 1));
461    }
462    }
463    }
464  602645 if (otherData != null)
465    {
466  124 for (int i = 0; i < otherData.size(); i++)
467    {
468  62 if (otherData.valueAt(i) == count)
469    {
470  60 modal.append((char) otherData.keyAt(i));
471    }
472    }
473    }
474  602670 return modal.toString();
475    }
477    /**
478    * Returns the highest count for any symbol(s) in the profile (excluding gap)
479    *
480    * @return
481    */
482  609534 toggle public int getModalCount()
483    {
484  609534 return maxCount;
485    }
487    /**
488    * Returns the number of distinct symbols with a non-zero count (excluding the
489    * gap symbol)
490    *
491    * @return
492    */
493  303795 toggle public int size()
494    {
495  303790 int size = 0;
496  303790 if (useIntCounts)
497    {
498  53 for (int i = 1; i < intCounts.length; i++)
499    {
500  50 if (intCounts[i] > 0)
501    {
502  10 size++;
503    }
504    }
505    }
506    else
507    {
508  6969724 for (int i = 1; i < counts.length; i++)
509    {
510  6668357 if (counts[i] > 0)
511    {
512  635655 size++;
513    }
514    }
515    }
517    /*
518    * include 'other' characters recorded (even if count is zero
519    * though that would be a strange use case)
520    */
521  303692 if (otherData != null)
522    {
523  43 size += otherData.size();
524    }
526  303705 return size;
527    }
529    /**
530    * Returns a data bean holding those symbols that have a non-zero count
531    * (excluding the gap symbol), with their counts.
532    *
533    * @return
534    */
535  303798 toggle public SymbolCounts getSymbolCounts()
536    {
537  303805 int size = size();
538  303717 char[] symbols = new char[size];
539  303739 int[] values = new int[size];
540  303760 int j = 0;
542  303776 if (useIntCounts)
543    {
544  53 for (int i = 1; i < intCounts.length; i++)
545    {
546  50 if (intCounts[i] > 0)
547    {
548  10 char symbol = isNucleotide ? NUCS.charAt(i - 1)
549    : AAS.charAt(i - 1);
550  10 symbols[j] = symbol;
551  10 values[j] = intCounts[i];
552  10 j++;
553    }
554    }
555    }
556    else
557    {
558  6971092 for (int i = 1; i < counts.length; i++)
559    {
560  6670396 if (counts[i] > 0)
561    {
562  635684 char symbol = isNucleotide ? NUCS.charAt(i - 1)
563    : AAS.charAt(i - 1);
564  635739 symbols[j] = symbol;
565  635763 values[j] = counts[i];
566  635795 j++;
567    }
568    }
569    }
570  303721 if (otherData != null)
571    {
572  88 for (int i = 0; i < otherData.size(); i++)
573    {
574  45 symbols[j] = (char) otherData.keyAt(i);
575  45 values[j] = otherData.valueAt(i);
576  45 j++;
577    }
578    }
580  303722 return new SymbolCounts(symbols, values);
581    }
583    /**
584    * Returns a tooltip string showing residues in descending order of their
585    * percentage frequency in the profile
586    *
587    * @param normaliseBy
588    * the divisor for residue counts (may or may not include gapped
589    * sequence count)
590    * @param percentageDecPl
591    * the number of decimal places to show in percentages
592    * @return
593    */
594  62116 toggle public String getTooltip(int normaliseBy, int percentageDecPl)
595    {
596  62116 SymbolCounts symbolCounts = getSymbolCounts();
597  62116 char[] ca = symbolCounts.symbols;
598  62116 int[] vl = symbolCounts.values;
600    /*
601    * sort characters into ascending order of their counts
602    */
603  62116 QuickSort.sort(vl, ca);
605    /*
606    * traverse in reverse order (highest count first) to build tooltip
607    */
608  62116 boolean first = true;
609  62116 StringBuilder sb = new StringBuilder(64);
610  187521 for (int c = ca.length - 1; c >= 0; c--)
611    {
612  125406 final char residue = ca[c];
613    // TODO combine residues which share a percentage
614    // (see AAFrequency.completeCdnaConsensus)
615  125407 float tval = (vl[c] * 100f) / normaliseBy;
616  125415 sb.append(first ? "" : "; ").append(residue).append(" ");
617  125411 Format.appendPercentage(sb, tval, percentageDecPl);
618  125413 sb.append("%");
619  125415 first = false;
620    }
621  62115 return sb.toString();
622    }
624    /**
625    * Returns a string representation of the symbol counts, for debug purposes.
626    */
627  3 toggle @Override
628    public String toString()
629    {
630  3 StringBuilder sb = new StringBuilder();
631  3 sb.append("[ ");
632  3 SymbolCounts sc = getSymbolCounts();
633  12 for (int i = 0; i < sc.symbols.length; i++)
634    {
635  9 sb.append(sc.symbols[i]).append(":").append(sc.values[i]).append(" ");
636    }
637  3 sb.append("]");
638  3 return sb.toString();
639    }
640    }