Class |
Line # |
Actions |
|||
---|---|---|---|---|---|
SecondaryStructureCount | 35 | 144 | 70 | ||
SecondaryStructureCount.SymbolCounts | 40 | 2 | 1 |
1 | /* | |
2 | * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) | |
3 | * Copyright (C) $$Year-Rel$$ The Jalview Authors | |
4 | * | |
5 | * This file is part of Jalview. | |
6 | * | |
7 | * Jalview is free software: you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation, either version 3 | |
10 | * of the License, or (at your option) any later version. | |
11 | * | |
12 | * Jalview is distributed in the hope that it will be useful, but | |
13 | * WITHOUT ANY WARRANTY; without even the implied warranty | |
14 | * of MERCHANTABILITY or FITNESS FOR A PARTICULAR | |
15 | * PURPOSE. See the GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with Jalview. If not, see <http://www.gnu.org/licenses/>. | |
19 | * The Jalview Authors are detailed in the 'AUTHORS' file. | |
20 | */ | |
21 | package jalview.datamodel; | |
22 | ||
23 | import jalview.util.Comparison; | |
24 | import jalview.util.Format; | |
25 | import jalview.util.QuickSort; | |
26 | import jalview.util.SparseCount; | |
27 | ||
28 | /** | |
29 | * A class to count occurrences of residues in a profile, optimised for speed | |
30 | * and memory footprint. | |
31 | * | |
32 | * @author gmcarstairs | |
33 | * | |
34 | */ | |
35 | public class SecondaryStructureCount | |
36 | { | |
37 | /** | |
38 | * A data bean to hold the results of counting symbols | |
39 | */ | |
40 | public class SymbolCounts | |
41 | { | |
42 | /** | |
43 | * the symbols seen (as char values), in no particular order | |
44 | */ | |
45 | public final char[] symbols; | |
46 | ||
47 | /** | |
48 | * the counts for each symbol, in the same order as the symbols | |
49 | */ | |
50 | public final int[] values; | |
51 | ||
52 | 47156 | SymbolCounts(char[] s, int[] v) |
53 | { | |
54 | 47156 | symbols = s; |
55 | 47156 | values = v; |
56 | } | |
57 | } | |
58 | ||
59 | private static final int TOUPPERCASE = 'A' - 'a'; | |
60 | ||
61 | /* | |
62 | * nucleotide symbols to count (including N unknown) | |
63 | */ | |
64 | private static final String SS_SYMBOLS = "HEC"; | |
65 | ||
66 | static final int GAP_COUNT = 0; | |
67 | ||
68 | /* | |
69 | * fast lookup tables holding the index into our count | |
70 | * arrays of each symbol; index 0 is reserved for gap counting | |
71 | */ | |
72 | private static int[] SS_INDEX = new int[26]; | |
73 | ||
74 | 50 | static |
75 | { | |
76 | 200 | for (int i = 0; i < SS_SYMBOLS.length(); i++) |
77 | { | |
78 | 150 | SS_INDEX[SS_SYMBOLS.charAt(i) - 'A'] = i + 1; |
79 | } | |
80 | } | |
81 | ||
82 | /* | |
83 | * counts array, just big enough for the nucleotide or peptide | |
84 | * character set (plus gap counts in position 0) | |
85 | */ | |
86 | private short[] counts; | |
87 | ||
88 | /* | |
89 | * alternative array of int counts for use if any count | |
90 | * exceeds the maximum value of short (32767) | |
91 | */ | |
92 | private int[] intCounts; | |
93 | ||
94 | /* | |
95 | * flag set if we switch from short to int counts | |
96 | */ | |
97 | private boolean useIntCounts; | |
98 | ||
99 | /* | |
100 | * general-purpose counter, only for use for characters | |
101 | * that are not in the expected alphabet | |
102 | */ | |
103 | private SparseCount otherData; | |
104 | ||
105 | /* | |
106 | * keeps track of the maximum count value recorded | |
107 | * (if this class ever allows decrements, would need to | |
108 | * calculate this on request instead) | |
109 | */ | |
110 | int maxCount; | |
111 | ||
112 | /** | |
113 | * Constructor that allocates an array just big enough for the anticipated | |
114 | * characters, plus one position to count gaps | |
115 | */ | |
116 | 623196 | public SecondaryStructureCount() |
117 | { | |
118 | // isSS = true; | |
119 | 623203 | int charsToCount = SS_SYMBOLS.length(); |
120 | 623202 | counts = new short[charsToCount + 1]; |
121 | } | |
122 | ||
123 | /** | |
124 | * Increments the count for the given character. The supplied character may be | |
125 | * upper or lower case but counts are for the upper case only. Gap characters | |
126 | * (space, ., -) are all counted together. | |
127 | * | |
128 | * @param c | |
129 | * @return the new value of the count for the character | |
130 | */ | |
131 | 61280 | public int add(final char c) |
132 | { | |
133 | 61280 | char u = toUpperCase(c); |
134 | 61280 | int newValue = 0; |
135 | 61280 | int offset = getOffset(u); |
136 | ||
137 | /* | |
138 | * offset 0 is reserved for gap counting, so 0 here means either | |
139 | * an unexpected character, or a gap character passed in error | |
140 | */ | |
141 | 61280 | if (offset == 0) |
142 | { | |
143 | 9232 | if (Comparison.isGap(u)) |
144 | { | |
145 | 0 | newValue = addGap(); |
146 | } | |
147 | else | |
148 | { | |
149 | 9232 | newValue = addOtherCharacter(u); |
150 | } | |
151 | } | |
152 | else | |
153 | { | |
154 | 52048 | newValue = increment(offset); |
155 | } | |
156 | 61280 | return newValue; |
157 | } | |
158 | ||
159 | /** | |
160 | * Increment the count at the specified offset. If this would result in short | |
161 | * overflow, promote to counting int values instead. | |
162 | * | |
163 | * @param offset | |
164 | * @return the new value of the count at this offset | |
165 | */ | |
166 | 86472 | int increment(int offset) |
167 | { | |
168 | 86472 | int newValue = 0; |
169 | 86472 | if (useIntCounts) |
170 | { | |
171 | 0 | newValue = intCounts[offset]; |
172 | 0 | intCounts[offset] = ++newValue; |
173 | } | |
174 | else | |
175 | { | |
176 | 86472 | if (counts[offset] == Short.MAX_VALUE) |
177 | { | |
178 | 0 | handleOverflow(); |
179 | 0 | newValue = intCounts[offset]; |
180 | 0 | intCounts[offset] = ++newValue; |
181 | } | |
182 | else | |
183 | { | |
184 | 86472 | newValue = counts[offset]; |
185 | 86472 | counts[offset] = (short) ++newValue; |
186 | } | |
187 | } | |
188 | ||
189 | 86472 | if (offset != GAP_COUNT) |
190 | { | |
191 | // update modal residue count | |
192 | 52048 | maxCount = Math.max(maxCount, newValue); |
193 | } | |
194 | 86472 | return newValue; |
195 | } | |
196 | ||
197 | /** | |
198 | * Switch from counting in short to counting in int | |
199 | */ | |
200 | 0 | synchronized void handleOverflow() |
201 | { | |
202 | 0 | intCounts = new int[counts.length]; |
203 | 0 | for (int i = 0; i < counts.length; i++) |
204 | { | |
205 | 0 | intCounts[i] = counts[i]; |
206 | } | |
207 | 0 | counts = null; |
208 | 0 | useIntCounts = true; |
209 | } | |
210 | ||
211 | /** | |
212 | * Returns this character's offset in the count array | |
213 | * | |
214 | * @param c | |
215 | * @return | |
216 | */ | |
217 | 61280 | int getOffset(char c) |
218 | { | |
219 | 61280 | int offset = 0; |
220 | 61280 | if ('A' <= c && c <= 'Z') |
221 | { | |
222 | 52048 | offset = SS_INDEX[c - 'A']; |
223 | } | |
224 | 61280 | return offset; |
225 | } | |
226 | ||
227 | /** | |
228 | * @param c | |
229 | * @return | |
230 | */ | |
231 | 61280 | protected char toUpperCase(final char c) |
232 | { | |
233 | 61280 | char u = c; |
234 | 61280 | if ('a' <= c && c <= 'z') |
235 | { | |
236 | 0 | u = (char) (c + TOUPPERCASE); |
237 | } | |
238 | 61280 | return u; |
239 | } | |
240 | ||
241 | /** | |
242 | * Increment count for some unanticipated character. The first time this | |
243 | * called, a SparseCount is instantiated to hold these 'extra' counts. | |
244 | * | |
245 | * @param c | |
246 | * @return the new value of the count for the character | |
247 | */ | |
248 | 9232 | int addOtherCharacter(char c) |
249 | { | |
250 | 9232 | if (otherData == null) |
251 | { | |
252 | 344 | otherData = new SparseCount(); |
253 | } | |
254 | 9232 | int newValue = otherData.add(c, 1); |
255 | 9232 | maxCount = Math.max(maxCount, newValue); |
256 | 9232 | return newValue; |
257 | } | |
258 | ||
259 | /** | |
260 | * Set count for some unanticipated character. The first time this called, a | |
261 | * SparseCount is instantiated to hold these 'extra' counts. | |
262 | * | |
263 | * @param c | |
264 | * @param value | |
265 | */ | |
266 | 0 | void setOtherCharacter(char c, int value) |
267 | { | |
268 | 0 | if (otherData == null) |
269 | { | |
270 | 0 | otherData = new SparseCount(); |
271 | } | |
272 | 0 | otherData.put(c, value); |
273 | } | |
274 | ||
275 | /** | |
276 | * Increment count of gap characters | |
277 | * | |
278 | * @return the new count of gaps | |
279 | */ | |
280 | 34424 | public int addGap() |
281 | { | |
282 | 34424 | int newValue = increment(GAP_COUNT); |
283 | 34424 | return newValue; |
284 | } | |
285 | ||
286 | /** | |
287 | * Answers true if we are counting ints (only after overflow of short counts) | |
288 | * | |
289 | * @return | |
290 | */ | |
291 | 0 | boolean isCountingInts() |
292 | { | |
293 | 0 | return useIntCounts; |
294 | } | |
295 | ||
296 | /** | |
297 | * Sets the count for the given character. The supplied character may be upper | |
298 | * or lower case but counts are for the upper case only. | |
299 | * | |
300 | * @param c | |
301 | * @param count | |
302 | */ | |
303 | 0 | public void put(char c, int count) |
304 | { | |
305 | 0 | char u = toUpperCase(c); |
306 | 0 | int offset = getOffset(u); |
307 | ||
308 | /* | |
309 | * offset 0 is reserved for gap counting, so 0 here means either | |
310 | * an unexpected character, or a gap character passed in error | |
311 | */ | |
312 | 0 | if (offset == 0) |
313 | { | |
314 | 0 | if (Comparison.isGap(u)) |
315 | { | |
316 | 0 | set(0, count); |
317 | } | |
318 | else | |
319 | { | |
320 | 0 | setOtherCharacter(u, count); |
321 | 0 | maxCount = Math.max(maxCount, count); |
322 | } | |
323 | } | |
324 | else | |
325 | { | |
326 | 0 | set(offset, count); |
327 | 0 | maxCount = Math.max(maxCount, count); |
328 | } | |
329 | } | |
330 | ||
331 | /** | |
332 | * Sets the count at the specified offset. If this would result in short | |
333 | * overflow, promote to counting int values instead. | |
334 | * | |
335 | * @param offset | |
336 | * @param value | |
337 | */ | |
338 | 0 | void set(int offset, int value) |
339 | { | |
340 | 0 | if (useIntCounts) |
341 | { | |
342 | 0 | intCounts[offset] = value; |
343 | } | |
344 | else | |
345 | { | |
346 | 0 | if (value > Short.MAX_VALUE || value < Short.MIN_VALUE) |
347 | { | |
348 | 0 | handleOverflow(); |
349 | 0 | intCounts[offset] = value; |
350 | } | |
351 | else | |
352 | { | |
353 | 0 | counts[offset] = (short) value; |
354 | } | |
355 | } | |
356 | } | |
357 | ||
358 | /** | |
359 | * Returns the count for the given character, or zero if no count held | |
360 | * | |
361 | * @param c | |
362 | * @return | |
363 | */ | |
364 | 0 | public int getCount(char c) |
365 | { | |
366 | 0 | char u = toUpperCase(c); |
367 | 0 | int offset = getOffset(u); |
368 | 0 | if (offset == 0) |
369 | { | |
370 | 0 | if (!Comparison.isGap(u)) |
371 | { | |
372 | // should have called getGapCount() | |
373 | 0 | return otherData == null ? 0 : otherData.get(u); |
374 | } | |
375 | } | |
376 | 0 | return useIntCounts ? intCounts[offset] : counts[offset]; |
377 | } | |
378 | ||
379 | 623126 | public int getGapCount() |
380 | { | |
381 | 623141 | return useIntCounts ? intCounts[0] : counts[0]; |
382 | } | |
383 | ||
384 | /** | |
385 | * Answers true if this object wraps a counter for unexpected characters | |
386 | * | |
387 | * @return | |
388 | */ | |
389 | 0 | boolean isUsingOtherData() |
390 | { | |
391 | 0 | return otherData != null; |
392 | } | |
393 | ||
394 | /** | |
395 | * Returns the character (or concatenated characters) for the symbol(s) with | |
396 | * the given count in the profile. Can be used to get the modal residue by | |
397 | * supplying the modal count value. Returns an empty string if no symbol has | |
398 | * the given count. The symbols are in alphabetic order of standard peptide or | |
399 | * nucleotide characters, followed by 'other' symbols if any. | |
400 | * | |
401 | * @return | |
402 | */ | |
403 | 623071 | public String getSSForCount(int count) |
404 | { | |
405 | 623138 | if (count == 0) |
406 | { | |
407 | 604409 | return ""; |
408 | } | |
409 | ||
410 | /* | |
411 | * find counts for the given value and append the | |
412 | * corresponding symbol | |
413 | */ | |
414 | 18737 | StringBuilder modal = new StringBuilder(); |
415 | 18737 | if (useIntCounts) |
416 | { | |
417 | 0 | for (int i = 1; i < intCounts.length; i++) |
418 | { | |
419 | 0 | if (intCounts[i] == count) |
420 | { | |
421 | 0 | modal.append(SS_SYMBOLS.charAt(i - 1)); |
422 | } | |
423 | } | |
424 | } | |
425 | else | |
426 | { | |
427 | 74948 | for (int i = 1; i < counts.length; i++) |
428 | { | |
429 | 56211 | if (counts[i] == count) |
430 | { | |
431 | 19585 | modal.append(SS_SYMBOLS.charAt(i - 1)); |
432 | } | |
433 | } | |
434 | } | |
435 | 18737 | if (otherData != null) |
436 | { | |
437 | 840 | for (int i = 0; i < otherData.size(); i++) |
438 | { | |
439 | 496 | if (otherData.valueAt(i) == count) |
440 | { | |
441 | 212 | modal.append((char) otherData.keyAt(i)); |
442 | } | |
443 | } | |
444 | } | |
445 | 18737 | return modal.toString(); |
446 | } | |
447 | ||
448 | /** | |
449 | * Returns the highest count for any symbol(s) in the profile (excluding gap) | |
450 | * | |
451 | * @return | |
452 | */ | |
453 | 623068 | public int getModalCount() |
454 | { | |
455 | 623079 | return maxCount; |
456 | } | |
457 | ||
458 | /** | |
459 | * Returns the number of distinct symbols with a non-zero count (excluding the | |
460 | * gap symbol) | |
461 | * | |
462 | * @return | |
463 | */ | |
464 | 47156 | public int size() |
465 | { | |
466 | 47156 | int size = 0; |
467 | 47156 | if (useIntCounts) |
468 | { | |
469 | 0 | for (int i = 1; i < intCounts.length; i++) |
470 | { | |
471 | 0 | if (intCounts[i] > 0) |
472 | { | |
473 | 0 | size++; |
474 | } | |
475 | } | |
476 | } | |
477 | else | |
478 | { | |
479 | 188624 | for (int i = 1; i < counts.length; i++) |
480 | { | |
481 | 141468 | if (counts[i] > 0) |
482 | { | |
483 | 6806 | size++; |
484 | } | |
485 | } | |
486 | } | |
487 | ||
488 | /* | |
489 | * include 'other' characters recorded (even if count is zero | |
490 | * though that would be a strange use case) | |
491 | */ | |
492 | 47156 | if (otherData != null) |
493 | { | |
494 | 0 | size += otherData.size(); |
495 | } | |
496 | ||
497 | 47156 | return size; |
498 | } | |
499 | ||
500 | /** | |
501 | * Returns a data bean holding those symbols that have a non-zero count | |
502 | * (excluding the gap symbol), with their counts. | |
503 | * | |
504 | * @return | |
505 | */ | |
506 | 47156 | public SymbolCounts getSymbolCounts() |
507 | { | |
508 | 47156 | int size = size(); |
509 | 47156 | char[] symbols = new char[size]; |
510 | 47156 | int[] values = new int[size]; |
511 | 47156 | int j = 0; |
512 | ||
513 | 47156 | if (useIntCounts) |
514 | { | |
515 | 0 | for (int i = 1; i < intCounts.length; i++) |
516 | { | |
517 | 0 | if (intCounts[i] > 0) |
518 | { | |
519 | 0 | char symbol = SS_SYMBOLS.charAt(i - 1); |
520 | 0 | symbols[j] = symbol; |
521 | 0 | values[j] = intCounts[i]; |
522 | 0 | j++; |
523 | } | |
524 | } | |
525 | } | |
526 | else | |
527 | { | |
528 | 188624 | for (int i = 1; i < counts.length; i++) |
529 | { | |
530 | 141468 | if (counts[i] > 0) |
531 | { | |
532 | 6806 | char symbol = SS_SYMBOLS.charAt(i - 1); |
533 | 6806 | symbols[j] = symbol; |
534 | 6806 | values[j] = counts[i]; |
535 | 6806 | j++; |
536 | } | |
537 | } | |
538 | } | |
539 | 47156 | if (otherData != null) |
540 | { | |
541 | 0 | for (int i = 0; i < otherData.size(); i++) |
542 | { | |
543 | 0 | symbols[j] = (char) otherData.keyAt(i); |
544 | 0 | values[j] = otherData.valueAt(i); |
545 | 0 | j++; |
546 | } | |
547 | } | |
548 | ||
549 | 47156 | return new SymbolCounts(symbols, values); |
550 | } | |
551 | ||
552 | /** | |
553 | * Returns a tooltip string showing residues in descending order of their | |
554 | * percentage frequency in the profile | |
555 | * | |
556 | * @param normaliseBy | |
557 | * the divisor for residue counts (may or may not include gapped | |
558 | * sequence count) | |
559 | * @param percentageDecPl | |
560 | * the number of decimal places to show in percentages | |
561 | * @return | |
562 | */ | |
563 | 47156 | public String getTooltip(int normaliseBy, int percentageDecPl) |
564 | { | |
565 | 47156 | SymbolCounts symbolCounts = getSymbolCounts(); |
566 | 47156 | char[] ca = symbolCounts.symbols; |
567 | 47156 | int[] vl = symbolCounts.values; |
568 | ||
569 | /* | |
570 | * sort characters into ascending order of their counts | |
571 | */ | |
572 | 47156 | QuickSort.sort(vl, ca); |
573 | ||
574 | /* | |
575 | * traverse in reverse order (highest count first) to build tooltip | |
576 | */ | |
577 | 47156 | boolean first = true; |
578 | 47156 | StringBuilder sb = new StringBuilder(64); |
579 | 53962 | for (int c = ca.length - 1; c >= 0; c--) |
580 | { | |
581 | 6806 | final char residue = ca[c]; |
582 | // TODO combine residues which share a percentage | |
583 | // (see AAFrequency.completeCdnaConsensus) | |
584 | 6806 | float tval = (vl[c] * 100f) / normaliseBy; |
585 | 6806 | sb.append(first ? "" : "; ").append(residue).append(" "); |
586 | 6806 | Format.appendPercentage(sb, tval, percentageDecPl); |
587 | 6806 | sb.append("%"); |
588 | 6806 | first = false; |
589 | } | |
590 | 47156 | return sb.toString(); |
591 | } | |
592 | ||
593 | /** | |
594 | * Returns a string representation of the symbol counts, for debug purposes. | |
595 | */ | |
596 | 0 | @Override |
597 | public String toString() | |
598 | { | |
599 | 0 | StringBuilder sb = new StringBuilder(); |
600 | 0 | sb.append("[ "); |
601 | 0 | SymbolCounts sc = getSymbolCounts(); |
602 | 0 | for (int i = 0; i < sc.symbols.length; i++) |
603 | { | |
604 | 0 | sb.append(sc.symbols[i]).append(":").append(sc.values[i]).append(" "); |
605 | } | |
606 | 0 | sb.append("]"); |
607 | 0 | return sb.toString(); |
608 | } | |
609 | } |