Class |
Line # |
Actions |
|||
---|---|---|---|---|---|
AAFrequency | 57 | 310 | 109 |
1 | /* | |
2 | * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) | |
3 | * Copyright (C) $$Year-Rel$$ The Jalview Authors | |
4 | * | |
5 | * This file is part of Jalview. | |
6 | * | |
7 | * Jalview is free software: you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation, either version 3 | |
10 | * of the License, or (at your option) any later version. | |
11 | * | |
12 | * Jalview is distributed in the hope that it will be useful, but | |
13 | * WITHOUT ANY WARRANTY; without even the implied warranty | |
14 | * of MERCHANTABILITY or FITNESS FOR A PARTICULAR | |
15 | * PURPOSE. See the GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with Jalview. If not, see <http://www.gnu.org/licenses/>. | |
19 | * The Jalview Authors are detailed in the 'AUTHORS' file. | |
20 | */ | |
21 | package jalview.analysis; | |
22 | ||
23 | import jalview.datamodel.AlignedCodonFrame; | |
24 | import jalview.datamodel.AlignmentAnnotation; | |
25 | import jalview.datamodel.AlignmentI; | |
26 | import jalview.datamodel.Annotation; | |
27 | import jalview.datamodel.Profile; | |
28 | import jalview.datamodel.ProfileI; | |
29 | import jalview.datamodel.Profiles; | |
30 | import jalview.datamodel.ProfilesI; | |
31 | import jalview.datamodel.ResidueCount; | |
32 | import jalview.datamodel.ResidueCount.SymbolCounts; | |
33 | import jalview.datamodel.SecondaryStructureCount; | |
34 | import jalview.datamodel.SeqCigar; | |
35 | import jalview.datamodel.SequenceI; | |
36 | import jalview.ext.android.SparseIntArray; | |
37 | import jalview.util.Comparison; | |
38 | import jalview.util.Constants; | |
39 | import jalview.util.Format; | |
40 | import jalview.util.MappingUtils; | |
41 | import jalview.util.QuickSort; | |
42 | ||
43 | import java.awt.Color; | |
44 | import java.util.Arrays; | |
45 | import java.util.Hashtable; | |
46 | import java.util.List; | |
47 | ||
48 | /** | |
49 | * Takes in a vector or array of sequences and column start and column end and | |
50 | * returns a new Hashtable[] of size maxSeqLength, if Hashtable not supplied. | |
51 | * This class is used extensively in calculating alignment colourschemes that | |
52 | * depend on the amount of conservation in each alignment column. | |
53 | * | |
54 | * @author $author$ | |
55 | * @version $Revision$ | |
56 | */ | |
57 | public class AAFrequency | |
58 | { | |
59 | public static final String PROFILE = "P"; | |
60 | ||
61 | /* | |
62 | * Quick look-up of String value of char 'A' to 'Z' | |
63 | */ | |
64 | private static final String[] CHARS = new String['Z' - 'A' + 1]; | |
65 | ||
66 | 50 | static |
67 | { | |
68 | 1350 | for (char c = 'A'; c <= 'Z'; c++) |
69 | { | |
70 | 1300 | CHARS[c - 'A'] = String.valueOf(c); |
71 | } | |
72 | } | |
73 | ||
74 | 3 | public static final ProfilesI calculate(List<SequenceI> list, int start, |
75 | int end) | |
76 | { | |
77 | 3 | return calculate(list, start, end, false); |
78 | } | |
79 | ||
80 | 384 | public static final ProfilesI calculate(List<SequenceI> sequences, |
81 | int start, int end, boolean profile) | |
82 | { | |
83 | 384 | SequenceI[] seqs = new SequenceI[sequences.size()]; |
84 | 384 | int width = 0; |
85 | 384 | synchronized (sequences) |
86 | { | |
87 | 3233 | for (int i = 0; i < sequences.size(); i++) |
88 | { | |
89 | 2849 | seqs[i] = sequences.get(i); |
90 | 2849 | int length = seqs[i].getLength(); |
91 | 2849 | if (length > width) |
92 | { | |
93 | 383 | width = length; |
94 | } | |
95 | } | |
96 | ||
97 | 384 | if (end >= width) |
98 | { | |
99 | 213 | end = width; |
100 | } | |
101 | ||
102 | 384 | ProfilesI reply = calculate(seqs, width, start, end, profile); |
103 | 384 | return reply; |
104 | } | |
105 | } | |
106 | ||
107 | /** | |
108 | * Calculate the consensus symbol(s) for each column in the given range. | |
109 | * | |
110 | * @param sequences | |
111 | * @param width | |
112 | * the full width of the alignment | |
113 | * @param start | |
114 | * start column (inclusive, base zero) | |
115 | * @param end | |
116 | * end column (exclusive) | |
117 | * @param saveFullProfile | |
118 | * if true, store all symbol counts | |
119 | */ | |
120 | 1365 | public static final ProfilesI calculate(final SequenceI[] sequences, |
121 | int width, int start, int end, boolean saveFullProfile) | |
122 | { | |
123 | // long now = System.currentTimeMillis(); | |
124 | 1365 | int seqCount = sequences.length; |
125 | 1365 | boolean nucleotide = false; |
126 | 1365 | int nucleotideCount = 0; |
127 | 1365 | int peptideCount = 0; |
128 | ||
129 | 1365 | ProfileI[] result = new ProfileI[width]; |
130 | ||
131 | 613829 | for (int column = start; column < end; column++) |
132 | { | |
133 | /* | |
134 | * Apply a heuristic to detect nucleotide data (which can | |
135 | * be counted in more compact arrays); here we test for | |
136 | * more than 90% nucleotide; recheck every 10 columns in case | |
137 | * of misleading data e.g. highly conserved Alanine in peptide! | |
138 | * Mistakenly guessing nucleotide has a small performance cost, | |
139 | * as it will result in counting in sparse arrays. | |
140 | * Mistakenly guessing peptide has a small space cost, | |
141 | * as it will use a larger than necessary array to hold counts. | |
142 | */ | |
143 | 612470 | if (nucleotideCount > 100 && column % 10 == 0) |
144 | { | |
145 | 55011 | nucleotide = (9 * peptideCount < nucleotideCount); |
146 | } | |
147 | 612478 | ResidueCount residueCounts = new ResidueCount(nucleotide); |
148 | ||
149 | 11336630 | for (int row = 0; row < seqCount; row++) |
150 | { | |
151 | 10736609 | if (sequences[row] == null) |
152 | { | |
153 | 0 | jalview.bin.Console.errPrintln( |
154 | "WARNING: Consensus skipping null sequence - possible race condition."); | |
155 | 0 | continue; |
156 | } | |
157 | 10708677 | if (sequences[row].getLength() > column) |
158 | { | |
159 | 10679071 | char c = sequences[row].getCharAt(column); |
160 | 10677780 | residueCounts.add(c); |
161 | 10718833 | if (Comparison.isNucleotide(c)) |
162 | { | |
163 | 975183 | nucleotideCount++; |
164 | } | |
165 | 9741168 | else if (!Comparison.isGap(c)) |
166 | { | |
167 | 894502 | peptideCount++; |
168 | } | |
169 | } | |
170 | else | |
171 | { | |
172 | /* | |
173 | * count a gap if the sequence doesn't reach this column | |
174 | */ | |
175 | 30133 | residueCounts.addGap(); |
176 | } | |
177 | } | |
178 | ||
179 | 612397 | int maxCount = residueCounts.getModalCount(); |
180 | 612371 | String maxResidue = residueCounts.getResiduesForCount(maxCount); |
181 | 612459 | int gapCount = residueCounts.getGapCount(); |
182 | 612447 | ProfileI profile = new Profile(seqCount, gapCount, maxCount, |
183 | maxResidue); | |
184 | ||
185 | 612486 | if (saveFullProfile) |
186 | { | |
187 | 594147 | profile.setCounts(residueCounts); |
188 | } | |
189 | ||
190 | 612456 | result[column] = profile; |
191 | } | |
192 | 1365 | return new Profiles(seqCount, result); |
193 | // long elapsed = System.currentTimeMillis() - now; | |
194 | // jalview.bin.Console.outPrintln(elapsed); | |
195 | } | |
196 | ||
197 | 0 | public static final ProfilesI calculateSS(List<SequenceI> list, int start, |
198 | int end) | |
199 | { | |
200 | 0 | return calculateSS(list, start, end, false); |
201 | } | |
202 | ||
203 | 381 | public static final ProfilesI calculateSS(List<SequenceI> sequences, |
204 | int start, int end, boolean profile) | |
205 | { | |
206 | 381 | SequenceI[] seqs = new SequenceI[sequences.size()]; |
207 | 381 | int width = 0; |
208 | 381 | synchronized (sequences) |
209 | { | |
210 | 3227 | for (int i = 0; i < sequences.size(); i++) |
211 | { | |
212 | 2846 | seqs[i] = sequences.get(i); |
213 | 2846 | int length = seqs[i].getLength(); |
214 | 2846 | if (length > width) |
215 | { | |
216 | 380 | width = length; |
217 | } | |
218 | } | |
219 | ||
220 | 381 | if (end >= width) |
221 | { | |
222 | 213 | end = width; |
223 | } | |
224 | ||
225 | 381 | ProfilesI reply = calculateSS(seqs, width, start, end, profile); |
226 | 381 | return reply; |
227 | } | |
228 | } | |
229 | ||
230 | 381 | public static final ProfilesI calculateSS(final SequenceI[] sequences, |
231 | int width, int start, int end, boolean saveFullProfile) | |
232 | { | |
233 | ||
234 | 381 | int seqCount = sequences.length; |
235 | ||
236 | 381 | ProfileI[] result = new ProfileI[width]; |
237 | 381 | int maxSSannotcount=0; |
238 | 40439 | for (int column = start; column < end; column++) |
239 | { | |
240 | ||
241 | 40058 | int ssCount = 0; |
242 | ||
243 | 40058 | SecondaryStructureCount ssCounts = new SecondaryStructureCount(); |
244 | ||
245 | 281450 | for (int row = 0; row < seqCount; row++) |
246 | { | |
247 | 241392 | if (sequences[row] == null) |
248 | { | |
249 | 0 | jalview.bin.Console.errPrintln( |
250 | "WARNING: Consensus skipping null sequence - possible race condition."); | |
251 | 0 | continue; |
252 | } | |
253 | ||
254 | 241392 | char c = sequences[row].getCharAt(column); |
255 | 241392 | AlignmentAnnotation aa = AlignmentUtils |
256 | .getDisplayedAlignmentAnnotation(sequences[row]); | |
257 | 241392 | if (aa != null) |
258 | { | |
259 | 0 | ssCount++; |
260 | } | |
261 | ||
262 | 241392 | if (sequences[row].getLength() > column && !Comparison.isGap(c) |
263 | && aa != null) | |
264 | { | |
265 | ||
266 | 0 | int seqPosition = sequences[row].findPosition(column); |
267 | ||
268 | 0 | char ss = AlignmentUtils.findSSAnnotationForGivenSeqposition(aa, |
269 | seqPosition); | |
270 | 0 | if (ss == '*') |
271 | { | |
272 | 0 | continue; |
273 | } | |
274 | 0 | ssCounts.add(ss); |
275 | } | |
276 | 241392 | else if (Comparison.isGap(c) && aa != null) |
277 | { | |
278 | 0 | ssCounts.addGap(); |
279 | } | |
280 | } | |
281 | ||
282 | 40058 | int maxSSCount = ssCounts.getModalCount(); |
283 | 40058 | String maxSS = ssCounts.getSSForCount(maxSSCount); |
284 | 40058 | int gapCount = ssCounts.getGapCount(); |
285 | 40058 | ProfileI profile = new Profile(maxSS, ssCount, gapCount, maxSSCount); |
286 | ||
287 | 40058 | if (saveFullProfile) |
288 | { | |
289 | 21744 | profile.setSSCounts(ssCounts); |
290 | } | |
291 | ||
292 | 40058 | result[column] = profile; |
293 | 40058 | maxSSannotcount=Math.max(maxSSannotcount, ssCount); |
294 | } | |
295 | 381 | return new Profiles(maxSSannotcount,result); |
296 | } | |
297 | ||
298 | /** | |
299 | * Make an estimate of the profile size we are going to compute i.e. how many | |
300 | * different characters may be present in it. Overestimating has a cost of | |
301 | * using more memory than necessary. Underestimating has a cost of needing to | |
302 | * extend the SparseIntArray holding the profile counts. | |
303 | * | |
304 | * @param profileSizes | |
305 | * counts of sizes of profiles so far encountered | |
306 | * @return | |
307 | */ | |
308 | 0 | static int estimateProfileSize(SparseIntArray profileSizes) |
309 | { | |
310 | 0 | if (profileSizes.size() == 0) |
311 | { | |
312 | 0 | return 4; |
313 | } | |
314 | ||
315 | /* | |
316 | * could do a statistical heuristic here e.g. 75%ile | |
317 | * for now just return the largest value | |
318 | */ | |
319 | 0 | return profileSizes.keyAt(profileSizes.size() - 1); |
320 | } | |
321 | ||
322 | /** | |
323 | * Derive the consensus annotations to be added to the alignment for display. | |
324 | * This does not recompute the raw data, but may be called on a change in | |
325 | * display options, such as 'ignore gaps', which may in turn result in a | |
326 | * change in the derived values. | |
327 | * | |
328 | * @param consensus | |
329 | * the annotation row to add annotations to | |
330 | * @param profiles | |
331 | * the source consensus data | |
332 | * @param startCol | |
333 | * start column (inclusive) | |
334 | * @param endCol | |
335 | * end column (exclusive) | |
336 | * @param ignoreGaps | |
337 | * if true, normalise residue percentages ignoring gaps | |
338 | * @param showSequenceLogo | |
339 | * if true include all consensus symbols, else just show modal | |
340 | * residue | |
341 | * @param nseq | |
342 | * number of sequences | |
343 | */ | |
344 | 1114 | public static void completeConsensus(AlignmentAnnotation consensus, |
345 | ProfilesI profiles, int startCol, int endCol, boolean ignoreGaps, | |
346 | boolean showSequenceLogo, long nseq) | |
347 | { | |
348 | // long now = System.currentTimeMillis(); | |
349 | 1114 | if (consensus == null || consensus.annotations == null |
350 | || consensus.annotations.length < endCol) | |
351 | { | |
352 | /* | |
353 | * called with a bad alignment annotation row | |
354 | * wait for it to be initialised properly | |
355 | */ | |
356 | 0 | return; |
357 | } | |
358 | ||
359 | 183813 | for (int i = startCol; i < endCol; i++) |
360 | { | |
361 | 182699 | ProfileI profile = profiles.get(i); |
362 | 182702 | if (profile == null) |
363 | { | |
364 | /* | |
365 | * happens if sequences calculated over were | |
366 | * shorter than alignment width | |
367 | */ | |
368 | 0 | consensus.annotations[i] = null; |
369 | 0 | return; |
370 | } | |
371 | ||
372 | 182702 | final int dp = getPercentageDp(nseq); |
373 | ||
374 | 182701 | float value = profile.getPercentageIdentity(ignoreGaps); |
375 | ||
376 | 182703 | String description = getTooltip(profile, value, showSequenceLogo, |
377 | ignoreGaps, dp); | |
378 | ||
379 | 182698 | String modalResidue = profile.getModalResidue(); |
380 | 182697 | if ("".equals(modalResidue)) |
381 | { | |
382 | 6203 | modalResidue = "-"; |
383 | } | |
384 | 176498 | else if (modalResidue.length() > 1) |
385 | { | |
386 | 7887 | modalResidue = "+"; |
387 | } | |
388 | 182699 | consensus.annotations[i] = new Annotation(modalResidue, description, |
389 | ' ', value); | |
390 | } | |
391 | // long elapsed = System.currentTimeMillis() - now; | |
392 | // jalview.bin.Console.outPrintln(-elapsed); | |
393 | } | |
394 | ||
395 | 0 | public static void completeSSConsensus(AlignmentAnnotation ssConsensus, |
396 | ProfilesI profiles, int startCol, int endCol, boolean ignoreGaps, | |
397 | boolean showSequenceLogo, long nseq) | |
398 | { | |
399 | // long now = System.currentTimeMillis(); | |
400 | 0 | if (ssConsensus == null || ssConsensus.annotations == null |
401 | || ssConsensus.annotations.length < endCol) | |
402 | { | |
403 | /* | |
404 | * called with a bad alignment annotation row | |
405 | * wait for it to be initialised properly | |
406 | */ | |
407 | 0 | return; |
408 | } | |
409 | ||
410 | 0 | for (int i = startCol; i < endCol; i++) |
411 | { | |
412 | 0 | ProfileI profile = profiles.get(i); |
413 | 0 | if (profile == null) |
414 | { | |
415 | /* | |
416 | * happens if sequences calculated over were | |
417 | * shorter than alignment width | |
418 | */ | |
419 | 0 | ssConsensus.annotations[i] = null; |
420 | 0 | return; |
421 | } | |
422 | ||
423 | 0 | final int dp = getPercentageDp(nseq); |
424 | ||
425 | 0 | float value = profile.getSSPercentageIdentity(ignoreGaps); |
426 | ||
427 | 0 | String description = getSSTooltip(profile, value, showSequenceLogo, |
428 | ignoreGaps, dp); | |
429 | ||
430 | 0 | String modalSS = profile.getModalSS(); |
431 | 0 | if ("".equals(modalSS)) |
432 | { | |
433 | 0 | modalSS = "-"; |
434 | } | |
435 | 0 | else if (modalSS.length() > 1) |
436 | { | |
437 | 0 | modalSS = "+"; |
438 | } | |
439 | 0 | ssConsensus.annotations[i] = new Annotation(modalSS, description, ' ', |
440 | value); | |
441 | } | |
442 | // long elapsed = System.currentTimeMillis() - now; | |
443 | // jalview.bin.Console.outPrintln(-elapsed); | |
444 | } | |
445 | ||
446 | /** | |
447 | * Derive the gap count annotation row. | |
448 | * | |
449 | * @param gaprow | |
450 | * the annotation row to add annotations to | |
451 | * @param profiles | |
452 | * the source consensus data | |
453 | * @param startCol | |
454 | * start column (inclusive) | |
455 | * @param endCol | |
456 | * end column (exclusive) | |
457 | */ | |
458 | 986 | public static void completeGapAnnot(AlignmentAnnotation gaprow, |
459 | ProfilesI profiles, int startCol, int endCol, long nseq) | |
460 | { | |
461 | 986 | if (gaprow == null || gaprow.annotations == null |
462 | || gaprow.annotations.length < endCol) | |
463 | { | |
464 | /* | |
465 | * called with a bad alignment annotation row | |
466 | * wait for it to be initialised properly | |
467 | */ | |
468 | 0 | return; |
469 | } | |
470 | // always set ranges again | |
471 | 986 | gaprow.graphMax = nseq; |
472 | 986 | gaprow.graphMin = 0; |
473 | 986 | double scale = 0.8 / nseq; |
474 | 163898 | for (int i = startCol; i < endCol; i++) |
475 | { | |
476 | 162912 | ProfileI profile = profiles.get(i); |
477 | 162912 | if (profile == null) |
478 | { | |
479 | /* | |
480 | * happens if sequences calculated over were | |
481 | * shorter than alignment width | |
482 | */ | |
483 | 0 | gaprow.annotations[i] = null; |
484 | 0 | return; |
485 | } | |
486 | ||
487 | 162912 | final int gapped = profile.getNonGapped(); |
488 | ||
489 | 162912 | String description = "" + gapped; |
490 | ||
491 | 162912 | gaprow.annotations[i] = new Annotation("", description, '\0', gapped, |
492 | jalview.util.ColorUtils.bleachColour(Color.DARK_GRAY, | |
493 | (float) scale * gapped)); | |
494 | } | |
495 | } | |
496 | ||
497 | /** | |
498 | * Returns a tooltip showing either | |
499 | * <ul> | |
500 | * <li>the full profile (percentages of all residues present), if | |
501 | * showSequenceLogo is true, or</li> | |
502 | * <li>just the modal (most common) residue(s), if showSequenceLogo is | |
503 | * false</li> | |
504 | * </ul> | |
505 | * Percentages are as a fraction of all sequence, or only ungapped sequences | |
506 | * if ignoreGaps is true. | |
507 | * | |
508 | * @param profile | |
509 | * @param pid | |
510 | * @param showSequenceLogo | |
511 | * @param ignoreGaps | |
512 | * @param dp | |
513 | * the number of decimal places to format percentages to | |
514 | * @return | |
515 | */ | |
516 | 182701 | static String getTooltip(ProfileI profile, float pid, |
517 | boolean showSequenceLogo, boolean ignoreGaps, int dp) | |
518 | { | |
519 | 182704 | ResidueCount counts = profile.getCounts(); |
520 | ||
521 | 182704 | String description = null; |
522 | 182704 | if (counts != null && showSequenceLogo) |
523 | { | |
524 | 64157 | int normaliseBy = ignoreGaps ? profile.getNonGapped() |
525 | : profile.getHeight(); | |
526 | 64157 | description = counts.getTooltip(normaliseBy, dp); |
527 | } | |
528 | else | |
529 | { | |
530 | 118547 | StringBuilder sb = new StringBuilder(64); |
531 | 118547 | String maxRes = profile.getModalResidue(); |
532 | 118547 | if (maxRes.length() > 1) |
533 | { | |
534 | 2930 | sb.append("[").append(maxRes).append("]"); |
535 | } | |
536 | else | |
537 | { | |
538 | 115617 | sb.append(maxRes); |
539 | } | |
540 | 118547 | if (maxRes.length() > 0) |
541 | { | |
542 | 115497 | sb.append(" "); |
543 | 115497 | Format.appendPercentage(sb, pid, dp); |
544 | 115497 | sb.append("%"); |
545 | } | |
546 | 118547 | description = sb.toString(); |
547 | } | |
548 | 182698 | return description; |
549 | } | |
550 | ||
551 | 0 | static String getSSTooltip(ProfileI profile, float pid, |
552 | boolean showSequenceLogo, boolean ignoreGaps, int dp) | |
553 | { | |
554 | 0 | SecondaryStructureCount counts = profile.getSSCounts(); |
555 | ||
556 | 0 | String description = null; |
557 | 0 | if (counts != null && showSequenceLogo) |
558 | { | |
559 | 0 | int normaliseBy = ignoreGaps ? profile.getNonGapped() |
560 | : profile.getHeight(); | |
561 | 0 | description = counts.getTooltip(normaliseBy, dp); |
562 | } | |
563 | else | |
564 | { | |
565 | 0 | StringBuilder sb = new StringBuilder(64); |
566 | 0 | String maxSS = profile.getModalSS(); |
567 | 0 | if (maxSS.length() > 1) |
568 | { | |
569 | 0 | sb.append("[").append(maxSS).append("]"); |
570 | } | |
571 | else | |
572 | { | |
573 | 0 | sb.append(maxSS); |
574 | } | |
575 | 0 | if (maxSS.length() > 0) |
576 | { | |
577 | 0 | sb.append(" "); |
578 | 0 | Format.appendPercentage(sb, pid, dp); |
579 | 0 | sb.append("%"); |
580 | } | |
581 | 0 | description = sb.toString(); |
582 | } | |
583 | 0 | return description; |
584 | } | |
585 | ||
586 | /** | |
587 | * Returns the sorted profile for the given consensus data. The returned array | |
588 | * contains | |
589 | * | |
590 | * <pre> | |
591 | * [profileType, numberOfValues, totalPercent, charValue1, percentage1, charValue2, percentage2, ...] | |
592 | * in descending order of percentage value | |
593 | * </pre> | |
594 | * | |
595 | * @param profile | |
596 | * the data object from which to extract and sort values | |
597 | * @param ignoreGaps | |
598 | * if true, only non-gapped values are included in percentage | |
599 | * calculations | |
600 | * @return | |
601 | */ | |
602 | 101848 | public static int[] extractProfile(ProfileI profile, boolean ignoreGaps) |
603 | { | |
604 | 101848 | char[] symbols; |
605 | 101848 | int[] values; |
606 | ||
607 | 101848 | if (profile.getCounts() != null) |
608 | { | |
609 | 101848 | ResidueCount counts = profile.getCounts(); |
610 | 101848 | SymbolCounts symbolCounts = counts.getSymbolCounts(); |
611 | 101848 | symbols = symbolCounts.symbols; |
612 | 101848 | values = symbolCounts.values; |
613 | ||
614 | } | |
615 | 0 | else if (profile.getSSCounts() != null) |
616 | { | |
617 | 0 | SecondaryStructureCount counts = profile.getSSCounts(); |
618 | // to do | |
619 | 0 | SecondaryStructureCount.SymbolCounts symbolCounts = counts |
620 | .getSymbolCounts(); | |
621 | 0 | symbols = symbolCounts.symbols; |
622 | 0 | values = symbolCounts.values; |
623 | } | |
624 | else | |
625 | { | |
626 | 0 | return null; |
627 | } | |
628 | ||
629 | 101848 | QuickSort.sort(values, symbols); |
630 | 101848 | int totalPercentage = 0; |
631 | 101848 | final int divisor = ignoreGaps ? profile.getNonGapped() |
632 | : profile.getHeight(); | |
633 | ||
634 | /* | |
635 | * traverse the arrays in reverse order (highest counts first) | |
636 | */ | |
637 | 101848 | int[] result = new int[3 + 2 * symbols.length]; |
638 | 101848 | int nextArrayPos = 3; |
639 | 101848 | int nonZeroCount = 0; |
640 | ||
641 | 295945 | for (int i = symbols.length - 1; i >= 0; i--) |
642 | { | |
643 | 194099 | int theChar = symbols[i]; |
644 | 194099 | int charCount = values[i]; |
645 | 194099 | final int percentage = (charCount * 100) / divisor; |
646 | 194099 | if (percentage == 0) |
647 | { | |
648 | /* | |
649 | * this count (and any remaining) round down to 0% - discard | |
650 | */ | |
651 | 2 | break; |
652 | } | |
653 | 194097 | nonZeroCount++; |
654 | 194097 | result[nextArrayPos++] = theChar; |
655 | 194097 | result[nextArrayPos++] = percentage; |
656 | 194097 | totalPercentage += percentage; |
657 | } | |
658 | ||
659 | /* | |
660 | * truncate array if any zero values were discarded | |
661 | */ | |
662 | 101848 | if (nonZeroCount < symbols.length) |
663 | { | |
664 | 2 | int[] tmp = new int[3 + 2 * nonZeroCount]; |
665 | 2 | System.arraycopy(result, 0, tmp, 0, tmp.length); |
666 | 2 | result = tmp; |
667 | } | |
668 | ||
669 | /* | |
670 | * fill in 'header' values | |
671 | */ | |
672 | 101848 | result[0] = AlignmentAnnotation.SEQUENCE_PROFILE; |
673 | 101848 | result[1] = nonZeroCount; |
674 | 101848 | result[2] = totalPercentage; |
675 | ||
676 | 101848 | return result; |
677 | } | |
678 | ||
679 | /** | |
680 | * Extract a sorted extract of cDNA codon profile data. The returned array | |
681 | * contains | |
682 | * | |
683 | * <pre> | |
684 | * [profileType, numberOfValues, totalPercentage, charValue1, percentage1, charValue2, percentage2, ...] | |
685 | * in descending order of percentage value, where the character values encode codon triplets | |
686 | * </pre> | |
687 | * | |
688 | * @param hashtable | |
689 | * @return | |
690 | */ | |
691 | 2 | public static int[] extractCdnaProfile( |
692 | Hashtable<String, Object> hashtable, boolean ignoreGaps) | |
693 | { | |
694 | // this holds #seqs, #ungapped, and then codon count, indexed by encoded | |
695 | // codon triplet | |
696 | 2 | int[] codonCounts = (int[]) hashtable.get(PROFILE); |
697 | 2 | int[] sortedCounts = new int[codonCounts.length - 2]; |
698 | 2 | System.arraycopy(codonCounts, 2, sortedCounts, 0, |
699 | codonCounts.length - 2); | |
700 | ||
701 | 2 | int[] result = new int[3 + 2 * sortedCounts.length]; |
702 | // first value is just the type of profile data | |
703 | 2 | result[0] = AlignmentAnnotation.CDNA_PROFILE; |
704 | ||
705 | 2 | char[] codons = new char[sortedCounts.length]; |
706 | 130 | for (int i = 0; i < codons.length; i++) |
707 | { | |
708 | 128 | codons[i] = (char) i; |
709 | } | |
710 | 2 | QuickSort.sort(sortedCounts, codons); |
711 | 2 | int totalPercentage = 0; |
712 | 2 | int distinctValuesCount = 0; |
713 | 2 | int j = 3; |
714 | 2 | int divisor = ignoreGaps ? codonCounts[1] : codonCounts[0]; |
715 | 8 | for (int i = codons.length - 1; i >= 0; i--) |
716 | { | |
717 | 8 | final int codonCount = sortedCounts[i]; |
718 | 8 | if (codonCount == 0) |
719 | { | |
720 | 0 | break; // nothing else of interest here |
721 | } | |
722 | 8 | final int percentage = codonCount * 100 / divisor; |
723 | 8 | if (percentage == 0) |
724 | { | |
725 | /* | |
726 | * this (and any remaining) values rounded down to 0 - discard | |
727 | */ | |
728 | 2 | break; |
729 | } | |
730 | 6 | distinctValuesCount++; |
731 | 6 | result[j++] = codons[i]; |
732 | 6 | result[j++] = percentage; |
733 | 6 | totalPercentage += percentage; |
734 | } | |
735 | 2 | result[2] = totalPercentage; |
736 | ||
737 | /* | |
738 | * Just return the non-zero values | |
739 | */ | |
740 | // todo next value is redundant if we limit the array to non-zero counts | |
741 | 2 | result[1] = distinctValuesCount; |
742 | 2 | return Arrays.copyOfRange(result, 0, j); |
743 | } | |
744 | ||
745 | /** | |
746 | * Compute a consensus for the cDNA coding for a protein alignment. | |
747 | * | |
748 | * @param alignment | |
749 | * the protein alignment (which should hold mappings to cDNA | |
750 | * sequences) | |
751 | * @param hconsensus | |
752 | * the consensus data stores to be populated (one per column) | |
753 | */ | |
754 | 4 | public static void calculateCdna(AlignmentI alignment, |
755 | Hashtable<String, Object>[] hconsensus) | |
756 | { | |
757 | 4 | final char gapCharacter = alignment.getGapCharacter(); |
758 | 4 | List<AlignedCodonFrame> mappings = alignment.getCodonFrames(); |
759 | 4 | if (mappings == null || mappings.isEmpty()) |
760 | { | |
761 | 0 | return; |
762 | } | |
763 | ||
764 | 4 | int cols = alignment.getWidth(); |
765 | 1928 | for (int col = 0; col < cols; col++) |
766 | { | |
767 | // todo would prefer a Java bean for consensus data | |
768 | 1924 | Hashtable<String, Object> columnHash = new Hashtable<>(); |
769 | // #seqs, #ungapped seqs, counts indexed by (codon encoded + 1) | |
770 | 1924 | int[] codonCounts = new int[66]; |
771 | 1924 | codonCounts[0] = alignment.getSequences().size(); |
772 | 1924 | int ungappedCount = 0; |
773 | 1924 | for (SequenceI seq : alignment.getSequences()) |
774 | { | |
775 | 20870 | if (seq.getCharAt(col) == gapCharacter) |
776 | { | |
777 | 10166 | continue; |
778 | } | |
779 | 10704 | List<char[]> codons = MappingUtils.findCodonsFor(seq, col, |
780 | mappings); | |
781 | 10704 | for (char[] codon : codons) |
782 | { | |
783 | 10657 | int codonEncoded = CodingUtils.encodeCodon(codon); |
784 | 10657 | if (codonEncoded >= 0) |
785 | { | |
786 | 10657 | codonCounts[codonEncoded + 2]++; |
787 | 10657 | ungappedCount++; |
788 | 10657 | break; |
789 | } | |
790 | } | |
791 | } | |
792 | 1924 | codonCounts[1] = ungappedCount; |
793 | // todo: sort values here, save counts and codons? | |
794 | 1924 | columnHash.put(PROFILE, codonCounts); |
795 | 1924 | hconsensus[col] = columnHash; |
796 | } | |
797 | } | |
798 | ||
799 | /** | |
800 | * Derive displayable cDNA consensus annotation from computed consensus data. | |
801 | * | |
802 | * @param consensusAnnotation | |
803 | * the annotation row to be populated for display | |
804 | * @param consensusData | |
805 | * the computed consensus data | |
806 | * @param showProfileLogo | |
807 | * if true show all symbols present at each position, else only the | |
808 | * modal value | |
809 | * @param nseqs | |
810 | * the number of sequences in the alignment | |
811 | */ | |
812 | 4 | public static void completeCdnaConsensus( |
813 | AlignmentAnnotation consensusAnnotation, | |
814 | Hashtable<String, Object>[] consensusData, | |
815 | boolean showProfileLogo, int nseqs) | |
816 | { | |
817 | 4 | if (consensusAnnotation == null |
818 | || consensusAnnotation.annotations == null | |
819 | || consensusAnnotation.annotations.length < consensusData.length) | |
820 | { | |
821 | // called with a bad alignment annotation row - wait for it to be | |
822 | // initialised properly | |
823 | 0 | return; |
824 | } | |
825 | ||
826 | // ensure codon triplet scales with font size | |
827 | 4 | consensusAnnotation.scaleColLabel = true; |
828 | 1928 | for (int col = 0; col < consensusData.length; col++) |
829 | { | |
830 | 1924 | Hashtable<String, Object> hci = consensusData[col]; |
831 | 1924 | if (hci == null) |
832 | { | |
833 | // gapped protein column? | |
834 | 0 | continue; |
835 | } | |
836 | // array holds #seqs, #ungapped, then codon counts indexed by codon | |
837 | 1924 | final int[] codonCounts = (int[]) hci.get(PROFILE); |
838 | 1924 | int totalCount = 0; |
839 | ||
840 | /* | |
841 | * First pass - get total count and find the highest | |
842 | */ | |
843 | 1924 | final char[] codons = new char[codonCounts.length - 2]; |
844 | 125060 | for (int j = 2; j < codonCounts.length; j++) |
845 | { | |
846 | 123136 | final int codonCount = codonCounts[j]; |
847 | 123136 | codons[j - 2] = (char) (j - 2); |
848 | 123136 | totalCount += codonCount; |
849 | } | |
850 | ||
851 | /* | |
852 | * Sort array of encoded codons by count ascending - so the modal value | |
853 | * goes to the end; start by copying the count (dropping the first value) | |
854 | */ | |
855 | 1924 | int[] sortedCodonCounts = new int[codonCounts.length - 2]; |
856 | 1924 | System.arraycopy(codonCounts, 2, sortedCodonCounts, 0, |
857 | codonCounts.length - 2); | |
858 | 1924 | QuickSort.sort(sortedCodonCounts, codons); |
859 | ||
860 | 1924 | int modalCodonEncoded = codons[codons.length - 1]; |
861 | 1924 | int modalCodonCount = sortedCodonCounts[codons.length - 1]; |
862 | 1924 | String modalCodon = String |
863 | .valueOf(CodingUtils.decodeCodon(modalCodonEncoded)); | |
864 | 1924 | if (sortedCodonCounts.length > 1 && sortedCodonCounts[codons.length |
865 | - 2] == sortedCodonCounts[codons.length - 1]) | |
866 | { | |
867 | /* | |
868 | * two or more codons share the modal count | |
869 | */ | |
870 | 25 | modalCodon = "+"; |
871 | } | |
872 | 1924 | float pid = sortedCodonCounts[sortedCodonCounts.length - 1] * 100 |
873 | / (float) totalCount; | |
874 | ||
875 | /* | |
876 | * todo ? Replace consensus hashtable with sorted arrays of codons and | |
877 | * counts (non-zero only). Include total count in count array [0]. | |
878 | */ | |
879 | ||
880 | /* | |
881 | * Scan sorted array backwards for most frequent values first. Show | |
882 | * repeated values compactly. | |
883 | */ | |
884 | 1924 | StringBuilder mouseOver = new StringBuilder(32); |
885 | 1924 | StringBuilder samePercent = new StringBuilder(); |
886 | 1924 | String percent = null; |
887 | 1924 | String lastPercent = null; |
888 | 1924 | int percentDecPl = getPercentageDp(nseqs); |
889 | ||
890 | 3823 | for (int j = codons.length - 1; j >= 0; j--) |
891 | { | |
892 | 3823 | int codonCount = sortedCodonCounts[j]; |
893 | 3823 | if (codonCount == 0) |
894 | { | |
895 | /* | |
896 | * remaining codons are 0% - ignore, but finish off the last one if | |
897 | * necessary | |
898 | */ | |
899 | 1924 | if (samePercent.length() > 0) |
900 | { | |
901 | 1899 | mouseOver.append(samePercent).append(": ").append(percent) |
902 | .append("% "); | |
903 | } | |
904 | 1924 | break; |
905 | } | |
906 | 1899 | int codonEncoded = codons[j]; |
907 | 1899 | final int pct = codonCount * 100 / totalCount; |
908 | 1899 | String codon = String |
909 | .valueOf(CodingUtils.decodeCodon(codonEncoded)); | |
910 | 1899 | StringBuilder sb = new StringBuilder(); |
911 | 1899 | Format.appendPercentage(sb, pct, percentDecPl); |
912 | 1899 | percent = sb.toString(); |
913 | 1899 | if (showProfileLogo || codonCount == modalCodonCount) |
914 | { | |
915 | 1899 | if (percent.equals(lastPercent) && j > 0) |
916 | { | |
917 | 0 | samePercent.append(samePercent.length() == 0 ? "" : ", "); |
918 | 0 | samePercent.append(codon); |
919 | } | |
920 | else | |
921 | { | |
922 | 1899 | if (samePercent.length() > 0) |
923 | { | |
924 | 0 | mouseOver.append(samePercent).append(": ").append(lastPercent) |
925 | .append("% "); | |
926 | } | |
927 | 1899 | samePercent.setLength(0); |
928 | 1899 | samePercent.append(codon); |
929 | } | |
930 | 1899 | lastPercent = percent; |
931 | } | |
932 | } | |
933 | ||
934 | 1924 | consensusAnnotation.annotations[col] = new Annotation(modalCodon, |
935 | mouseOver.toString(), ' ', pid); | |
936 | } | |
937 | } | |
938 | ||
939 | /** | |
940 | * Returns the number of decimal places to show for profile percentages. For | |
941 | * less than 100 sequences, returns zero (the integer percentage value will be | |
942 | * displayed). For 100-999 sequences, returns 1, for 1000-9999 returns 2, etc. | |
943 | * | |
944 | * @param nseq | |
945 | * @return | |
946 | */ | |
947 | 184625 | protected static int getPercentageDp(long nseq) |
948 | { | |
949 | 184625 | int scale = 0; |
950 | 184626 | while (nseq >= 100) |
951 | { | |
952 | 0 | scale++; |
953 | 0 | nseq /= 10; |
954 | } | |
955 | 184626 | return scale; |
956 | } | |
957 | } |