Class |
Line # |
Actions |
|||
---|---|---|---|---|---|
AAFrequency | 55 | 316 | 112 |
1 | /* | |
2 | * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) | |
3 | * Copyright (C) $$Year-Rel$$ The Jalview Authors | |
4 | * | |
5 | * This file is part of Jalview. | |
6 | * | |
7 | * Jalview is free software: you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation, either version 3 | |
10 | * of the License, or (at your option) any later version. | |
11 | * | |
12 | * Jalview is distributed in the hope that it will be useful, but | |
13 | * WITHOUT ANY WARRANTY; without even the implied warranty | |
14 | * of MERCHANTABILITY or FITNESS FOR A PARTICULAR | |
15 | * PURPOSE. See the GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with Jalview. If not, see <http://www.gnu.org/licenses/>. | |
19 | * The Jalview Authors are detailed in the 'AUTHORS' file. | |
20 | */ | |
21 | package jalview.analysis; | |
22 | ||
23 | import jalview.datamodel.AlignedCodonFrame; | |
24 | import jalview.datamodel.AlignmentAnnotation; | |
25 | import jalview.datamodel.AlignmentI; | |
26 | import jalview.datamodel.Annotation; | |
27 | import jalview.datamodel.Profile; | |
28 | import jalview.datamodel.ProfileI; | |
29 | import jalview.datamodel.Profiles; | |
30 | import jalview.datamodel.ProfilesI; | |
31 | import jalview.datamodel.ResidueCount; | |
32 | import jalview.datamodel.ResidueCount.SymbolCounts; | |
33 | import jalview.datamodel.SecondaryStructureCount; | |
34 | import jalview.datamodel.SequenceI; | |
35 | import jalview.ext.android.SparseIntArray; | |
36 | import jalview.util.Comparison; | |
37 | import jalview.util.Format; | |
38 | import jalview.util.MappingUtils; | |
39 | import jalview.util.QuickSort; | |
40 | ||
41 | import java.awt.Color; | |
42 | import java.util.Arrays; | |
43 | import java.util.Hashtable; | |
44 | import java.util.List; | |
45 | ||
46 | /** | |
47 | * Takes in a vector or array of sequences and column start and column end and | |
48 | * returns a new Hashtable[] of size maxSeqLength, if Hashtable not supplied. | |
49 | * This class is used extensively in calculating alignment colourschemes that | |
50 | * depend on the amount of conservation in each alignment column. | |
51 | * | |
52 | * @author $author$ | |
53 | * @version $Revision$ | |
54 | */ | |
55 | public class AAFrequency | |
56 | { | |
57 | public static final String PROFILE = "P"; | |
58 | ||
59 | /* | |
60 | * Quick look-up of String value of char 'A' to 'Z' | |
61 | */ | |
62 | private static final String[] CHARS = new String['Z' - 'A' + 1]; | |
63 | ||
64 | 50 | static |
65 | { | |
66 | 1350 | for (char c = 'A'; c <= 'Z'; c++) |
67 | { | |
68 | 1300 | CHARS[c - 'A'] = String.valueOf(c); |
69 | } | |
70 | } | |
71 | ||
72 | 3 | public static final ProfilesI calculate(List<SequenceI> list, int start, |
73 | int end) | |
74 | { | |
75 | 3 | return calculate(list, start, end, false); |
76 | } | |
77 | ||
78 | 384 | public static final ProfilesI calculate(List<SequenceI> sequences, |
79 | int start, int end, boolean profile) | |
80 | { | |
81 | 384 | SequenceI[] seqs = new SequenceI[sequences.size()]; |
82 | 384 | int width = 0; |
83 | 384 | synchronized (sequences) |
84 | { | |
85 | 3233 | for (int i = 0; i < sequences.size(); i++) |
86 | { | |
87 | 2849 | seqs[i] = sequences.get(i); |
88 | 2849 | int length = seqs[i].getLength(); |
89 | 2849 | if (length > width) |
90 | { | |
91 | 383 | width = length; |
92 | } | |
93 | } | |
94 | ||
95 | 384 | if (end >= width) |
96 | { | |
97 | 213 | end = width; |
98 | } | |
99 | ||
100 | 384 | ProfilesI reply = calculate(seqs, width, start, end, profile); |
101 | 384 | return reply; |
102 | } | |
103 | } | |
104 | ||
105 | /** | |
106 | * Calculate the consensus symbol(s) for each column in the given range. | |
107 | * | |
108 | * @param sequences | |
109 | * @param width | |
110 | * the full width of the alignment | |
111 | * @param start | |
112 | * start column (inclusive, base zero) | |
113 | * @param end | |
114 | * end column (exclusive) | |
115 | * @param saveFullProfile | |
116 | * if true, store all symbol counts | |
117 | */ | |
118 | 1343 | public static final ProfilesI calculate(final SequenceI[] sequences, |
119 | int width, int start, int end, boolean saveFullProfile) | |
120 | { | |
121 | // long now = System.currentTimeMillis(); | |
122 | 1343 | int seqCount = sequences.length; |
123 | 1343 | boolean nucleotide = false; |
124 | 1343 | int nucleotideCount = 0; |
125 | 1343 | int peptideCount = 0; |
126 | ||
127 | 1343 | ProfileI[] result = new ProfileI[width]; |
128 | ||
129 | 609711 | for (int column = start; column < end; column++) |
130 | { | |
131 | /* | |
132 | * Apply a heuristic to detect nucleotide data (which can | |
133 | * be counted in more compact arrays); here we test for | |
134 | * more than 90% nucleotide; recheck every 10 columns in case | |
135 | * of misleading data e.g. highly conserved Alanine in peptide! | |
136 | * Mistakenly guessing nucleotide has a small performance cost, | |
137 | * as it will result in counting in sparse arrays. | |
138 | * Mistakenly guessing peptide has a small space cost, | |
139 | * as it will use a larger than necessary array to hold counts. | |
140 | */ | |
141 | 608396 | if (nucleotideCount > 100 && column % 10 == 0) |
142 | { | |
143 | 54769 | nucleotide = (9 * peptideCount < nucleotideCount); |
144 | } | |
145 | 608399 | ResidueCount residueCounts = new ResidueCount(nucleotide); |
146 | ||
147 | 11222763 | for (int row = 0; row < seqCount; row++) |
148 | { | |
149 | 10689158 | if (sequences[row] == null) |
150 | { | |
151 | 0 | jalview.bin.Console.errPrintln( |
152 | "WARNING: Consensus skipping null sequence - possible race condition."); | |
153 | 0 | continue; |
154 | } | |
155 | 10675229 | if (sequences[row].getLength() > column) |
156 | { | |
157 | 10657331 | char c = sequences[row].getCharAt(column); |
158 | 10649032 | residueCounts.add(c); |
159 | 10656056 | if (Comparison.isNucleotide(c)) |
160 | { | |
161 | 961181 | nucleotideCount++; |
162 | } | |
163 | 9663178 | else if (!Comparison.isGap(c)) |
164 | { | |
165 | 861260 | peptideCount++; |
166 | } | |
167 | } | |
168 | else | |
169 | { | |
170 | /* | |
171 | * count a gap if the sequence doesn't reach this column | |
172 | */ | |
173 | 30127 | residueCounts.addGap(); |
174 | } | |
175 | } | |
176 | ||
177 | 608293 | int maxCount = residueCounts.getModalCount(); |
178 | 608290 | String maxResidue = residueCounts.getResiduesForCount(maxCount); |
179 | 608324 | int gapCount = residueCounts.getGapCount(); |
180 | 608309 | ProfileI profile = new Profile(seqCount, gapCount, maxCount, |
181 | maxResidue); | |
182 | ||
183 | 608400 | if (saveFullProfile) |
184 | { | |
185 | 590057 | profile.setCounts(residueCounts); |
186 | } | |
187 | ||
188 | 608339 | result[column] = profile; |
189 | } | |
190 | 1343 | return new Profiles(result); |
191 | // long elapsed = System.currentTimeMillis() - now; | |
192 | // jalview.bin.Console.outPrintln(elapsed); | |
193 | } | |
194 | ||
195 | 0 | public static final ProfilesI calculateSS(List<SequenceI> list, int start, |
196 | int end, String source) | |
197 | { | |
198 | 0 | return calculateSS(list, start, end, false, source); |
199 | } | |
200 | ||
201 | 381 | public static final ProfilesI calculateSS(List<SequenceI> sequences, |
202 | int start, int end, boolean profile, String source) | |
203 | { | |
204 | 381 | SequenceI[] seqs = new SequenceI[sequences.size()]; |
205 | 381 | int width = 0; |
206 | 381 | synchronized (sequences) |
207 | { | |
208 | 3227 | for (int i = 0; i < sequences.size(); i++) |
209 | { | |
210 | 2846 | seqs[i] = sequences.get(i); |
211 | 2846 | int length = seqs[i].getLength(); |
212 | 2846 | if (length > width) |
213 | { | |
214 | 380 | width = length; |
215 | } | |
216 | } | |
217 | ||
218 | 381 | if (end >= width) |
219 | { | |
220 | 213 | end = width; |
221 | } | |
222 | ||
223 | 381 | ProfilesI reply = calculateSS(seqs, width, start, end, profile, |
224 | source); | |
225 | 381 | return reply; |
226 | } | |
227 | } | |
228 | ||
229 | 1388 | public static final ProfilesI calculateSS(final SequenceI[] sequences, |
230 | int width, int start, int end, boolean saveFullProfile, | |
231 | String source) | |
232 | { | |
233 | ||
234 | 1388 | int seqCount = sequences.length; |
235 | ||
236 | 1388 | int seqWithSSCount = 0; |
237 | ||
238 | 1388 | ProfileI[] result = new ProfileI[width]; |
239 | ||
240 | 624560 | for (int column = start; column < end; column++) |
241 | { | |
242 | ||
243 | 623202 | int ssCount = 0; |
244 | ||
245 | 623200 | SecondaryStructureCount ssCounts = new SecondaryStructureCount(); |
246 | ||
247 | 11381069 | for (int row = 0; row < seqCount; row++) |
248 | { | |
249 | 10820205 | if (sequences[row] == null) |
250 | { | |
251 | 0 | jalview.bin.Console.errPrintln( |
252 | "WARNING: Consensus skipping null sequence - possible race condition."); | |
253 | 0 | continue; |
254 | } | |
255 | ||
256 | 10811330 | char c = sequences[row].getCharAt(column); |
257 | ||
258 | 10807145 | List<AlignmentAnnotation> annots = AlignmentUtils.getAlignmentAnnotationForSource(sequences[row], source); |
259 | 10762290 | if(annots!=null) { |
260 | 86936 | seqWithSSCount++; |
261 | 86936 | for (AlignmentAnnotation aa : annots) |
262 | { | |
263 | 95704 | if (aa != null) |
264 | { | |
265 | 95704 | ssCount++; |
266 | } | |
267 | ||
268 | 95704 | if (sequences[row].getLength() > column && !Comparison.isGap(c) |
269 | && aa != null) | |
270 | { | |
271 | ||
272 | 61280 | int seqPosition = sequences[row].findPosition(column); |
273 | ||
274 | 61280 | char ss = AlignmentUtils |
275 | .findSSAnnotationForGivenSeqposition(aa, seqPosition); | |
276 | 61280 | if (ss == '*') |
277 | { | |
278 | 0 | continue; |
279 | } | |
280 | 61280 | ssCounts.add(ss); |
281 | } | |
282 | 34424 | else if (Comparison.isGap(c) && aa != null) |
283 | { | |
284 | 34424 | ssCounts.addGap(); |
285 | } | |
286 | } | |
287 | } | |
288 | } | |
289 | ||
290 | 623067 | int maxSSCount = ssCounts.getModalCount(); |
291 | 623070 | String maxSS = ssCounts.getSSForCount(maxSSCount); |
292 | 623127 | int gapCount = ssCounts.getGapCount(); |
293 | 623116 | ProfileI profile = new Profile(maxSS, ssCount, gapCount, maxSSCount, |
294 | seqWithSSCount); | |
295 | ||
296 | 623173 | if (saveFullProfile) |
297 | { | |
298 | 604867 | profile.setSSCounts(ssCounts); |
299 | } | |
300 | ||
301 | 623147 | result[column] = profile; |
302 | } | |
303 | 1388 | return new Profiles(result); |
304 | } | |
305 | ||
306 | /** | |
307 | * Make an estimate of the profile size we are going to compute i.e. how many | |
308 | * different characters may be present in it. Overestimating has a cost of | |
309 | * using more memory than necessary. Underestimating has a cost of needing to | |
310 | * extend the SparseIntArray holding the profile counts. | |
311 | * | |
312 | * @param profileSizes | |
313 | * counts of sizes of profiles so far encountered | |
314 | * @return | |
315 | */ | |
316 | 0 | static int estimateProfileSize(SparseIntArray profileSizes) |
317 | { | |
318 | 0 | if (profileSizes.size() == 0) |
319 | { | |
320 | 0 | return 4; |
321 | } | |
322 | ||
323 | /* | |
324 | * could do a statistical heuristic here e.g. 75%ile | |
325 | * for now just return the largest value | |
326 | */ | |
327 | 0 | return profileSizes.keyAt(profileSizes.size() - 1); |
328 | } | |
329 | ||
330 | /** | |
331 | * Derive the consensus annotations to be added to the alignment for display. | |
332 | * This does not recompute the raw data, but may be called on a change in | |
333 | * display options, such as 'ignore gaps', which may in turn result in a | |
334 | * change in the derived values. | |
335 | * | |
336 | * @param consensus | |
337 | * the annotation row to add annotations to | |
338 | * @param profiles | |
339 | * the source consensus data | |
340 | * @param startCol | |
341 | * start column (inclusive) | |
342 | * @param endCol | |
343 | * end column (exclusive) | |
344 | * @param ignoreGaps | |
345 | * if true, normalise residue percentages ignoring gaps | |
346 | * @param showSequenceLogo | |
347 | * if true include all consensus symbols, else just show modal | |
348 | * residue | |
349 | * @param nseq | |
350 | * number of sequences | |
351 | */ | |
352 | 1090 | public static void completeConsensus(AlignmentAnnotation consensus, |
353 | ProfilesI profiles, int startCol, int endCol, boolean ignoreGaps, | |
354 | boolean showSequenceLogo, long nseq) | |
355 | { | |
356 | // long now = System.currentTimeMillis(); | |
357 | 1090 | if (consensus == null || consensus.annotations == null |
358 | || consensus.annotations.length < endCol) | |
359 | { | |
360 | /* | |
361 | * called with a bad alignment annotation row | |
362 | * wait for it to be initialised properly | |
363 | */ | |
364 | 0 | return; |
365 | } | |
366 | ||
367 | 179398 | for (int i = startCol; i < endCol; i++) |
368 | { | |
369 | 178308 | ProfileI profile = profiles.get(i); |
370 | 178308 | if (profile == null) |
371 | { | |
372 | /* | |
373 | * happens if sequences calculated over were | |
374 | * shorter than alignment width | |
375 | */ | |
376 | 0 | consensus.annotations[i] = null; |
377 | 0 | return; |
378 | } | |
379 | ||
380 | 178308 | final int dp = getPercentageDp(nseq); |
381 | ||
382 | 178308 | float value = profile.getPercentageIdentity(ignoreGaps); |
383 | ||
384 | 178308 | String description = getTooltip(profile, value, showSequenceLogo, |
385 | ignoreGaps, dp); | |
386 | ||
387 | 178308 | String modalResidue = profile.getModalResidue(); |
388 | 178308 | if ("".equals(modalResidue)) |
389 | { | |
390 | 6523 | modalResidue = "-"; |
391 | } | |
392 | 171785 | else if (modalResidue.length() > 1) |
393 | { | |
394 | 7793 | modalResidue = "+"; |
395 | } | |
396 | 178308 | consensus.annotations[i] = new Annotation(modalResidue, description, |
397 | ' ', value); | |
398 | } | |
399 | // long elapsed = System.currentTimeMillis() - now; | |
400 | // jalview.bin.Console.outPrintln(-elapsed); | |
401 | } | |
402 | ||
403 | 927 | public static void completeSSConsensus(AlignmentAnnotation ssConsensus, |
404 | ProfilesI profiles, int startCol, int endCol, boolean ignoreGaps, | |
405 | boolean showSequenceLogo, long nseq) | |
406 | { | |
407 | // long now = System.currentTimeMillis(); | |
408 | 927 | if (ssConsensus == null || ssConsensus.annotations == null |
409 | || ssConsensus.annotations.length < endCol) | |
410 | { | |
411 | /* | |
412 | * called with a bad alignment annotation row | |
413 | * wait for it to be initialised properly | |
414 | */ | |
415 | 5 | return; |
416 | } | |
417 | ||
418 | 524060 | for (int i = startCol; i < endCol; i++) |
419 | { | |
420 | 523138 | ProfileI profile = profiles.get(i); |
421 | 523138 | if (profile == null) |
422 | { | |
423 | /* | |
424 | * happens if sequences calculated over were | |
425 | * shorter than alignment width | |
426 | */ | |
427 | 0 | ssConsensus.annotations[i] = null; |
428 | 0 | return; |
429 | } | |
430 | ||
431 | 523138 | if (ssConsensus.getNoOfSequencesIncluded() < 0) |
432 | { | |
433 | 0 | ssConsensus.setNoOfSequencesIncluded(profile.getSeqWithSSCount()); |
434 | } | |
435 | ||
436 | 523138 | final int dp = getPercentageDp(nseq); |
437 | ||
438 | 523138 | float value = profile.getSSPercentageIdentity(ignoreGaps); |
439 | ||
440 | 523138 | String description = getSSTooltip(profile, value, showSequenceLogo, |
441 | ignoreGaps, dp); | |
442 | ||
443 | 523138 | String modalSS = profile.getModalSS(); |
444 | 523138 | if ("".equals(modalSS)) |
445 | { | |
446 | 506167 | modalSS = "-"; |
447 | } | |
448 | 16971 | else if (modalSS.length() > 1) |
449 | { | |
450 | 1060 | modalSS = "+"; |
451 | } | |
452 | 523138 | ssConsensus.annotations[i] = new Annotation(modalSS, description, |
453 | ' ', value); | |
454 | } | |
455 | ||
456 | //Hide consensus with no data to display | |
457 | 922 | if(ssConsensus.getNoOfSequencesIncluded()<1) |
458 | 846 | ssConsensus.visible = false; |
459 | ||
460 | // long elapsed = System.currentTimeMillis() - now; | |
461 | // jalview.bin.Console.outPrintln(-elapsed); | |
462 | } | |
463 | ||
464 | /** | |
465 | * Derive the gap count annotation row. | |
466 | * | |
467 | * @param gaprow | |
468 | * the annotation row to add annotations to | |
469 | * @param profiles | |
470 | * the source consensus data | |
471 | * @param startCol | |
472 | * start column (inclusive) | |
473 | * @param endCol | |
474 | * end column (exclusive) | |
475 | */ | |
476 | 1883 | public static void completeGapAnnot(AlignmentAnnotation gaprow, |
477 | ProfilesI profiles, int startCol, int endCol, long nseq) | |
478 | { | |
479 | 1883 | if (gaprow == null || gaprow.annotations == null |
480 | || gaprow.annotations.length < endCol) | |
481 | { | |
482 | /* | |
483 | * called with a bad alignment annotation row | |
484 | * wait for it to be initialised properly | |
485 | */ | |
486 | 0 | return; |
487 | } | |
488 | // always set ranges again | |
489 | 1883 | gaprow.graphMax = nseq; |
490 | 1883 | gaprow.graphMin = 0; |
491 | 1883 | double scale = 0.8 / nseq; |
492 | 684201 | for (int i = startCol; i < endCol; i++) |
493 | { | |
494 | 682319 | ProfileI profile = profiles.get(i); |
495 | 682340 | if (profile == null) |
496 | { | |
497 | /* | |
498 | * happens if sequences calculated over were | |
499 | * shorter than alignment width | |
500 | */ | |
501 | 0 | gaprow.annotations[i] = null; |
502 | 0 | return; |
503 | } | |
504 | ||
505 | 682353 | final int gapped = profile.getNonGapped(); |
506 | ||
507 | 682354 | String description = "" + gapped; |
508 | ||
509 | 682360 | gaprow.annotations[i] = new Annotation("", description, '\0', gapped, |
510 | jalview.util.ColorUtils.bleachColour(Color.DARK_GRAY, | |
511 | (float) scale * gapped)); | |
512 | } | |
513 | } | |
514 | ||
515 | /** | |
516 | * Returns a tooltip showing either | |
517 | * <ul> | |
518 | * <li>the full profile (percentages of all residues present), if | |
519 | * showSequenceLogo is true, or</li> | |
520 | * <li>just the modal (most common) residue(s), if showSequenceLogo is | |
521 | * false</li> | |
522 | * </ul> | |
523 | * Percentages are as a fraction of all sequence, or only ungapped sequences | |
524 | * if ignoreGaps is true. | |
525 | * | |
526 | * @param profile | |
527 | * @param pid | |
528 | * @param showSequenceLogo | |
529 | * @param ignoreGaps | |
530 | * @param dp | |
531 | * the number of decimal places to format percentages to | |
532 | * @return | |
533 | */ | |
534 | 178308 | static String getTooltip(ProfileI profile, float pid, |
535 | boolean showSequenceLogo, boolean ignoreGaps, int dp) | |
536 | { | |
537 | 178308 | ResidueCount counts = profile.getCounts(); |
538 | ||
539 | 178308 | String description = null; |
540 | 178308 | if (counts != null && showSequenceLogo) |
541 | { | |
542 | 62511 | int normaliseBy = ignoreGaps ? profile.getNonGapped() |
543 | : profile.getHeight(); | |
544 | 62511 | description = counts.getTooltip(normaliseBy, dp); |
545 | } | |
546 | else | |
547 | { | |
548 | 115797 | StringBuilder sb = new StringBuilder(64); |
549 | 115797 | String maxRes = profile.getModalResidue(); |
550 | 115797 | if (maxRes.length() > 1) |
551 | { | |
552 | 2870 | sb.append("[").append(maxRes).append("]"); |
553 | } | |
554 | else | |
555 | { | |
556 | 112927 | sb.append(maxRes); |
557 | } | |
558 | 115797 | if (maxRes.length() > 0) |
559 | { | |
560 | 112622 | sb.append(" "); |
561 | 112622 | Format.appendPercentage(sb, pid, dp); |
562 | 112622 | sb.append("%"); |
563 | } | |
564 | 115797 | description = sb.toString(); |
565 | } | |
566 | 178308 | return description; |
567 | } | |
568 | ||
569 | 523138 | static String getSSTooltip(ProfileI profile, float pid, |
570 | boolean showSequenceLogo, boolean ignoreGaps, int dp) | |
571 | { | |
572 | 523138 | SecondaryStructureCount counts = profile.getSSCounts(); |
573 | ||
574 | 523138 | String description = null; |
575 | 523138 | if (counts != null && showSequenceLogo) |
576 | { | |
577 | 47156 | int normaliseBy = ignoreGaps ? profile.getNonGapped() |
578 | : profile.getHeight(); | |
579 | 47156 | description = counts.getTooltip(normaliseBy, dp); |
580 | } | |
581 | else | |
582 | { | |
583 | 475982 | StringBuilder sb = new StringBuilder(64); |
584 | 475982 | String maxSS = profile.getModalSS(); |
585 | 475982 | if (maxSS.length() > 1) |
586 | { | |
587 | 720 | sb.append("[").append(maxSS).append("]"); |
588 | } | |
589 | else | |
590 | { | |
591 | 475262 | sb.append(maxSS); |
592 | } | |
593 | 475982 | if (maxSS.length() > 0) |
594 | { | |
595 | 11645 | sb.append(" "); |
596 | 11645 | Format.appendPercentage(sb, pid, dp); |
597 | 11645 | sb.append("%"); |
598 | } | |
599 | 475982 | description = sb.toString(); |
600 | } | |
601 | 523138 | return description; |
602 | } | |
603 | ||
604 | /** | |
605 | * Returns the sorted profile for the given consensus data. The returned array | |
606 | * contains | |
607 | * | |
608 | * <pre> | |
609 | * [profileType, numberOfValues, totalPercent, charValue1, percentage1, charValue2, percentage2, ...] | |
610 | * in descending order of percentage value | |
611 | * </pre> | |
612 | * | |
613 | * @param profile | |
614 | * the data object from which to extract and sort values | |
615 | * @param ignoreGaps | |
616 | * if true, only non-gapped values are included in percentage | |
617 | * calculations | |
618 | * @return | |
619 | */ | |
620 | 104411 | public static int[] extractProfile(ProfileI profile, boolean ignoreGaps) |
621 | { | |
622 | 104411 | char[] symbols; |
623 | 104411 | int[] values; |
624 | ||
625 | 104411 | if (profile.getCounts() != null) |
626 | { | |
627 | 104411 | ResidueCount counts = profile.getCounts(); |
628 | 104411 | SymbolCounts symbolCounts = counts.getSymbolCounts(); |
629 | 104411 | symbols = symbolCounts.symbols; |
630 | 104411 | values = symbolCounts.values; |
631 | ||
632 | } | |
633 | 0 | else if (profile.getSSCounts() != null) |
634 | { | |
635 | 0 | SecondaryStructureCount counts = profile.getSSCounts(); |
636 | // to do | |
637 | 0 | SecondaryStructureCount.SymbolCounts symbolCounts = counts |
638 | .getSymbolCounts(); | |
639 | 0 | symbols = symbolCounts.symbols; |
640 | 0 | values = symbolCounts.values; |
641 | } | |
642 | else | |
643 | { | |
644 | 0 | return null; |
645 | } | |
646 | ||
647 | 104411 | QuickSort.sort(values, symbols); |
648 | 104411 | int totalPercentage = 0; |
649 | 104411 | final int divisor = ignoreGaps ? profile.getNonGapped() |
650 | : profile.getHeight(); | |
651 | ||
652 | /* | |
653 | * traverse the arrays in reverse order (highest counts first) | |
654 | */ | |
655 | 104411 | int[] result = new int[3 + 2 * symbols.length]; |
656 | 104411 | int nextArrayPos = 3; |
657 | 104411 | int nonZeroCount = 0; |
658 | ||
659 | 295941 | for (int i = symbols.length - 1; i >= 0; i--) |
660 | { | |
661 | 191532 | int theChar = symbols[i]; |
662 | 191532 | int charCount = values[i]; |
663 | 191532 | final int percentage = (charCount * 100) / divisor; |
664 | 191532 | if (percentage == 0) |
665 | { | |
666 | /* | |
667 | * this count (and any remaining) round down to 0% - discard | |
668 | */ | |
669 | 2 | break; |
670 | } | |
671 | 191530 | nonZeroCount++; |
672 | 191530 | result[nextArrayPos++] = theChar; |
673 | 191530 | result[nextArrayPos++] = percentage; |
674 | 191530 | totalPercentage += percentage; |
675 | } | |
676 | ||
677 | /* | |
678 | * truncate array if any zero values were discarded | |
679 | */ | |
680 | 104411 | if (nonZeroCount < symbols.length) |
681 | { | |
682 | 2 | int[] tmp = new int[3 + 2 * nonZeroCount]; |
683 | 2 | System.arraycopy(result, 0, tmp, 0, tmp.length); |
684 | 2 | result = tmp; |
685 | } | |
686 | ||
687 | /* | |
688 | * fill in 'header' values | |
689 | */ | |
690 | 104411 | result[0] = AlignmentAnnotation.SEQUENCE_PROFILE; |
691 | 104411 | result[1] = nonZeroCount; |
692 | 104411 | result[2] = totalPercentage; |
693 | ||
694 | 104411 | return result; |
695 | } | |
696 | ||
697 | /** | |
698 | * Extract a sorted extract of cDNA codon profile data. The returned array | |
699 | * contains | |
700 | * | |
701 | * <pre> | |
702 | * [profileType, numberOfValues, totalPercentage, charValue1, percentage1, charValue2, percentage2, ...] | |
703 | * in descending order of percentage value, where the character values encode codon triplets | |
704 | * </pre> | |
705 | * | |
706 | * @param hashtable | |
707 | * @return | |
708 | */ | |
709 | 2 | public static int[] extractCdnaProfile( |
710 | Hashtable<String, Object> hashtable, boolean ignoreGaps) | |
711 | { | |
712 | // this holds #seqs, #ungapped, and then codon count, indexed by encoded | |
713 | // codon triplet | |
714 | 2 | int[] codonCounts = (int[]) hashtable.get(PROFILE); |
715 | 2 | int[] sortedCounts = new int[codonCounts.length - 2]; |
716 | 2 | System.arraycopy(codonCounts, 2, sortedCounts, 0, |
717 | codonCounts.length - 2); | |
718 | ||
719 | 2 | int[] result = new int[3 + 2 * sortedCounts.length]; |
720 | // first value is just the type of profile data | |
721 | 2 | result[0] = AlignmentAnnotation.CDNA_PROFILE; |
722 | ||
723 | 2 | char[] codons = new char[sortedCounts.length]; |
724 | 130 | for (int i = 0; i < codons.length; i++) |
725 | { | |
726 | 128 | codons[i] = (char) i; |
727 | } | |
728 | 2 | QuickSort.sort(sortedCounts, codons); |
729 | 2 | int totalPercentage = 0; |
730 | 2 | int distinctValuesCount = 0; |
731 | 2 | int j = 3; |
732 | 2 | int divisor = ignoreGaps ? codonCounts[1] : codonCounts[0]; |
733 | 8 | for (int i = codons.length - 1; i >= 0; i--) |
734 | { | |
735 | 8 | final int codonCount = sortedCounts[i]; |
736 | 8 | if (codonCount == 0) |
737 | { | |
738 | 0 | break; // nothing else of interest here |
739 | } | |
740 | 8 | final int percentage = codonCount * 100 / divisor; |
741 | 8 | if (percentage == 0) |
742 | { | |
743 | /* | |
744 | * this (and any remaining) values rounded down to 0 - discard | |
745 | */ | |
746 | 2 | break; |
747 | } | |
748 | 6 | distinctValuesCount++; |
749 | 6 | result[j++] = codons[i]; |
750 | 6 | result[j++] = percentage; |
751 | 6 | totalPercentage += percentage; |
752 | } | |
753 | 2 | result[2] = totalPercentage; |
754 | ||
755 | /* | |
756 | * Just return the non-zero values | |
757 | */ | |
758 | // todo next value is redundant if we limit the array to non-zero counts | |
759 | 2 | result[1] = distinctValuesCount; |
760 | 2 | return Arrays.copyOfRange(result, 0, j); |
761 | } | |
762 | ||
763 | /** | |
764 | * Compute a consensus for the cDNA coding for a protein alignment. | |
765 | * | |
766 | * @param alignment | |
767 | * the protein alignment (which should hold mappings to cDNA | |
768 | * sequences) | |
769 | * @param hconsensus | |
770 | * the consensus data stores to be populated (one per column) | |
771 | */ | |
772 | 4 | public static void calculateCdna(AlignmentI alignment, |
773 | Hashtable<String, Object>[] hconsensus) | |
774 | { | |
775 | 4 | final char gapCharacter = alignment.getGapCharacter(); |
776 | 4 | List<AlignedCodonFrame> mappings = alignment.getCodonFrames(); |
777 | 4 | if (mappings == null || mappings.isEmpty()) |
778 | { | |
779 | 0 | return; |
780 | } | |
781 | ||
782 | 4 | int cols = alignment.getWidth(); |
783 | 1928 | for (int col = 0; col < cols; col++) |
784 | { | |
785 | // todo would prefer a Java bean for consensus data | |
786 | 1924 | Hashtable<String, Object> columnHash = new Hashtable<>(); |
787 | // #seqs, #ungapped seqs, counts indexed by (codon encoded + 1) | |
788 | 1924 | int[] codonCounts = new int[66]; |
789 | 1924 | codonCounts[0] = alignment.getSequences().size(); |
790 | 1924 | int ungappedCount = 0; |
791 | 1924 | for (SequenceI seq : alignment.getSequences()) |
792 | { | |
793 | 20870 | if (seq.getCharAt(col) == gapCharacter) |
794 | { | |
795 | 10166 | continue; |
796 | } | |
797 | 10704 | List<char[]> codons = MappingUtils.findCodonsFor(seq, col, |
798 | mappings); | |
799 | 10704 | for (char[] codon : codons) |
800 | { | |
801 | 10657 | int codonEncoded = CodingUtils.encodeCodon(codon); |
802 | 10657 | if (codonEncoded >= 0) |
803 | { | |
804 | 10657 | codonCounts[codonEncoded + 2]++; |
805 | 10657 | ungappedCount++; |
806 | 10657 | break; |
807 | } | |
808 | } | |
809 | } | |
810 | 1924 | codonCounts[1] = ungappedCount; |
811 | // todo: sort values here, save counts and codons? | |
812 | 1924 | columnHash.put(PROFILE, codonCounts); |
813 | 1924 | hconsensus[col] = columnHash; |
814 | } | |
815 | } | |
816 | ||
817 | /** | |
818 | * Derive displayable cDNA consensus annotation from computed consensus data. | |
819 | * | |
820 | * @param consensusAnnotation | |
821 | * the annotation row to be populated for display | |
822 | * @param consensusData | |
823 | * the computed consensus data | |
824 | * @param showProfileLogo | |
825 | * if true show all symbols present at each position, else only the | |
826 | * modal value | |
827 | * @param nseqs | |
828 | * the number of sequences in the alignment | |
829 | */ | |
830 | 3 | public static void completeCdnaConsensus( |
831 | AlignmentAnnotation consensusAnnotation, | |
832 | Hashtable<String, Object>[] consensusData, | |
833 | boolean showProfileLogo, int nseqs) | |
834 | { | |
835 | 3 | if (consensusAnnotation == null |
836 | || consensusAnnotation.annotations == null | |
837 | || consensusAnnotation.annotations.length < consensusData.length) | |
838 | { | |
839 | // called with a bad alignment annotation row - wait for it to be | |
840 | // initialised properly | |
841 | 0 | return; |
842 | } | |
843 | ||
844 | // ensure codon triplet scales with font size | |
845 | 3 | consensusAnnotation.scaleColLabel = true; |
846 | 981 | for (int col = 0; col < consensusData.length; col++) |
847 | { | |
848 | 978 | Hashtable<String, Object> hci = consensusData[col]; |
849 | 978 | if (hci == null) |
850 | { | |
851 | // gapped protein column? | |
852 | 0 | continue; |
853 | } | |
854 | // array holds #seqs, #ungapped, then codon counts indexed by codon | |
855 | 978 | final int[] codonCounts = (int[]) hci.get(PROFILE); |
856 | 978 | int totalCount = 0; |
857 | ||
858 | /* | |
859 | * First pass - get total count and find the highest | |
860 | */ | |
861 | 978 | final char[] codons = new char[codonCounts.length - 2]; |
862 | 63570 | for (int j = 2; j < codonCounts.length; j++) |
863 | { | |
864 | 62592 | final int codonCount = codonCounts[j]; |
865 | 62592 | codons[j - 2] = (char) (j - 2); |
866 | 62592 | totalCount += codonCount; |
867 | } | |
868 | ||
869 | /* | |
870 | * Sort array of encoded codons by count ascending - so the modal value | |
871 | * goes to the end; start by copying the count (dropping the first value) | |
872 | */ | |
873 | 978 | int[] sortedCodonCounts = new int[codonCounts.length - 2]; |
874 | 978 | System.arraycopy(codonCounts, 2, sortedCodonCounts, 0, |
875 | codonCounts.length - 2); | |
876 | 978 | QuickSort.sort(sortedCodonCounts, codons); |
877 | ||
878 | 978 | int modalCodonEncoded = codons[codons.length - 1]; |
879 | 978 | int modalCodonCount = sortedCodonCounts[codons.length - 1]; |
880 | 978 | String modalCodon = String |
881 | .valueOf(CodingUtils.decodeCodon(modalCodonEncoded)); | |
882 | 978 | if (sortedCodonCounts.length > 1 && sortedCodonCounts[codons.length |
883 | - 2] == sortedCodonCounts[codons.length - 1]) | |
884 | { | |
885 | /* | |
886 | * two or more codons share the modal count | |
887 | */ | |
888 | 25 | modalCodon = "+"; |
889 | } | |
890 | 978 | float pid = sortedCodonCounts[sortedCodonCounts.length - 1] * 100 |
891 | / (float) totalCount; | |
892 | ||
893 | /* | |
894 | * todo ? Replace consensus hashtable with sorted arrays of codons and | |
895 | * counts (non-zero only). Include total count in count array [0]. | |
896 | */ | |
897 | ||
898 | /* | |
899 | * Scan sorted array backwards for most frequent values first. Show | |
900 | * repeated values compactly. | |
901 | */ | |
902 | 978 | StringBuilder mouseOver = new StringBuilder(32); |
903 | 978 | StringBuilder samePercent = new StringBuilder(); |
904 | 978 | String percent = null; |
905 | 978 | String lastPercent = null; |
906 | 978 | int percentDecPl = getPercentageDp(nseqs); |
907 | ||
908 | 1931 | for (int j = codons.length - 1; j >= 0; j--) |
909 | { | |
910 | 1931 | int codonCount = sortedCodonCounts[j]; |
911 | 1931 | if (codonCount == 0) |
912 | { | |
913 | /* | |
914 | * remaining codons are 0% - ignore, but finish off the last one if | |
915 | * necessary | |
916 | */ | |
917 | 978 | if (samePercent.length() > 0) |
918 | { | |
919 | 953 | mouseOver.append(samePercent).append(": ").append(percent) |
920 | .append("% "); | |
921 | } | |
922 | 978 | break; |
923 | } | |
924 | 953 | int codonEncoded = codons[j]; |
925 | 953 | final int pct = codonCount * 100 / totalCount; |
926 | 953 | String codon = String |
927 | .valueOf(CodingUtils.decodeCodon(codonEncoded)); | |
928 | 953 | StringBuilder sb = new StringBuilder(); |
929 | 953 | Format.appendPercentage(sb, pct, percentDecPl); |
930 | 953 | percent = sb.toString(); |
931 | 953 | if (showProfileLogo || codonCount == modalCodonCount) |
932 | { | |
933 | 953 | if (percent.equals(lastPercent) && j > 0) |
934 | { | |
935 | 0 | samePercent.append(samePercent.length() == 0 ? "" : ", "); |
936 | 0 | samePercent.append(codon); |
937 | } | |
938 | else | |
939 | { | |
940 | 953 | if (samePercent.length() > 0) |
941 | { | |
942 | 0 | mouseOver.append(samePercent).append(": ").append(lastPercent) |
943 | .append("% "); | |
944 | } | |
945 | 953 | samePercent.setLength(0); |
946 | 953 | samePercent.append(codon); |
947 | } | |
948 | 953 | lastPercent = percent; |
949 | } | |
950 | } | |
951 | ||
952 | 978 | consensusAnnotation.annotations[col] = new Annotation(modalCodon, |
953 | mouseOver.toString(), ' ', pid); | |
954 | } | |
955 | } | |
956 | ||
957 | /** | |
958 | * Returns the number of decimal places to show for profile percentages. For | |
959 | * less than 100 sequences, returns zero (the integer percentage value will be | |
960 | * displayed). For 100-999 sequences, returns 1, for 1000-9999 returns 2, etc. | |
961 | * | |
962 | * @param nseq | |
963 | * @return | |
964 | */ | |
965 | 702404 | protected static int getPercentageDp(long nseq) |
966 | { | |
967 | 702416 | int scale = 0; |
968 | 702411 | while (nseq >= 100) |
969 | { | |
970 | 0 | scale++; |
971 | 0 | nseq /= 10; |
972 | } | |
973 | 702414 | return scale; |
974 | } | |
975 | } |