Clover icon

jalviewX

  1. Project Clover database Wed Oct 31 2018 15:13:58 GMT
  2. Package jalview.analysis

File AAFrequency.java

 

Coverage histogram

../../img/srcFileCovDistChart8.png
19% of files have more coverage

Code metrics

98
218
13
1
721
424
74
0.34
16.77
13
5.69

Classes

Class Line # Actions
AAFrequency 54 218 74 74
0.77507677.5%
 

Contributing tests

This file is covered by 82 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.analysis;
22   
23    import jalview.datamodel.AlignedCodonFrame;
24    import jalview.datamodel.AlignmentAnnotation;
25    import jalview.datamodel.AlignmentI;
26    import jalview.datamodel.Annotation;
27    import jalview.datamodel.Profile;
28    import jalview.datamodel.ProfileI;
29    import jalview.datamodel.Profiles;
30    import jalview.datamodel.ProfilesI;
31    import jalview.datamodel.ResidueCount;
32    import jalview.datamodel.ResidueCount.SymbolCounts;
33    import jalview.datamodel.SequenceI;
34    import jalview.ext.android.SparseIntArray;
35    import jalview.util.Comparison;
36    import jalview.util.Format;
37    import jalview.util.MappingUtils;
38    import jalview.util.QuickSort;
39   
40    import java.awt.Color;
41    import java.util.Arrays;
42    import java.util.Hashtable;
43    import java.util.List;
44   
45    /**
46    * Takes in a vector or array of sequences and column start and column end and
47    * returns a new Hashtable[] of size maxSeqLength, if Hashtable not supplied.
48    * This class is used extensively in calculating alignment colourschemes that
49    * depend on the amount of conservation in each alignment column.
50    *
51    * @author $author$
52    * @version $Revision$
53    */
 
54    public class AAFrequency
55    {
56    public static final String PROFILE = "P";
57   
58    /*
59    * Quick look-up of String value of char 'A' to 'Z'
60    */
61    private static final String[] CHARS = new String['Z' - 'A' + 1];
62   
 
63  1 toggle static
64    {
65  27 for (char c = 'A'; c <= 'Z'; c++)
66    {
67  26 CHARS[c - 'A'] = String.valueOf(c);
68    }
69    }
70   
 
71  3 toggle public static final ProfilesI calculate(List<SequenceI> list, int start,
72    int end)
73    {
74  3 return calculate(list, start, end, false);
75    }
76   
 
77  473 toggle public static final ProfilesI calculate(List<SequenceI> sequences,
78    int start, int end, boolean profile)
79    {
80  473 SequenceI[] seqs = new SequenceI[sequences.size()];
81  473 int width = 0;
82  473 synchronized (sequences)
83    {
84  4047 for (int i = 0; i < sequences.size(); i++)
85    {
86  3574 seqs[i] = sequences.get(i);
87  3574 int length = seqs[i].getLength();
88  3574 if (length > width)
89    {
90  483 width = length;
91    }
92    }
93   
94  473 if (end >= width)
95    {
96  281 end = width;
97    }
98   
99  473 ProfilesI reply = calculate(seqs, width, start, end, profile);
100  473 return reply;
101    }
102    }
103   
104    /**
105    * Calculate the consensus symbol(s) for each column in the given range.
106    *
107    * @param sequences
108    * @param width
109    * the full width of the alignment
110    * @param start
111    * start column (inclusive, base zero)
112    * @param end
113    * end column (exclusive)
114    * @param saveFullProfile
115    * if true, store all symbol counts
116    */
 
117  936 toggle public static final ProfilesI calculate(final SequenceI[] sequences,
118    int width, int start, int end, boolean saveFullProfile)
119    {
120    // long now = System.currentTimeMillis();
121  936 int seqCount = sequences.length;
122  936 boolean nucleotide = false;
123  936 int nucleotideCount = 0;
124  936 int peptideCount = 0;
125   
126  936 ProfileI[] result = new ProfileI[width];
127   
128  118435 for (int column = start; column < end; column++)
129    {
130    /*
131    * Apply a heuristic to detect nucleotide data (which can
132    * be counted in more compact arrays); here we test for
133    * more than 90% nucleotide; recheck every 10 columns in case
134    * of misleading data e.g. highly conserved Alanine in peptide!
135    * Mistakenly guessing nucleotide has a small performance cost,
136    * as it will result in counting in sparse arrays.
137    * Mistakenly guessing peptide has a small space cost,
138    * as it will use a larger than necessary array to hold counts.
139    */
140  117502 if (nucleotideCount > 100 && column % 10 == 0)
141    {
142  5708 nucleotide = (9 * peptideCount < nucleotideCount);
143    }
144  117498 ResidueCount residueCounts = new ResidueCount(nucleotide);
145   
146  1191097 for (int row = 0; row < seqCount; row++)
147    {
148  1073668 if (sequences[row] == null)
149    {
150  0 System.err.println(
151    "WARNING: Consensus skipping null sequence - possible race condition.");
152  0 continue;
153    }
154  1073563 if (sequences[row].getLength() > column)
155    {
156  1061259 char c = sequences[row].getCharAt(column);
157  1061199 residueCounts.add(c);
158  1062601 if (Comparison.isNucleotide(c))
159    {
160  275069 nucleotideCount++;
161    }
162  787723 else if (!Comparison.isGap(c))
163    {
164  659832 peptideCount++;
165    }
166    }
167    else
168    {
169    /*
170    * count a gap if the sequence doesn't reach this column
171    */
172  12355 residueCounts.addGap();
173    }
174    }
175   
176  117486 int maxCount = residueCounts.getModalCount();
177  117484 String maxResidue = residueCounts.getResiduesForCount(maxCount);
178  117499 int gapCount = residueCounts.getGapCount();
179  117499 ProfileI profile = new Profile(seqCount, gapCount, maxCount,
180    maxResidue);
181   
182  117505 if (saveFullProfile)
183    {
184  96615 profile.setCounts(residueCounts);
185    }
186   
187  117501 result[column] = profile;
188    }
189  936 return new Profiles(result);
190    // long elapsed = System.currentTimeMillis() - now;
191    // System.out.println(elapsed);
192    }
193   
194    /**
195    * Make an estimate of the profile size we are going to compute i.e. how many
196    * different characters may be present in it. Overestimating has a cost of
197    * using more memory than necessary. Underestimating has a cost of needing to
198    * extend the SparseIntArray holding the profile counts.
199    *
200    * @param profileSizes
201    * counts of sizes of profiles so far encountered
202    * @return
203    */
 
204  0 toggle static int estimateProfileSize(SparseIntArray profileSizes)
205    {
206  0 if (profileSizes.size() == 0)
207    {
208  0 return 4;
209    }
210   
211    /*
212    * could do a statistical heuristic here e.g. 75%ile
213    * for now just return the largest value
214    */
215  0 return profileSizes.keyAt(profileSizes.size() - 1);
216    }
217   
218    /**
219    * Derive the consensus annotations to be added to the alignment for display.
220    * This does not recompute the raw data, but may be called on a change in
221    * display options, such as 'ignore gaps', which may in turn result in a
222    * change in the derived values.
223    *
224    * @param consensus
225    * the annotation row to add annotations to
226    * @param profiles
227    * the source consensus data
228    * @param startCol
229    * start column (inclusive)
230    * @param endCol
231    * end column (exclusive)
232    * @param ignoreGaps
233    * if true, normalise residue percentages ignoring gaps
234    * @param showSequenceLogo
235    * if true include all consensus symbols, else just show modal
236    * residue
237    * @param nseq
238    * number of sequences
239    */
 
240  665 toggle public static void completeConsensus(AlignmentAnnotation consensus,
241    ProfilesI profiles, int startCol, int endCol, boolean ignoreGaps,
242    boolean showSequenceLogo, long nseq)
243    {
244    // long now = System.currentTimeMillis();
245  665 if (consensus == null || consensus.annotations == null
246    || consensus.annotations.length < endCol)
247    {
248    /*
249    * called with a bad alignment annotation row
250    * wait for it to be initialised properly
251    */
252  0 return;
253    }
254   
255  96547 for (int i = startCol; i < endCol; i++)
256    {
257  95882 ProfileI profile = profiles.get(i);
258  95884 if (profile == null)
259    {
260    /*
261    * happens if sequences calculated over were
262    * shorter than alignment width
263    */
264  0 consensus.annotations[i] = null;
265  0 return;
266    }
267   
268  95886 final int dp = getPercentageDp(nseq);
269   
270  95884 float value = profile.getPercentageIdentity(ignoreGaps);
271   
272  95885 String description = getTooltip(profile, value, showSequenceLogo,
273    ignoreGaps, dp);
274   
275  95890 String modalResidue = profile.getModalResidue();
276  95890 if ("".equals(modalResidue))
277    {
278  1498 modalResidue = "-";
279    }
280  94393 else if (modalResidue.length() > 1)
281    {
282  6608 modalResidue = "+";
283    }
284  95888 consensus.annotations[i] = new Annotation(modalResidue, description,
285    ' ', value);
286    }
287    // long elapsed = System.currentTimeMillis() - now;
288    // System.out.println(-elapsed);
289    }
290   
291    /**
292    * Derive the gap count annotation row.
293    *
294    * @param gaprow
295    * the annotation row to add annotations to
296    * @param profiles
297    * the source consensus data
298    * @param startCol
299    * start column (inclusive)
300    * @param endCol
301    * end column (exclusive)
302    */
 
303  481 toggle public static void completeGapAnnot(AlignmentAnnotation gaprow,
304    ProfilesI profiles, int startCol, int endCol, long nseq)
305    {
306  481 if (gaprow == null || gaprow.annotations == null
307    || gaprow.annotations.length < endCol)
308    {
309    /*
310    * called with a bad alignment annotation row
311    * wait for it to be initialised properly
312    */
313  0 return;
314    }
315    // always set ranges again
316  481 gaprow.graphMax = nseq;
317  481 gaprow.graphMin = 0;
318  481 double scale = 0.8 / nseq;
319  67795 for (int i = startCol; i < endCol; i++)
320    {
321  67314 ProfileI profile = profiles.get(i);
322  67314 if (profile == null)
323    {
324    /*
325    * happens if sequences calculated over were
326    * shorter than alignment width
327    */
328  0 gaprow.annotations[i] = null;
329  0 return;
330    }
331   
332  67314 final int gapped = profile.getNonGapped();
333   
334  67314 String description = "" + gapped;
335   
336  67314 gaprow.annotations[i] = new Annotation("", description, '\0', gapped,
337    jalview.util.ColorUtils.bleachColour(Color.DARK_GRAY,
338    (float) scale * gapped));
339    }
340    }
341   
342    /**
343    * Returns a tooltip showing either
344    * <ul>
345    * <li>the full profile (percentages of all residues present), if
346    * showSequenceLogo is true, or</li>
347    * <li>just the modal (most common) residue(s), if showSequenceLogo is
348    * false</li>
349    * </ul>
350    * Percentages are as a fraction of all sequence, or only ungapped sequences
351    * if ignoreGaps is true.
352    *
353    * @param profile
354    * @param pid
355    * @param showSequenceLogo
356    * @param ignoreGaps
357    * @param dp
358    * the number of decimal places to format percentages to
359    * @return
360    */
 
361  95884 toggle static String getTooltip(ProfileI profile, float pid,
362    boolean showSequenceLogo, boolean ignoreGaps, int dp)
363    {
364  95884 ResidueCount counts = profile.getCounts();
365   
366  95881 String description = null;
367  95892 if (counts != null && showSequenceLogo)
368    {
369  48518 int normaliseBy = ignoreGaps ? profile.getNonGapped()
370    : profile.getHeight();
371  48513 description = counts.getTooltip(normaliseBy, dp);
372    }
373    else
374    {
375  47380 StringBuilder sb = new StringBuilder(64);
376  47380 String maxRes = profile.getModalResidue();
377  47380 if (maxRes.length() > 1)
378    {
379  2230 sb.append("[").append(maxRes).append("]");
380    }
381    else
382    {
383  45150 sb.append(maxRes);
384    }
385  47380 if (maxRes.length() > 0)
386    {
387  46783 sb.append(" ");
388  46783 Format.appendPercentage(sb, pid, dp);
389  46783 sb.append("%");
390    }
391  47380 description = sb.toString();
392    }
393  95890 return description;
394    }
395   
396    /**
397    * Returns the sorted profile for the given consensus data. The returned array
398    * contains
399    *
400    * <pre>
401    * [profileType, numberOfValues, nonGapCount, charValue1, percentage1, charValue2, percentage2, ...]
402    * in descending order of percentage value
403    * </pre>
404    *
405    * @param profile
406    * the data object from which to extract and sort values
407    * @param ignoreGaps
408    * if true, only non-gapped values are included in percentage
409    * calculations
410    * @return
411    */
 
412  48860 toggle public static int[] extractProfile(ProfileI profile, boolean ignoreGaps)
413    {
414  48860 int[] rtnval = new int[64];
415  48860 ResidueCount counts = profile.getCounts();
416  48860 if (counts == null)
417    {
418  0 return null;
419    }
420   
421  48860 SymbolCounts symbolCounts = counts.getSymbolCounts();
422  48860 char[] symbols = symbolCounts.symbols;
423  48860 int[] values = symbolCounts.values;
424  48860 QuickSort.sort(values, symbols);
425  48860 int nextArrayPos = 2;
426  48860 int totalPercentage = 0;
427  48860 final int divisor = ignoreGaps ? profile.getNonGapped()
428    : profile.getHeight();
429   
430    /*
431    * traverse the arrays in reverse order (highest counts first)
432    */
433  128760 for (int i = symbols.length - 1; i >= 0; i--)
434    {
435  79900 int theChar = symbols[i];
436  79900 int charCount = values[i];
437   
438  79900 rtnval[nextArrayPos++] = theChar;
439  79900 final int percentage = (charCount * 100) / divisor;
440  79900 rtnval[nextArrayPos++] = percentage;
441  79900 totalPercentage += percentage;
442    }
443  48860 rtnval[0] = symbols.length;
444  48860 rtnval[1] = totalPercentage;
445  48860 int[] result = new int[rtnval.length + 1];
446  48860 result[0] = AlignmentAnnotation.SEQUENCE_PROFILE;
447  48860 System.arraycopy(rtnval, 0, result, 1, rtnval.length);
448   
449  48860 return result;
450    }
451   
452    /**
453    * Extract a sorted extract of cDNA codon profile data. The returned array
454    * contains
455    *
456    * <pre>
457    * [profileType, numberOfValues, totalCount, charValue1, percentage1, charValue2, percentage2, ...]
458    * in descending order of percentage value, where the character values encode codon triplets
459    * </pre>
460    *
461    * @param hashtable
462    * @return
463    */
 
464  0 toggle public static int[] extractCdnaProfile(Hashtable hashtable,
465    boolean ignoreGaps)
466    {
467    // this holds #seqs, #ungapped, and then codon count, indexed by encoded
468    // codon triplet
469  0 int[] codonCounts = (int[]) hashtable.get(PROFILE);
470  0 int[] sortedCounts = new int[codonCounts.length - 2];
471  0 System.arraycopy(codonCounts, 2, sortedCounts, 0,
472    codonCounts.length - 2);
473   
474  0 int[] result = new int[3 + 2 * sortedCounts.length];
475    // first value is just the type of profile data
476  0 result[0] = AlignmentAnnotation.CDNA_PROFILE;
477   
478  0 char[] codons = new char[sortedCounts.length];
479  0 for (int i = 0; i < codons.length; i++)
480    {
481  0 codons[i] = (char) i;
482    }
483  0 QuickSort.sort(sortedCounts, codons);
484  0 int totalPercentage = 0;
485  0 int distinctValuesCount = 0;
486  0 int j = 3;
487  0 int divisor = ignoreGaps ? codonCounts[1] : codonCounts[0];
488  0 for (int i = codons.length - 1; i >= 0; i--)
489    {
490  0 final int codonCount = sortedCounts[i];
491  0 if (codonCount == 0)
492    {
493  0 break; // nothing else of interest here
494    }
495  0 distinctValuesCount++;
496  0 result[j++] = codons[i];
497  0 final int percentage = codonCount * 100 / divisor;
498  0 result[j++] = percentage;
499  0 totalPercentage += percentage;
500    }
501  0 result[2] = totalPercentage;
502   
503    /*
504    * Just return the non-zero values
505    */
506    // todo next value is redundant if we limit the array to non-zero counts
507  0 result[1] = distinctValuesCount;
508  0 return Arrays.copyOfRange(result, 0, j);
509    }
510   
511    /**
512    * Compute a consensus for the cDNA coding for a protein alignment.
513    *
514    * @param alignment
515    * the protein alignment (which should hold mappings to cDNA
516    * sequences)
517    * @param hconsensus
518    * the consensus data stores to be populated (one per column)
519    */
 
520  1 toggle public static void calculateCdna(AlignmentI alignment,
521    Hashtable[] hconsensus)
522    {
523  1 final char gapCharacter = alignment.getGapCharacter();
524  1 List<AlignedCodonFrame> mappings = alignment.getCodonFrames();
525  1 if (mappings == null || mappings.isEmpty())
526    {
527  0 return;
528    }
529   
530  1 int cols = alignment.getWidth();
531  27 for (int col = 0; col < cols; col++)
532    {
533    // todo would prefer a Java bean for consensus data
534  26 Hashtable<String, int[]> columnHash = new Hashtable<String, int[]>();
535    // #seqs, #ungapped seqs, counts indexed by (codon encoded + 1)
536  26 int[] codonCounts = new int[66];
537  26 codonCounts[0] = alignment.getSequences().size();
538  26 int ungappedCount = 0;
539  26 for (SequenceI seq : alignment.getSequences())
540    {
541  52 if (seq.getCharAt(col) == gapCharacter)
542    {
543  8 continue;
544    }
545  44 List<char[]> codons = MappingUtils.findCodonsFor(seq, col,
546    mappings);
547  44 for (char[] codon : codons)
548    {
549  1 int codonEncoded = CodingUtils.encodeCodon(codon);
550  1 if (codonEncoded >= 0)
551    {
552  1 codonCounts[codonEncoded + 2]++;
553  1 ungappedCount++;
554    }
555    }
556    }
557  26 codonCounts[1] = ungappedCount;
558    // todo: sort values here, save counts and codons?
559  26 columnHash.put(PROFILE, codonCounts);
560  26 hconsensus[col] = columnHash;
561    }
562    }
563   
564    /**
565    * Derive displayable cDNA consensus annotation from computed consensus data.
566    *
567    * @param consensusAnnotation
568    * the annotation row to be populated for display
569    * @param consensusData
570    * the computed consensus data
571    * @param showProfileLogo
572    * if true show all symbols present at each position, else only the
573    * modal value
574    * @param nseqs
575    * the number of sequences in the alignment
576    */
 
577  1 toggle public static void completeCdnaConsensus(
578    AlignmentAnnotation consensusAnnotation,
579    Hashtable[] consensusData, boolean showProfileLogo, int nseqs)
580    {
581  1 if (consensusAnnotation == null
582    || consensusAnnotation.annotations == null
583    || consensusAnnotation.annotations.length < consensusData.length)
584    {
585    // called with a bad alignment annotation row - wait for it to be
586    // initialised properly
587  0 return;
588    }
589   
590    // ensure codon triplet scales with font size
591  1 consensusAnnotation.scaleColLabel = true;
592  27 for (int col = 0; col < consensusData.length; col++)
593    {
594  26 Hashtable hci = consensusData[col];
595  26 if (hci == null)
596    {
597    // gapped protein column?
598  0 continue;
599    }
600    // array holds #seqs, #ungapped, then codon counts indexed by codon
601  26 final int[] codonCounts = (int[]) hci.get(PROFILE);
602  26 int totalCount = 0;
603   
604    /*
605    * First pass - get total count and find the highest
606    */
607  26 final char[] codons = new char[codonCounts.length - 2];
608  1690 for (int j = 2; j < codonCounts.length; j++)
609    {
610  1664 final int codonCount = codonCounts[j];
611  1664 codons[j - 2] = (char) (j - 2);
612  1664 totalCount += codonCount;
613    }
614   
615    /*
616    * Sort array of encoded codons by count ascending - so the modal value
617    * goes to the end; start by copying the count (dropping the first value)
618    */
619  26 int[] sortedCodonCounts = new int[codonCounts.length - 2];
620  26 System.arraycopy(codonCounts, 2, sortedCodonCounts, 0,
621    codonCounts.length - 2);
622  26 QuickSort.sort(sortedCodonCounts, codons);
623   
624  26 int modalCodonEncoded = codons[codons.length - 1];
625  26 int modalCodonCount = sortedCodonCounts[codons.length - 1];
626  26 String modalCodon = String
627    .valueOf(CodingUtils.decodeCodon(modalCodonEncoded));
628  26 if (sortedCodonCounts.length > 1 && sortedCodonCounts[codons.length
629    - 2] == sortedCodonCounts[codons.length - 1])
630    {
631    /*
632    * two or more codons share the modal count
633    */
634  25 modalCodon = "+";
635    }
636  26 float pid = sortedCodonCounts[sortedCodonCounts.length - 1] * 100
637    / (float) totalCount;
638   
639    /*
640    * todo ? Replace consensus hashtable with sorted arrays of codons and
641    * counts (non-zero only). Include total count in count array [0].
642    */
643   
644    /*
645    * Scan sorted array backwards for most frequent values first. Show
646    * repeated values compactly.
647    */
648  26 StringBuilder mouseOver = new StringBuilder(32);
649  26 StringBuilder samePercent = new StringBuilder();
650  26 String percent = null;
651  26 String lastPercent = null;
652  26 int percentDecPl = getPercentageDp(nseqs);
653   
654  27 for (int j = codons.length - 1; j >= 0; j--)
655    {
656  27 int codonCount = sortedCodonCounts[j];
657  27 if (codonCount == 0)
658    {
659    /*
660    * remaining codons are 0% - ignore, but finish off the last one if
661    * necessary
662    */
663  26 if (samePercent.length() > 0)
664    {
665  1 mouseOver.append(samePercent).append(": ").append(percent)
666    .append("% ");
667    }
668  26 break;
669    }
670  1 int codonEncoded = codons[j];
671  1 final int pct = codonCount * 100 / totalCount;
672  1 String codon = String
673    .valueOf(CodingUtils.decodeCodon(codonEncoded));
674  1 StringBuilder sb = new StringBuilder();
675  1 Format.appendPercentage(sb, pct, percentDecPl);
676  1 percent = sb.toString();
677  1 if (showProfileLogo || codonCount == modalCodonCount)
678    {
679  1 if (percent.equals(lastPercent) && j > 0)
680    {
681  0 samePercent.append(samePercent.length() == 0 ? "" : ", ");
682  0 samePercent.append(codon);
683    }
684    else
685    {
686  1 if (samePercent.length() > 0)
687    {
688  0 mouseOver.append(samePercent).append(": ").append(lastPercent)
689    .append("% ");
690    }
691  1 samePercent.setLength(0);
692  1 samePercent.append(codon);
693    }
694  1 lastPercent = percent;
695    }
696    }
697   
698  26 consensusAnnotation.annotations[col] = new Annotation(modalCodon,
699    mouseOver.toString(), ' ', pid);
700    }
701    }
702   
703    /**
704    * Returns the number of decimal places to show for profile percentages. For
705    * less than 100 sequences, returns zero (the integer percentage value will be
706    * displayed). For 100-999 sequences, returns 1, for 1000-9999 returns 2, etc.
707    *
708    * @param nseq
709    * @return
710    */
 
711  95910 toggle protected static int getPercentageDp(long nseq)
712    {
713  95910 int scale = 0;
714  95910 while (nseq >= 100)
715    {
716  0 scale++;
717  0 nseq /= 10;
718    }
719  95910 return scale;
720    }
721    }