Clover icon

Coverage Report

  1. Project Clover database Tue Oct 29 2024 21:36:55 GMT
  2. Package jalview.util

File Comparison.java

 

Coverage histogram

../../img/srcFileCovDistChart8.png
20% of files have more coverage

Code metrics

80
155
19
1
606
362
83
0.54
8.16
19
4.37

Classes

Class Line # Actions
Comparison 33 155 83
0.7952755779.5%
 

Contributing tests

This file is covered by 751 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.util;
22   
23    import java.util.ArrayList;
24    import java.util.List;
25   
26    import jalview.bin.Cache;
27    import jalview.bin.Console;
28    import jalview.datamodel.SequenceI;
29   
30    /**
31    * Assorted methods for analysing or comparing sequences.
32    */
 
33    public class Comparison
34    {
35    private static final int EIGHTY_FIVE = 85;
36   
37    private static final int NUCLEOTIDE_COUNT_PERCENT;
38   
39    private static final int NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT;
40   
41    private static final int NUCLEOTIDE_COUNT_SHORT_SEQUENCE;
42   
43    private static final int NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE;
44   
45    private static final boolean NUCLEOTIDE_AMBIGUITY_DETECTION;
46   
47    public static final char GAP_SPACE = ' ';
48   
49    public static final char GAP_DOT = '.';
50   
51    public static final char GAP_DASH = '-';
52   
53    public static final String GapChars = new String(
54    new char[]
55    { GAP_SPACE, GAP_DOT, GAP_DASH });
56   
 
57  54 toggle static
58    {
59    // these options read only at start of session
60  54 NUCLEOTIDE_COUNT_PERCENT = Cache.getDefault("NUCLEOTIDE_COUNT_PERCENT",
61    55);
62  54 NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT = Cache.getDefault(
63    "NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT", 95);
64  54 NUCLEOTIDE_COUNT_SHORT_SEQUENCE = Cache
65    .getDefault("NUCLEOTIDE_COUNT_SHORT", 100);
66  54 NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE = Cache
67    .getDefault("NUCLEOTIDE_COUNT_VERY_SHORT", 4);
68  54 NUCLEOTIDE_AMBIGUITY_DETECTION = Cache
69    .getDefault("NUCLEOTIDE_AMBIGUITY_DETECTION", true);
70    }
71   
72    /**
73    * DOCUMENT ME!
74    *
75    * @param ii
76    * DOCUMENT ME!
77    * @param jj
78    * DOCUMENT ME!
79    *
80    * @return DOCUMENT ME!
81    */
 
82  0 toggle public static final float compare(SequenceI ii, SequenceI jj)
83    {
84  0 return Comparison.compare(ii, jj, 0, ii.getLength() - 1);
85    }
86   
87    /**
88    * this was supposed to be an ungapped pid calculation
89    *
90    * @param ii
91    * SequenceI
92    * @param jj
93    * SequenceI
94    * @param start
95    * int
96    * @param end
97    * int
98    * @return float
99    */
 
100  0 toggle public static float compare(SequenceI ii, SequenceI jj, int start,
101    int end)
102    {
103  0 String si = ii.getSequenceAsString();
104  0 String sj = jj.getSequenceAsString();
105   
106  0 int ilen = si.length() - 1;
107  0 int jlen = sj.length() - 1;
108   
109  0 while (Comparison.isGap(si.charAt(start + ilen)))
110    {
111  0 ilen--;
112    }
113   
114  0 while (Comparison.isGap(sj.charAt(start + jlen)))
115    {
116  0 jlen--;
117    }
118   
119  0 int match = 0;
120  0 float pid = -1;
121   
122  0 if (ilen > jlen)
123    {
124  0 for (int j = 0; j < jlen; j++)
125    {
126  0 if (si.substring(start + j, start + j + 1)
127    .equals(sj.substring(start + j, start + j + 1)))
128    {
129  0 match++;
130    }
131    }
132   
133  0 pid = (float) match / (float) ilen * 100;
134    }
135    else
136    {
137  0 for (int j = 0; j < jlen; j++)
138    {
139  0 if (si.substring(start + j, start + j + 1)
140    .equals(sj.substring(start + j, start + j + 1)))
141    {
142  0 match++;
143    }
144    }
145   
146  0 pid = (float) match / (float) jlen * 100;
147    }
148   
149  0 return pid;
150    }
151   
152    /**
153    * this is a gapped PID calculation
154    *
155    * @param s1
156    * SequenceI
157    * @param s2
158    * SequenceI
159    * @return float
160    * @deprecated use PIDModel.computePID()
161    */
 
162  8 toggle @Deprecated
163    public final static float PID(String seq1, String seq2)
164    {
165  8 return PID(seq1, seq2, 0, seq1.length());
166    }
167   
168    static final int caseShift = 'a' - 'A';
169   
170    // Another pid with region specification
171    /**
172    * @deprecated use PIDModel.computePID()
173    */
 
174  8 toggle @Deprecated
175    public final static float PID(String seq1, String seq2, int start,
176    int end)
177    {
178  8 return PID(seq1, seq2, start, end, true, false);
179    }
180   
181    /**
182    * Calculate percent identity for a pair of sequences over a particular range,
183    * with different options for ignoring gaps.
184    *
185    * @param seq1
186    * @param seq2
187    * @param start
188    * - position in seqs
189    * @param end
190    * - position in seqs
191    * @param wcGaps
192    * - if true - gaps match any character, if false, do not match
193    * anything
194    * @param ungappedOnly
195    * - if true - only count PID over ungapped columns
196    * @return
197    * @deprecated use PIDModel.computePID()
198    */
 
199  12 toggle @Deprecated
200    public final static float PID(String seq1, String seq2, int start,
201    int end, boolean wcGaps, boolean ungappedOnly)
202    {
203  12 int s1len = seq1.length();
204  12 int s2len = seq2.length();
205   
206  12 int len = Math.min(s1len, s2len);
207   
208  12 if (end < len)
209    {
210  0 len = end;
211    }
212   
213  12 if (len < start)
214    {
215  0 start = len - 1; // we just use a single residue for the difference
216    }
217   
218  12 int elen = len - start, bad = 0;
219  12 char chr1;
220  12 char chr2;
221  12 boolean agap;
222  109 for (int i = start; i < len; i++)
223    {
224  97 chr1 = seq1.charAt(i);
225   
226  97 chr2 = seq2.charAt(i);
227  97 agap = isGap(chr1) || isGap(chr2);
228  97 if ('a' <= chr1 && chr1 <= 'z')
229    {
230    // TO UPPERCASE !!!
231    // Faster than toUpperCase
232  35 chr1 -= caseShift;
233    }
234  97 if ('a' <= chr2 && chr2 <= 'z')
235    {
236    // TO UPPERCASE !!!
237    // Faster than toUpperCase
238  48 chr2 -= caseShift;
239    }
240   
241  97 if (chr1 != chr2)
242    {
243  30 if (agap)
244    {
245  18 if (ungappedOnly)
246    {
247  4 elen--;
248    }
249  14 else if (!wcGaps)
250    {
251  2 bad++;
252    }
253    }
254    else
255    {
256  12 bad++;
257    }
258    }
259   
260    }
261  12 if (elen < 1)
262    {
263  0 return 0f;
264    }
265  12 return ((float) 100 * (elen - bad)) / elen;
266    }
267   
268    /**
269    * Answers true if the supplied character is a recognised gap character, else
270    * false. Currently hard-coded to recognise '-', '-' or ' ' (hyphen / dot /
271    * space).
272    *
273    * @param c
274    *
275    * @return
276    */
 
277  50636542 toggle public static final boolean isGap(char c)
278    {
279  50810255 return c == GAP_DASH || c == GAP_DOT || c == GAP_SPACE;
280    }
281   
282    /**
283    * Overloaded method signature to test whether a single sequence is nucleotide
284    * (that is, more than 85% CGTAUNX)
285    *
286    * @param seq
287    * @return
288    */
 
289  2427 toggle public static final boolean isNucleotide(SequenceI seq)
290    {
291  2427 if (seq == null || seq.getLength() == 0)
292    {
293  1 return false;
294    }
295  2426 long ntCount = 0; // nucleotide symbol count (does not include ntaCount)
296  2426 long aaCount = 0; // non-nucleotide, non-gap symbol count (includes nCount
297    // and ntaCount)
298  2426 long nCount = 0; // "Unknown" (N) symbol count
299  2426 long xCount = 0; // Also used as "Unknown" (X) symbol count
300  2426 long ntaCount = 0; // nucleotide ambiguity symbol count
301   
302  2426 int len = seq.getLength();
303  5383191 for (int i = 0; i < len; i++)
304    {
305  5380765 char c = seq.getCharAt(i);
306  5380765 if (isNucleotide(c))
307    {
308  1015871 ntCount++;
309    }
310  4364894 else if (!isGap(c))
311    {
312  79435 aaCount++;
313  79435 if (isN(c))
314    {
315  2997 nCount++;
316    }
317    else
318    {
319  76438 if (isX(c))
320    {
321  82 xCount++;
322    }
323  76438 if (isNucleotideAmbiguity(c))
324    {
325  47089 ntaCount++;
326    }
327    }
328    }
329    }
330  2426 long allCount = ntCount + aaCount;
331   
332  2426 if (NUCLEOTIDE_AMBIGUITY_DETECTION)
333    {
334  2426 Console.debug("Performing new nucleotide detection routine");
335  2426 if (allCount > NUCLEOTIDE_COUNT_SHORT_SEQUENCE)
336    {
337    // a long sequence.
338    // check for at least 55% nucleotide, and nucleotide and ambiguity codes
339    // (including N) must make up 95%
340  404 return ntCount * 100 >= NUCLEOTIDE_COUNT_PERCENT * allCount
341    && 100 * (ntCount + nCount
342    + ntaCount) >= NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT
343    * allCount;
344    }
345  2022 else if (allCount > NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE)
346    {
347    // a short sequence.
348    // check if a short sequence is at least 55% nucleotide and the rest of
349    // the symbols are all X or all N
350  1698 if (ntCount * 100 >= NUCLEOTIDE_COUNT_PERCENT * allCount
351    && (nCount == aaCount || xCount == aaCount))
352    {
353  850 return true;
354    }
355   
356    // a short sequence.
357    // check for at least x% nucleotide and all the rest nucleotide
358    // ambiguity codes (including N), where x slides from 75% for sequences
359    // of length 4 (i.e. only one non-nucleotide) to 55% for sequences of
360    // length 100
361  848 return myShortSequenceNucleotideProportionCount(ntCount, allCount)
362    && nCount + ntaCount == aaCount;
363    }
364    else
365    {
366    // a very short sequence. (<4)
367    // all bases must be nucleotide
368  324 return ntCount > 0 && ntCount == allCount;
369    }
370    }
371    else
372    {
373  0 Console.debug("Performing old nucleotide detection routine");
374    /*
375    * Check for nucleotide count > 85% of total count (in a form that evades
376    * int / float conversion or divide by zero).
377    */
378  0 if ((ntCount + nCount) * 100 > EIGHTY_FIVE * allCount)
379    {
380  0 return ntCount > 0; // all N is considered protein. Could use a
381    // threshold here too
382    }
383    }
384  0 return false;
385    }
386   
 
387  858 toggle protected static boolean myShortSequenceNucleotideProportionCount(
388    long ntCount, long allCount)
389    {
390    /**
391    * this method is valid only for NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE <=
392    * allCount <= NUCLEOTIDE_COUNT_SHORT_SEQUENCE
393    */
394    // the following is a simplified integer version of:
395    //
396    // a := allCount # the number of bases in the sequence
397    // n : = ntCount # the number of definite nucleotide bases
398    // vs := NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE
399    // s := NUCLEOTIDE_COUNT_SHORT_SEQUENCE
400    // lp := NUCLEOTIDE_COUNT_LOWER_PERCENT
401    // vsp := 1 - (1/a) # this is the proportion of required definite
402    // nucleotides
403    // # in a VERY_SHORT Sequence (4 bases).
404    // # This should be equivalent to all but one base in the sequence.
405    // p := (a - vs)/(s - vs) # proportion of the way between
406    // # VERY_SHORT and SHORT thresholds.
407    // tp := vsp + p * (lp/100 - vsp) # the proportion of definite nucleotides
408    // # required for this length of sequence.
409    // minNt := tp * a # the minimum number of definite nucleotide bases
410    // # required for this length of sequence.
411    //
412    // We are then essentially returning:
413    // # ntCount >= 55% of allCount and the rest are all nucleotide ambiguity:
414    // ntCount >= tp * allCount && nCount + ntaCount == aaCount
415    // but without going into float/double land
416  858 long LHS = 100 * allCount
417    * (NUCLEOTIDE_COUNT_SHORT_SEQUENCE
418    - NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE)
419    * (ntCount - allCount + 1);
420  858 long RHS = allCount * (allCount - NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE)
421    * (allCount * NUCLEOTIDE_COUNT_PERCENT - 100 * allCount + 100);
422  858 return LHS >= RHS;
423    }
424   
425    /**
426    * Answers true if more than 85% of the sequence residues (ignoring gaps) are
427    * A, G, C, T or U, else false. This is just a heuristic guess and may give a
428    * wrong answer (as AGCT are also amino acid codes).
429    *
430    * @param seqs
431    * @return
432    */
 
433  1841 toggle public static final boolean isNucleotide(SequenceI[] seqs)
434    {
435  1841 if (seqs == null)
436    {
437  1 return false;
438    }
439    // true if we have seen a nucleotide sequence
440  1840 boolean na = false;
441  1840 for (SequenceI seq : seqs)
442    {
443  2669 if (seq == null)
444    {
445  0 continue;
446    }
447  2669 na = true;
448    // TODO could possibly make an informed guess just from the first sequence
449    // to save a lengthy calculation
450  2669 if (seq.isProtein())
451    {
452    // if even one looks like protein, the alignment is protein
453  1479 return false;
454    }
455    }
456  361 return na;
457    }
458   
459    /**
460    * Answers true if the character is one of aAcCgGtTuU
461    *
462    * @param c
463    * @return
464    */
 
465  16010936 toggle public static boolean isNucleotide(char c)
466    {
467  16018654 return isNucleotide(c, false);
468    }
469   
470    /**
471    * includeAmbiguity = true also includes Nucleotide Ambiguity codes
472    */
 
473  16032935 toggle public static boolean isNucleotide(char c, boolean includeAmbiguity)
474    {
475  16034012 char C = Character.toUpperCase(c);
476  16046153 switch (C)
477    {
478  590473 case 'A':
479  373888 case 'C':
480  434691 case 'G':
481  576436 case 'T':
482  11379 case 'U':
483  1986147 return true;
484    }
485  14090247 if (includeAmbiguity)
486    {
487  16 boolean ambiguity = isNucleotideAmbiguity(C);
488  16 if (ambiguity)
489  12 return true;
490    }
491  14089253 return false;
492    }
493   
494    /**
495    * Tests *only* nucleotide ambiguity codes (and not definite nucleotide codes)
496    */
 
497  76454 toggle public static boolean isNucleotideAmbiguity(char c)
498    {
499  76454 switch (Character.toUpperCase(c))
500    {
501  5330 case 'I':
502  83 case 'X':
503  3175 case 'R':
504  3679 case 'Y':
505  1136 case 'W':
506  7975 case 'S':
507  2140 case 'M':
508  5750 case 'K':
509  175 case 'B':
510  1918 case 'H':
511  8600 case 'D':
512  7140 case 'V':
513  47101 return true;
514  1 case 'N': // not counting N as nucleotide
515    }
516  29353 return false;
517    }
518   
 
519  79435 toggle public static boolean isN(char c)
520    {
521  79435 return 'n' == Character.toLowerCase(c);
522    }
523   
 
524  76438 toggle public static boolean isX(char c)
525    {
526  76438 return 'x' == Character.toLowerCase(c);
527    }
528   
529    /**
530    * Answers true if every character in the string is one of aAcCgGtTuU, or
531    * (optionally) a gap character (dot, dash, space), else false
532    *
533    * @param s
534    * @param allowGaps
535    * @return
536    */
 
537  22 toggle public static boolean isNucleotideSequence(String s, boolean allowGaps)
538    {
539  22 return isNucleotideSequence(s, allowGaps, false);
540    }
541   
 
542  24 toggle public static boolean isNucleotideSequence(String s, boolean allowGaps,
543    boolean includeAmbiguous)
544    {
545  24 if (s == null)
546    {
547  1 return false;
548    }
549  112 for (int i = 0; i < s.length(); i++)
550    {
551  97 char c = s.charAt(i);
552  97 if (!isNucleotide(c, includeAmbiguous))
553    {
554  13 if (!allowGaps || !isGap(c))
555    {
556  8 return false;
557    }
558    }
559    }
560  15 return true;
561    }
562   
563    /**
564    * Convenience overload of isNucleotide
565    *
566    * @param seqs
567    * @return
568    */
 
569  55 toggle public static boolean isNucleotide(SequenceI[][] seqs)
570    {
571  55 if (seqs == null)
572    {
573  1 return false;
574    }
575  54 List<SequenceI> flattened = new ArrayList<SequenceI>();
576  54 for (SequenceI[] ss : seqs)
577    {
578  67 for (SequenceI s : ss)
579    {
580  94 flattened.add(s);
581    }
582    }
583  54 final SequenceI[] oneDArray = flattened
584    .toArray(new SequenceI[flattened.size()]);
585  54 return isNucleotide(oneDArray);
586    }
587   
588    /**
589    * Compares two residues either case sensitively or case insensitively
590    * depending on the caseSensitive flag
591    *
592    * @param c1
593    * first char
594    * @param c2
595    * second char to compare with
596    * @param caseSensitive
597    * if true comparison will be case sensitive otherwise its not
598    * @return
599    */
 
600  42827 toggle public static boolean isSameResidue(char c1, char c2,
601    boolean caseSensitive)
602    {
603  42827 return caseSensitive ? c1 == c2
604    : Character.toUpperCase(c1) == Character.toUpperCase(c2);
605    }
606    }