1 |
|
|
2 |
|
|
3 |
|
|
4 |
|
|
5 |
|
|
6 |
|
|
7 |
|
|
8 |
|
|
9 |
|
|
10 |
|
|
11 |
|
|
12 |
|
|
13 |
|
|
14 |
|
|
15 |
|
|
16 |
|
|
17 |
|
|
18 |
|
|
19 |
|
|
20 |
|
|
21 |
|
package jalview.util; |
22 |
|
|
23 |
|
import java.util.ArrayList; |
24 |
|
import java.util.List; |
25 |
|
|
26 |
|
import jalview.bin.Cache; |
27 |
|
import jalview.bin.Console; |
28 |
|
import jalview.datamodel.SequenceI; |
29 |
|
|
30 |
|
|
31 |
|
|
32 |
|
|
|
|
| 0% |
Uncovered Elements: 254 (254) |
Complexity: 83 |
Complexity Density: 0.54 |
|
33 |
|
public class Comparison |
34 |
|
{ |
35 |
|
private static final int EIGHTY_FIVE = 85; |
36 |
|
|
37 |
|
private static final int NUCLEOTIDE_COUNT_PERCENT; |
38 |
|
|
39 |
|
private static final int NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT; |
40 |
|
|
41 |
|
private static final int NUCLEOTIDE_COUNT_SHORT_SEQUENCE; |
42 |
|
|
43 |
|
private static final int NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE; |
44 |
|
|
45 |
|
private static final boolean NUCLEOTIDE_AMBIGUITY_DETECTION; |
46 |
|
|
47 |
|
public static final char GAP_SPACE = ' '; |
48 |
|
|
49 |
|
public static final char GAP_DOT = '.'; |
50 |
|
|
51 |
|
public static final char GAP_DASH = '-'; |
52 |
|
|
53 |
|
public static final String GapChars = new String( |
54 |
|
new char[] |
55 |
|
{ GAP_SPACE, GAP_DOT, GAP_DASH }); |
56 |
|
|
|
|
| 0% |
Uncovered Elements: 5 (5) |
Complexity: 1 |
Complexity Density: 0.2 |
|
57 |
0 |
static... |
58 |
|
{ |
59 |
|
|
60 |
0 |
NUCLEOTIDE_COUNT_PERCENT = Cache.getDefault("NUCLEOTIDE_COUNT_PERCENT", |
61 |
|
55); |
62 |
0 |
NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT = Cache.getDefault( |
63 |
|
"NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT", 95); |
64 |
0 |
NUCLEOTIDE_COUNT_SHORT_SEQUENCE = Cache |
65 |
|
.getDefault("NUCLEOTIDE_COUNT_SHORT", 100); |
66 |
0 |
NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE = Cache |
67 |
|
.getDefault("NUCLEOTIDE_COUNT_VERY_SHORT", 4); |
68 |
0 |
NUCLEOTIDE_AMBIGUITY_DETECTION = Cache |
69 |
|
.getDefault("NUCLEOTIDE_AMBIGUITY_DETECTION", true); |
70 |
|
} |
71 |
|
|
72 |
|
|
73 |
|
|
74 |
|
|
75 |
|
@param |
76 |
|
|
77 |
|
@param |
78 |
|
|
79 |
|
|
80 |
|
@return |
81 |
|
|
|
|
| 0% |
Uncovered Elements: 1 (1) |
Complexity: 1 |
Complexity Density: 1 |
|
82 |
0 |
public static final float compare(SequenceI ii, SequenceI jj)... |
83 |
|
{ |
84 |
0 |
return Comparison.compare(ii, jj, 0, ii.getLength() - 1); |
85 |
|
} |
86 |
|
|
87 |
|
|
88 |
|
|
89 |
|
|
90 |
|
@param |
91 |
|
|
92 |
|
@param |
93 |
|
|
94 |
|
@param |
95 |
|
|
96 |
|
@param |
97 |
|
|
98 |
|
@return |
99 |
|
|
|
|
| 0% |
Uncovered Elements: 34 (34) |
Complexity: 8 |
Complexity Density: 0.4 |
|
100 |
0 |
public static float compare(SequenceI ii, SequenceI jj, int start,... |
101 |
|
int end) |
102 |
|
{ |
103 |
0 |
String si = ii.getSequenceAsString(); |
104 |
0 |
String sj = jj.getSequenceAsString(); |
105 |
|
|
106 |
0 |
int ilen = si.length() - 1; |
107 |
0 |
int jlen = sj.length() - 1; |
108 |
|
|
109 |
0 |
while (Comparison.isGap(si.charAt(start + ilen))) |
110 |
|
{ |
111 |
0 |
ilen--; |
112 |
|
} |
113 |
|
|
114 |
0 |
while (Comparison.isGap(sj.charAt(start + jlen))) |
115 |
|
{ |
116 |
0 |
jlen--; |
117 |
|
} |
118 |
|
|
119 |
0 |
int match = 0; |
120 |
0 |
float pid = -1; |
121 |
|
|
122 |
0 |
if (ilen > jlen) |
123 |
|
{ |
124 |
0 |
for (int j = 0; j < jlen; j++) |
125 |
|
{ |
126 |
0 |
if (si.substring(start + j, start + j + 1) |
127 |
|
.equals(sj.substring(start + j, start + j + 1))) |
128 |
|
{ |
129 |
0 |
match++; |
130 |
|
} |
131 |
|
} |
132 |
|
|
133 |
0 |
pid = (float) match / (float) ilen * 100; |
134 |
|
} |
135 |
|
else |
136 |
|
{ |
137 |
0 |
for (int j = 0; j < jlen; j++) |
138 |
|
{ |
139 |
0 |
if (si.substring(start + j, start + j + 1) |
140 |
|
.equals(sj.substring(start + j, start + j + 1))) |
141 |
|
{ |
142 |
0 |
match++; |
143 |
|
} |
144 |
|
} |
145 |
|
|
146 |
0 |
pid = (float) match / (float) jlen * 100; |
147 |
|
} |
148 |
|
|
149 |
0 |
return pid; |
150 |
|
} |
151 |
|
|
152 |
|
|
153 |
|
|
154 |
|
|
155 |
|
@param |
156 |
|
|
157 |
|
@param |
158 |
|
|
159 |
|
@return |
160 |
|
@deprecated |
161 |
|
|
|
|
| 0% |
Uncovered Elements: 1 (1) |
Complexity: 1 |
Complexity Density: 1 |
|
162 |
0 |
@Deprecated... |
163 |
|
public final static float PID(String seq1, String seq2) |
164 |
|
{ |
165 |
0 |
return PID(seq1, seq2, 0, seq1.length()); |
166 |
|
} |
167 |
|
|
168 |
|
static final int caseShift = 'a' - 'A'; |
169 |
|
|
170 |
|
|
171 |
|
|
172 |
|
@deprecated |
173 |
|
|
|
|
| 0% |
Uncovered Elements: 1 (1) |
Complexity: 1 |
Complexity Density: 1 |
|
174 |
0 |
@Deprecated... |
175 |
|
public final static float PID(String seq1, String seq2, int start, |
176 |
|
int end) |
177 |
|
{ |
178 |
0 |
return PID(seq1, seq2, start, end, true, false); |
179 |
|
} |
180 |
|
|
181 |
|
|
182 |
|
|
183 |
|
|
184 |
|
|
185 |
|
@param |
186 |
|
@param |
187 |
|
@param |
188 |
|
|
189 |
|
@param |
190 |
|
|
191 |
|
@param |
192 |
|
|
193 |
|
|
194 |
|
@param |
195 |
|
|
196 |
|
@return |
197 |
|
@deprecated |
198 |
|
|
|
|
| 0% |
Uncovered Elements: 49 (49) |
Complexity: 13 |
Complexity Density: 0.45 |
|
199 |
0 |
@Deprecated... |
200 |
|
public final static float PID(String seq1, String seq2, int start, |
201 |
|
int end, boolean wcGaps, boolean ungappedOnly) |
202 |
|
{ |
203 |
0 |
int s1len = seq1.length(); |
204 |
0 |
int s2len = seq2.length(); |
205 |
|
|
206 |
0 |
int len = Math.min(s1len, s2len); |
207 |
|
|
208 |
0 |
if (end < len) |
209 |
|
{ |
210 |
0 |
len = end; |
211 |
|
} |
212 |
|
|
213 |
0 |
if (len < start) |
214 |
|
{ |
215 |
0 |
start = len - 1; |
216 |
|
} |
217 |
|
|
218 |
0 |
int elen = len - start, bad = 0; |
219 |
0 |
char chr1; |
220 |
0 |
char chr2; |
221 |
0 |
boolean agap; |
222 |
0 |
for (int i = start; i < len; i++) |
223 |
|
{ |
224 |
0 |
chr1 = seq1.charAt(i); |
225 |
|
|
226 |
0 |
chr2 = seq2.charAt(i); |
227 |
0 |
agap = isGap(chr1) || isGap(chr2); |
228 |
0 |
if ('a' <= chr1 && chr1 <= 'z') |
229 |
|
{ |
230 |
|
|
231 |
|
|
232 |
0 |
chr1 -= caseShift; |
233 |
|
} |
234 |
0 |
if ('a' <= chr2 && chr2 <= 'z') |
235 |
|
{ |
236 |
|
|
237 |
|
|
238 |
0 |
chr2 -= caseShift; |
239 |
|
} |
240 |
|
|
241 |
0 |
if (chr1 != chr2) |
242 |
|
{ |
243 |
0 |
if (agap) |
244 |
|
{ |
245 |
0 |
if (ungappedOnly) |
246 |
|
{ |
247 |
0 |
elen--; |
248 |
|
} |
249 |
0 |
else if (!wcGaps) |
250 |
|
{ |
251 |
0 |
bad++; |
252 |
|
} |
253 |
|
} |
254 |
|
else |
255 |
|
{ |
256 |
0 |
bad++; |
257 |
|
} |
258 |
|
} |
259 |
|
|
260 |
|
} |
261 |
0 |
if (elen < 1) |
262 |
|
{ |
263 |
0 |
return 0f; |
264 |
|
} |
265 |
0 |
return ((float) 100 * (elen - bad)) / elen; |
266 |
|
} |
267 |
|
|
268 |
|
|
269 |
|
|
270 |
|
|
271 |
|
|
272 |
|
|
273 |
|
@param |
274 |
|
|
275 |
|
@return |
276 |
|
|
|
|
| 0% |
Uncovered Elements: 1 (1) |
Complexity: 1 |
Complexity Density: 1 |
|
277 |
0 |
public static final boolean isGap(char c)... |
278 |
|
{ |
279 |
0 |
return c == GAP_DASH || c == GAP_DOT || c == GAP_SPACE; |
280 |
|
} |
281 |
|
|
282 |
|
|
283 |
|
|
284 |
|
|
285 |
|
|
286 |
|
@param |
287 |
|
@return |
288 |
|
|
|
|
| 0% |
Uncovered Elements: 58 (58) |
Complexity: 16 |
Complexity Density: 0.47 |
|
289 |
0 |
public static final boolean isNucleotide(SequenceI seq)... |
290 |
|
{ |
291 |
0 |
if (seq == null || seq.getLength() == 0) |
292 |
|
{ |
293 |
0 |
return false; |
294 |
|
} |
295 |
0 |
long ntCount = 0; |
296 |
0 |
long aaCount = 0; |
297 |
|
|
298 |
0 |
long nCount = 0; |
299 |
0 |
long xCount = 0; |
300 |
0 |
long ntaCount = 0; |
301 |
|
|
302 |
0 |
int len = seq.getLength(); |
303 |
0 |
for (int i = 0; i < len; i++) |
304 |
|
{ |
305 |
0 |
char c = seq.getCharAt(i); |
306 |
0 |
if (isNucleotide(c)) |
307 |
|
{ |
308 |
0 |
ntCount++; |
309 |
|
} |
310 |
0 |
else if (!isGap(c)) |
311 |
|
{ |
312 |
0 |
aaCount++; |
313 |
0 |
if (isN(c)) |
314 |
|
{ |
315 |
0 |
nCount++; |
316 |
|
} |
317 |
|
else |
318 |
|
{ |
319 |
0 |
if (isX(c)) |
320 |
|
{ |
321 |
0 |
xCount++; |
322 |
|
} |
323 |
0 |
if (isNucleotideAmbiguity(c)) |
324 |
|
{ |
325 |
0 |
ntaCount++; |
326 |
|
} |
327 |
|
} |
328 |
|
} |
329 |
|
} |
330 |
0 |
long allCount = ntCount + aaCount; |
331 |
|
|
332 |
0 |
if (NUCLEOTIDE_AMBIGUITY_DETECTION) |
333 |
|
{ |
334 |
0 |
Console.debug("Performing new nucleotide detection routine"); |
335 |
0 |
if (allCount > NUCLEOTIDE_COUNT_SHORT_SEQUENCE) |
336 |
|
{ |
337 |
|
|
338 |
|
|
339 |
|
|
340 |
0 |
return ntCount * 100 >= NUCLEOTIDE_COUNT_PERCENT * allCount |
341 |
|
&& 100 * (ntCount + nCount |
342 |
|
+ ntaCount) >= NUCLEOTIDE_COUNT_LONG_SEQUENCE_AMBIGUITY_PERCENT |
343 |
|
* allCount; |
344 |
|
} |
345 |
0 |
else if (allCount > NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE) |
346 |
|
{ |
347 |
|
|
348 |
|
|
349 |
|
|
350 |
0 |
if (ntCount * 100 >= NUCLEOTIDE_COUNT_PERCENT * allCount |
351 |
|
&& (nCount == aaCount || xCount == aaCount)) |
352 |
|
{ |
353 |
0 |
return true; |
354 |
|
} |
355 |
|
|
356 |
|
|
357 |
|
|
358 |
|
|
359 |
|
|
360 |
|
|
361 |
0 |
return myShortSequenceNucleotideProportionCount(ntCount, allCount) |
362 |
|
&& nCount + ntaCount == aaCount; |
363 |
|
} |
364 |
|
else |
365 |
|
{ |
366 |
|
|
367 |
|
|
368 |
0 |
return ntCount > 0 && ntCount == allCount; |
369 |
|
} |
370 |
|
} |
371 |
|
else |
372 |
|
{ |
373 |
0 |
Console.debug("Performing old nucleotide detection routine"); |
374 |
|
|
375 |
|
|
376 |
|
|
377 |
|
|
378 |
0 |
if ((ntCount + nCount) * 100 > EIGHTY_FIVE * allCount) |
379 |
|
{ |
380 |
0 |
return ntCount > 0; |
381 |
|
|
382 |
|
} |
383 |
|
} |
384 |
0 |
return false; |
385 |
|
} |
386 |
|
|
|
|
| 0% |
Uncovered Elements: 3 (3) |
Complexity: 1 |
Complexity Density: 0.33 |
|
387 |
0 |
protected static boolean myShortSequenceNucleotideProportionCount(... |
388 |
|
long ntCount, long allCount) |
389 |
|
{ |
390 |
|
|
391 |
|
|
392 |
|
|
393 |
|
|
394 |
|
|
395 |
|
|
396 |
|
|
397 |
|
|
398 |
|
|
399 |
|
|
400 |
|
|
401 |
|
|
402 |
|
|
403 |
|
|
404 |
|
|
405 |
|
|
406 |
|
|
407 |
|
|
408 |
|
|
409 |
|
|
410 |
|
|
411 |
|
|
412 |
|
|
413 |
|
|
414 |
|
|
415 |
|
|
416 |
0 |
long LHS = 100 * allCount |
417 |
|
* (NUCLEOTIDE_COUNT_SHORT_SEQUENCE |
418 |
|
- NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE) |
419 |
|
* (ntCount - allCount + 1); |
420 |
0 |
long RHS = allCount * (allCount - NUCLEOTIDE_COUNT_VERY_SHORT_SEQUENCE) |
421 |
|
* (allCount * NUCLEOTIDE_COUNT_PERCENT - 100 * allCount + 100); |
422 |
0 |
return LHS >= RHS; |
423 |
|
} |
424 |
|
|
425 |
|
|
426 |
|
|
427 |
|
|
428 |
|
|
429 |
|
|
430 |
|
@param |
431 |
|
@return |
432 |
|
|
|
|
| 0% |
Uncovered Elements: 16 (16) |
Complexity: 4 |
Complexity Density: 0.4 |
|
433 |
0 |
public static final boolean isNucleotide(SequenceI[] seqs)... |
434 |
|
{ |
435 |
0 |
if (seqs == null) |
436 |
|
{ |
437 |
0 |
return false; |
438 |
|
} |
439 |
|
|
440 |
0 |
boolean na = false; |
441 |
0 |
for (SequenceI seq : seqs) |
442 |
|
{ |
443 |
0 |
if (seq == null) |
444 |
|
{ |
445 |
0 |
continue; |
446 |
|
} |
447 |
0 |
na = true; |
448 |
|
|
449 |
|
|
450 |
0 |
if (seq.isProtein()) |
451 |
|
{ |
452 |
|
|
453 |
0 |
return false; |
454 |
|
} |
455 |
|
} |
456 |
0 |
return na; |
457 |
|
} |
458 |
|
|
459 |
|
|
460 |
|
|
461 |
|
|
462 |
|
@param |
463 |
|
@return |
464 |
|
|
|
|
| 0% |
Uncovered Elements: 1 (1) |
Complexity: 1 |
Complexity Density: 1 |
|
465 |
0 |
public static boolean isNucleotide(char c)... |
466 |
|
{ |
467 |
0 |
return isNucleotide(c, false); |
468 |
|
} |
469 |
|
|
470 |
|
|
471 |
|
|
472 |
|
|
|
|
| 0% |
Uncovered Elements: 17 (17) |
Complexity: 8 |
Complexity Density: 0.62 |
|
473 |
0 |
public static boolean isNucleotide(char c, boolean includeAmbiguity)... |
474 |
|
{ |
475 |
0 |
char C = Character.toUpperCase(c); |
476 |
0 |
switch (C) |
477 |
|
{ |
478 |
0 |
case 'A': |
479 |
0 |
case 'C': |
480 |
0 |
case 'G': |
481 |
0 |
case 'T': |
482 |
0 |
case 'U': |
483 |
0 |
return true; |
484 |
|
} |
485 |
0 |
if (includeAmbiguity) |
486 |
|
{ |
487 |
0 |
boolean ambiguity = isNucleotideAmbiguity(C); |
488 |
0 |
if (ambiguity) |
489 |
0 |
return true; |
490 |
|
} |
491 |
0 |
return false; |
492 |
|
} |
493 |
|
|
494 |
|
|
495 |
|
|
496 |
|
|
|
|
| 0% |
Uncovered Elements: 16 (16) |
Complexity: 14 |
Complexity Density: 0.88 |
|
497 |
0 |
public static boolean isNucleotideAmbiguity(char c)... |
498 |
|
{ |
499 |
0 |
switch (Character.toUpperCase(c)) |
500 |
|
{ |
501 |
0 |
case 'I': |
502 |
0 |
case 'X': |
503 |
0 |
case 'R': |
504 |
0 |
case 'Y': |
505 |
0 |
case 'W': |
506 |
0 |
case 'S': |
507 |
0 |
case 'M': |
508 |
0 |
case 'K': |
509 |
0 |
case 'B': |
510 |
0 |
case 'H': |
511 |
0 |
case 'D': |
512 |
0 |
case 'V': |
513 |
0 |
return true; |
514 |
0 |
case 'N': |
515 |
|
} |
516 |
0 |
return false; |
517 |
|
} |
518 |
|
|
|
|
| 0% |
Uncovered Elements: 1 (1) |
Complexity: 1 |
Complexity Density: 1 |
|
519 |
0 |
public static boolean isN(char c)... |
520 |
|
{ |
521 |
0 |
return 'n' == Character.toLowerCase(c); |
522 |
|
} |
523 |
|
|
|
|
| 0% |
Uncovered Elements: 1 (1) |
Complexity: 1 |
Complexity Density: 1 |
|
524 |
0 |
public static boolean isX(char c)... |
525 |
|
{ |
526 |
0 |
return 'x' == Character.toLowerCase(c); |
527 |
|
} |
528 |
|
|
529 |
|
|
530 |
|
|
531 |
|
|
532 |
|
|
533 |
|
@param |
534 |
|
@param |
535 |
|
@return |
536 |
|
|
|
|
| 0% |
Uncovered Elements: 1 (1) |
Complexity: 1 |
Complexity Density: 1 |
|
537 |
0 |
public static boolean isNucleotideSequence(String s, boolean allowGaps)... |
538 |
|
{ |
539 |
0 |
return isNucleotideSequence(s, allowGaps, false); |
540 |
|
} |
541 |
|
|
|
|
| 0% |
Uncovered Elements: 16 (16) |
Complexity: 6 |
Complexity Density: 0.75 |
|
542 |
0 |
public static boolean isNucleotideSequence(String s, boolean allowGaps,... |
543 |
|
boolean includeAmbiguous) |
544 |
|
{ |
545 |
0 |
if (s == null) |
546 |
|
{ |
547 |
0 |
return false; |
548 |
|
} |
549 |
0 |
for (int i = 0; i < s.length(); i++) |
550 |
|
{ |
551 |
0 |
char c = s.charAt(i); |
552 |
0 |
if (!isNucleotide(c, includeAmbiguous)) |
553 |
|
{ |
554 |
0 |
if (!allowGaps || !isGap(c)) |
555 |
|
{ |
556 |
0 |
return false; |
557 |
|
} |
558 |
|
} |
559 |
|
} |
560 |
0 |
return true; |
561 |
|
} |
562 |
|
|
563 |
|
|
564 |
|
|
565 |
|
|
566 |
|
@param |
567 |
|
@return |
568 |
|
|
|
|
| 0% |
Uncovered Elements: 10 (10) |
Complexity: 2 |
Complexity Density: 0.25 |
|
569 |
0 |
public static boolean isNucleotide(SequenceI[][] seqs)... |
570 |
|
{ |
571 |
0 |
if (seqs == null) |
572 |
|
{ |
573 |
0 |
return false; |
574 |
|
} |
575 |
0 |
List<SequenceI> flattened = new ArrayList<SequenceI>(); |
576 |
0 |
for (SequenceI[] ss : seqs) |
577 |
|
{ |
578 |
0 |
for (SequenceI s : ss) |
579 |
|
{ |
580 |
0 |
flattened.add(s); |
581 |
|
} |
582 |
|
} |
583 |
0 |
final SequenceI[] oneDArray = flattened |
584 |
|
.toArray(new SequenceI[flattened.size()]); |
585 |
0 |
return isNucleotide(oneDArray); |
586 |
|
} |
587 |
|
|
588 |
|
|
589 |
|
|
590 |
|
|
591 |
|
|
592 |
|
@param |
593 |
|
|
594 |
|
@param |
595 |
|
|
596 |
|
@param |
597 |
|
|
598 |
|
@return |
599 |
|
|
|
|
| 0% |
Uncovered Elements: 3 (3) |
Complexity: 2 |
Complexity Density: 2 |
|
600 |
0 |
public static boolean isSameResidue(char c1, char c2,... |
601 |
|
boolean caseSensitive) |
602 |
|
{ |
603 |
0 |
return caseSensitive ? c1 == c2 |
604 |
|
: Character.toUpperCase(c1) == Character.toUpperCase(c2); |
605 |
|
} |
606 |
|
} |