Class |
Line # |
Actions |
|||
---|---|---|---|---|---|
AlignSeq | 54 | 512 | 179 |
1 | /* | |
2 | * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) | |
3 | * Copyright (C) $$Year-Rel$$ The Jalview Authors | |
4 | * | |
5 | * This file is part of Jalview. | |
6 | * | |
7 | * Jalview is free software: you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation, either version 3 | |
10 | * of the License, or (at your option) any later version. | |
11 | * | |
12 | * Jalview is distributed in the hope that it will be useful, but | |
13 | * WITHOUT ANY WARRANTY; without even the implied warranty | |
14 | * of MERCHANTABILITY or FITNESS FOR A PARTICULAR | |
15 | * PURPOSE. See the GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with Jalview. If not, see <http://www.gnu.org/licenses/>. | |
19 | * The Jalview Authors are detailed in the 'AUTHORS' file. | |
20 | */ | |
21 | package jalview.analysis; | |
22 | ||
23 | import jalview.analysis.scoremodels.PIDModel; | |
24 | import jalview.analysis.scoremodels.ScoreMatrix; | |
25 | import jalview.analysis.scoremodels.ScoreModels; | |
26 | import jalview.analysis.scoremodels.SimilarityParams; | |
27 | import jalview.api.analysis.SimilarityParamsI; | |
28 | import jalview.datamodel.AlignmentAnnotation; | |
29 | import jalview.datamodel.AlignmentI; | |
30 | import jalview.datamodel.Mapping; | |
31 | import jalview.datamodel.Sequence; | |
32 | import jalview.datamodel.SequenceI; | |
33 | import jalview.math.MiscMath; | |
34 | import jalview.util.Comparison; | |
35 | import jalview.util.Format; | |
36 | import jalview.util.MapList; | |
37 | import jalview.util.MessageManager; | |
38 | ||
39 | import java.awt.Color; | |
40 | import java.awt.Graphics; | |
41 | import java.io.PrintStream; | |
42 | import java.lang.IllegalArgumentException; | |
43 | import java.util.ArrayList; | |
44 | import java.util.Arrays; | |
45 | import java.util.HashMap; | |
46 | import java.util.List; | |
47 | import java.util.StringTokenizer; | |
48 | import java.util.Locale; | |
49 | ||
50 | /** | |
51 | * @author $author$ | |
52 | * @version $Revision$ | |
53 | */ | |
54 | public class AlignSeq | |
55 | { | |
56 | private static final int MAX_NAME_LENGTH = 30; | |
57 | ||
58 | private static final int DEFAULT_OPENCOST = 120; | |
59 | ||
60 | private static final int DEFAULT_EXTENDCOST = 20; | |
61 | ||
62 | private int GAP_OPEN_COST = DEFAULT_OPENCOST; | |
63 | ||
64 | private int GAP_EXTEND_COST = DEFAULT_EXTENDCOST; | |
65 | ||
66 | private static final int GAP_INDEX = -1; | |
67 | ||
68 | public static final String PEP = "pep"; | |
69 | ||
70 | public static final String DNA = "dna"; | |
71 | ||
72 | private static final String NEWLINE = System.lineSeparator(); | |
73 | ||
74 | float[][] score; | |
75 | ||
76 | float alignmentScore; | |
77 | ||
78 | float[][] E; | |
79 | ||
80 | float[][] F; | |
81 | ||
82 | int[][] traceback; // todo is this actually used? | |
83 | ||
84 | int[] seq1; | |
85 | ||
86 | int[] seq2; | |
87 | ||
88 | SequenceI s1; | |
89 | ||
90 | SequenceI s2; | |
91 | ||
92 | public String s1str; | |
93 | ||
94 | public String s2str; | |
95 | ||
96 | int maxi; | |
97 | ||
98 | int maxj; | |
99 | ||
100 | int[] aseq1; | |
101 | ||
102 | int[] aseq2; | |
103 | ||
104 | /* | |
105 | * matches in alignment | |
106 | */ | |
107 | int match = -1; | |
108 | ||
109 | public String astr1 = ""; | |
110 | ||
111 | public String astr2 = ""; | |
112 | ||
113 | public String indelfreeAstr1 = ""; | |
114 | ||
115 | public String indelfreeAstr2 = ""; | |
116 | ||
117 | /** DOCUMENT ME!! */ | |
118 | public int seq1start; | |
119 | ||
120 | /** DOCUMENT ME!! */ | |
121 | public int seq1end; | |
122 | ||
123 | /** DOCUMENT ME!! */ | |
124 | public int seq2start; | |
125 | ||
126 | public int seq2end; | |
127 | ||
128 | int count; | |
129 | ||
130 | public float maxscore; | |
131 | ||
132 | public float meanScore; // needed for PaSiMap | |
133 | ||
134 | public int hypotheticMaxScore; // needed for PaSiMap | |
135 | ||
136 | int prev = 0; | |
137 | ||
138 | StringBuffer output = new StringBuffer(); | |
139 | ||
140 | String type; // AlignSeq.PEP or AlignSeq.DNA | |
141 | ||
142 | private ScoreMatrix scoreMatrix; | |
143 | ||
144 | /** | |
145 | * Creates a new AlignSeq object. | |
146 | * | |
147 | * @param s1 | |
148 | * first sequence for alignment | |
149 | * @param s2 | |
150 | * second sequence for alignment | |
151 | * @param type | |
152 | * molecule type, either AlignSeq.PEP or AlignSeq.DNA | |
153 | */ | |
154 | 0 | public AlignSeq(int opencost, int extcost) |
155 | { | |
156 | 0 | GAP_OPEN_COST = opencost; |
157 | 0 | GAP_EXTEND_COST = extcost; |
158 | } | |
159 | ||
160 | 267 | public AlignSeq(SequenceI s1, SequenceI s2, String type) |
161 | { | |
162 | 267 | seqInit(s1, s1.getSequenceAsString(), s2, s2.getSequenceAsString(), |
163 | type); | |
164 | } | |
165 | ||
166 | /** | |
167 | * Creates a new AlignSeq object. | |
168 | * | |
169 | * @param s1,string1 | |
170 | * s1 reference sequence for string1 | |
171 | * @param s2,string2 | |
172 | * s2 reference sequence for string2 | |
173 | * @param type | |
174 | * molecule type, either AlignSeq.PEP or AlignSeq.DNA | |
175 | */ | |
176 | 2 | public AlignSeq(SequenceI s1, String string1, SequenceI s2, |
177 | String string2, String type) | |
178 | { | |
179 | 2 | seqInit(s1, string1.toUpperCase(Locale.ROOT), s2, |
180 | string2.toUpperCase(Locale.ROOT), type); | |
181 | } | |
182 | ||
183 | 265 | public AlignSeq(SequenceI s1, SequenceI s2, String type, int opencost, |
184 | int extcost) | |
185 | { | |
186 | 265 | this(s1, s2, type); |
187 | 265 | GAP_OPEN_COST = opencost; |
188 | 265 | GAP_EXTEND_COST = extcost; |
189 | } | |
190 | ||
191 | 2 | public AlignSeq(SequenceI s12, String string1, SequenceI s22, |
192 | String string2, String type2, int defaultOpencost, | |
193 | int defaultExtendcost) | |
194 | { | |
195 | 2 | this(s12, string1, s22, string2, type2); |
196 | 2 | GAP_OPEN_COST = defaultOpencost; |
197 | 2 | GAP_EXTEND_COST = defaultExtendcost; |
198 | } | |
199 | ||
200 | /** | |
201 | * DOCUMENT ME! | |
202 | * | |
203 | * @return DOCUMENT ME! | |
204 | */ | |
205 | 32 | public float getMaxScore() |
206 | { | |
207 | 32 | return maxscore; |
208 | } | |
209 | ||
210 | /** | |
211 | * returns the overall score of the alignment | |
212 | * | |
213 | * @return | |
214 | */ | |
215 | 2 | public float getAlignmentScore() |
216 | { | |
217 | 2 | return alignmentScore; |
218 | } | |
219 | ||
220 | /** | |
221 | * DOCUMENT ME! | |
222 | * | |
223 | * @return DOCUMENT ME! | |
224 | */ | |
225 | 617 | public int getSeq2Start() |
226 | { | |
227 | 617 | return seq2start; |
228 | } | |
229 | ||
230 | /** | |
231 | * DOCUMENT ME! | |
232 | * | |
233 | * @return DOCUMENT ME! | |
234 | */ | |
235 | 199 | public int getSeq2End() |
236 | { | |
237 | 199 | return seq2end; |
238 | } | |
239 | ||
240 | /** | |
241 | * DOCUMENT ME! | |
242 | * | |
243 | * @return DOCUMENT ME! | |
244 | */ | |
245 | 617 | public int getSeq1Start() |
246 | { | |
247 | 617 | return seq1start; |
248 | } | |
249 | ||
250 | /** | |
251 | * DOCUMENT ME! | |
252 | * | |
253 | * @return DOCUMENT ME! | |
254 | */ | |
255 | 199 | public int getSeq1End() |
256 | { | |
257 | 199 | return seq1end; |
258 | } | |
259 | ||
260 | /** | |
261 | * DOCUMENT ME! | |
262 | * | |
263 | * @return DOCUMENT ME! | |
264 | */ | |
265 | 4 | public String getOutput() |
266 | { | |
267 | 4 | return output.toString(); |
268 | } | |
269 | ||
270 | /** | |
271 | * DOCUMENT ME! | |
272 | * | |
273 | * @return DOCUMENT ME! | |
274 | */ | |
275 | 206 | public String getAStr1() |
276 | { | |
277 | 206 | return astr1; |
278 | } | |
279 | ||
280 | /** | |
281 | * DOCUMENT ME! | |
282 | * | |
283 | * @return DOCUMENT ME! | |
284 | */ | |
285 | 205 | public String getAStr2() |
286 | { | |
287 | 205 | return astr2; |
288 | } | |
289 | ||
290 | /** | |
291 | * DOCUMENT ME! | |
292 | * | |
293 | * @return DOCUMENT ME! | |
294 | */ | |
295 | 2 | public int[] getASeq1() |
296 | { | |
297 | 2 | return aseq1; |
298 | } | |
299 | ||
300 | /** | |
301 | * DOCUMENT ME! | |
302 | * | |
303 | * @return DOCUMENT ME! | |
304 | */ | |
305 | 0 | public int[] getASeq2() |
306 | { | |
307 | 0 | return aseq2; |
308 | } | |
309 | ||
310 | /** | |
311 | * | |
312 | * @return aligned instance of Seq 1 | |
313 | */ | |
314 | 198 | public SequenceI getAlignedSeq1() |
315 | { | |
316 | 198 | SequenceI alSeq1 = new Sequence(s1.getName(), getAStr1()); |
317 | 198 | alSeq1.setStart(s1.getStart() + getSeq1Start() - 1); |
318 | 198 | alSeq1.setEnd(s1.getStart() + getSeq1End() - 1); |
319 | 198 | alSeq1.setDatasetSequence( |
320 | 198 | s1.getDatasetSequence() == null ? s1 : s1.getDatasetSequence()); |
321 | 198 | return alSeq1; |
322 | } | |
323 | ||
324 | /** | |
325 | * | |
326 | * @return aligned instance of Seq 2 | |
327 | */ | |
328 | 198 | public SequenceI getAlignedSeq2() |
329 | { | |
330 | 198 | SequenceI alSeq2 = new Sequence(s2.getName(), getAStr2()); |
331 | 198 | alSeq2.setStart(s2.getStart() + getSeq2Start() - 1); |
332 | 198 | alSeq2.setEnd(s2.getStart() + getSeq2End() - 1); |
333 | 198 | alSeq2.setDatasetSequence( |
334 | 198 | s2.getDatasetSequence() == null ? s2 : s2.getDatasetSequence()); |
335 | 198 | return alSeq2; |
336 | } | |
337 | ||
338 | /** | |
339 | * fraction of seq2 matched in the alignment | |
340 | * | |
341 | * @return NaN or [0..1] | |
342 | */ | |
343 | 0 | public double getS2Coverage() |
344 | { | |
345 | 0 | if (match >= 0) |
346 | { | |
347 | 0 | return ((double) match) / ((double) s2.getEnd() - s2.getStart() + 1); |
348 | } | |
349 | 0 | return Double.NaN; |
350 | } | |
351 | ||
352 | /** | |
353 | * fraction of seq1 matched in the alignment | |
354 | * | |
355 | * @return NaN or [0..1] | |
356 | */ | |
357 | 0 | public double getS1Coverage() |
358 | { | |
359 | 0 | if (match >= 0) |
360 | { | |
361 | 0 | return ((double) match) / ((double) s1.getEnd() - s1.getStart() + 1); |
362 | } | |
363 | 0 | return Double.NaN; |
364 | } | |
365 | ||
366 | /** | |
367 | * Construct score matrix for sequences with standard DNA or PEPTIDE matrix | |
368 | * | |
369 | * @param s1 | |
370 | * - sequence 1 | |
371 | * @param string1 | |
372 | * - string to use for s1 | |
373 | * @param s2 | |
374 | * - sequence 2 | |
375 | * @param string2 | |
376 | * - string to use for s2 | |
377 | * @param type | |
378 | * DNA or PEPTIDE | |
379 | */ | |
380 | 269 | public void seqInit(SequenceI s1, String string1, SequenceI s2, |
381 | String string2, String type) | |
382 | { | |
383 | 269 | seqInit(s1, string1, s2, string2, type, GAP_OPEN_COST, GAP_EXTEND_COST); |
384 | } | |
385 | ||
386 | 269 | public void seqInit(SequenceI s1, String string1, SequenceI s2, |
387 | String string2, String type, int opening, int extension) | |
388 | { | |
389 | 269 | GAP_OPEN_COST = opening; |
390 | 269 | GAP_EXTEND_COST = extension; |
391 | 269 | this.s1 = s1; |
392 | 269 | this.s2 = s2; |
393 | 269 | setDefaultParams(type); |
394 | 269 | seqInit(string1, string2); |
395 | } | |
396 | ||
397 | /** | |
398 | * construct score matrix for string1 and string2 (after removing any existing | |
399 | * gaps | |
400 | * | |
401 | * @param string1 | |
402 | * @param string2 | |
403 | */ | |
404 | 269 | private void seqInit(String string1, String string2) |
405 | { | |
406 | 269 | s1str = extractGaps(jalview.util.Comparison.GapChars, string1); |
407 | 269 | s2str = extractGaps(jalview.util.Comparison.GapChars, string2); |
408 | ||
409 | 269 | if (s1str.length() == 0 || s2str.length() == 0) |
410 | { | |
411 | 0 | output.append( |
412 | 0 | "ALL GAPS: " + (s1str.length() == 0 ? s1.getName() : " ") |
413 | 0 | + (s2str.length() == 0 ? s2.getName() : "")); |
414 | 0 | return; |
415 | } | |
416 | ||
417 | 269 | score = new float[s1str.length()][s2str.length()]; |
418 | ||
419 | 269 | E = new float[s1str.length()][s2str.length()]; |
420 | ||
421 | 269 | F = new float[s1str.length()][s2str.length()]; |
422 | 269 | traceback = new int[s1str.length()][s2str.length()]; |
423 | ||
424 | 269 | seq1 = indexEncode(s1str); |
425 | ||
426 | 269 | seq2 = indexEncode(s2str); |
427 | } | |
428 | ||
429 | 269 | private void setDefaultParams(String moleculeType) |
430 | { | |
431 | 269 | if (!PEP.equals(moleculeType) && !DNA.equals(moleculeType)) |
432 | { | |
433 | 0 | output.append("Wrong type = dna or pep only"); |
434 | 0 | throw new Error(MessageManager |
435 | .formatMessage("error.unknown_type_dna_or_pep", new String[] | |
436 | { moleculeType })); | |
437 | } | |
438 | ||
439 | 269 | type = moleculeType; |
440 | 269 | scoreMatrix = ScoreModels.getInstance() |
441 | .getDefaultModel(PEP.equals(type)); | |
442 | } | |
443 | ||
444 | /** | |
445 | * DOCUMENT ME! | |
446 | */ | |
447 | 267 | public void traceAlignment() |
448 | { | |
449 | // Find the maximum score along the rhs or bottom row | |
450 | 267 | float max = -Float.MAX_VALUE; |
451 | ||
452 | 37023 | for (int i = 0; i < seq1.length; i++) |
453 | { | |
454 | 36756 | if (score[i][seq2.length - 1] > max) |
455 | { | |
456 | 21198 | max = score[i][seq2.length - 1]; |
457 | 21198 | maxi = i; |
458 | 21198 | maxj = seq2.length - 1; |
459 | } | |
460 | } | |
461 | ||
462 | 34394 | for (int j = 0; j < seq2.length; j++) |
463 | { | |
464 | 34127 | if (score[seq1.length - 1][j] > max) |
465 | { | |
466 | 23 | max = score[seq1.length - 1][j]; |
467 | 23 | maxi = seq1.length - 1; |
468 | 23 | maxj = j; |
469 | } | |
470 | } | |
471 | ||
472 | 267 | int i = maxi; |
473 | 267 | int j = maxj; |
474 | 267 | int trace; |
475 | 267 | maxscore = score[i][j] / 10f; |
476 | ||
477 | 267 | seq1end = maxi + 1; |
478 | 267 | seq2end = maxj + 1; |
479 | ||
480 | 267 | aseq1 = new int[seq1.length + seq2.length]; |
481 | 267 | aseq2 = new int[seq1.length + seq2.length]; |
482 | 267 | match = 0; |
483 | 267 | StringBuilder sb1 = new StringBuilder(aseq1.length); |
484 | 267 | StringBuilder sb2 = new StringBuilder(aseq2.length); |
485 | ||
486 | 267 | count = (seq1.length + seq2.length) - 1; |
487 | ||
488 | 29206 | while (i > 0 && j > 0) |
489 | { | |
490 | 28939 | aseq1[count] = seq1[i]; |
491 | 28939 | sb1.append(s1str.charAt(i)); |
492 | 28939 | aseq2[count] = seq2[j]; |
493 | 28939 | sb2.append(s2str.charAt(j)); |
494 | 28939 | trace = findTrace(i, j); |
495 | ||
496 | 28939 | if (trace == 0) |
497 | { | |
498 | 28936 | match++; |
499 | 28936 | i--; |
500 | 28936 | j--; |
501 | } | |
502 | 3 | else if (trace == 1) |
503 | { | |
504 | 0 | j--; |
505 | 0 | aseq1[count] = GAP_INDEX; |
506 | 0 | sb1.replace(sb1.length() - 1, sb1.length(), "-"); |
507 | } | |
508 | 3 | else if (trace == -1) |
509 | { | |
510 | 3 | i--; |
511 | 3 | aseq2[count] = GAP_INDEX; |
512 | 3 | sb2.replace(sb2.length() - 1, sb2.length(), "-"); |
513 | } | |
514 | ||
515 | 28939 | count--; |
516 | } | |
517 | ||
518 | 267 | seq1start = i + 1; |
519 | 267 | seq2start = j + 1; |
520 | ||
521 | 267 | if (aseq1[count] != GAP_INDEX) |
522 | { | |
523 | 267 | aseq1[count] = seq1[i]; |
524 | 267 | sb1.append(s1str.charAt(i)); |
525 | } | |
526 | ||
527 | 267 | if (aseq2[count] != GAP_INDEX) |
528 | { | |
529 | 267 | aseq2[count] = seq2[j]; |
530 | 267 | sb2.append(s2str.charAt(j)); |
531 | 267 | if (aseq1[count] != GAP_INDEX) |
532 | { | |
533 | 267 | match++; |
534 | } | |
535 | } | |
536 | ||
537 | /* | |
538 | * we built the character strings backwards, so now | |
539 | * reverse them to convert to sequence strings | |
540 | */ | |
541 | 267 | astr1 = sb1.reverse().toString(); |
542 | 267 | astr2 = sb2.reverse().toString(); |
543 | } | |
544 | ||
545 | /** | |
546 | * DOCUMENT ME! | |
547 | */ | |
548 | 0 | public void traceAlignmentWithEndGaps() |
549 | { | |
550 | // Find the maximum score along the rhs or bottom row | |
551 | 0 | float max = -Float.MAX_VALUE; |
552 | ||
553 | 0 | for (int i = 0; i < seq1.length; i++) |
554 | { | |
555 | 0 | if (score[i][seq2.length - 1] > max) |
556 | { | |
557 | 0 | max = score[i][seq2.length - 1]; |
558 | 0 | maxi = i; |
559 | 0 | maxj = seq2.length - 1; |
560 | } | |
561 | } | |
562 | ||
563 | 0 | for (int j = 0; j < seq2.length; j++) |
564 | { | |
565 | 0 | if (score[seq1.length - 1][j] > max) |
566 | { | |
567 | 0 | max = score[seq1.length - 1][j]; |
568 | 0 | maxi = seq1.length - 1; |
569 | 0 | maxj = j; |
570 | } | |
571 | } | |
572 | ||
573 | 0 | int i = maxi; |
574 | 0 | int j = maxj; |
575 | 0 | int trace; |
576 | 0 | maxscore = score[i][j] / 10f; |
577 | ||
578 | // prepare trailing gaps | |
579 | 0 | while ((i < seq1.length - 1) || (j < seq2.length - 1)) |
580 | { | |
581 | 0 | i++; |
582 | 0 | j++; |
583 | } | |
584 | 0 | seq1end = i + 1; |
585 | 0 | seq2end = j + 1; |
586 | ||
587 | 0 | aseq1 = new int[seq1.length + seq2.length]; |
588 | 0 | aseq2 = new int[seq1.length + seq2.length]; |
589 | ||
590 | 0 | StringBuilder sb1 = new StringBuilder(aseq1.length); |
591 | 0 | StringBuilder sb2 = new StringBuilder(aseq2.length); |
592 | ||
593 | 0 | count = (seq1.length + seq2.length) - 1; |
594 | ||
595 | // get trailing gaps | |
596 | 0 | while ((i >= seq1.length) || (j >= seq2.length)) |
597 | { | |
598 | 0 | if (i >= seq1.length) |
599 | { | |
600 | 0 | aseq1[count] = GAP_INDEX; |
601 | 0 | sb1.append("-"); |
602 | 0 | aseq2[count] = seq2[j]; |
603 | 0 | sb2.append(s2str.charAt(j)); |
604 | } | |
605 | 0 | else if (j >= seq2.length) |
606 | { | |
607 | 0 | aseq1[count] = seq1[i]; |
608 | 0 | sb1.append(s1str.charAt(i)); |
609 | 0 | aseq2[count] = GAP_INDEX; |
610 | 0 | sb2.append("-"); |
611 | } | |
612 | 0 | i--; |
613 | 0 | j--; |
614 | } | |
615 | ||
616 | 0 | while (i > 0 && j > 0) |
617 | { | |
618 | 0 | aseq1[count] = seq1[i]; |
619 | 0 | sb1.append(s1str.charAt(i)); |
620 | 0 | aseq2[count] = seq2[j]; |
621 | 0 | sb2.append(s2str.charAt(j)); |
622 | ||
623 | 0 | trace = findTrace(i, j); |
624 | ||
625 | 0 | if (trace == 0) |
626 | { | |
627 | 0 | i--; |
628 | 0 | j--; |
629 | } | |
630 | 0 | else if (trace == 1) |
631 | { | |
632 | 0 | j--; |
633 | 0 | aseq1[count] = GAP_INDEX; |
634 | 0 | sb1.replace(sb1.length() - 1, sb1.length(), "-"); |
635 | } | |
636 | 0 | else if (trace == -1) |
637 | { | |
638 | 0 | i--; |
639 | 0 | aseq2[count] = GAP_INDEX; |
640 | 0 | sb2.replace(sb2.length() - 1, sb2.length(), "-"); |
641 | } | |
642 | ||
643 | 0 | count--; |
644 | } | |
645 | ||
646 | 0 | seq1start = i + 1; |
647 | 0 | seq2start = j + 1; |
648 | ||
649 | 0 | aseq1[count] = seq1[i]; |
650 | 0 | sb1.append(s1str.charAt(i)); |
651 | 0 | aseq2[count] = seq2[j]; |
652 | 0 | sb2.append(s2str.charAt(j)); |
653 | ||
654 | // get initial gaps | |
655 | 0 | while (j > 0 || i > 0) |
656 | { | |
657 | 0 | if (j > 0) |
658 | { | |
659 | 0 | j--; |
660 | 0 | sb1.append("-"); |
661 | 0 | sb2.append(s2str.charAt(j)); |
662 | } | |
663 | 0 | else if (i > 0) |
664 | { | |
665 | 0 | i--; |
666 | 0 | sb1.append(s1str.charAt(i)); |
667 | 0 | sb2.append("-"); |
668 | } | |
669 | } | |
670 | ||
671 | /* | |
672 | * we built the character strings backwards, so now | |
673 | * reverse them to convert to sequence strings | |
674 | */ | |
675 | 0 | astr1 = sb1.reverse().toString(); |
676 | 0 | astr2 = sb2.reverse().toString(); |
677 | } | |
678 | ||
679 | /** | |
680 | * DOCUMENT ME! | |
681 | */ | |
682 | 196 | public void printAlignment(PrintStream os) |
683 | { | |
684 | // TODO: Use original sequence characters rather than re-translated | |
685 | // characters in output | |
686 | // Find the biggest id length for formatting purposes | |
687 | 196 | String s1id = getAlignedSeq1().getDisplayId(true); |
688 | 196 | String s2id = getAlignedSeq2().getDisplayId(true); |
689 | 196 | int nameLength = Math.max(s1id.length(), s2id.length()); |
690 | 196 | if (nameLength > MAX_NAME_LENGTH) |
691 | { | |
692 | 39 | int truncateBy = nameLength - MAX_NAME_LENGTH; |
693 | 39 | nameLength = MAX_NAME_LENGTH; |
694 | // JAL-527 - truncate the sequence ids | |
695 | 39 | if (s1id.length() > nameLength) |
696 | { | |
697 | 0 | int slashPos = s1id.lastIndexOf('/'); |
698 | 0 | s1id = s1id.substring(0, slashPos - truncateBy) |
699 | + s1id.substring(slashPos); | |
700 | } | |
701 | 39 | if (s2id.length() > nameLength) |
702 | { | |
703 | 39 | int slashPos = s2id.lastIndexOf('/'); |
704 | 39 | s2id = s2id.substring(0, slashPos - truncateBy) |
705 | + s2id.substring(slashPos); | |
706 | } | |
707 | } | |
708 | 196 | int len = 72 - nameLength - 1; |
709 | 196 | int nochunks = ((aseq1.length - count) / len) |
710 | 196 | + ((aseq1.length - count) % len > 0 ? 1 : 0); |
711 | 196 | float pid = 0f; |
712 | ||
713 | 196 | output.append("Score = ").append(score[maxi][maxj]).append(NEWLINE); |
714 | 196 | output.append("Length of alignment = ") |
715 | .append(String.valueOf(aseq1.length - count)).append(NEWLINE); | |
716 | 196 | output.append("Sequence "); |
717 | 196 | Format nameFormat = new Format("%" + nameLength + "s"); |
718 | 196 | output.append(nameFormat.form(s1id)); |
719 | 196 | output.append(" (Sequence length = ") |
720 | .append(String.valueOf(s1str.length())).append(")") | |
721 | .append(NEWLINE); | |
722 | 196 | output.append("Sequence "); |
723 | 196 | output.append(nameFormat.form(s2id)); |
724 | 196 | output.append(" (Sequence length = ") |
725 | .append(String.valueOf(s2str.length())).append(")") | |
726 | .append(NEWLINE).append(NEWLINE); | |
727 | ||
728 | 196 | ScoreMatrix pam250 = ScoreModels.getInstance().getPam250(); |
729 | ||
730 | 698 | for (int j = 0; j < nochunks; j++) |
731 | { | |
732 | // Print the first aligned sequence | |
733 | 502 | output.append(nameFormat.form(s1id)).append(" "); |
734 | ||
735 | 26439 | for (int i = 0; i < len; i++) |
736 | { | |
737 | 25937 | if ((i + (j * len)) < astr1.length()) |
738 | { | |
739 | 21224 | output.append(astr1.charAt(i + (j * len))); |
740 | } | |
741 | } | |
742 | ||
743 | 502 | output.append(NEWLINE); |
744 | 502 | output.append(nameFormat.form(" ")).append(" "); |
745 | ||
746 | /* | |
747 | * Print out the match symbols: | |
748 | * | for exact match (ignoring case) | |
749 | * . if PAM250 score is positive | |
750 | * else a space | |
751 | */ | |
752 | 26439 | for (int i = 0; i < len; i++) |
753 | { | |
754 | 25937 | if ((i + (j * len)) < astr1.length()) |
755 | { | |
756 | 21224 | char c1 = astr1.charAt(i + (j * len)); |
757 | 21224 | char c2 = astr2.charAt(i + (j * len)); |
758 | 21224 | boolean sameChar = Comparison.isSameResidue(c1, c2, false); |
759 | 21224 | if (sameChar && !Comparison.isGap(c1)) |
760 | { | |
761 | 21172 | pid++; |
762 | 21172 | output.append("|"); |
763 | } | |
764 | 52 | else if (PEP.equals(type)) |
765 | { | |
766 | 52 | if (pam250.getPairwiseScore(c1, c2) > 0) |
767 | { | |
768 | 2 | output.append("."); |
769 | } | |
770 | else | |
771 | { | |
772 | 50 | output.append(" "); |
773 | } | |
774 | } | |
775 | else | |
776 | { | |
777 | 0 | output.append(" "); |
778 | } | |
779 | } | |
780 | } | |
781 | ||
782 | // Now print the second aligned sequence | |
783 | 502 | output = output.append(NEWLINE); |
784 | 502 | output = output.append(nameFormat.form(s2id)).append(" "); |
785 | ||
786 | 26439 | for (int i = 0; i < len; i++) |
787 | { | |
788 | 25937 | if ((i + (j * len)) < astr2.length()) |
789 | { | |
790 | 21224 | output.append(astr2.charAt(i + (j * len))); |
791 | } | |
792 | } | |
793 | ||
794 | 502 | output.append(NEWLINE).append(NEWLINE); |
795 | } | |
796 | ||
797 | 196 | pid = pid / (aseq1.length - count) * 100; |
798 | 196 | output.append(new Format("Percentage ID = %3.2f\n").form(pid)); |
799 | 196 | output.append(NEWLINE); |
800 | 196 | try |
801 | { | |
802 | 196 | os.print(output.toString()); |
803 | } catch (Exception ex) | |
804 | { | |
805 | } | |
806 | } | |
807 | ||
808 | /** | |
809 | * DOCUMENT ME! | |
810 | * | |
811 | * @param i | |
812 | * DOCUMENT ME! | |
813 | * @param j | |
814 | * DOCUMENT ME! | |
815 | * | |
816 | * @return DOCUMENT ME! | |
817 | */ | |
818 | 6509600 | public int findTrace(int i, int j) |
819 | { | |
820 | 6509600 | int t = 0; |
821 | 6509600 | float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(i), |
822 | s2str.charAt(j)); | |
823 | 6509600 | float max = score[i - 1][j - 1] + (pairwiseScore * 10); |
824 | ||
825 | 6509600 | if (F[i][j] > max) |
826 | { | |
827 | 2114494 | max = F[i][j]; |
828 | 2114494 | t = -1; |
829 | } | |
830 | 4395106 | else if (F[i][j] == max) |
831 | { | |
832 | 241653 | if (prev == -1) |
833 | { | |
834 | 143951 | max = F[i][j]; |
835 | 143951 | t = -1; |
836 | } | |
837 | } | |
838 | ||
839 | 6509600 | if (E[i][j] >= max) |
840 | { | |
841 | 2324713 | max = E[i][j]; |
842 | 2324713 | t = 1; |
843 | } | |
844 | 4184887 | else if (E[i][j] == max) |
845 | { | |
846 | 0 | if (prev == 1) |
847 | { | |
848 | 0 | max = E[i][j]; |
849 | 0 | t = 1; |
850 | } | |
851 | } | |
852 | ||
853 | 6509600 | prev = t; |
854 | ||
855 | 6509600 | return t; |
856 | } | |
857 | ||
858 | /** | |
859 | * DOCUMENT ME! | |
860 | */ | |
861 | 267 | public void calcScoreMatrix() |
862 | { | |
863 | 267 | int n = seq1.length; |
864 | 267 | int m = seq2.length; |
865 | 267 | final int GAP_EX_COST = GAP_EXTEND_COST; |
866 | 267 | final int GAP_OP_COST = GAP_OPEN_COST; |
867 | // top left hand element | |
868 | 267 | score[0][0] = scoreMatrix.getPairwiseScore(s1str.charAt(0), |
869 | s2str.charAt(0)) * 10; | |
870 | 267 | E[0][0] = -GAP_EX_COST; |
871 | 267 | F[0][0] = 0; |
872 | ||
873 | // Calculate the top row first | |
874 | 34127 | for (int j = 1; j < m; j++) |
875 | { | |
876 | // What should these values be? 0 maybe | |
877 | 33860 | E[0][j] = max(score[0][j - 1] - GAP_OP_COST, |
878 | E[0][j - 1] - GAP_EX_COST); | |
879 | 33860 | F[0][j] = -GAP_EX_COST; |
880 | ||
881 | 33860 | float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(0), |
882 | s2str.charAt(j)); | |
883 | 33860 | score[0][j] = max(pairwiseScore * 10, -GAP_OP_COST, -GAP_EX_COST); |
884 | ||
885 | 33860 | traceback[0][j] = 1; |
886 | } | |
887 | ||
888 | // Now do the left hand column | |
889 | 36756 | for (int i = 1; i < n; i++) |
890 | { | |
891 | 36489 | E[i][0] = -GAP_OP_COST; |
892 | 36489 | F[i][0] = max(score[i - 1][0] - GAP_OP_COST, |
893 | F[i - 1][0] - GAP_EX_COST); | |
894 | ||
895 | 36489 | float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(i), |
896 | s2str.charAt(0)); | |
897 | 36489 | score[i][0] = max(pairwiseScore * 10, E[i][0], F[i][0]); |
898 | 36489 | traceback[i][0] = -1; |
899 | } | |
900 | ||
901 | // Now do all the other rows | |
902 | 36756 | for (int i = 1; i < n; i++) |
903 | { | |
904 | 6517150 | for (int j = 1; j < m; j++) |
905 | { | |
906 | 6480661 | E[i][j] = max(score[i][j - 1] - GAP_OP_COST, |
907 | E[i][j - 1] - GAP_EX_COST); | |
908 | 6480661 | F[i][j] = max(score[i - 1][j] - GAP_OP_COST, |
909 | F[i - 1][j] - GAP_EX_COST); | |
910 | ||
911 | 6480661 | float pairwiseScore = scoreMatrix.getPairwiseScore(s1str.charAt(i), |
912 | s2str.charAt(j)); | |
913 | 6480661 | score[i][j] = max(score[i - 1][j - 1] + (pairwiseScore * 10), |
914 | E[i][j], F[i][j]); | |
915 | 6480661 | traceback[i][j] = findTrace(i, j); |
916 | } | |
917 | } | |
918 | } | |
919 | ||
920 | /** | |
921 | * Returns the given sequence with all of the given gap characters removed. | |
922 | * | |
923 | * @param gapChars | |
924 | * a string of characters to be treated as gaps | |
925 | * @param seq | |
926 | * the input sequence | |
927 | * | |
928 | * @return | |
929 | */ | |
930 | 7183 | public static String extractGaps(String gapChars, String seq) |
931 | { | |
932 | 7183 | if (gapChars == null || seq == null) |
933 | { | |
934 | 6 | return null; |
935 | } | |
936 | 7177 | StringTokenizer str = new StringTokenizer(seq, gapChars); |
937 | 7177 | StringBuilder newString = new StringBuilder(seq.length()); |
938 | ||
939 | 51363 | while (str.hasMoreTokens()) |
940 | { | |
941 | 44186 | newString.append(str.nextToken()); |
942 | } | |
943 | ||
944 | 7177 | return newString.toString(); |
945 | } | |
946 | ||
947 | /** | |
948 | * DOCUMENT ME! | |
949 | * | |
950 | * @param f1 | |
951 | * DOCUMENT ME! | |
952 | * @param f2 | |
953 | * DOCUMENT ME! | |
954 | * @param f3 | |
955 | * DOCUMENT ME! | |
956 | * | |
957 | * @return DOCUMENT ME! | |
958 | */ | |
959 | 6551010 | private static float max(float f1, float f2, float f3) |
960 | { | |
961 | 6551010 | float max = f1; |
962 | ||
963 | 6551010 | if (f2 > f1) |
964 | { | |
965 | 2118202 | max = f2; |
966 | } | |
967 | ||
968 | 6551010 | if (f3 > max) |
969 | { | |
970 | 2099168 | max = f3; |
971 | } | |
972 | ||
973 | 6551010 | return max; |
974 | } | |
975 | ||
976 | /** | |
977 | * DOCUMENT ME! | |
978 | * | |
979 | * @param f1 | |
980 | * DOCUMENT ME! | |
981 | * @param f2 | |
982 | * DOCUMENT ME! | |
983 | * | |
984 | * @return DOCUMENT ME! | |
985 | */ | |
986 | 13031671 | private static float max(float f1, float f2) |
987 | { | |
988 | 13031671 | float max = f1; |
989 | ||
990 | 13031671 | if (f2 > f1) |
991 | { | |
992 | 6942308 | max = f2; |
993 | } | |
994 | ||
995 | 13031671 | return max; |
996 | } | |
997 | ||
998 | /** | |
999 | * Converts the character string to an array of integers which are the | |
1000 | * corresponding indices to the characters in the score matrix | |
1001 | * | |
1002 | * @param s | |
1003 | * | |
1004 | * @return | |
1005 | */ | |
1006 | 540 | int[] indexEncode(String s) |
1007 | { | |
1008 | 540 | int[] encoded = new int[s.length()]; |
1009 | ||
1010 | 71476 | for (int i = 0; i < s.length(); i++) |
1011 | { | |
1012 | 70936 | char c = s.charAt(i); |
1013 | 70936 | encoded[i] = scoreMatrix.getMatrixIndex(c); |
1014 | } | |
1015 | ||
1016 | 540 | return encoded; |
1017 | } | |
1018 | ||
1019 | /** | |
1020 | * DOCUMENT ME! | |
1021 | * | |
1022 | * @param g | |
1023 | * DOCUMENT ME! | |
1024 | * @param mat | |
1025 | * DOCUMENT ME! | |
1026 | * @param n | |
1027 | * DOCUMENT ME! | |
1028 | * @param m | |
1029 | * DOCUMENT ME! | |
1030 | * @param psize | |
1031 | * DOCUMENT ME! | |
1032 | */ | |
1033 | 0 | public static void displayMatrix(Graphics g, int[][] mat, int n, int m, |
1034 | int psize) | |
1035 | { | |
1036 | // TODO method doesn't seem to be referenced anywhere delete?? | |
1037 | 0 | int max = -1000; |
1038 | 0 | int min = 1000; |
1039 | ||
1040 | 0 | for (int i = 0; i < n; i++) |
1041 | { | |
1042 | 0 | for (int j = 0; j < m; j++) |
1043 | { | |
1044 | 0 | if (mat[i][j] >= max) |
1045 | { | |
1046 | 0 | max = mat[i][j]; |
1047 | } | |
1048 | ||
1049 | 0 | if (mat[i][j] <= min) |
1050 | { | |
1051 | 0 | min = mat[i][j]; |
1052 | } | |
1053 | } | |
1054 | } | |
1055 | ||
1056 | 0 | jalview.bin.Console.outPrintln(max + " " + min); |
1057 | ||
1058 | 0 | for (int i = 0; i < n; i++) |
1059 | { | |
1060 | 0 | for (int j = 0; j < m; j++) |
1061 | { | |
1062 | 0 | int x = psize * i; |
1063 | 0 | int y = psize * j; |
1064 | ||
1065 | // jalview.bin.Console.outPrintln(mat[i][j]); | |
1066 | 0 | float score = (float) (mat[i][j] - min) / (float) (max - min); |
1067 | 0 | g.setColor(new Color(score, 0, 0)); |
1068 | 0 | g.fillRect(x, y, psize, psize); |
1069 | ||
1070 | // jalview.bin.Console.outPrintln(x + " " + y + " " + score); | |
1071 | } | |
1072 | } | |
1073 | } | |
1074 | ||
1075 | /** | |
1076 | * Compute a globally optimal needleman and wunsch alignment between two | |
1077 | * sequences | |
1078 | * | |
1079 | * @param s1 | |
1080 | * @param s2 | |
1081 | * @param type | |
1082 | * AlignSeq.DNA or AlignSeq.PEP | |
1083 | */ | |
1084 | 265 | public static AlignSeq doGlobalNWAlignment(SequenceI s1, SequenceI s2, |
1085 | String type) | |
1086 | { | |
1087 | 265 | return doGlobalNWAlignment(s1, s2, type, DEFAULT_OPENCOST, |
1088 | DEFAULT_EXTENDCOST); | |
1089 | } | |
1090 | ||
1091 | 265 | public static AlignSeq doGlobalNWAlignment(SequenceI s1, SequenceI s2, |
1092 | String type, int opencost, int extcost) | |
1093 | { | |
1094 | ||
1095 | 265 | AlignSeq as = new AlignSeq(s1, s2, type, opencost, extcost); |
1096 | ||
1097 | 265 | as.calcScoreMatrix(); |
1098 | 265 | as.traceAlignment(); |
1099 | 265 | return as; |
1100 | } | |
1101 | ||
1102 | /** | |
1103 | * | |
1104 | * @return mapping from positions in S1 to corresponding positions in S2 | |
1105 | */ | |
1106 | 224 | public jalview.datamodel.Mapping getMappingFromS1(boolean allowmismatch) |
1107 | { | |
1108 | 224 | ArrayList<Integer> as1 = new ArrayList<Integer>(), |
1109 | as2 = new ArrayList<Integer>(); | |
1110 | 224 | int pdbpos = s2.getStart() + getSeq2Start() - 2; |
1111 | 224 | int alignpos = s1.getStart() + getSeq1Start() - 2; |
1112 | 224 | int lp2 = pdbpos - 3, lp1 = alignpos - 3; |
1113 | 224 | boolean lastmatch = false; |
1114 | // and now trace the alignment onto the atom set. | |
1115 | 28149 | for (int i = 0; i < astr1.length(); i++) |
1116 | { | |
1117 | 27925 | char c1 = astr1.charAt(i), c2 = astr2.charAt(i); |
1118 | 27925 | if (c1 != '-') |
1119 | { | |
1120 | 27925 | alignpos++; |
1121 | } | |
1122 | ||
1123 | 27925 | if (c2 != '-') |
1124 | { | |
1125 | 27925 | pdbpos++; |
1126 | } | |
1127 | ||
1128 | // ignore case differences | |
1129 | 27925 | if (allowmismatch || (c1 == c2) || (Math.abs(c2 - c1) == ('a' - 'A'))) |
1130 | { | |
1131 | // extend mapping interval | |
1132 | 27879 | if (lp1 + 1 != alignpos || lp2 + 1 != pdbpos) |
1133 | { | |
1134 | 265 | as1.add(Integer.valueOf(alignpos)); |
1135 | 265 | as2.add(Integer.valueOf(pdbpos)); |
1136 | } | |
1137 | 27879 | lastmatch = true; |
1138 | 27879 | lp1 = alignpos; |
1139 | 27879 | lp2 = pdbpos; |
1140 | } | |
1141 | else | |
1142 | { | |
1143 | // extend mapping interval | |
1144 | 46 | if (lastmatch) |
1145 | { | |
1146 | 41 | as1.add(Integer.valueOf(lp1)); |
1147 | 41 | as2.add(Integer.valueOf(lp2)); |
1148 | } | |
1149 | 46 | lastmatch = false; |
1150 | } | |
1151 | } | |
1152 | // construct range pairs | |
1153 | ||
1154 | 224 | int[] mapseq1 = new int[as1.size() + (lastmatch ? 1 : 0)], |
1155 | 224 | mapseq2 = new int[as2.size() + (lastmatch ? 1 : 0)]; |
1156 | 224 | int i = 0; |
1157 | 224 | for (Integer ip : as1) |
1158 | { | |
1159 | 306 | mapseq1[i++] = ip; |
1160 | } | |
1161 | 224 | ; |
1162 | 224 | i = 0; |
1163 | 224 | for (Integer ip : as2) |
1164 | { | |
1165 | 306 | mapseq2[i++] = ip; |
1166 | } | |
1167 | 224 | ; |
1168 | 224 | if (lastmatch) |
1169 | { | |
1170 | 224 | mapseq1[mapseq1.length - 1] = alignpos; |
1171 | 224 | mapseq2[mapseq2.length - 1] = pdbpos; |
1172 | } | |
1173 | 224 | MapList map = new MapList(mapseq1, mapseq2, 1, 1); |
1174 | ||
1175 | 224 | jalview.datamodel.Mapping mapping = new Mapping(map); |
1176 | 224 | mapping.setTo(s2); |
1177 | 224 | return mapping; |
1178 | } | |
1179 | ||
1180 | /** | |
1181 | * matches ochains against al and populates seqs with the best match between | |
1182 | * each ochain and the set in al | |
1183 | * | |
1184 | * @param ochains | |
1185 | * @param al | |
1186 | * @param dnaOrProtein | |
1187 | * @param removeOldAnnots | |
1188 | * when true, old annotation is cleared before new annotation | |
1189 | * transferred | |
1190 | * @return List<List<SequenceI> originals, List<SequenceI> replacement, | |
1191 | * List<AlignSeq> alignment between each> | |
1192 | */ | |
1193 | 5 | public static List<List<? extends Object>> replaceMatchingSeqsWith( |
1194 | List<SequenceI> seqs, List<AlignmentAnnotation> annotations, | |
1195 | List<SequenceI> ochains, AlignmentI al, String dnaOrProtein, | |
1196 | boolean removeOldAnnots) | |
1197 | { | |
1198 | 5 | List<SequenceI> orig = new ArrayList<SequenceI>(), |
1199 | repl = new ArrayList<SequenceI>(); | |
1200 | 5 | List<AlignSeq> aligs = new ArrayList<AlignSeq>(); |
1201 | 5 | if (al != null && al.getHeight() > 0) |
1202 | { | |
1203 | 5 | ArrayList<SequenceI> matches = new ArrayList<SequenceI>(); |
1204 | 5 | ArrayList<AlignSeq> aligns = new ArrayList<AlignSeq>(); |
1205 | ||
1206 | 5 | for (SequenceI sq : ochains) |
1207 | { | |
1208 | 14 | SequenceI bestm = null; |
1209 | 14 | AlignSeq bestaseq = null; |
1210 | 14 | float bestscore = 0; |
1211 | 14 | for (SequenceI msq : al.getSequences()) |
1212 | { | |
1213 | 30 | AlignSeq aseq = doGlobalNWAlignment(msq, sq, dnaOrProtein); |
1214 | 30 | if (bestm == null || aseq.getMaxScore() > bestscore) |
1215 | { | |
1216 | 14 | bestscore = aseq.getMaxScore(); |
1217 | 14 | bestaseq = aseq; |
1218 | 14 | bestm = msq; |
1219 | } | |
1220 | } | |
1221 | // jalview.bin.Console.outPrintln("Best Score for " + (matches.size() + | |
1222 | // 1) + " :" | |
1223 | // + bestscore); | |
1224 | 14 | matches.add(bestm); |
1225 | 14 | aligns.add(bestaseq); |
1226 | 14 | al.deleteSequence(bestm); |
1227 | } | |
1228 | 19 | for (int p = 0, pSize = seqs.size(); p < pSize; p++) |
1229 | { | |
1230 | 14 | SequenceI sq, sp = seqs.get(p); |
1231 | 14 | int q; |
1232 | ? | if ((q = ochains.indexOf(sp)) > -1) |
1233 | { | |
1234 | 14 | seqs.set(p, sq = matches.get(q)); |
1235 | 14 | orig.add(sp); |
1236 | 14 | repl.add(sq); |
1237 | 14 | sq.setName(sp.getName()); |
1238 | 14 | sq.setDescription(sp.getDescription()); |
1239 | 14 | Mapping sp2sq; |
1240 | 14 | sq.transferAnnotation(sp, |
1241 | sp2sq = aligns.get(q).getMappingFromS1(false)); | |
1242 | 14 | aligs.add(aligns.get(q)); |
1243 | 14 | int inspos = -1; |
1244 | 46 | for (int ap = 0; ap < annotations.size();) |
1245 | { | |
1246 | 32 | if (annotations.get(ap).sequenceRef == sp) |
1247 | { | |
1248 | 4 | if (inspos == -1) |
1249 | { | |
1250 | 4 | inspos = ap; |
1251 | } | |
1252 | 4 | if (removeOldAnnots) |
1253 | { | |
1254 | 0 | annotations.remove(ap); |
1255 | } | |
1256 | else | |
1257 | { | |
1258 | 4 | AlignmentAnnotation alan = annotations.remove(ap); |
1259 | 4 | alan.liftOver(sq, sp2sq); |
1260 | 4 | alan.setSequenceRef(sq); |
1261 | 4 | sq.addAlignmentAnnotation(alan); |
1262 | } | |
1263 | } | |
1264 | else | |
1265 | { | |
1266 | 28 | ap++; |
1267 | } | |
1268 | } | |
1269 | 14 | if (sq.getAnnotation() != null && sq.getAnnotation().length > 0) |
1270 | { | |
1271 | 14 | annotations.addAll(inspos == -1 ? annotations.size() : inspos, |
1272 | Arrays.asList(sq.getAnnotation())); | |
1273 | } | |
1274 | } | |
1275 | } | |
1276 | } | |
1277 | 5 | return Arrays.asList(orig, repl, aligs); |
1278 | } | |
1279 | ||
1280 | /** | |
1281 | * compute the PID vector used by the redundancy filter. | |
1282 | * | |
1283 | * @param originalSequences | |
1284 | * - sequences in alignment that are to filtered | |
1285 | * @param omitHidden | |
1286 | * - null or strings to be analysed (typically, visible portion of | |
1287 | * each sequence in alignment) | |
1288 | * @param start | |
1289 | * - first column in window for calculation | |
1290 | * @param end | |
1291 | * - last column in window for calculation | |
1292 | * @param ungapped | |
1293 | * - if true then use ungapped sequence to compute PID | |
1294 | * @return vector containing maximum PID for i-th sequence and any sequences | |
1295 | * longer than that seuqence | |
1296 | */ | |
1297 | 0 | public static float[] computeRedundancyMatrix( |
1298 | SequenceI[] originalSequences, String[] omitHidden, int start, | |
1299 | int end, boolean ungapped) | |
1300 | { | |
1301 | 0 | int height = originalSequences.length; |
1302 | 0 | float[] redundancy = new float[height]; |
1303 | 0 | int[] lngth = new int[height]; |
1304 | 0 | for (int i = 0; i < height; i++) |
1305 | { | |
1306 | 0 | redundancy[i] = 0f; |
1307 | 0 | lngth[i] = -1; |
1308 | } | |
1309 | ||
1310 | // long start = System.currentTimeMillis(); | |
1311 | ||
1312 | 0 | SimilarityParams pidParams = new SimilarityParams(true, true, true, |
1313 | true); | |
1314 | 0 | float pid; |
1315 | 0 | String seqi, seqj; |
1316 | 0 | for (int i = 0; i < height; i++) |
1317 | { | |
1318 | ||
1319 | 0 | for (int j = 0; j < i; j++) |
1320 | { | |
1321 | 0 | if (i == j) |
1322 | { | |
1323 | 0 | continue; |
1324 | } | |
1325 | ||
1326 | 0 | if (omitHidden == null) |
1327 | { | |
1328 | 0 | seqi = originalSequences[i].getSequenceAsString(start, end); |
1329 | 0 | seqj = originalSequences[j].getSequenceAsString(start, end); |
1330 | } | |
1331 | else | |
1332 | { | |
1333 | 0 | seqi = omitHidden[i]; |
1334 | 0 | seqj = omitHidden[j]; |
1335 | } | |
1336 | 0 | if (lngth[i] == -1) |
1337 | { | |
1338 | 0 | String ug = AlignSeq.extractGaps(Comparison.GapChars, seqi); |
1339 | 0 | lngth[i] = ug.length(); |
1340 | 0 | if (ungapped) |
1341 | { | |
1342 | 0 | seqi = ug; |
1343 | } | |
1344 | } | |
1345 | 0 | if (lngth[j] == -1) |
1346 | { | |
1347 | 0 | String ug = AlignSeq.extractGaps(Comparison.GapChars, seqj); |
1348 | 0 | lngth[j] = ug.length(); |
1349 | 0 | if (ungapped) |
1350 | { | |
1351 | 0 | seqj = ug; |
1352 | } | |
1353 | } | |
1354 | 0 | pid = (float) PIDModel.computePID(seqi, seqj, pidParams); |
1355 | ||
1356 | // use real sequence length rather than string length | |
1357 | 0 | if (lngth[j] < lngth[i]) |
1358 | { | |
1359 | 0 | redundancy[j] = Math.max(pid, redundancy[j]); |
1360 | } | |
1361 | else | |
1362 | { | |
1363 | 0 | redundancy[i] = Math.max(pid, redundancy[i]); |
1364 | } | |
1365 | ||
1366 | } | |
1367 | } | |
1368 | 0 | return redundancy; |
1369 | } | |
1370 | ||
1371 | /** | |
1372 | * calculate the mean score of the alignment mean score is equal to the score | |
1373 | * of an alignmenet of two sequences with randomly shuffled AA sequence | |
1374 | * composited of the same AA as the two original sequences | |
1375 | * | |
1376 | */ | |
1377 | 2 | public void meanScore() |
1378 | { | |
1379 | 2 | int length = indelfreeAstr1.length(); // both have the same length |
1380 | // create HashMap for counting residues in each sequence | |
1381 | 2 | HashMap<Character, Integer> seq1ResCount = new HashMap<Character, Integer>(); |
1382 | 2 | HashMap<Character, Integer> seq2ResCount = new HashMap<Character, Integer>(); |
1383 | ||
1384 | // for both sequences (String indelfreeAstr1 or 2) create a key for the | |
1385 | // residue and add 1 each time its encountered | |
1386 | 2 | for (char residue : indelfreeAstr1.toCharArray()) |
1387 | { | |
1388 | 8 | seq1ResCount.putIfAbsent(residue, 0); |
1389 | 8 | seq1ResCount.replace(residue, seq1ResCount.get(residue) + 1); |
1390 | } | |
1391 | 2 | for (char residue : indelfreeAstr2.toCharArray()) |
1392 | { | |
1393 | 8 | seq2ResCount.putIfAbsent(residue, 0); |
1394 | 8 | seq2ResCount.replace(residue, seq2ResCount.get(residue) + 1); |
1395 | } | |
1396 | ||
1397 | // meanscore = for each residue pair get the number of appearance and add | |
1398 | // (countA * countB * pairwiseScore(AB)) | |
1399 | // divide the meanscore by the sequence length afterwards | |
1400 | 2 | float _meanscore = 0; |
1401 | 2 | for (char resA : seq1ResCount.keySet()) |
1402 | { | |
1403 | 8 | for (char resB : seq2ResCount.keySet()) |
1404 | { | |
1405 | 32 | int countA = seq1ResCount.get(resA); |
1406 | 32 | int countB = seq2ResCount.get(resB); |
1407 | ||
1408 | 32 | float scoreAB = scoreMatrix.getPairwiseScore(resA, resB); |
1409 | ||
1410 | 32 | _meanscore += countA * countB * scoreAB; |
1411 | } | |
1412 | } | |
1413 | 2 | _meanscore /= length; |
1414 | 2 | this.meanScore = _meanscore; |
1415 | } | |
1416 | ||
1417 | 0 | public float getMeanScore() |
1418 | { | |
1419 | 0 | return this.meanScore; |
1420 | } | |
1421 | ||
1422 | /** | |
1423 | * calculate the hypothetic max score using the self-alignment of the | |
1424 | * sequences | |
1425 | */ | |
1426 | 2 | public void hypotheticMaxScore() |
1427 | { | |
1428 | 2 | int _hmsA = 0; |
1429 | 2 | int _hmsB = 0; |
1430 | 2 | for (char residue : indelfreeAstr1.toCharArray()) |
1431 | { | |
1432 | 8 | _hmsA += scoreMatrix.getPairwiseScore(residue, residue); |
1433 | } | |
1434 | 2 | for (char residue : indelfreeAstr2.toCharArray()) |
1435 | { | |
1436 | 8 | _hmsB += scoreMatrix.getPairwiseScore(residue, residue); |
1437 | } | |
1438 | 2 | this.hypotheticMaxScore = (_hmsA < _hmsB) ? _hmsA : _hmsB; // take the lower |
1439 | // self alignment | |
1440 | ||
1441 | } | |
1442 | ||
1443 | 0 | public int getHypotheticMaxScore() |
1444 | { | |
1445 | 0 | return this.hypotheticMaxScore; |
1446 | } | |
1447 | ||
1448 | /** | |
1449 | * create strings based of astr1 and astr2 but without gaps | |
1450 | */ | |
1451 | 2 | public void getIndelfreeAstr() |
1452 | { | |
1453 | 2 | int n = astr1.length(); // both have the same length |
1454 | 10 | for (int i = 0; i < n; i++) |
1455 | { | |
1456 | 8 | if (Character.isLetter(astr1.charAt(i)) |
1457 | && Character.isLetter(astr2.charAt(i))) // if both sequences dont | |
1458 | // have a gap -> add to | |
1459 | // indelfreeAstr | |
1460 | { | |
1461 | 8 | this.indelfreeAstr1 += astr1.charAt(i); |
1462 | 8 | this.indelfreeAstr2 += astr2.charAt(i); |
1463 | } | |
1464 | } | |
1465 | } | |
1466 | ||
1467 | /** | |
1468 | * calculates the overall score of the alignment preprescore = sum of all | |
1469 | * scores - all penalties if preprescore < 1 ~ alignmentScore = Float.NaN > | |
1470 | * alignmentScore = ((preprescore - meanScore) / (hypotheticMaxScore - | |
1471 | * meanScore)) * coverage | |
1472 | */ | |
1473 | 2 | public void scoreAlignment() |
1474 | { | |
1475 | ||
1476 | 2 | getIndelfreeAstr(); |
1477 | 2 | meanScore(); |
1478 | 2 | hypotheticMaxScore(); |
1479 | // cannot calculate score because denominator would be zero | |
1480 | 2 | if (this.hypotheticMaxScore == this.meanScore) |
1481 | { | |
1482 | 0 | this.alignmentScore = Float.NaN; |
1483 | 0 | return; |
1484 | } | |
1485 | 2 | int n = indelfreeAstr1.length(); |
1486 | ||
1487 | 2 | float score = 0; |
1488 | 2 | boolean aGapOpen = false; |
1489 | 2 | boolean bGapOpen = false; |
1490 | 10 | for (int i = 0; i < n; i++) |
1491 | { | |
1492 | 8 | char char1 = indelfreeAstr1.charAt(i); |
1493 | 8 | char char2 = indelfreeAstr2.charAt(i); |
1494 | 8 | boolean aIsLetter = Character.isLetter(char1); |
1495 | 8 | boolean bIsLetter = Character.isLetter(char2); |
1496 | 8 | if (aIsLetter && bIsLetter) // if pair -> get score |
1497 | { | |
1498 | 8 | score += scoreMatrix.getPairwiseScore(char1, char2); |
1499 | } | |
1500 | 0 | else if (!aIsLetter && !bIsLetter) |
1501 | { // both are gap -> skip | |
1502 | } | |
1503 | 0 | else if ((!aIsLetter && aGapOpen) || (!bIsLetter && bGapOpen)) |
1504 | { // one side gapopen -> score - gap_extend | |
1505 | 0 | score -= GAP_EXTEND_COST; |
1506 | } | |
1507 | else | |
1508 | { // no gap open -> score - gap_open | |
1509 | 0 | score -= GAP_OPEN_COST; |
1510 | } | |
1511 | // adjust GapOpen status in both sequences | |
1512 | 8 | aGapOpen = (!aIsLetter) ? true : false; |
1513 | 8 | bGapOpen = (!bIsLetter) ? true : false; |
1514 | } | |
1515 | ||
1516 | 2 | float preprescore = score; // if this score < 1 --> alignment score = |
1517 | // Float.NaN | |
1518 | 2 | score = (score - this.meanScore) |
1519 | / (this.hypotheticMaxScore - this.meanScore); | |
1520 | 2 | int[] _max = MiscMath |
1521 | .findMax(new int[] | |
1522 | { astr1.replace("-", "").length(), | |
1523 | astr2.replace("-", "").length() }); // {index of max, max} | |
1524 | 2 | float coverage = (float) n / (float) _max[1]; // indelfreeAstr length / |
1525 | // longest sequence length | |
1526 | 2 | float prescore = score; // only debug |
1527 | 2 | score *= coverage; |
1528 | ||
1529 | // System.out.println(String.format("prepre-score: %f, pre-score: %f, | |
1530 | // longlength: %d\nscore: %1.16f, mean: %f, max: %d", preprescore, prescore, | |
1531 | // _max[1], score, this.meanScore, this.hypotheticMaxScore)); | |
1532 | 2 | float minScore = 0f; |
1533 | 2 | this.alignmentScore = (score <= minScore) ? Float.NaN : score; |
1534 | } | |
1535 | ||
1536 | 0 | public void setScoreMatrix(ScoreMatrix sm) |
1537 | { | |
1538 | 0 | if (sm != null) |
1539 | { | |
1540 | 0 | scoreMatrix = sm; |
1541 | } | |
1542 | } | |
1543 | } |