Class |
Line # |
Actions |
|||
---|---|---|---|---|---|
MappedFeatures | 42 | 69 | 30 |
1 | /* | |
2 | * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) | |
3 | * Copyright (C) $$Year-Rel$$ The Jalview Authors | |
4 | * | |
5 | * This file is part of Jalview. | |
6 | * | |
7 | * Jalview is free software: you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation, either version 3 | |
10 | * of the License, or (at your option) any later version. | |
11 | * | |
12 | * Jalview is distributed in the hope that it will be useful, but | |
13 | * WITHOUT ANY WARRANTY; without even the implied warranty | |
14 | * of MERCHANTABILITY or FITNESS FOR A PARTICULAR | |
15 | * PURPOSE. See the GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with Jalview. If not, see <http://www.gnu.org/licenses/>. | |
19 | * The Jalview Authors are detailed in the 'AUTHORS' file. | |
20 | */ | |
21 | package jalview.datamodel; | |
22 | ||
23 | import java.util.Locale; | |
24 | ||
25 | import java.util.HashSet; | |
26 | import java.util.List; | |
27 | import java.util.Set; | |
28 | ||
29 | import jalview.io.gff.Gff3Helper; | |
30 | import jalview.schemes.ResidueProperties; | |
31 | import jalview.util.MapList; | |
32 | import jalview.util.MappingUtils; | |
33 | import jalview.util.StringUtils; | |
34 | ||
35 | /** | |
36 | * A data bean to hold a list of mapped sequence features (e.g. CDS features | |
37 | * mapped from protein), and the mapping between the sequences. It also provides | |
38 | * a method to derive peptide variants from codon variants. | |
39 | * | |
40 | * @author gmcarstairs | |
41 | */ | |
42 | public class MappedFeatures | |
43 | { | |
44 | /* | |
45 | * VEP CSQ:HGVSp (if present) is a short-cut to the protein variant consequence | |
46 | */ | |
47 | private static final String HGV_SP = "HGVSp"; | |
48 | ||
49 | private static final String CSQ = "CSQ"; | |
50 | ||
51 | /* | |
52 | * the sequence the mapped features are on | |
53 | */ | |
54 | private final SequenceI featureSequence; | |
55 | ||
56 | /* | |
57 | * the mapping between sequences; | |
58 | * NB this could be in either sense (from or to featureSequence) | |
59 | */ | |
60 | private final Mapping mapping; | |
61 | ||
62 | /* | |
63 | * features on featureSequence that overlap the mapped positions | |
64 | */ | |
65 | public final List<SequenceFeature> features; | |
66 | ||
67 | /* | |
68 | * the residue position in the sequence mapped to | |
69 | */ | |
70 | private final int toPosition; | |
71 | ||
72 | /* | |
73 | * the residue at toPosition | |
74 | */ | |
75 | private final char toResidue; | |
76 | ||
77 | /* | |
78 | * if the mapping is 3:1 or 1:3 (peptide to CDS), this holds the | |
79 | * mapped positions i.e. codon base positions in CDS; to | |
80 | * support calculation of peptide variants from alleles | |
81 | */ | |
82 | private final int[] codonPos; | |
83 | ||
84 | private final char[] baseCodon; | |
85 | ||
86 | /** | |
87 | * Constructor | |
88 | * | |
89 | * @param theMapping | |
90 | * sequence mapping (which may be either to, or from, the sequence | |
91 | * holding the linked features) | |
92 | * @param featureSeq | |
93 | * the sequence hosting the virtual features | |
94 | * @param pos | |
95 | * the residue position in the sequence mapped to | |
96 | * @param res | |
97 | * the residue character at position pos | |
98 | * @param theFeatures | |
99 | * list of mapped features found in the 'featureSeq' sequence at the | |
100 | * mapped position(s) | |
101 | */ | |
102 | 7 | public MappedFeatures(Mapping theMapping, SequenceI featureSeq, int pos, |
103 | char res, List<SequenceFeature> theFeatures) | |
104 | { | |
105 | 7 | mapping = theMapping; |
106 | 7 | featureSequence = featureSeq; |
107 | 7 | toPosition = pos; |
108 | 7 | toResidue = res; |
109 | 7 | features = theFeatures; |
110 | ||
111 | /* | |
112 | * determine codon positions and canonical codon | |
113 | * for a peptide-to-CDS mapping | |
114 | */ | |
115 | 7 | int[] codonIntervals = mapping.getMap().locateInFrom(toPosition, |
116 | toPosition); | |
117 | 7 | int[] codonPositions = codonIntervals == null ? null |
118 | : MappingUtils.flattenRanges(codonIntervals); | |
119 | 7 | if (codonPositions != null && codonPositions.length == 3) |
120 | { | |
121 | 6 | codonPos = codonPositions; |
122 | 6 | baseCodon = new char[3]; |
123 | 6 | int cdsStart = featureSequence.getStart(); |
124 | 6 | baseCodon[0] = Character.toUpperCase( |
125 | featureSequence.getCharAt(codonPos[0] - cdsStart)); | |
126 | 6 | baseCodon[1] = Character.toUpperCase( |
127 | featureSequence.getCharAt(codonPos[1] - cdsStart)); | |
128 | 6 | baseCodon[2] = Character.toUpperCase( |
129 | featureSequence.getCharAt(codonPos[2] - cdsStart)); | |
130 | } | |
131 | else | |
132 | { | |
133 | 1 | codonPos = null; |
134 | 1 | baseCodon = null; |
135 | } | |
136 | } | |
137 | ||
138 | /** | |
139 | * Computes and returns comma-delimited HGVS notation peptide variants derived | |
140 | * from codon allele variants. If no variants are found, answers an empty | |
141 | * string. The peptide variant is either simply read from the "CSQ:HGVSp" | |
142 | * attribute if present, else computed based on the "alleles" attribute if | |
143 | * present. If neither attribute is found, no variant (empty string) is | |
144 | * returned. | |
145 | * | |
146 | * @param sf | |
147 | * a sequence feature (which must be one of those held in this | |
148 | * object) | |
149 | * @return | |
150 | */ | |
151 | 14 | public String findProteinVariants(SequenceFeature sf) |
152 | { | |
153 | 14 | if (!features.contains(sf) || baseCodon == null) |
154 | { | |
155 | 2 | return ""; |
156 | } | |
157 | ||
158 | /* | |
159 | * VCF data may already contain the protein consequence | |
160 | */ | |
161 | 12 | String hgvsp = sf.getValueAsString(CSQ, HGV_SP); |
162 | 12 | if (hgvsp != null) |
163 | { | |
164 | 3 | int colonPos = hgvsp.lastIndexOf(':'); |
165 | 3 | if (colonPos >= 0) |
166 | { | |
167 | 2 | String var = hgvsp.substring(colonPos + 1); |
168 | 2 | if (var.contains("p.")) // sanity check |
169 | { | |
170 | 1 | return var; |
171 | } | |
172 | } | |
173 | } | |
174 | ||
175 | /* | |
176 | * otherwise, compute codon and peptide variant | |
177 | */ | |
178 | 11 | int cdsPos = sf.getBegin(); |
179 | 11 | if (cdsPos != sf.getEnd()) |
180 | { | |
181 | // not handling multi-locus variant features | |
182 | 1 | return ""; |
183 | } | |
184 | 10 | if (cdsPos != codonPos[0] && cdsPos != codonPos[1] |
185 | && cdsPos != codonPos[2]) | |
186 | { | |
187 | // e.g. feature on intron within spliced codon! | |
188 | 0 | return ""; |
189 | } | |
190 | ||
191 | 10 | String alls = (String) sf.getValue(Gff3Helper.ALLELES); |
192 | 10 | if (alls == null) |
193 | { | |
194 | 1 | return ""; |
195 | } | |
196 | ||
197 | 9 | String from3 = StringUtils.toSentenceCase( |
198 | ResidueProperties.aa2Triplet.get(String.valueOf(toResidue))); | |
199 | ||
200 | /* | |
201 | * make a peptide variant for each SNP allele | |
202 | * e.g. C,G,T gives variants G and T for base C | |
203 | */ | |
204 | 9 | Set<String> variantPeptides = new HashSet<>(); |
205 | 9 | String[] alleles = alls.toUpperCase(Locale.ROOT).split(","); |
206 | 9 | StringBuilder vars = new StringBuilder(); |
207 | ||
208 | 9 | for (String allele : alleles) |
209 | { | |
210 | 23 | allele = allele.trim().toUpperCase(Locale.ROOT); |
211 | 23 | if (allele.length() > 1 || "-".equals(allele)) |
212 | { | |
213 | 4 | continue; // multi-locus variant |
214 | } | |
215 | 19 | char[] variantCodon = new char[3]; |
216 | 19 | variantCodon[0] = baseCodon[0]; |
217 | 19 | variantCodon[1] = baseCodon[1]; |
218 | 19 | variantCodon[2] = baseCodon[2]; |
219 | ||
220 | /* | |
221 | * poke variant base into canonical codon; | |
222 | * ignore first 'allele' (canonical base) | |
223 | */ | |
224 | 19 | final int i = cdsPos == codonPos[0] ? 0 |
225 | 12 | : (cdsPos == codonPos[1] ? 1 : 2); |
226 | 19 | variantCodon[i] = allele.toUpperCase(Locale.ROOT).charAt(0); |
227 | 19 | if (variantCodon[i] == baseCodon[i]) |
228 | { | |
229 | 9 | continue; |
230 | } | |
231 | 10 | String codon = new String(variantCodon); |
232 | 10 | String peptide = ResidueProperties.codonTranslate(codon); |
233 | 10 | boolean synonymous = toResidue == peptide.charAt(0); |
234 | 10 | StringBuilder var = new StringBuilder(); |
235 | 10 | if (synonymous) |
236 | { | |
237 | /* | |
238 | * synonymous variant notation e.g. c.1062C>A(p.=) | |
239 | */ | |
240 | 3 | var.append("c.").append(String.valueOf(cdsPos)) |
241 | .append(String.valueOf(baseCodon[i])).append(">") | |
242 | .append(String.valueOf(variantCodon[i])).append("(p.=)"); | |
243 | } | |
244 | else | |
245 | { | |
246 | /* | |
247 | * missense variant notation e.g. p.Arg355Met | |
248 | */ | |
249 | 7 | String to3 = ResidueProperties.STOP.equals(peptide) ? "Ter" |
250 | : StringUtils.toSentenceCase( | |
251 | ResidueProperties.aa2Triplet.get(peptide)); | |
252 | 7 | var.append("p.").append(from3).append(String.valueOf(toPosition)) |
253 | .append(to3); | |
254 | } | |
255 | 10 | if (!variantPeptides.contains(peptide)) // duplicate consequence |
256 | { | |
257 | 10 | variantPeptides.add(peptide); |
258 | 10 | if (vars.length() > 0) |
259 | { | |
260 | 1 | vars.append(","); |
261 | } | |
262 | 10 | vars.append(var); |
263 | } | |
264 | } | |
265 | ||
266 | 9 | return vars.toString(); |
267 | } | |
268 | ||
269 | /** | |
270 | * Answers the name of the linked sequence holding any mapped features | |
271 | * | |
272 | * @return | |
273 | */ | |
274 | 2 | public String getLinkedSequenceName() |
275 | { | |
276 | 2 | return featureSequence == null ? null : featureSequence.getName(); |
277 | } | |
278 | ||
279 | /** | |
280 | * Answers the mapped ranges (as one or more [start, end] positions) which | |
281 | * correspond to the given [begin, end] range of the linked sequence. | |
282 | * | |
283 | * <pre> | |
284 | * Example: MappedFeatures with CDS features mapped to peptide | |
285 | * CDS/200-220 gtc aac TGa acGt att AAC tta | |
286 | * mapped to PEP/6-7 WN by mapping [206, 207, 210, 210, 215, 217] to [6, 7] | |
287 | * getMappedPositions(206, 206) should return [6, 6] | |
288 | * getMappedPositions(200, 214) should return [6, 6] | |
289 | * getMappedPositions(210, 215) should return [6, 7] | |
290 | * </pre> | |
291 | * | |
292 | * @param begin | |
293 | * @param end | |
294 | * @return | |
295 | */ | |
296 | 5 | public int[] getMappedPositions(int begin, int end) |
297 | { | |
298 | 5 | MapList map = mapping.getMap(); |
299 | 5 | return mapping.to == featureSequence ? map.getOverlapsInFrom(begin, end) |
300 | : map.getOverlapsInTo(begin, end); | |
301 | } | |
302 | ||
303 | /** | |
304 | * Answers true if the linked features are on coding sequence, false if on | |
305 | * peptide | |
306 | * | |
307 | * @return | |
308 | */ | |
309 | 4 | public boolean isFromCds() |
310 | { | |
311 | 4 | if (mapping.getMap().getFromRatio() == 3) |
312 | { | |
313 | 4 | return mapping.to != featureSequence; |
314 | } | |
315 | 0 | return mapping.to == featureSequence; |
316 | } | |
317 | } |