Class |
Line # |
Actions |
|||
---|---|---|---|---|---|
EmblFlatFileTest | 49 | 178 | 16 |
1 | /* | |
2 | * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) | |
3 | * Copyright (C) $$Year-Rel$$ The Jalview Authors | |
4 | * | |
5 | * This file is part of Jalview. | |
6 | * | |
7 | * Jalview is free software: you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation, either version 3 | |
10 | * of the License, or (at your option) any later version. | |
11 | * | |
12 | * Jalview is distributed in the hope that it will be useful, but | |
13 | * WITHOUT ANY WARRANTY; without even the implied warranty | |
14 | * of MERCHANTABILITY or FITNESS FOR A PARTICULAR | |
15 | * PURPOSE. See the GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with Jalview. If not, see <http://www.gnu.org/licenses/>. | |
19 | * The Jalview Authors are detailed in the 'AUTHORS' file. | |
20 | */ | |
21 | package jalview.io; | |
22 | ||
23 | import static org.testng.Assert.assertEquals; | |
24 | import static org.testng.Assert.assertTrue; | |
25 | import static org.testng.AssertJUnit.assertNotNull; | |
26 | import static org.testng.AssertJUnit.assertNull; | |
27 | import static org.testng.AssertJUnit.assertSame; | |
28 | import static org.testng.AssertJUnit.fail; | |
29 | ||
30 | import java.io.File; | |
31 | import java.io.IOException; | |
32 | import java.net.MalformedURLException; | |
33 | import java.util.Arrays; | |
34 | import java.util.List; | |
35 | import java.util.Set; | |
36 | ||
37 | import org.testng.annotations.BeforeClass; | |
38 | import org.testng.annotations.Test; | |
39 | ||
40 | import jalview.bin.Console; | |
41 | import jalview.datamodel.DBRefEntry; | |
42 | import jalview.datamodel.Mapping; | |
43 | import jalview.datamodel.Sequence.DBModList; | |
44 | import jalview.datamodel.SequenceFeature; | |
45 | import jalview.datamodel.SequenceI; | |
46 | import jalview.datamodel.features.SequenceFeatures; | |
47 | import jalview.util.MapList; | |
48 | ||
49 | public class EmblFlatFileTest | |
50 | { | |
51 | 1 | @BeforeClass(alwaysRun = true) |
52 | public void setUp() | |
53 | { | |
54 | 1 | Console.initLogger(); |
55 | } | |
56 | ||
57 | /** | |
58 | * A fairly tough test, using J03321 (circular DNA), which has 8 CDS features, | |
59 | * one of them reverse strand | |
60 | * | |
61 | * @throws MalformedURLException | |
62 | * @throws IOException | |
63 | */ | |
64 | 1 | @Test(groups = "Functional") |
65 | public void testParse() throws MalformedURLException, IOException | |
66 | { | |
67 | 1 | File dataFile = new File("test/jalview/io/J03321.embl.txt"); |
68 | 1 | FileParse fp = new FileParse(dataFile, DataSourceType.FILE); |
69 | 1 | EmblFlatFile parser = new EmblFlatFile(fp, "EmblTest"); |
70 | 1 | List<SequenceI> seqs = parser.getSeqs(); |
71 | ||
72 | 1 | assertEquals(seqs.size(), 1); |
73 | 1 | SequenceI seq = seqs.get(0); |
74 | 1 | assertEquals(seq.getName(), "EmblTest|J03321"); |
75 | 1 | assertEquals(seq.getLength(), 7502); |
76 | 1 | assertEquals(seq.getDescription(), |
77 | "Chlamydia trachomatis plasmid pCHL1, complete sequence"); | |
78 | ||
79 | /* | |
80 | * should be 9 CDS features (one is a 'join' of two exons) | |
81 | */ | |
82 | 1 | Set<String> featureTypes = seq.getFeatures().getFeatureTypes(); |
83 | 1 | assertEquals(featureTypes.size(), 1); |
84 | 1 | assertTrue(featureTypes.contains("CDS")); |
85 | ||
86 | /* | |
87 | * inspect some features (sorted just for convenience of test assertions) | |
88 | */ | |
89 | 1 | List<SequenceFeature> features = seq.getFeatures() |
90 | .getAllFeatures("CDS"); | |
91 | 1 | SequenceFeatures.sortFeatures(features, true); |
92 | 1 | assertEquals(features.size(), 9); |
93 | ||
94 | 1 | SequenceFeature sf = features.get(0); |
95 | 1 | assertEquals(sf.getBegin(), 1); |
96 | 1 | assertEquals(sf.getEnd(), 437); |
97 | 1 | assertEquals(sf.getDescription(), |
98 | "Exon 2 for protein EMBLCDS:AAA91567.1"); | |
99 | 1 | assertEquals(sf.getFeatureGroup(), "EmblTest"); |
100 | 1 | assertEquals(sf.getEnaLocation(), "join(7022..7502,1..437)"); |
101 | 1 | assertEquals(sf.getPhase(), "0"); |
102 | 1 | assertEquals(sf.getStrand(), 1); |
103 | 1 | assertEquals(sf.getValue("note"), "pGP7-D"); |
104 | // this is the second exon of circular CDS! | |
105 | 1 | assertEquals(sf.getValue("exon number"), 2); |
106 | 1 | assertEquals(sf.getValue("product"), "hypothetical protein"); |
107 | 1 | assertEquals(sf.getValue("transl_table"), "11"); |
108 | ||
109 | 1 | sf = features.get(1); |
110 | 1 | assertEquals(sf.getBegin(), 488); |
111 | 1 | assertEquals(sf.getEnd(), 1480); |
112 | 1 | assertEquals(sf.getDescription(), |
113 | "Exon 1 for protein EMBLCDS:AAA91568.1"); | |
114 | 1 | assertEquals(sf.getFeatureGroup(), "EmblTest"); |
115 | 1 | assertEquals(sf.getEnaLocation(), "complement(488..1480)"); |
116 | 1 | assertEquals(sf.getPhase(), "0"); |
117 | 1 | assertEquals(sf.getStrand(), -1); // reverse strand! |
118 | 1 | assertEquals(sf.getValue("note"), "pGP8-D"); |
119 | 1 | assertEquals(sf.getValue("exon number"), 1); |
120 | 1 | assertEquals(sf.getValue("product"), "hypothetical protein"); |
121 | ||
122 | 1 | sf = features.get(7); |
123 | 1 | assertEquals(sf.getBegin(), 6045); |
124 | 1 | assertEquals(sf.getEnd(), 6788); |
125 | 1 | assertEquals(sf.getDescription(), |
126 | "Exon 1 for protein EMBLCDS:AAA91574.1"); | |
127 | 1 | assertEquals(sf.getFeatureGroup(), "EmblTest"); |
128 | 1 | assertEquals(sf.getEnaLocation(), "6045..6788"); |
129 | 1 | assertEquals(sf.getPhase(), "0"); |
130 | 1 | assertEquals(sf.getStrand(), 1); |
131 | 1 | assertEquals(sf.getValue("note"), "pGP6-D (gtg start codon)"); |
132 | 1 | assertEquals(sf.getValue("exon number"), 1); |
133 | 1 | assertEquals(sf.getValue("product"), "hypothetical protein"); |
134 | ||
135 | /* | |
136 | * CDS at 7022-7502 is the first exon of the circular CDS | |
137 | */ | |
138 | 1 | sf = features.get(8); |
139 | 1 | assertEquals(sf.getBegin(), 7022); |
140 | 1 | assertEquals(sf.getEnd(), 7502); |
141 | 1 | assertEquals(sf.getDescription(), |
142 | "Exon 1 for protein EMBLCDS:AAA91567.1"); | |
143 | 1 | assertEquals(sf.getFeatureGroup(), "EmblTest"); |
144 | 1 | assertEquals(sf.getEnaLocation(), "join(7022..7502,1..437)"); |
145 | 1 | assertEquals(sf.getPhase(), "0"); |
146 | 1 | assertEquals(sf.getStrand(), 1); |
147 | 1 | assertEquals(sf.getValue("note"), "pGP7-D"); |
148 | 1 | assertEquals(sf.getValue("exon number"), 1); |
149 | 1 | assertEquals(sf.getValue("product"), "hypothetical protein"); |
150 | ||
151 | /* | |
152 | * Verify DBRefs, whether declared in the file or added by Jalview. | |
153 | * There are 4 'direct' (DR) dbrefs, and numerous CDS /db_xref entries | |
154 | * (some e.g. INTERPRO are duplicates). Jalview adds a dbref to 'self'. | |
155 | * Sample a few here. Note DBRefEntry constructor capitalises source. | |
156 | */ | |
157 | 1 | List<DBRefEntry> dbrefs = seq.getDBRefs(); |
158 | 1 | assertEquals(dbrefs.size(), 32); |
159 | // xref to 'self': | |
160 | 1 | DBRefEntry selfRef = new DBRefEntry("EMBLTEST", "1", "J03321"); |
161 | 1 | int[] range = new int[] { 1, seq.getLength() }; |
162 | 1 | selfRef.setMap(new Mapping(null, range, range, 1, 1)); |
163 | 1 | assertTrue(dbrefs.contains(selfRef)); |
164 | ||
165 | // 1st DR line; note trailing period is removed | |
166 | 1 | assertTrue(dbrefs.contains(new DBRefEntry("MD5", "0", |
167 | "d4c4942a634e3df4995fd5ac75c26a61"))); | |
168 | // the 4th DR line: | |
169 | 1 | assertTrue( |
170 | dbrefs.contains(new DBRefEntry("EUROPEPMC", "0", "PMC87941"))); | |
171 | // from the first CDS feature | |
172 | 1 | assertTrue(dbrefs.contains(new DBRefEntry("GOA", "0", "P0CE19"))); |
173 | // from the last CDS feature | |
174 | 1 | assertTrue( |
175 | dbrefs.contains(new DBRefEntry("INTERPRO", "0", "IPR005350"))); | |
176 | ||
177 | /* | |
178 | * verify mappings to, and sequences for, UNIPROT proteins | |
179 | */ | |
180 | 1 | int uniprotCount = 0; |
181 | 1 | List<int[]> ranges; |
182 | 1 | for (DBRefEntry dbref : dbrefs) |
183 | { | |
184 | 32 | if ("UNIPROT".equals(dbref.getSource())) |
185 | { | |
186 | 8 | uniprotCount++; |
187 | 8 | Mapping mapping = dbref.getMap(); |
188 | 8 | assertNotNull(mapping); |
189 | 8 | MapList map = mapping.getMap(); |
190 | 8 | String mappedToName = mapping.getTo().getName(); |
191 | 8 | if ("UNIPROT|P0CE16".equals(mappedToName)) |
192 | { | |
193 | 1 | assertEquals((ranges = map.getFromRanges()).size(), 1); |
194 | 1 | assertEquals(ranges.get(0)[0], 1579); |
195 | 1 | assertEquals(ranges.get(0)[1], 2931); // excludes stop 2934 |
196 | 1 | assertEquals((ranges = map.getToRanges()).size(), 1); |
197 | 1 | assertEquals(ranges.get(0)[0], 1); |
198 | 1 | assertEquals(ranges.get(0)[1], 451); |
199 | // CDS /product carries over as protein product description | |
200 | 1 | assertEquals(mapping.getTo().getDescription(), |
201 | "hypothetical protein"); | |
202 | } | |
203 | 7 | else if ("UNIPROT|P0CE17".equals(mappedToName)) |
204 | { | |
205 | 1 | assertEquals((ranges = map.getFromRanges()).size(), 1); |
206 | 1 | assertEquals(ranges.get(0)[0], 2928); |
207 | 1 | assertEquals(ranges.get(0)[1], 3989); // excludes stop 3992 |
208 | 1 | assertEquals((ranges = map.getToRanges()).size(), 1); |
209 | 1 | assertEquals(ranges.get(0)[0], 1); |
210 | 1 | assertEquals(ranges.get(0)[1], 354); |
211 | } | |
212 | 6 | else if ("UNIPROT|P0CE18".equals(mappedToName)) |
213 | { | |
214 | 1 | assertEquals((ranges = map.getFromRanges()).size(), 1); |
215 | 1 | assertEquals(ranges.get(0)[0], 4054); |
216 | 1 | assertEquals(ranges.get(0)[1], 4845); // excludes stop 4848 |
217 | 1 | assertEquals((ranges = map.getToRanges()).size(), 1); |
218 | 1 | assertEquals(ranges.get(0)[0], 1); |
219 | 1 | assertEquals(ranges.get(0)[1], 264); |
220 | } | |
221 | 5 | else if ("UNIPROT|P0CE19".equals(mappedToName)) |
222 | { | |
223 | // join(7022..7502,1..437) | |
224 | 1 | assertEquals((ranges = map.getFromRanges()).size(), 2); |
225 | 1 | assertEquals(ranges.get(0)[0], 7022); |
226 | 1 | assertEquals(ranges.get(0)[1], 7502); |
227 | 1 | assertEquals(ranges.get(1)[0], 1); |
228 | 1 | assertEquals(ranges.get(1)[1], 434); // excludes stop at 437 |
229 | 1 | assertEquals((ranges = map.getToRanges()).size(), 1); |
230 | 1 | assertEquals(ranges.get(0)[0], 1); |
231 | 1 | assertEquals(ranges.get(0)[1], 305); |
232 | } | |
233 | 4 | else if ("UNIPROT|P0CE20".equals(mappedToName)) |
234 | { | |
235 | // complement(488..1480) | |
236 | 1 | assertEquals((ranges = map.getFromRanges()).size(), 1); |
237 | 1 | assertEquals(ranges.get(0)[0], 1480); |
238 | 1 | assertEquals(ranges.get(0)[1], 491); // // excludes stop at 488 |
239 | 1 | assertEquals((ranges = map.getToRanges()).size(), 1); |
240 | 1 | assertEquals(ranges.get(0)[0], 1); |
241 | 1 | assertEquals(ranges.get(0)[1], 330); |
242 | } | |
243 | 3 | else if (!"UNIPROT|P0CE23".equals(mappedToName) |
244 | && !"UNIPROT|P10559".equals(mappedToName) | |
245 | && !"UNIPROT|P10560".equals(mappedToName)) | |
246 | { | |
247 | 0 | fail("Unexpected UNIPROT dbref to " + mappedToName); |
248 | } | |
249 | } | |
250 | } | |
251 | 1 | assertEquals(uniprotCount, 8); |
252 | } | |
253 | ||
254 | /** | |
255 | * A fairly tough test, using J03321 (circular DNA), which has 8 CDS features, | |
256 | * one of them reverse strand | |
257 | * | |
258 | * @throws MalformedURLException | |
259 | * @throws IOException | |
260 | */ | |
261 | 1 | @Test(groups = "Functional") |
262 | public void testParseToRNA() throws MalformedURLException, IOException | |
263 | { | |
264 | 1 | File dataFile = new File("test/jalview/io/J03321_rna.embl.txt"); |
265 | 1 | FileParse fp = new FileParse(dataFile, DataSourceType.FILE); |
266 | 1 | EmblFlatFile parser = new EmblFlatFile(fp, "EmblTest"); |
267 | 1 | List<SequenceI> seqs = parser.getSeqs(); |
268 | 1 | assertTrue(seqs.get(0).getSequenceAsString().indexOf("u") > -1); |
269 | } | |
270 | ||
271 | 1 | @Test(groups = "Functional") |
272 | public void testParse_codonStartNot1() | |
273 | { | |
274 | // TODO verify CDS-to-protein mapping for CDS with /codon_start=2 | |
275 | // example: https://www.ebi.ac.uk/ena/browser/api/embl/EU498516 | |
276 | } | |
277 | ||
278 | /** | |
279 | * Test for the case that the EMBL CDS has no UNIPROT xref. In this case | |
280 | * Jalview should synthesize an xref to EMBLCDSPROTEIN in the hope this will | |
281 | * allow Get Cross-References. | |
282 | * | |
283 | * @throws IOException | |
284 | */ | |
285 | 1 | @Test(groups = "Functional") |
286 | public void testParse_noUniprotXref() throws IOException | |
287 | { | |
288 | // MN908947 cut down to 40BP, one CDS, length 5 peptide for test purposes | |
289 | // plus an additional (invented) test case: | |
290 | // - multi-line /product qualifier including escaped quotes | |
291 | 1 | String data = "ID MN908947; SV 3; linear; genomic RNA; STD; VRL; 20 BP.\n" |
292 | + "DE Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1,\n" | |
293 | + "FT CDS 3..17\n" | |
294 | + "FT /protein_id=\"QHD43415.1\"\n" | |
295 | + "FT /product=\"orf1ab polyprotein\n" | |
296 | + "FT \"\"foobar\"\" \"\n" | |
297 | + "FT /translation=\"MRKLD\n" | |
298 | + "SQ Sequence 7496 BP; 2450 A; 1290 C; 1434 G; 2322 T; 0 other;\n" | |
299 | + " ggatGcgtaa gttagacgaa attttgtctt tgcgcacaga 40\n"; | |
300 | 1 | FileParse fp = new FileParse(data, DataSourceType.PASTE); |
301 | 1 | EmblFlatFile parser = new EmblFlatFile(fp, "EmblTest"); |
302 | 1 | List<SequenceI> seqs = parser.getSeqs(); |
303 | 1 | assertEquals(seqs.size(), 1); |
304 | 1 | SequenceI seq = seqs.get(0); |
305 | 1 | DBModList<DBRefEntry> dbrefs = seq.getDBRefs(); |
306 | ||
307 | /* | |
308 | * dna should have dbref to itself, and to inferred EMBLCDSPROTEIN:QHD43415.1 | |
309 | */ | |
310 | 1 | assertEquals(dbrefs.size(), 2); |
311 | ||
312 | // dbref to self | |
313 | 1 | DBRefEntry dbref = dbrefs.get(0); |
314 | 1 | assertEquals(dbref.getSource(), "EMBLTEST"); |
315 | 1 | assertEquals(dbref.getAccessionId(), "MN908947"); |
316 | 1 | Mapping mapping = dbref.getMap(); |
317 | 1 | assertNull(mapping.getTo()); |
318 | 1 | MapList map = mapping.getMap(); |
319 | 1 | assertEquals(map.getFromLowest(), 1); |
320 | 1 | assertEquals(map.getFromHighest(), 40); |
321 | 1 | assertEquals(map.getToLowest(), 1); |
322 | 1 | assertEquals(map.getToHighest(), 40); |
323 | 1 | assertEquals(map.getFromRatio(), 1); |
324 | 1 | assertEquals(map.getToRatio(), 1); |
325 | ||
326 | // dbref to inferred EMBLCDSPROTEIN: | |
327 | 1 | dbref = dbrefs.get(1); |
328 | 1 | assertEquals(dbref.getSource(), "EMBLCDSPROTEIN"); |
329 | 1 | assertEquals(dbref.getAccessionId(), "QHD43415.1"); |
330 | 1 | mapping = dbref.getMap(); |
331 | 1 | SequenceI mapTo = mapping.getTo(); |
332 | 1 | assertEquals(mapTo.getName(), "QHD43415.1"); |
333 | // the /product qualifier transfers to protein product description | |
334 | 1 | assertEquals(mapTo.getDescription(), "orf1ab polyprotein \"foobar\""); |
335 | 1 | assertEquals(mapTo.getSequenceAsString(), "MRKLD"); |
336 | 1 | map = mapping.getMap(); |
337 | 1 | assertEquals(map.getFromLowest(), 3); |
338 | 1 | assertEquals(map.getFromHighest(), 17); |
339 | 1 | assertEquals(map.getToLowest(), 1); |
340 | 1 | assertEquals(map.getToHighest(), 5); |
341 | 1 | assertEquals(map.getFromRatio(), 3); |
342 | 1 | assertEquals(map.getToRatio(), 1); |
343 | } | |
344 | ||
345 | 1 | @Test(groups = "Functional") |
346 | public void testAdjustForProteinLength() | |
347 | { | |
348 | 1 | int[] exons = new int[] { 11, 15, 21, 25, 31, 38 }; // 18 bp |
349 | ||
350 | // exact length match: | |
351 | 1 | assertSame(exons, EmblFlatFile.adjustForProteinLength(6, exons)); |
352 | ||
353 | // patch from JAL-3725 in EmblXmlSource propagated to Flatfile | |
354 | // match if we assume exons include stop codon not in protein: | |
355 | 1 | int[] truncated = EmblFlatFile.adjustForProteinLength(5, exons); |
356 | 1 | assertEquals(Arrays.toString(truncated), "[11, 15, 21, 25, 31, 35]"); |
357 | ||
358 | // truncate last exon by 6bp | |
359 | 1 | truncated = EmblFlatFile.adjustForProteinLength(4, exons); |
360 | 1 | assertEquals(Arrays.toString(truncated), "[11, 15, 21, 25, 31, 32]"); |
361 | ||
362 | // remove last exon and truncate preceding by 1bp (so 3bp in total) | |
363 | 1 | truncated = EmblFlatFile.adjustForProteinLength(3, exons); |
364 | 1 | assertEquals(Arrays.toString(truncated), "[11, 15, 21, 24]"); |
365 | ||
366 | // exact removal of exon case: | |
367 | 1 | exons = new int[] { 11, 15, 21, 27, 33, 38 }; // 18 bp |
368 | 1 | truncated = EmblFlatFile.adjustForProteinLength(4, exons); |
369 | 1 | assertEquals(Arrays.toString(truncated), "[11, 15, 21, 27]"); |
370 | ||
371 | // what if exons are too short for protein? | |
372 | 1 | truncated = EmblFlatFile.adjustForProteinLength(7, exons); |
373 | 1 | assertSame(exons, truncated); |
374 | } | |
375 | ||
376 | 1 | @Test(groups = "Functional") |
377 | public void testRemoveQuotes() | |
378 | { | |
379 | 1 | assertNull(EmblFlatFile.removeQuotes(null)); |
380 | 1 | assertEquals(EmblFlatFile.removeQuotes("No quotes here"), |
381 | "No quotes here"); | |
382 | 1 | assertEquals(EmblFlatFile.removeQuotes("\"Enclosing quotes\""), |
383 | "Enclosing quotes"); | |
384 | 1 | assertEquals( |
385 | EmblFlatFile.removeQuotes("\"Escaped \"\"quotes\"\" example\""), | |
386 | "Escaped \"quotes\" example"); | |
387 | } | |
388 | } |