Class |
Line # |
Actions |
|||
---|---|---|---|---|---|
PhylipFile | 60 | 70 | 26 |
1 | /* | |
2 | * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) | |
3 | * Copyright (C) $$Year-Rel$$ The Jalview Authors | |
4 | * | |
5 | * This file is part of Jalview. | |
6 | * | |
7 | * Jalview is free software: you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation, either version 3 | |
10 | * of the License, or (at your option) any later version. | |
11 | * | |
12 | * Jalview is distributed in the hope that it will be useful, but | |
13 | * WITHOUT ANY WARRANTY; without even the implied warranty | |
14 | * of MERCHANTABILITY or FITNESS FOR A PARTICULAR | |
15 | * PURPOSE. See the GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with Jalview. If not, see <http://www.gnu.org/licenses/>. | |
19 | * The Jalview Authors are detailed in the 'AUTHORS' file. | |
20 | */ | |
21 | package jalview.io; | |
22 | ||
23 | import jalview.datamodel.Sequence; | |
24 | import jalview.datamodel.SequenceI; | |
25 | ||
26 | import java.io.IOException; | |
27 | ||
28 | /** | |
29 | * <p> | |
30 | * Parser and exporter for PHYLIP file format, as defined | |
31 | * <a href="http://evolution.genetics.washington.edu/phylip/doc/main.html">in | |
32 | * the documentation</a>. The parser imports PHYLIP files in both sequential and | |
33 | * interleaved format, and (currently) exports in interleaved format (using 60 | |
34 | * characters per matrix for the sequence). | |
35 | * <p> | |
36 | * | |
37 | * <p> | |
38 | * The following assumptions have been made for input | |
39 | * <ul> | |
40 | * <li>Sequences are expressed as letters, not real numbers with decimal points | |
41 | * separated by blanks (which is a valid option according to the | |
42 | * specification)</li> | |
43 | * </ul> | |
44 | * | |
45 | * The following assumptions have been made for output | |
46 | * <ul> | |
47 | * <li>Interleaved format is used, with each matrix consisting of 60 characters; | |
48 | * </li> | |
49 | * <li>a blank line is added between each matrix;</li> | |
50 | * <li>no spacing is added between the sequence characters.</li> | |
51 | * </ul> | |
52 | * | |
53 | * | |
54 | * </p> | |
55 | * | |
56 | * @author David Corsar | |
57 | * | |
58 | * | |
59 | */ | |
60 | public class PhylipFile extends AlignFile | |
61 | { | |
62 | ||
63 | public static final String FILE_DESC = "PHYLIP"; | |
64 | ||
65 | /** | |
66 | * | |
67 | * @see {@link AlignFile#AlignFile()} | |
68 | */ | |
69 | 3 | public PhylipFile() |
70 | { | |
71 | 3 | super(); |
72 | } | |
73 | ||
74 | /** | |
75 | * | |
76 | * @param source | |
77 | * @throws IOException | |
78 | */ | |
79 | 7 | public PhylipFile(FileParse source) throws IOException |
80 | { | |
81 | 7 | super(source); |
82 | } | |
83 | ||
84 | /** | |
85 | * @param inFile | |
86 | * @param sourceType | |
87 | * @throws IOException | |
88 | * @see {@link AlignFile#AlignFile(FileParse)} | |
89 | */ | |
90 | 0 | public PhylipFile(String inFile, DataSourceType sourceType) |
91 | throws IOException | |
92 | { | |
93 | 0 | super(inFile, sourceType); |
94 | } | |
95 | ||
96 | /** | |
97 | * Parses the input source | |
98 | * | |
99 | * @see {@link AlignFile#parse()} | |
100 | */ | |
101 | 7 | @Override |
102 | public void parse() throws IOException | |
103 | { | |
104 | 7 | try |
105 | { | |
106 | // First line should contain number of species and number of | |
107 | // characters, separated by blanks | |
108 | 7 | String line = nextLine(); |
109 | 7 | String[] lineElements = line.trim().split("\\s+"); |
110 | 7 | if (lineElements.length < 2) |
111 | { | |
112 | 0 | throw new IOException( |
113 | "First line must contain the number of specifies and number of characters"); | |
114 | } | |
115 | ||
116 | 7 | int numberSpecies = Integer.parseInt(lineElements[0]), |
117 | numberCharacters = Integer.parseInt(lineElements[1]); | |
118 | ||
119 | 7 | if (numberSpecies <= 0) |
120 | { | |
121 | // there are no sequences in this file so exit a nothing to | |
122 | // parse | |
123 | 0 | return; |
124 | } | |
125 | ||
126 | 7 | SequenceI[] sequenceElements = new Sequence[numberSpecies]; |
127 | 7 | StringBuffer[] sequences = new StringBuffer[numberSpecies]; |
128 | ||
129 | // if file is in sequential format there is only one data matrix, | |
130 | // else there are multiple | |
131 | ||
132 | // read the first data matrix | |
133 | 82 | for (int i = 0; i < numberSpecies; i++) |
134 | { | |
135 | 75 | line = nextLine(); |
136 | // lines start with the name - a maximum of 10 characters | |
137 | // if less, then padded out or terminated with a tab | |
138 | 75 | String potentialName = line.substring(0, 10); |
139 | 75 | int tabIndex = potentialName.indexOf('\t'); |
140 | 75 | if (tabIndex == -1) |
141 | { | |
142 | 75 | sequenceElements[i] = parseId(validateName(potentialName)); |
143 | 75 | sequences[i] = new StringBuffer( |
144 | removeWhitespace(line.substring(10))); | |
145 | } | |
146 | else | |
147 | { | |
148 | 0 | sequenceElements[i] = parseId( |
149 | validateName(potentialName.substring(0, tabIndex))); | |
150 | 0 | sequences[i] = new StringBuffer( |
151 | removeWhitespace(line.substring(tabIndex))); | |
152 | } | |
153 | } | |
154 | ||
155 | // determine if interleaved | |
156 | 7 | if ((sequences[0]).length() != numberCharacters) |
157 | { | |
158 | // interleaved file, so have to read the remainder | |
159 | 5 | int i = 0; |
160 | 521 | for (line = nextLine(); line != null; line = nextLine()) |
161 | { | |
162 | // ignore blank lines, as defined by the specification | |
163 | 516 | if (line.length() > 0) |
164 | { | |
165 | 470 | sequences[i++].append(removeWhitespace(line)); |
166 | } | |
167 | // reached end of matrix, so get ready for the next one | |
168 | 516 | if (i == sequences.length) |
169 | { | |
170 | 46 | i = 0; |
171 | } | |
172 | } | |
173 | } | |
174 | ||
175 | // file parsed completely, now store sequences | |
176 | 82 | for (int i = 0; i < numberSpecies; i++) |
177 | { | |
178 | // first check sequence is the expected length | |
179 | 75 | if (sequences[i].length() != numberCharacters) |
180 | { | |
181 | 0 | throw new IOException(sequenceElements[i].getName() |
182 | + " sequence is incorrect length - should be " | |
183 | + numberCharacters + " but is " + sequences[i].length()); | |
184 | } | |
185 | 75 | sequenceElements[i].setSequence(sequences[i].toString()); |
186 | 75 | seqs.add(sequenceElements[i]); |
187 | } | |
188 | ||
189 | } catch (IOException e) | |
190 | { | |
191 | 0 | jalview.bin.Console.errPrintln("Exception parsing PHYLIP file " + e); |
192 | 0 | e.printStackTrace(System.err); |
193 | 0 | throw e; |
194 | } | |
195 | ||
196 | } | |
197 | ||
198 | /** | |
199 | * Removes any whitespace from txt, used to strip and spaces added to | |
200 | * sequences to improve human readability | |
201 | * | |
202 | * @param txt | |
203 | * @return | |
204 | */ | |
205 | 545 | private String removeWhitespace(String txt) |
206 | { | |
207 | 545 | return txt.replaceAll("\\s*", ""); |
208 | } | |
209 | ||
210 | /** | |
211 | * According to the specification, the name cannot have parentheses, square | |
212 | * brackets, colon, semicolon, comma | |
213 | * | |
214 | * @param name | |
215 | * @return | |
216 | * @throws IOException | |
217 | */ | |
218 | 75 | private String validateName(String name) throws IOException |
219 | { | |
220 | 75 | char[] invalidCharacters = new char[] { '(', ')', '[', ']', ':', ';', |
221 | ',' }; | |
222 | 75 | for (char c : invalidCharacters) |
223 | { | |
224 | 525 | if (name.indexOf(c) > -1) |
225 | { | |
226 | 0 | throw new IOException( |
227 | "Species name contains illegal character " + c); | |
228 | } | |
229 | } | |
230 | 75 | return name; |
231 | } | |
232 | ||
233 | /** | |
234 | * <p> | |
235 | * Prints the seqs in interleaved format, with each matrix consisting of 60 | |
236 | * characters; a blank line is added between each matrix; no spacing is added | |
237 | * between the sequence characters. | |
238 | * </p> | |
239 | * | |
240 | * | |
241 | * @see {@link AlignFile#print()} | |
242 | */ | |
243 | 3 | @Override |
244 | public String print(SequenceI[] sqs, boolean jvsuffix) | |
245 | { | |
246 | ||
247 | 3 | StringBuffer sb = new StringBuffer(Integer.toString(sqs.length)); |
248 | 3 | sb.append(" "); |
249 | // if there are no sequences, then define the number of characters as 0 | |
250 | 3 | sb.append((sqs.length > 0) ? Integer.toString(sqs[0].getLength()) : "0") |
251 | .append(newline); | |
252 | ||
253 | // Due to how IO is handled, there doesn't appear to be a way to store | |
254 | // if the original file was sequential or interleaved; if there is, then | |
255 | // use that to set the value of the following variable | |
256 | 3 | boolean sequential = false; |
257 | ||
258 | // maximum number of columns for each row of interleaved format | |
259 | 3 | int numInterleavedColumns = 60; |
260 | ||
261 | 3 | int sequenceLength = 0; |
262 | 3 | for (SequenceI s : sqs) |
263 | { | |
264 | ||
265 | // ensure name is only 10 characters | |
266 | 35 | String name = s.getName(); |
267 | 35 | if (name.length() > 10) |
268 | { | |
269 | 4 | name = name.substring(0, 10); |
270 | } | |
271 | else | |
272 | { | |
273 | // add padding 10 characters | |
274 | 31 | name = String.format("%1$-" + 10 + "s", s.getName()); |
275 | } | |
276 | 35 | sb.append(name); |
277 | ||
278 | // sequential has the entire sequence following the name | |
279 | 35 | if (sequential) |
280 | { | |
281 | 0 | sb.append(s.getSequenceAsString()); |
282 | } | |
283 | else | |
284 | { | |
285 | // Jalview ensures all sequences are of same length so no need | |
286 | // to keep track of min/max length | |
287 | 35 | sequenceLength = s.getLength(); |
288 | // interleaved breaks the sequence into chunks for | |
289 | // interleavedColumns characters | |
290 | 35 | sb.append(s.getSequence(0, |
291 | Math.min(numInterleavedColumns, sequenceLength))); | |
292 | } | |
293 | 35 | sb.append(newline); |
294 | } | |
295 | ||
296 | // add the remaining matrixes if interleaved and there is something to | |
297 | // add | |
298 | 3 | if (!sequential && sequenceLength > numInterleavedColumns) |
299 | { | |
300 | // determine number of remaining matrixes | |
301 | 3 | int numMatrics = sequenceLength / numInterleavedColumns; |
302 | 3 | if ((sequenceLength % numInterleavedColumns) > 0) |
303 | { | |
304 | 3 | numMatrics++; |
305 | } | |
306 | ||
307 | // start i = 1 as first matrix has already been printed | |
308 | 27 | for (int i = 1; i < numMatrics; i++) |
309 | { | |
310 | // add blank line to separate this matrix from previous | |
311 | 24 | sb.append(newline); |
312 | 24 | int start = i * numInterleavedColumns; |
313 | 24 | for (SequenceI s : sqs) |
314 | { | |
315 | 250 | sb.append(s.getSequence(start, |
316 | Math.min(start + numInterleavedColumns, sequenceLength))) | |
317 | .append(newline); | |
318 | } | |
319 | } | |
320 | ||
321 | } | |
322 | ||
323 | 3 | return sb.toString(); |
324 | } | |
325 | } |