Clover icon

Coverage Report

  1. Project Clover database Thu Nov 28 2024 11:45:30 GMT
  2. Package jalview.io

File PhylipFile.java

 

Coverage histogram

../../img/srcFileCovDistChart9.png
13% of files have more coverage

Code metrics

34
70
7
1
325
166
26
0.37
10
7
3.71

Classes

Class Line # Actions
PhylipFile 60 70 26
0.810810881.1%
 

Contributing tests

This file is covered by 5 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.io;
22   
23    import jalview.datamodel.Sequence;
24    import jalview.datamodel.SequenceI;
25   
26    import java.io.IOException;
27   
28    /**
29    * <p>
30    * Parser and exporter for PHYLIP file format, as defined
31    * <a href="http://evolution.genetics.washington.edu/phylip/doc/main.html">in
32    * the documentation</a>. The parser imports PHYLIP files in both sequential and
33    * interleaved format, and (currently) exports in interleaved format (using 60
34    * characters per matrix for the sequence).
35    * <p>
36    *
37    * <p>
38    * The following assumptions have been made for input
39    * <ul>
40    * <li>Sequences are expressed as letters, not real numbers with decimal points
41    * separated by blanks (which is a valid option according to the
42    * specification)</li>
43    * </ul>
44    *
45    * The following assumptions have been made for output
46    * <ul>
47    * <li>Interleaved format is used, with each matrix consisting of 60 characters;
48    * </li>
49    * <li>a blank line is added between each matrix;</li>
50    * <li>no spacing is added between the sequence characters.</li>
51    * </ul>
52    *
53    *
54    * </p>
55    *
56    * @author David Corsar
57    *
58    *
59    */
 
60    public class PhylipFile extends AlignFile
61    {
62   
63    public static final String FILE_DESC = "PHYLIP";
64   
65    /**
66    *
67    * @see {@link AlignFile#AlignFile()}
68    */
 
69  3 toggle public PhylipFile()
70    {
71  3 super();
72    }
73   
74    /**
75    *
76    * @param source
77    * @throws IOException
78    */
 
79  7 toggle public PhylipFile(FileParse source) throws IOException
80    {
81  7 super(source);
82    }
83   
84    /**
85    * @param inFile
86    * @param sourceType
87    * @throws IOException
88    * @see {@link AlignFile#AlignFile(FileParse)}
89    */
 
90  0 toggle public PhylipFile(String inFile, DataSourceType sourceType)
91    throws IOException
92    {
93  0 super(inFile, sourceType);
94    }
95   
96    /**
97    * Parses the input source
98    *
99    * @see {@link AlignFile#parse()}
100    */
 
101  7 toggle @Override
102    public void parse() throws IOException
103    {
104  7 try
105    {
106    // First line should contain number of species and number of
107    // characters, separated by blanks
108  7 String line = nextLine();
109  7 String[] lineElements = line.trim().split("\\s+");
110  7 if (lineElements.length < 2)
111    {
112  0 throw new IOException(
113    "First line must contain the number of specifies and number of characters");
114    }
115   
116  7 int numberSpecies = Integer.parseInt(lineElements[0]),
117    numberCharacters = Integer.parseInt(lineElements[1]);
118   
119  7 if (numberSpecies <= 0)
120    {
121    // there are no sequences in this file so exit a nothing to
122    // parse
123  0 return;
124    }
125   
126  7 SequenceI[] sequenceElements = new Sequence[numberSpecies];
127  7 StringBuffer[] sequences = new StringBuffer[numberSpecies];
128   
129    // if file is in sequential format there is only one data matrix,
130    // else there are multiple
131   
132    // read the first data matrix
133  82 for (int i = 0; i < numberSpecies; i++)
134    {
135  75 line = nextLine();
136    // lines start with the name - a maximum of 10 characters
137    // if less, then padded out or terminated with a tab
138  75 String potentialName = line.substring(0, 10);
139  75 int tabIndex = potentialName.indexOf('\t');
140  75 if (tabIndex == -1)
141    {
142  75 sequenceElements[i] = parseId(validateName(potentialName));
143  75 sequences[i] = new StringBuffer(
144    removeWhitespace(line.substring(10)));
145    }
146    else
147    {
148  0 sequenceElements[i] = parseId(
149    validateName(potentialName.substring(0, tabIndex)));
150  0 sequences[i] = new StringBuffer(
151    removeWhitespace(line.substring(tabIndex)));
152    }
153    }
154   
155    // determine if interleaved
156  7 if ((sequences[0]).length() != numberCharacters)
157    {
158    // interleaved file, so have to read the remainder
159  5 int i = 0;
160  521 for (line = nextLine(); line != null; line = nextLine())
161    {
162    // ignore blank lines, as defined by the specification
163  516 if (line.length() > 0)
164    {
165  470 sequences[i++].append(removeWhitespace(line));
166    }
167    // reached end of matrix, so get ready for the next one
168  516 if (i == sequences.length)
169    {
170  46 i = 0;
171    }
172    }
173    }
174   
175    // file parsed completely, now store sequences
176  82 for (int i = 0; i < numberSpecies; i++)
177    {
178    // first check sequence is the expected length
179  75 if (sequences[i].length() != numberCharacters)
180    {
181  0 throw new IOException(sequenceElements[i].getName()
182    + " sequence is incorrect length - should be "
183    + numberCharacters + " but is " + sequences[i].length());
184    }
185  75 sequenceElements[i].setSequence(sequences[i].toString());
186  75 seqs.add(sequenceElements[i]);
187    }
188   
189    } catch (IOException e)
190    {
191  0 jalview.bin.Console.errPrintln("Exception parsing PHYLIP file " + e);
192  0 e.printStackTrace(System.err);
193  0 throw e;
194    }
195   
196    }
197   
198    /**
199    * Removes any whitespace from txt, used to strip and spaces added to
200    * sequences to improve human readability
201    *
202    * @param txt
203    * @return
204    */
 
205  545 toggle private String removeWhitespace(String txt)
206    {
207  545 return txt.replaceAll("\\s*", "");
208    }
209   
210    /**
211    * According to the specification, the name cannot have parentheses, square
212    * brackets, colon, semicolon, comma
213    *
214    * @param name
215    * @return
216    * @throws IOException
217    */
 
218  75 toggle private String validateName(String name) throws IOException
219    {
220  75 char[] invalidCharacters = new char[] { '(', ')', '[', ']', ':', ';',
221    ',' };
222  75 for (char c : invalidCharacters)
223    {
224  525 if (name.indexOf(c) > -1)
225    {
226  0 throw new IOException(
227    "Species name contains illegal character " + c);
228    }
229    }
230  75 return name;
231    }
232   
233    /**
234    * <p>
235    * Prints the seqs in interleaved format, with each matrix consisting of 60
236    * characters; a blank line is added between each matrix; no spacing is added
237    * between the sequence characters.
238    * </p>
239    *
240    *
241    * @see {@link AlignFile#print()}
242    */
 
243  3 toggle @Override
244    public String print(SequenceI[] sqs, boolean jvsuffix)
245    {
246   
247  3 StringBuffer sb = new StringBuffer(Integer.toString(sqs.length));
248  3 sb.append(" ");
249    // if there are no sequences, then define the number of characters as 0
250  3 sb.append((sqs.length > 0) ? Integer.toString(sqs[0].getLength()) : "0")
251    .append(newline);
252   
253    // Due to how IO is handled, there doesn't appear to be a way to store
254    // if the original file was sequential or interleaved; if there is, then
255    // use that to set the value of the following variable
256  3 boolean sequential = false;
257   
258    // maximum number of columns for each row of interleaved format
259  3 int numInterleavedColumns = 60;
260   
261  3 int sequenceLength = 0;
262  3 for (SequenceI s : sqs)
263    {
264   
265    // ensure name is only 10 characters
266  35 String name = s.getName();
267  35 if (name.length() > 10)
268    {
269  4 name = name.substring(0, 10);
270    }
271    else
272    {
273    // add padding 10 characters
274  31 name = String.format("%1$-" + 10 + "s", s.getName());
275    }
276  35 sb.append(name);
277   
278    // sequential has the entire sequence following the name
279  35 if (sequential)
280    {
281  0 sb.append(s.getSequenceAsString());
282    }
283    else
284    {
285    // Jalview ensures all sequences are of same length so no need
286    // to keep track of min/max length
287  35 sequenceLength = s.getLength();
288    // interleaved breaks the sequence into chunks for
289    // interleavedColumns characters
290  35 sb.append(s.getSequence(0,
291    Math.min(numInterleavedColumns, sequenceLength)));
292    }
293  35 sb.append(newline);
294    }
295   
296    // add the remaining matrixes if interleaved and there is something to
297    // add
298  3 if (!sequential && sequenceLength > numInterleavedColumns)
299    {
300    // determine number of remaining matrixes
301  3 int numMatrics = sequenceLength / numInterleavedColumns;
302  3 if ((sequenceLength % numInterleavedColumns) > 0)
303    {
304  3 numMatrics++;
305    }
306   
307    // start i = 1 as first matrix has already been printed
308  27 for (int i = 1; i < numMatrics; i++)
309    {
310    // add blank line to separate this matrix from previous
311  24 sb.append(newline);
312  24 int start = i * numInterleavedColumns;
313  24 for (SequenceI s : sqs)
314    {
315  250 sb.append(s.getSequence(start,
316    Math.min(start + numInterleavedColumns, sequenceLength)))
317    .append(newline);
318    }
319    }
320   
321    }
322   
323  3 return sb.toString();
324    }
325    }