Clover icon

jalviewX

  1. Project Clover database Wed Oct 31 2018 15:13:58 GMT
  2. Package jalview.analysis

File AlignmentGenerator.java

 

Coverage histogram

../../img/srcFileCovDistChart8.png
19% of files have more coverage

Code metrics

38
84
8
1
320
178
28
0.33
10.5
8
3.5

Classes

Class Line # Actions
AlignmentGenerator 47 84 28 35
0.730769273.1%
 

Contributing tests

This file is covered by 17 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.analysis;
22   
23    import jalview.datamodel.Alignment;
24    import jalview.datamodel.AlignmentI;
25    import jalview.datamodel.Sequence;
26    import jalview.datamodel.SequenceI;
27    import jalview.gui.JvOptionPane;
28    import jalview.io.FastaFile;
29   
30    import java.io.File;
31    import java.io.FileNotFoundException;
32    import java.io.PrintStream;
33    import java.util.Arrays;
34    import java.util.Random;
35   
36    import org.testng.annotations.BeforeClass;
37   
38    /**
39    * Generates, and outputs in Fasta format, a random peptide or nucleotide alignment for given
40    * sequence length and count. Will regenerate the same alignment each time if
41    * the same random seed is used (so may be used for reproducible unit tests).
42    * Not guaranteed to reproduce the same results between versions, as the rules
43    * may get tweaked to produce more 'realistic' results.
44    *
45    * @author gmcarstairs
46    */
 
47    public class AlignmentGenerator
48    {
49    private static final char GAP = '-';
50   
51    private static final char ZERO = '0';
52   
53    private static final char[] NUCS = "GTCA".toCharArray();
54   
55    private static final char[] PEPS = "MILVFYWHKRDEQNTCGASNP".toCharArray();
56   
57    private static char[] BASES;
58   
59    private Random random;
60   
61    private PrintStream ps;
62   
63    /**
64    * Outputs a pseudo-randomly generated nucleotide or peptide alignment
65    * Arguments:
66    * <ul>
67    * <li>n (for nucleotide) or p (for peptide)</li>
68    * <li>length (number of bases in each sequence)</li>
69    * <li>height (number of sequences)</li>
70    * <li>a whole number random seed</li>
71    * <li>percentage of gaps to include (0-100)</li>
72    * <li>percentage chance of variation of each position (0-100)</li>
73    * <li>(optional) path to a file to write the alignment to</li>
74    * </ul>
75    *
76    *
77    * @param args
78    * @throws FileNotFoundException
79    */
 
80  0 toggle public static void main(String[] args) throws FileNotFoundException
81    {
82  0 if (args.length != 6 && args.length != 7)
83    {
84  0 usage();
85  0 return;
86    }
87   
88  0 PrintStream ps = System.out;
89  0 if (args.length == 7)
90    {
91  0 ps = new PrintStream(new File(args[6]));
92    }
93   
94  0 boolean nucleotide = args[0].toLowerCase().startsWith("n");
95  0 int width = Integer.parseInt(args[1]);
96  0 int height = Integer.parseInt(args[2]);
97  0 long randomSeed = Long.valueOf(args[3]);
98  0 int gapPercentage = Integer.valueOf(args[4]);
99  0 int changePercentage = Integer.valueOf(args[5]);
100   
101  0 ps.println("; " + height + " sequences of " + width
102    + " bases with " + gapPercentage + "% gaps and "
103    + changePercentage + "% mutations (random seed = " + randomSeed
104    + ")");
105   
106  0 new AlignmentGenerator(nucleotide, ps).generate(width, height,
107    randomSeed, gapPercentage, changePercentage);
108   
109  0 if (ps != System.out)
110    {
111  0 ps.close();
112    }
113    }
114   
115    /**
116    * Prints parameter help
117    */
 
118  0 toggle private static void usage()
119    {
120  0 System.out.println("Usage:");
121  0 System.out.println("arg0: n (for nucleotide) or p (for peptide)");
122  0 System.out.println("arg1: number of (non-gap) bases per sequence");
123  0 System.out.println("arg2: number of sequences");
124  0 System.out
125    .println("arg3: an integer as random seed (same seed = same results)");
126  0 System.out.println("arg4: percentage of gaps to (randomly) generate");
127  0 System.out
128    .println("arg5: percentage of 'mutations' to (randomly) generate");
129  0 System.out
130    .println("arg6: (optional) path to output file (default is sysout)");
131  0 System.out.println("Example: AlignmentGenerator n 12 15 387 10 5");
132  0 System.out
133    .println("- 15 nucleotide sequences of 12 bases each, approx 10% gaps and 5% mutations, random seed = 387");
134   
135    }
136   
137    /**
138    * Constructor that sets nucleotide or peptide symbol set, and also writes the
139    * generated alignment to sysout
140    */
 
141  16 toggle public AlignmentGenerator(boolean nuc)
142    {
143  16 this(nuc, System.out);
144    }
145   
146    /**
147    * Constructor that sets nucleotide or peptide symbol set, and also writes the
148    * generated alignment to the specified output stream (if not null). This can
149    * be used to write the alignment to a file or sysout.
150    */
 
151  16 toggle public AlignmentGenerator(boolean nucleotide, PrintStream printStream)
152    {
153  16 BASES = nucleotide ? NUCS : PEPS;
154  16 ps = printStream;
155    }
156   
157    /**
158    * Outputs an 'alignment' of given width and height, where each position is a
159    * random choice from the symbol alphabet, or - for gap
160    *
161    * @param width
162    * @param height
163    * @param randomSeed
164    * @param changePercentage
165    * @param gapPercentage
166    */
 
167  29 toggle public AlignmentI generate(int width, int height, long randomSeed,
168    int gapPercentage, int changePercentage)
169    {
170  29 SequenceI[] seqs = new SequenceI[height];
171  29 random = new Random(randomSeed);
172  29 seqs[0] = generateSequence(1, width, gapPercentage);
173  1577 for (int seqno = 1; seqno < height; seqno++)
174    {
175  1548 seqs[seqno] = generateAnotherSequence(seqs[0].getSequence(),
176    seqno + 1, width, changePercentage);
177    }
178  29 AlignmentI al = new Alignment(seqs);
179   
180  29 if (ps != null)
181    {
182  29 ps.println(new FastaFile().print(al.getSequencesArray(), true));
183    }
184   
185  29 return al;
186    }
187   
188    /**
189    * Outputs a DNA 'sequence' of given length, with some random gaps included.
190    *
191    * @param seqno
192    * @param length
193    * @param gapPercentage
194    */
 
195  29 toggle private SequenceI generateSequence(int seqno, int length,
196    int gapPercentage)
197    {
198  29 StringBuilder seq = new StringBuilder(length);
199   
200    /*
201    * Loop till we've added 'length' bases (excluding gaps)
202    */
203  1588 for (int count = 0; count < length;)
204    {
205  1559 boolean addGap = random.nextInt(100) < gapPercentage;
206  1559 char c = addGap ? GAP : BASES[random.nextInt(Integer.MAX_VALUE)
207    % BASES.length];
208  1559 seq.append(c);
209  1559 if (!addGap)
210    {
211  1517 count++;
212    }
213    }
214  29 final String seqName = "SEQ" + seqno;
215  29 final String seqString = seq.toString();
216  29 SequenceI sq = new Sequence(seqName, seqString);
217  29 sq.createDatasetSequence();
218  29 return sq;
219    }
220   
221    /**
222    * Generate a sequence approximately aligned to the first one.
223    *
224    * @param ds
225    * @param seqno
226    * @param width
227    * number of bases
228    * @param changePercentage
229    * @return
230    */
 
231  1548 toggle private SequenceI generateAnotherSequence(char[] ds, int seqno,
232    int width, int changePercentage)
233    {
234  1548 int length = ds.length;
235  1548 char[] seq = new char[length];
236  1548 Arrays.fill(seq, ZERO);
237  1548 int gapsWanted = length - width;
238  1548 int gapsAdded = 0;
239   
240    /*
241    * First 'randomly' mimic gaps in model sequence.
242    */
243  198368 for (int pos = 0; pos < length; pos++)
244    {
245  196820 if (ds[pos] == GAP)
246    {
247    /*
248    * Add a gap at the same position with changePercentage likelihood
249    */
250  7858 seq[pos] = randomCharacter(GAP, changePercentage);
251  7858 if (seq[pos] == GAP)
252    {
253  7483 gapsAdded++;
254    }
255    }
256    }
257   
258    /*
259    * Next scatter any remaining gaps (if any) at random. This gives an even
260    * distribution.
261    */
262  1923 while (gapsAdded < gapsWanted)
263    {
264  375 boolean added = false;
265  760 while (!added)
266    {
267  385 int pos = random.nextInt(length);
268  385 if (seq[pos] != GAP)
269    {
270  375 seq[pos] = GAP;
271  375 added = true;
272  375 gapsAdded++;
273    }
274    }
275    }
276   
277    /*
278    * Finally fill in the rest with randomly mutated bases.
279    */
280  198368 for (int pos = 0; pos < length; pos++)
281    {
282  196820 if (seq[pos] == ZERO)
283    {
284  188594 char c = randomCharacter(ds[pos], changePercentage);
285  188594 seq[pos] = c;
286    }
287    }
288  1548 final String seqName = "SEQ" + seqno;
289  1548 final String seqString = new String(seq);
290  1548 SequenceI sq = new Sequence(seqName, seqString);
291  1548 sq.createDatasetSequence();
292  1548 return sq;
293    }
294   
295    /**
296    * Returns a random character that is changePercentage% likely to match the
297    * given type (as base or gap).
298    *
299    * @param changePercentage
300    *
301    * @param c
302    * @return
303    */
 
304  196452 toggle private char randomCharacter(char c, int changePercentage)
305    {
306  196452 final boolean mutation = random.nextInt(100) < changePercentage;
307   
308  196452 if (!mutation)
309    {
310  186951 return c;
311    }
312   
313  9501 char newchar = c;
314  19518 while (newchar == c)
315    {
316  10017 newchar = BASES[random.nextInt(Integer.MAX_VALUE) % BASES.length];
317    }
318  9501 return newchar;
319    }
320    }