1. Project Clover database Wed Nov 13 2024 18:27:33 GMT
  2. Package jalview.analysis

File AlignmentGenerator.java

 

Coverage histogram

../../img/srcFileCovDistChart8.png
20% of files have more coverage

Code metrics

38
84
8
1
322
178
28
0.33
10.5
8
3.5

Classes

Class
Line #
Actions
AlignmentGenerator 50 84 28
0.730769273.1%
 

Contributing tests

This file is covered by 17 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.analysis;
22   
23    import java.util.Locale;
24   
25    import jalview.datamodel.Alignment;
26    import jalview.datamodel.AlignmentI;
27    import jalview.datamodel.Sequence;
28    import jalview.datamodel.SequenceI;
29    import jalview.gui.JvOptionPane;
30    import jalview.io.FastaFile;
31   
32    import java.io.File;
33    import java.io.FileNotFoundException;
34    import java.io.PrintStream;
35    import java.util.Arrays;
36    import java.util.Random;
37   
38    import org.testng.annotations.BeforeClass;
39   
40    /**
41    * Generates, and outputs in Fasta format, a random peptide or nucleotide
42    * alignment for given sequence length and count. Will regenerate the same
43    * alignment each time if the same random seed is used (so may be used for
44    * reproducible unit tests). Not guaranteed to reproduce the same results
45    * between versions, as the rules may get tweaked to produce more 'realistic'
46    * results.
47    *
48    * @author gmcarstairs
49    */
 
50    public class AlignmentGenerator
51    {
52    private static final char GAP = '-';
53   
54    private static final char ZERO = '0';
55   
56    private static final char[] NUCS = "GTCA".toCharArray();
57   
58    private static final char[] PEPS = "MILVFYWHKRDEQNTCGASNP".toCharArray();
59   
60    private static char[] BASES;
61   
62    private Random random;
63   
64    private PrintStream ps;
65   
66    /**
67    * Outputs a pseudo-randomly generated nucleotide or peptide alignment
68    * Arguments:
69    * <ul>
70    * <li>n (for nucleotide) or p (for peptide)</li>
71    * <li>length (number of bases in each sequence)</li>
72    * <li>height (number of sequences)</li>
73    * <li>a whole number random seed</li>
74    * <li>percentage of gaps to include (0-100)</li>
75    * <li>percentage chance of variation of each position (0-100)</li>
76    * <li>(optional) path to a file to write the alignment to</li>
77    * </ul>
78    *
79    *
80    * @param args
81    * @throws FileNotFoundException
82    */
 
83  0 toggle public static void main(String[] args) throws FileNotFoundException
84    {
85  0 if (args.length != 6 && args.length != 7)
86    {
87  0 usage();
88  0 return;
89    }
90   
91  0 PrintStream ps = System.out;
92  0 if (args.length == 7)
93    {
94  0 ps = new PrintStream(new File(args[6]));
95    }
96   
97  0 boolean nucleotide = args[0].toLowerCase(Locale.ROOT).startsWith("n");
98  0 int width = Integer.parseInt(args[1]);
99  0 int height = Integer.parseInt(args[2]);
100  0 long randomSeed = Long.valueOf(args[3]);
101  0 int gapPercentage = Integer.valueOf(args[4]);
102  0 int changePercentage = Integer.valueOf(args[5]);
103   
104  0 ps.println("; " + height + " sequences of " + width + " bases with "
105    + gapPercentage + "% gaps and " + changePercentage
106    + "% mutations (random seed = " + randomSeed + ")");
107   
108  0 new AlignmentGenerator(nucleotide, ps).generate(width, height,
109    randomSeed, gapPercentage, changePercentage);
110   
111  0 if (ps != System.out)
112    {
113  0 ps.close();
114    }
115    }
116   
117    /**
118    * Prints parameter help
119    */
 
120  0 toggle private static void usage()
121    {
122  0 System.out.println("Usage:");
123  0 System.out.println("arg0: n (for nucleotide) or p (for peptide)");
124  0 System.out.println("arg1: number of (non-gap) bases per sequence");
125  0 System.out.println("arg2: number of sequences");
126  0 System.out.println(
127    "arg3: an integer as random seed (same seed = same results)");
128  0 System.out.println("arg4: percentage of gaps to (randomly) generate");
129  0 System.out.println(
130    "arg5: percentage of 'mutations' to (randomly) generate");
131  0 System.out.println(
132    "arg6: (optional) path to output file (default is sysout)");
133  0 System.out.println("Example: AlignmentGenerator n 12 15 387 10 5");
134  0 System.out.println(
135    "- 15 nucleotide sequences of 12 bases each, approx 10% gaps and 5% mutations, random seed = 387");
136   
137    }
138   
139    /**
140    * Constructor that sets nucleotide or peptide symbol set, and also writes the
141    * generated alignment to sysout
142    */
 
143  20 toggle public AlignmentGenerator(boolean nuc)
144    {
145  20 this(nuc, System.out);
146    }
147   
148    /**
149    * Constructor that sets nucleotide or peptide symbol set, and also writes the
150    * generated alignment to the specified output stream (if not null). This can
151    * be used to write the alignment to a file or sysout.
152    */
 
153  20 toggle public AlignmentGenerator(boolean nucleotide, PrintStream printStream)
154    {
155  20 BASES = nucleotide ? NUCS : PEPS;
156  20 ps = printStream;
157    }
158   
159    /**
160    * Outputs an 'alignment' of given width and height, where each position is a
161    * random choice from the symbol alphabet, or - for gap
162    *
163    * @param width
164    * @param height
165    * @param randomSeed
166    * @param changePercentage
167    * @param gapPercentage
168    */
 
169  36 toggle public AlignmentI generate(int width, int height, long randomSeed,
170    int gapPercentage, int changePercentage)
171    {
172  36 SequenceI[] seqs = new SequenceI[height];
173  36 random = new Random(randomSeed);
174  36 seqs[0] = generateSequence(1, width, gapPercentage);
175  1688 for (int seqno = 1; seqno < height; seqno++)
176    {
177  1652 seqs[seqno] = generateAnotherSequence(seqs[0].getSequence(),
178    seqno + 1, width, changePercentage);
179    }
180  36 AlignmentI al = new Alignment(seqs);
181   
182  36 if (ps != null)
183    {
184  36 ps.println(new FastaFile().print(al.getSequencesArray(), true));
185    }
186   
187  36 return al;
188    }
189   
190    /**
191    * Outputs a DNA 'sequence' of given length, with some random gaps included.
192    *
193    * @param seqno
194    * @param length
195    * @param gapPercentage
196    */
 
197  36 toggle private SequenceI generateSequence(int seqno, int length,
198    int gapPercentage)
199    {
200  36 StringBuilder seq = new StringBuilder(length);
201   
202    /*
203    * Loop till we've added 'length' bases (excluding gaps)
204    */
205  1699 for (int count = 0; count < length;)
206    {
207  1663 boolean addGap = random.nextInt(100) < gapPercentage;
208  1663 char c = addGap ? GAP
209    : BASES[random.nextInt(Integer.MAX_VALUE) % BASES.length];
210  1663 seq.append(c);
211  1663 if (!addGap)
212    {
213  1618 count++;
214    }
215    }
216  36 final String seqName = "SEQ" + seqno;
217  36 final String seqString = seq.toString();
218  36 SequenceI sq = new Sequence(seqName, seqString);
219  36 sq.createDatasetSequence();
220  36 return sq;
221    }
222   
223    /**
224    * Generate a sequence approximately aligned to the first one.
225    *
226    * @param ds
227    * @param seqno
228    * @param width
229    * number of bases
230    * @param changePercentage
231    * @return
232    */
 
233  1652 toggle private SequenceI generateAnotherSequence(char[] ds, int seqno, int width,
234    int changePercentage)
235    {
236  1652 int length = ds.length;
237  1652 char[] seq = new char[length];
238  1652 Arrays.fill(seq, ZERO);
239  1652 int gapsWanted = length - width;
240  1652 int gapsAdded = 0;
241   
242    /*
243    * First 'randomly' mimic gaps in model sequence.
244    */
245  200600 for (int pos = 0; pos < length; pos++)
246    {
247  198948 if (ds[pos] == GAP)
248    {
249    /*
250    * Add a gap at the same position with changePercentage likelihood
251    */
252  7945 seq[pos] = randomCharacter(GAP, changePercentage);
253  7945 if (seq[pos] == GAP)
254    {
255  7564 gapsAdded++;
256    }
257    }
258    }
259   
260    /*
261    * Next scatter any remaining gaps (if any) at random. This gives an even
262    * distribution.
263    */
264  2033 while (gapsAdded < gapsWanted)
265    {
266  381 boolean added = false;
267  772 while (!added)
268    {
269  391 int pos = random.nextInt(length);
270  391 if (seq[pos] != GAP)
271    {
272  381 seq[pos] = GAP;
273  381 added = true;
274  381 gapsAdded++;
275    }
276    }
277    }
278   
279    /*
280    * Finally fill in the rest with randomly mutated bases.
281    */
282  200600 for (int pos = 0; pos < length; pos++)
283    {
284  198948 if (seq[pos] == ZERO)
285    {
286  190632 char c = randomCharacter(ds[pos], changePercentage);
287  190632 seq[pos] = c;
288    }
289    }
290  1652 final String seqName = "SEQ" + seqno;
291  1652 final String seqString = new String(seq);
292  1652 SequenceI sq = new Sequence(seqName, seqString);
293  1652 sq.createDatasetSequence();
294  1652 return sq;
295    }
296   
297    /**
298    * Returns a random character that is changePercentage% likely to match the
299    * given type (as base or gap).
300    *
301    * @param changePercentage
302    *
303    * @param c
304    * @return
305    */
 
306  198577 toggle private char randomCharacter(char c, int changePercentage)
307    {
308  198577 final boolean mutation = random.nextInt(100) < changePercentage;
309   
310  198577 if (!mutation)
311    {
312  188969 return c;
313    }
314   
315  9608 char newchar = c;
316  19733 while (newchar == c)
317    {
318  10125 newchar = BASES[random.nextInt(Integer.MAX_VALUE) % BASES.length];
319    }
320  9608 return newchar;
321    }
322    }