Class |
Line # |
Actions |
|||
---|---|---|---|---|---|
AlignmentGenerator | 50 | 84 | 28 |
1 | /* | |
2 | * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) | |
3 | * Copyright (C) $$Year-Rel$$ The Jalview Authors | |
4 | * | |
5 | * This file is part of Jalview. | |
6 | * | |
7 | * Jalview is free software: you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation, either version 3 | |
10 | * of the License, or (at your option) any later version. | |
11 | * | |
12 | * Jalview is distributed in the hope that it will be useful, but | |
13 | * WITHOUT ANY WARRANTY; without even the implied warranty | |
14 | * of MERCHANTABILITY or FITNESS FOR A PARTICULAR | |
15 | * PURPOSE. See the GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with Jalview. If not, see <http://www.gnu.org/licenses/>. | |
19 | * The Jalview Authors are detailed in the 'AUTHORS' file. | |
20 | */ | |
21 | package jalview.analysis; | |
22 | ||
23 | import java.util.Locale; | |
24 | ||
25 | import jalview.datamodel.Alignment; | |
26 | import jalview.datamodel.AlignmentI; | |
27 | import jalview.datamodel.Sequence; | |
28 | import jalview.datamodel.SequenceI; | |
29 | import jalview.gui.JvOptionPane; | |
30 | import jalview.io.FastaFile; | |
31 | ||
32 | import java.io.File; | |
33 | import java.io.FileNotFoundException; | |
34 | import java.io.PrintStream; | |
35 | import java.util.Arrays; | |
36 | import java.util.Random; | |
37 | ||
38 | import org.testng.annotations.BeforeClass; | |
39 | ||
40 | /** | |
41 | * Generates, and outputs in Fasta format, a random peptide or nucleotide | |
42 | * alignment for given sequence length and count. Will regenerate the same | |
43 | * alignment each time if the same random seed is used (so may be used for | |
44 | * reproducible unit tests). Not guaranteed to reproduce the same results | |
45 | * between versions, as the rules may get tweaked to produce more 'realistic' | |
46 | * results. | |
47 | * | |
48 | * @author gmcarstairs | |
49 | */ | |
50 | public class AlignmentGenerator | |
51 | { | |
52 | private static final char GAP = '-'; | |
53 | ||
54 | private static final char ZERO = '0'; | |
55 | ||
56 | private static final char[] NUCS = "GTCA".toCharArray(); | |
57 | ||
58 | private static final char[] PEPS = "MILVFYWHKRDEQNTCGASNP".toCharArray(); | |
59 | ||
60 | private static char[] BASES; | |
61 | ||
62 | private Random random; | |
63 | ||
64 | private PrintStream ps; | |
65 | ||
66 | /** | |
67 | * Outputs a pseudo-randomly generated nucleotide or peptide alignment | |
68 | * Arguments: | |
69 | * <ul> | |
70 | * <li>n (for nucleotide) or p (for peptide)</li> | |
71 | * <li>length (number of bases in each sequence)</li> | |
72 | * <li>height (number of sequences)</li> | |
73 | * <li>a whole number random seed</li> | |
74 | * <li>percentage of gaps to include (0-100)</li> | |
75 | * <li>percentage chance of variation of each position (0-100)</li> | |
76 | * <li>(optional) path to a file to write the alignment to</li> | |
77 | * </ul> | |
78 | * | |
79 | * | |
80 | * @param args | |
81 | * @throws FileNotFoundException | |
82 | */ | |
83 | 0 | public static void main(String[] args) throws FileNotFoundException |
84 | { | |
85 | 0 | if (args.length != 6 && args.length != 7) |
86 | { | |
87 | 0 | usage(); |
88 | 0 | return; |
89 | } | |
90 | ||
91 | 0 | PrintStream ps = System.out; |
92 | 0 | if (args.length == 7) |
93 | { | |
94 | 0 | ps = new PrintStream(new File(args[6])); |
95 | } | |
96 | ||
97 | 0 | boolean nucleotide = args[0].toLowerCase(Locale.ROOT).startsWith("n"); |
98 | 0 | int width = Integer.parseInt(args[1]); |
99 | 0 | int height = Integer.parseInt(args[2]); |
100 | 0 | long randomSeed = Long.valueOf(args[3]); |
101 | 0 | int gapPercentage = Integer.valueOf(args[4]); |
102 | 0 | int changePercentage = Integer.valueOf(args[5]); |
103 | ||
104 | 0 | ps.println("; " + height + " sequences of " + width + " bases with " |
105 | + gapPercentage + "% gaps and " + changePercentage | |
106 | + "% mutations (random seed = " + randomSeed + ")"); | |
107 | ||
108 | 0 | new AlignmentGenerator(nucleotide, ps).generate(width, height, |
109 | randomSeed, gapPercentage, changePercentage); | |
110 | ||
111 | 0 | if (ps != System.out) |
112 | { | |
113 | 0 | ps.close(); |
114 | } | |
115 | } | |
116 | ||
117 | /** | |
118 | * Prints parameter help | |
119 | */ | |
120 | 0 | private static void usage() |
121 | { | |
122 | 0 | System.out.println("Usage:"); |
123 | 0 | System.out.println("arg0: n (for nucleotide) or p (for peptide)"); |
124 | 0 | System.out.println("arg1: number of (non-gap) bases per sequence"); |
125 | 0 | System.out.println("arg2: number of sequences"); |
126 | 0 | System.out.println( |
127 | "arg3: an integer as random seed (same seed = same results)"); | |
128 | 0 | System.out.println("arg4: percentage of gaps to (randomly) generate"); |
129 | 0 | System.out.println( |
130 | "arg5: percentage of 'mutations' to (randomly) generate"); | |
131 | 0 | System.out.println( |
132 | "arg6: (optional) path to output file (default is sysout)"); | |
133 | 0 | System.out.println("Example: AlignmentGenerator n 12 15 387 10 5"); |
134 | 0 | System.out.println( |
135 | "- 15 nucleotide sequences of 12 bases each, approx 10% gaps and 5% mutations, random seed = 387"); | |
136 | ||
137 | } | |
138 | ||
139 | /** | |
140 | * Constructor that sets nucleotide or peptide symbol set, and also writes the | |
141 | * generated alignment to sysout | |
142 | */ | |
143 | 20 | public AlignmentGenerator(boolean nuc) |
144 | { | |
145 | 20 | this(nuc, System.out); |
146 | } | |
147 | ||
148 | /** | |
149 | * Constructor that sets nucleotide or peptide symbol set, and also writes the | |
150 | * generated alignment to the specified output stream (if not null). This can | |
151 | * be used to write the alignment to a file or sysout. | |
152 | */ | |
153 | 20 | public AlignmentGenerator(boolean nucleotide, PrintStream printStream) |
154 | { | |
155 | 20 | BASES = nucleotide ? NUCS : PEPS; |
156 | 20 | ps = printStream; |
157 | } | |
158 | ||
159 | /** | |
160 | * Outputs an 'alignment' of given width and height, where each position is a | |
161 | * random choice from the symbol alphabet, or - for gap | |
162 | * | |
163 | * @param width | |
164 | * @param height | |
165 | * @param randomSeed | |
166 | * @param changePercentage | |
167 | * @param gapPercentage | |
168 | */ | |
169 | 36 | public AlignmentI generate(int width, int height, long randomSeed, |
170 | int gapPercentage, int changePercentage) | |
171 | { | |
172 | 36 | SequenceI[] seqs = new SequenceI[height]; |
173 | 36 | random = new Random(randomSeed); |
174 | 36 | seqs[0] = generateSequence(1, width, gapPercentage); |
175 | 1688 | for (int seqno = 1; seqno < height; seqno++) |
176 | { | |
177 | 1652 | seqs[seqno] = generateAnotherSequence(seqs[0].getSequence(), |
178 | seqno + 1, width, changePercentage); | |
179 | } | |
180 | 36 | AlignmentI al = new Alignment(seqs); |
181 | ||
182 | 36 | if (ps != null) |
183 | { | |
184 | 36 | ps.println(new FastaFile().print(al.getSequencesArray(), true)); |
185 | } | |
186 | ||
187 | 36 | return al; |
188 | } | |
189 | ||
190 | /** | |
191 | * Outputs a DNA 'sequence' of given length, with some random gaps included. | |
192 | * | |
193 | * @param seqno | |
194 | * @param length | |
195 | * @param gapPercentage | |
196 | */ | |
197 | 36 | private SequenceI generateSequence(int seqno, int length, |
198 | int gapPercentage) | |
199 | { | |
200 | 36 | StringBuilder seq = new StringBuilder(length); |
201 | ||
202 | /* | |
203 | * Loop till we've added 'length' bases (excluding gaps) | |
204 | */ | |
205 | 1699 | for (int count = 0; count < length;) |
206 | { | |
207 | 1663 | boolean addGap = random.nextInt(100) < gapPercentage; |
208 | 1663 | char c = addGap ? GAP |
209 | : BASES[random.nextInt(Integer.MAX_VALUE) % BASES.length]; | |
210 | 1663 | seq.append(c); |
211 | 1663 | if (!addGap) |
212 | { | |
213 | 1618 | count++; |
214 | } | |
215 | } | |
216 | 36 | final String seqName = "SEQ" + seqno; |
217 | 36 | final String seqString = seq.toString(); |
218 | 36 | SequenceI sq = new Sequence(seqName, seqString); |
219 | 36 | sq.createDatasetSequence(); |
220 | 36 | return sq; |
221 | } | |
222 | ||
223 | /** | |
224 | * Generate a sequence approximately aligned to the first one. | |
225 | * | |
226 | * @param ds | |
227 | * @param seqno | |
228 | * @param width | |
229 | * number of bases | |
230 | * @param changePercentage | |
231 | * @return | |
232 | */ | |
233 | 1652 | private SequenceI generateAnotherSequence(char[] ds, int seqno, int width, |
234 | int changePercentage) | |
235 | { | |
236 | 1652 | int length = ds.length; |
237 | 1652 | char[] seq = new char[length]; |
238 | 1652 | Arrays.fill(seq, ZERO); |
239 | 1652 | int gapsWanted = length - width; |
240 | 1652 | int gapsAdded = 0; |
241 | ||
242 | /* | |
243 | * First 'randomly' mimic gaps in model sequence. | |
244 | */ | |
245 | 200600 | for (int pos = 0; pos < length; pos++) |
246 | { | |
247 | 198948 | if (ds[pos] == GAP) |
248 | { | |
249 | /* | |
250 | * Add a gap at the same position with changePercentage likelihood | |
251 | */ | |
252 | 7945 | seq[pos] = randomCharacter(GAP, changePercentage); |
253 | 7945 | if (seq[pos] == GAP) |
254 | { | |
255 | 7564 | gapsAdded++; |
256 | } | |
257 | } | |
258 | } | |
259 | ||
260 | /* | |
261 | * Next scatter any remaining gaps (if any) at random. This gives an even | |
262 | * distribution. | |
263 | */ | |
264 | 2033 | while (gapsAdded < gapsWanted) |
265 | { | |
266 | 381 | boolean added = false; |
267 | 772 | while (!added) |
268 | { | |
269 | 391 | int pos = random.nextInt(length); |
270 | 391 | if (seq[pos] != GAP) |
271 | { | |
272 | 381 | seq[pos] = GAP; |
273 | 381 | added = true; |
274 | 381 | gapsAdded++; |
275 | } | |
276 | } | |
277 | } | |
278 | ||
279 | /* | |
280 | * Finally fill in the rest with randomly mutated bases. | |
281 | */ | |
282 | 200600 | for (int pos = 0; pos < length; pos++) |
283 | { | |
284 | 198948 | if (seq[pos] == ZERO) |
285 | { | |
286 | 190632 | char c = randomCharacter(ds[pos], changePercentage); |
287 | 190632 | seq[pos] = c; |
288 | } | |
289 | } | |
290 | 1652 | final String seqName = "SEQ" + seqno; |
291 | 1652 | final String seqString = new String(seq); |
292 | 1652 | SequenceI sq = new Sequence(seqName, seqString); |
293 | 1652 | sq.createDatasetSequence(); |
294 | 1652 | return sq; |
295 | } | |
296 | ||
297 | /** | |
298 | * Returns a random character that is changePercentage% likely to match the | |
299 | * given type (as base or gap). | |
300 | * | |
301 | * @param changePercentage | |
302 | * | |
303 | * @param c | |
304 | * @return | |
305 | */ | |
306 | 198577 | private char randomCharacter(char c, int changePercentage) |
307 | { | |
308 | 198577 | final boolean mutation = random.nextInt(100) < changePercentage; |
309 | ||
310 | 198577 | if (!mutation) |
311 | { | |
312 | 188969 | return c; |
313 | } | |
314 | ||
315 | 9608 | char newchar = c; |
316 | 19733 | while (newchar == c) |
317 | { | |
318 | 10125 | newchar = BASES[random.nextInt(Integer.MAX_VALUE) % BASES.length]; |
319 | } | |
320 | 9608 | return newchar; |
321 | } | |
322 | } |