Class |
Line # |
Actions |
|||
---|---|---|---|---|---|
HtsContigDb | 52 | 54 | 28 |
1 | /* | |
2 | * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) | |
3 | * Copyright (C) $$Year-Rel$$ The Jalview Authors | |
4 | * | |
5 | * This file is part of Jalview. | |
6 | * | |
7 | * Jalview is free software: you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation, either version 3 | |
10 | * of the License, or (at your option) any later version. | |
11 | * | |
12 | * Jalview is distributed in the hope that it will be useful, but | |
13 | * WITHOUT ANY WARRANTY; without even the implied warranty | |
14 | * of MERCHANTABILITY or FITNESS FOR A PARTICULAR | |
15 | * PURPOSE. See the GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with Jalview. If not, see <http://www.gnu.org/licenses/>. | |
19 | * The Jalview Authors are detailed in the 'AUTHORS' file. | |
20 | */ | |
21 | package jalview.ext.htsjdk; | |
22 | ||
23 | import jalview.datamodel.Sequence; | |
24 | import jalview.datamodel.SequenceI; | |
25 | ||
26 | import java.io.File; | |
27 | import java.io.IOException; | |
28 | import java.math.BigInteger; | |
29 | import java.nio.file.Path; | |
30 | import java.security.MessageDigest; | |
31 | import java.security.NoSuchAlgorithmException; | |
32 | import java.util.ArrayList; | |
33 | import java.util.HashSet; | |
34 | import java.util.List; | |
35 | import java.util.Set; | |
36 | ||
37 | import htsjdk.samtools.SAMException; | |
38 | import htsjdk.samtools.SAMSequenceDictionary; | |
39 | import htsjdk.samtools.SAMSequenceRecord; | |
40 | import htsjdk.samtools.reference.FastaSequenceIndexCreator; | |
41 | import htsjdk.samtools.reference.ReferenceSequence; | |
42 | import htsjdk.samtools.reference.ReferenceSequenceFile; | |
43 | import htsjdk.samtools.reference.ReferenceSequenceFileFactory; | |
44 | import htsjdk.samtools.util.StringUtil; | |
45 | ||
46 | /** | |
47 | * a source of sequence data accessed via the HTSJDK | |
48 | * | |
49 | * @author jprocter | |
50 | * | |
51 | */ | |
52 | public class HtsContigDb | |
53 | { | |
54 | private String name; | |
55 | ||
56 | private File dbLocation; | |
57 | ||
58 | private htsjdk.samtools.reference.ReferenceSequenceFile refFile = null; | |
59 | ||
60 | 2 | public static void createFastaSequenceIndex(Path path, boolean overwrite) |
61 | throws IOException | |
62 | { | |
63 | 2 | try |
64 | { | |
65 | 2 | FastaSequenceIndexCreator.create(path, overwrite); |
66 | } catch (SAMException e) | |
67 | { | |
68 | 1 | throw new IOException(e.getMessage()); |
69 | } | |
70 | } | |
71 | ||
72 | 7 | public HtsContigDb(String name, File descriptor) |
73 | { | |
74 | 7 | if (descriptor.isFile()) |
75 | { | |
76 | 7 | this.name = name; |
77 | 7 | dbLocation = descriptor; |
78 | } | |
79 | 7 | initSource(); |
80 | } | |
81 | ||
82 | 5 | public void close() |
83 | { | |
84 | 5 | if (refFile != null) |
85 | { | |
86 | 5 | try |
87 | { | |
88 | 5 | refFile.close(); |
89 | } catch (IOException e) | |
90 | { | |
91 | // ignore | |
92 | } | |
93 | } | |
94 | } | |
95 | ||
96 | 7 | private void initSource() |
97 | { | |
98 | 7 | if (refFile != null) |
99 | { | |
100 | 0 | return; |
101 | } | |
102 | ||
103 | 7 | refFile = ReferenceSequenceFileFactory |
104 | .getReferenceSequenceFile(dbLocation, true); | |
105 | 6 | if (refFile == null || refFile.getSequenceDictionary() == null) |
106 | { | |
107 | // refFile = initSequenceDictionaryFor(dbLocation); | |
108 | } | |
109 | ||
110 | } | |
111 | ||
112 | SAMSequenceDictionary rrefDict = null; | |
113 | ||
114 | 0 | private ReferenceSequenceFile initSequenceDictionaryFor(File dbLocation2) |
115 | throws Exception | |
116 | { | |
117 | 0 | rrefDict = getDictionary(dbLocation2, true); |
118 | 0 | if (rrefDict != null) |
119 | { | |
120 | 0 | ReferenceSequenceFile rrefFile = ReferenceSequenceFileFactory |
121 | .getReferenceSequenceFile(dbLocation2, true); | |
122 | 0 | return rrefFile; |
123 | } | |
124 | 0 | return null; |
125 | } | |
126 | ||
127 | /** | |
128 | * code below hacked out from picard ---- | |
129 | * | |
130 | * picard/src/java/picard/sam/CreateSequenceDictionary.java | |
131 | * https://github.com/ | |
132 | * broadinstitute/picard/commit/270580d3e28123496576f0b91b3433179bb5d876 | |
133 | */ | |
134 | ||
135 | /* | |
136 | * The MIT License | |
137 | * | |
138 | * Copyright (c) 2009 The Broad Institute | |
139 | * | |
140 | * Permission is hereby granted, free of charge, to any person obtaining a | |
141 | * copy of this software and associated documentation files (the "Software"), | |
142 | * to deal in the Software without restriction, including without limitation | |
143 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
144 | * and/or sell copies of the Software, and to permit persons to whom the | |
145 | * Software is furnished to do so, subject to the following conditions: | |
146 | * | |
147 | * The above copyright notice and this permission notice shall be included in | |
148 | * all copies or substantial portions of the Software. | |
149 | * | |
150 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
151 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
152 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
153 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
154 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
155 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
156 | * DEALINGS IN THE SOFTWARE. | |
157 | */ | |
158 | /** | |
159 | * | |
160 | * @param f | |
161 | * @param truncate | |
162 | * @return | |
163 | * @throws Exception | |
164 | */ | |
165 | 0 | SAMSequenceDictionary getDictionary(File f, boolean truncate) |
166 | throws Exception | |
167 | { | |
168 | 0 | if (md5 == null) |
169 | { | |
170 | 0 | initCreateSequenceDictionary(); |
171 | } | |
172 | 0 | final ReferenceSequenceFile refSeqFile = ReferenceSequenceFileFactory |
173 | .getReferenceSequenceFile(f, truncate); | |
174 | 0 | ReferenceSequence refSeq; |
175 | 0 | List<SAMSequenceRecord> ret = new ArrayList<>(); |
176 | 0 | Set<String> sequenceNames = new HashSet<>(); |
177 | 0 | for (int numSequences = 0; (refSeq = refSeqFile |
178 | .nextSequence()) != null; ++numSequences) | |
179 | { | |
180 | 0 | if (sequenceNames.contains(refSeq.getName())) |
181 | { | |
182 | 0 | throw new Exception( |
183 | "Sequence name appears more than once in reference: " | |
184 | + refSeq.getName()); | |
185 | } | |
186 | 0 | sequenceNames.add(refSeq.getName()); |
187 | 0 | ret.add(makeSequenceRecord(refSeq)); |
188 | } | |
189 | 0 | return new SAMSequenceDictionary(ret); |
190 | } | |
191 | ||
192 | 8 | public boolean isValid() |
193 | { | |
194 | 8 | return dbLocation != null && refFile != null; |
195 | } | |
196 | ||
197 | /** | |
198 | * Create one SAMSequenceRecord from a single fasta sequence | |
199 | */ | |
200 | 0 | private SAMSequenceRecord makeSequenceRecord( |
201 | final ReferenceSequence refSeq) | |
202 | { | |
203 | ||
204 | 0 | final SAMSequenceRecord ret = new SAMSequenceRecord(refSeq.getName(), |
205 | refSeq.length()); | |
206 | ||
207 | // Compute MD5 of upcased bases | |
208 | 0 | final byte[] bases = refSeq.getBases(); |
209 | 0 | for (int i = 0; i < bases.length; ++i) |
210 | { | |
211 | 0 | bases[i] = StringUtil.toUpperCase(bases[i]); |
212 | } | |
213 | ||
214 | 0 | ret.setAttribute(SAMSequenceRecord.MD5_TAG, md5Hash(bases)); |
215 | // if (GENOME_ASSEMBLY != null) { | |
216 | // ret.setAttribute(SAMSequenceRecord.ASSEMBLY_TAG, GENOME_ASSEMBLY); | |
217 | // } | |
218 | // ret.setAttribute(SAMSequenceRecord.URI_TAG, URI); | |
219 | // if (SPECIES != null) { | |
220 | // ret.setAttribute(SAMSequenceRecord.SPECIES_TAG, SPECIES); | |
221 | // } | |
222 | 0 | return ret; |
223 | } | |
224 | ||
225 | private MessageDigest md5; | |
226 | ||
227 | 0 | public void initCreateSequenceDictionary() throws Exception |
228 | { | |
229 | 0 | try |
230 | { | |
231 | 0 | md5 = MessageDigest.getInstance("MD5"); |
232 | } catch (NoSuchAlgorithmException e) | |
233 | { | |
234 | 0 | throw new Exception("MD5 algorithm not found", e); |
235 | } | |
236 | } | |
237 | ||
238 | 0 | private String md5Hash(final byte[] bytes) |
239 | { | |
240 | 0 | md5.reset(); |
241 | 0 | md5.update(bytes); |
242 | 0 | String s = new BigInteger(1, md5.digest()).toString(16); |
243 | 0 | if (s.length() != 32) |
244 | { | |
245 | 0 | final String zeros = "00000000000000000000000000000000"; |
246 | 0 | s = zeros.substring(0, 32 - s.length()) + s; |
247 | } | |
248 | 0 | return s; |
249 | } | |
250 | ||
251 | // ///// end of hts bits. | |
252 | ||
253 | /** | |
254 | * Reads the contig with the given id and returns as a Jalview SequenceI | |
255 | * object. Note the database must be indexed for this operation to succeed. | |
256 | * | |
257 | * @param id | |
258 | * @return | |
259 | */ | |
260 | 5 | public SequenceI getSequenceProxy(String id) |
261 | { | |
262 | 5 | if (!isValid() || !refFile.isIndexed()) |
263 | { | |
264 | 0 | jalview.bin.Console.errPrintln( |
265 | "Cannot read contig as file is invalid or not indexed"); | |
266 | 0 | return null; |
267 | } | |
268 | ||
269 | 5 | ReferenceSequence sseq = refFile.getSequence(id); |
270 | 5 | return new Sequence(sseq.getName(), new String(sseq.getBases())); |
271 | } | |
272 | ||
273 | 3 | public boolean isIndexed() |
274 | { | |
275 | 3 | return refFile != null && refFile.isIndexed(); |
276 | } | |
277 | ||
278 | } |