Class |
Line # |
Actions |
|||
---|---|---|---|---|---|
SequenceIdMatcher | 40 | 70 | 33 | ||
SequenceIdMatcher.SeqIdName | 286 | 15 | 14 |
1 | /* | |
2 | * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) | |
3 | * Copyright (C) $$Year-Rel$$ The Jalview Authors | |
4 | * | |
5 | * This file is part of Jalview. | |
6 | * | |
7 | * Jalview is free software: you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation, either version 3 | |
10 | * of the License, or (at your option) any later version. | |
11 | * | |
12 | * Jalview is distributed in the hope that it will be useful, but | |
13 | * WITHOUT ANY WARRANTY; without even the implied warranty | |
14 | * of MERCHANTABILITY or FITNESS FOR A PARTICULAR | |
15 | * PURPOSE. See the GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with Jalview. If not, see <http://www.gnu.org/licenses/>. | |
19 | * The Jalview Authors are detailed in the 'AUTHORS' file. | |
20 | */ | |
21 | package jalview.analysis; | |
22 | ||
23 | import java.util.Locale; | |
24 | ||
25 | import jalview.datamodel.DBRefEntry; | |
26 | import jalview.datamodel.SequenceI; | |
27 | ||
28 | import java.util.ArrayList; | |
29 | import java.util.Arrays; | |
30 | import java.util.HashMap; | |
31 | import java.util.List; | |
32 | import java.util.Vector; | |
33 | ||
34 | /** | |
35 | * Routines for approximate Sequence Id resolution by name using string | |
36 | * containment (on word boundaries) rather than equivalence. It also attempts to | |
37 | * resolve ties where no exact match is available by picking the the id closest | |
38 | * to the query. | |
39 | */ | |
40 | public class SequenceIdMatcher | |
41 | { | |
42 | private HashMap<SeqIdName, SequenceI> names; | |
43 | ||
44 | 46 | public SequenceIdMatcher(List<SequenceI> seqs) |
45 | { | |
46 | 46 | names = new HashMap<SeqIdName, SequenceI>(); |
47 | 46 | addAll(seqs); |
48 | } | |
49 | ||
50 | /** | |
51 | * Adds sequences to this matcher | |
52 | * | |
53 | * @param seqs | |
54 | */ | |
55 | 58 | public void addAll(List<SequenceI> seqs) |
56 | { | |
57 | 58 | for (SequenceI seq : seqs) |
58 | { | |
59 | 326 | add(seq); |
60 | } | |
61 | } | |
62 | ||
63 | /** | |
64 | * Adds one sequence to this matcher | |
65 | * | |
66 | * @param seq | |
67 | */ | |
68 | 332 | public void add(SequenceI seq) |
69 | { | |
70 | // TODO: deal with ID collisions - SequenceI should be appended to list | |
71 | // associated with this key. | |
72 | 332 | names.put(new SeqIdName(seq.getDisplayId(true)), seq); |
73 | 332 | SequenceI dbseq = seq; |
74 | 518 | while (dbseq.getDatasetSequence() != null) |
75 | { | |
76 | 186 | dbseq = dbseq.getDatasetSequence(); |
77 | } | |
78 | // add in any interesting identifiers | |
79 | 332 | List<DBRefEntry> dbr = dbseq.getDBRefs(); |
80 | 332 | if (dbr != null) |
81 | { | |
82 | 143 | SeqIdName sid = null; |
83 | 1936 | for (int r = 0, nr = dbr.size(); r < nr; r++) |
84 | { | |
85 | 1793 | sid = new SeqIdName(dbr.get(r).getAccessionId()); |
86 | 1793 | if (!names.containsKey(sid)) |
87 | { | |
88 | 571 | names.put(sid, seq); |
89 | } | |
90 | } | |
91 | } | |
92 | } | |
93 | ||
94 | /** | |
95 | * convenience method to make a matcher from concrete array | |
96 | * | |
97 | * @param sequences | |
98 | */ | |
99 | 35 | public SequenceIdMatcher(SequenceI[] sequences) |
100 | { | |
101 | 35 | this(Arrays.asList(sequences)); |
102 | } | |
103 | ||
104 | /** | |
105 | * returns the closest SequenceI in matches to SeqIdName and returns all the | |
106 | * matches to the names hash. | |
107 | * | |
108 | * @param candName | |
109 | * SeqIdName | |
110 | * @param matches | |
111 | * List of SequenceI objects | |
112 | * @return SequenceI closest SequenceI to SeqIdName | |
113 | */ | |
114 | 315 | private SequenceI pickbestMatch(SeqIdName candName, |
115 | List<SequenceI> matches) | |
116 | { | |
117 | 315 | List<SequenceI> st = pickbestMatches(candName, matches); |
118 | 315 | return st == null || st.size() == 0 ? null : st.get(0); |
119 | } | |
120 | ||
121 | /** | |
122 | * returns the closest SequenceI in matches to SeqIdName and returns all the | |
123 | * matches to the names hash. | |
124 | * | |
125 | * @param candName | |
126 | * SeqIdName | |
127 | * @param matches | |
128 | * Vector of SequenceI objects | |
129 | * @return Object[] { SequenceI closest SequenceI to SeqIdName, SequenceI[] | |
130 | * ties } | |
131 | */ | |
132 | 315 | private List<SequenceI> pickbestMatches(SeqIdName candName, |
133 | List<SequenceI> matches) | |
134 | { | |
135 | 315 | ArrayList<SequenceI> best = new ArrayList<SequenceI>(); |
136 | 315 | if (candName == null || matches == null || matches.size() == 0) |
137 | { | |
138 | 73 | return null; |
139 | } | |
140 | 242 | SequenceI match = matches.remove(0); |
141 | 242 | best.add(match); |
142 | 242 | names.put(new SeqIdName(match.getName()), match); |
143 | 242 | int matchlen = match.getName().length(); |
144 | 242 | int namlen = candName.id.length(); |
145 | 242 | while (matches.size() > 0) |
146 | { | |
147 | // look through for a better one. | |
148 | 0 | SequenceI cand = matches.remove(0); |
149 | 0 | names.put(new SeqIdName(cand.getName()), cand); |
150 | 0 | int q, w, candlen = cand.getName().length(); |
151 | // keep the one with an id 'closer' to the given seqnam string | |
152 | 0 | if ((q = Math.abs(matchlen - namlen)) > (w = Math |
153 | .abs(candlen - namlen)) && candlen > matchlen) | |
154 | { | |
155 | 0 | best.clear(); |
156 | 0 | match = cand; |
157 | 0 | matchlen = candlen; |
158 | 0 | best.add(match); |
159 | } | |
160 | 0 | if (q == w && candlen == matchlen) |
161 | { | |
162 | // record any ties | |
163 | 0 | best.add(cand); |
164 | } | |
165 | } | |
166 | 242 | if (best.size() == 0) |
167 | { | |
168 | 0 | return null; |
169 | } | |
170 | 242 | ; |
171 | 242 | return best; |
172 | } | |
173 | ||
174 | /** | |
175 | * get SequenceI with closest SequenceI.getName() to seq.getName() | |
176 | * | |
177 | * @param seq | |
178 | * SequenceI | |
179 | * @return SequenceI | |
180 | */ | |
181 | 8 | public SequenceI findIdMatch(SequenceI seq) |
182 | { | |
183 | 8 | SeqIdName nam = new SeqIdName(seq.getName()); |
184 | 8 | return findIdMatch(nam); |
185 | } | |
186 | ||
187 | 269 | public SequenceI findIdMatch(String seqnam) |
188 | { | |
189 | 269 | SeqIdName nam = new SeqIdName(seqnam); |
190 | 269 | return findIdMatch(nam); |
191 | } | |
192 | ||
193 | /** | |
194 | * Find all matches for a given sequence name. | |
195 | * | |
196 | * @param seqnam | |
197 | * string to query Matcher with. | |
198 | * @return a new array or (possibly) null | |
199 | */ | |
200 | 0 | public SequenceI[] findAllIdMatches(String seqnam) |
201 | { | |
202 | ||
203 | 0 | SeqIdName nam = new SeqIdName(seqnam); |
204 | 0 | List<SequenceI> m = findAllIdMatches(nam); |
205 | 0 | if (m != null) |
206 | { | |
207 | 0 | return m.toArray(new SequenceI[m.size()]); |
208 | } | |
209 | 0 | return null; |
210 | } | |
211 | ||
212 | /** | |
213 | * findIdMatch | |
214 | * | |
215 | * Return pointers to sequences (or sequence object containers) which have | |
216 | * same Id as a given set of different sequence objects | |
217 | * | |
218 | * @param seqs | |
219 | * SequenceI[] | |
220 | * @return SequenceI[] | |
221 | */ | |
222 | 6 | public SequenceI[] findIdMatch(SequenceI[] seqs) |
223 | { | |
224 | 6 | SequenceI[] namedseqs = null; |
225 | 6 | int i = 0; |
226 | 6 | SeqIdName nam; |
227 | ||
228 | 6 | if (seqs.length > 0) |
229 | { | |
230 | 6 | namedseqs = new SequenceI[seqs.length]; |
231 | 6 | do |
232 | { | |
233 | 38 | nam = new SeqIdName(seqs[i].getName()); |
234 | ||
235 | 38 | if (names.containsKey(nam)) |
236 | { | |
237 | 38 | namedseqs[i] = findIdMatch(nam); |
238 | } | |
239 | else | |
240 | { | |
241 | 0 | namedseqs[i] = null; |
242 | } | |
243 | 38 | } while (++i < seqs.length); |
244 | } | |
245 | ||
246 | 6 | return namedseqs; |
247 | } | |
248 | ||
249 | /** | |
250 | * core findIdMatch search method | |
251 | * | |
252 | * @param nam | |
253 | * SeqIdName | |
254 | * @return SequenceI | |
255 | */ | |
256 | 315 | private SequenceI findIdMatch( |
257 | jalview.analysis.SequenceIdMatcher.SeqIdName nam) | |
258 | { | |
259 | 315 | Vector matches = new Vector(); |
260 | 557 | while (names.containsKey(nam)) |
261 | { | |
262 | 242 | matches.addElement(names.remove(nam)); |
263 | } | |
264 | 315 | return pickbestMatch(nam, matches); |
265 | } | |
266 | ||
267 | /** | |
268 | * core findIdMatch search method for finding all equivalent matches | |
269 | * | |
270 | * @param nam | |
271 | * SeqIdName | |
272 | * @return SequenceI[] | |
273 | */ | |
274 | 0 | private List<SequenceI> findAllIdMatches( |
275 | jalview.analysis.SequenceIdMatcher.SeqIdName nam) | |
276 | { | |
277 | 0 | ArrayList<SequenceI> matches = new ArrayList<SequenceI>(); |
278 | 0 | while (names.containsKey(nam)) |
279 | { | |
280 | 0 | matches.add(names.remove(nam)); |
281 | } | |
282 | 0 | List<SequenceI> r = pickbestMatches(nam, matches); |
283 | 0 | return r; |
284 | } | |
285 | ||
286 | class SeqIdName | |
287 | { | |
288 | String id; | |
289 | ||
290 | 2688 | SeqIdName(String s) |
291 | { | |
292 | 2688 | if (s != null) |
293 | { | |
294 | 2688 | id = s.toLowerCase(Locale.ROOT); |
295 | } | |
296 | else | |
297 | { | |
298 | 0 | id = ""; |
299 | } | |
300 | } | |
301 | ||
302 | 3775 | @Override |
303 | public int hashCode() | |
304 | { | |
305 | 3775 | return ((id.length() >= 4) ? id.substring(0, 4).hashCode() |
306 | : id.hashCode()); | |
307 | } | |
308 | ||
309 | 17963 | @Override |
310 | public boolean equals(Object s) | |
311 | { | |
312 | 17963 | if (s == null) |
313 | { | |
314 | 1 | return false; |
315 | } | |
316 | 17962 | if (s instanceof SeqIdName) |
317 | { | |
318 | 17921 | return this.stringequals(((SeqIdName) s).id); |
319 | } | |
320 | else | |
321 | { | |
322 | 41 | if (s instanceof String) |
323 | { | |
324 | 41 | return this.stringequals(((String) s).toLowerCase(Locale.ROOT)); |
325 | } | |
326 | } | |
327 | ||
328 | 0 | return false; |
329 | } | |
330 | ||
331 | /** | |
332 | * Characters that define the end of a unique sequence ID at the beginning | |
333 | * of an arbitrary ID string JBPNote: This is a heuristic that will fail for | |
334 | * arbritrarily extended sequence id's (like portions of an aligned set of | |
335 | * repeats from one sequence) | |
336 | */ | |
337 | private String WORD_SEP = "~. |#\\/<>!\"" + ((char) 0x00A4) | |
338 | + "$%^*)}[@',?_"; | |
339 | ||
340 | /** | |
341 | * matches if one ID properly contains another at a whitespace boundary. | |
342 | * TODO: (JBPNote) These are not efficient. should use char[] for speed | |
343 | * todo: (JBPNote) Set separator characters appropriately | |
344 | * | |
345 | * @param s | |
346 | * @return boolean | |
347 | */ | |
348 | 17962 | private boolean stringequals(String s) |
349 | { | |
350 | 17962 | if (id.length() > s.length()) |
351 | { | |
352 | 3871 | return id.startsWith(s) |
353 | ? (WORD_SEP.indexOf(id.charAt(s.length())) > -1) | |
354 | : false; | |
355 | } | |
356 | else | |
357 | { | |
358 | 14091 | return s.startsWith(id) |
359 | 1772 | ? (s.equals(id) ? true |
360 | : (WORD_SEP.indexOf(s.charAt(id.length())) > -1)) | |
361 | : false; | |
362 | } | |
363 | } | |
364 | ||
365 | /** | |
366 | * toString method returns the wrapped sequence id. For debugging purposes | |
367 | * only, behaviour not guaranteed not to change. | |
368 | */ | |
369 | 0 | @Override |
370 | public String toString() | |
371 | { | |
372 | 0 | return id; |
373 | } | |
374 | } | |
375 | } |