Clover icon

jalviewX

  1. Project Clover database Wed Oct 31 2018 15:13:58 GMT
  2. Package jalview.analysis

File SequenceIdMatcher.java

 

Coverage histogram

../../img/srcFileCovDistChart8.png
19% of files have more coverage

Code metrics

50
85
17
2
373
222
47
0.55
5
8.5
2.76

Classes

Class Line # Actions
SequenceIdMatcher 38 70 33 36
0.6842105468.4%
SequenceIdMatcher.SeqIdName 284 15 14 7
0.8157894681.6%
 

Contributing tests

This file is covered by 24 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.analysis;
22   
23    import jalview.datamodel.DBRefEntry;
24    import jalview.datamodel.SequenceI;
25   
26    import java.util.ArrayList;
27    import java.util.Arrays;
28    import java.util.HashMap;
29    import java.util.List;
30    import java.util.Vector;
31   
32    /**
33    * Routines for approximate Sequence Id resolution by name using string
34    * containment (on word boundaries) rather than equivalence. It also attempts to
35    * resolve ties where no exact match is available by picking the the id closest
36    * to the query.
37    */
 
38    public class SequenceIdMatcher
39    {
40    private HashMap<SeqIdName, SequenceI> names;
41   
 
42  49 toggle public SequenceIdMatcher(List<SequenceI> seqs)
43    {
44  49 names = new HashMap<SeqIdName, SequenceI>();
45  49 addAll(seqs);
46    }
47   
48    /**
49    * Adds sequences to this matcher
50    *
51    * @param seqs
52    */
 
53  61 toggle public void addAll(List<SequenceI> seqs)
54    {
55  61 for (SequenceI seq : seqs)
56    {
57  213 add(seq);
58    }
59    }
60   
61    /**
62    * Adds one sequence to this matcher
63    *
64    * @param seq
65    */
 
66  225 toggle public void add(SequenceI seq)
67    {
68    // TODO: deal with ID collisions - SequenceI should be appended to list
69    // associated with this key.
70  225 names.put(new SeqIdName(seq.getDisplayId(true)), seq);
71  225 SequenceI dbseq = seq;
72  361 while (dbseq.getDatasetSequence() != null)
73    {
74  136 dbseq = dbseq.getDatasetSequence();
75    }
76    // add in any interesting identifiers
77  225 if (dbseq.getDBRefs() != null)
78    {
79  124 DBRefEntry dbr[] = dbseq.getDBRefs();
80  124 SeqIdName sid = null;
81  282 for (int r = 0; r < dbr.length; r++)
82    {
83  158 sid = new SeqIdName(dbr[r].getAccessionId());
84  158 if (!names.containsKey(sid))
85    {
86  133 names.put(sid, seq);
87    }
88    }
89    }
90    }
91   
92    /**
93    * convenience method to make a matcher from concrete array
94    *
95    * @param sequences
96    */
 
97  37 toggle public SequenceIdMatcher(SequenceI[] sequences)
98    {
99  37 this(Arrays.asList(sequences));
100    }
101   
102    /**
103    * returns the closest SequenceI in matches to SeqIdName and returns all the
104    * matches to the names hash.
105    *
106    * @param candName
107    * SeqIdName
108    * @param matches
109    * List of SequenceI objects
110    * @return SequenceI closest SequenceI to SeqIdName
111    */
 
112  256 toggle private SequenceI pickbestMatch(SeqIdName candName,
113    List<SequenceI> matches)
114    {
115  256 List<SequenceI> st = pickbestMatches(candName, matches);
116  256 return st == null || st.size() == 0 ? null : st.get(0);
117    }
118   
119    /**
120    * returns the closest SequenceI in matches to SeqIdName and returns all the
121    * matches to the names hash.
122    *
123    * @param candName
124    * SeqIdName
125    * @param matches
126    * Vector of SequenceI objects
127    * @return Object[] { SequenceI closest SequenceI to SeqIdName, SequenceI[]
128    * ties }
129    */
 
130  261 toggle private List<SequenceI> pickbestMatches(SeqIdName candName,
131    List<SequenceI> matches)
132    {
133  261 ArrayList<SequenceI> best = new ArrayList<SequenceI>();
134  261 if (candName == null || matches == null || matches.size() == 0)
135    {
136  63 return null;
137    }
138  198 SequenceI match = matches.remove(0);
139  198 best.add(match);
140  198 names.put(new SeqIdName(match.getName()), match);
141  198 int matchlen = match.getName().length();
142  198 int namlen = candName.id.length();
143  198 while (matches.size() > 0)
144    {
145    // look through for a better one.
146  0 SequenceI cand = matches.remove(0);
147  0 names.put(new SeqIdName(cand.getName()), cand);
148  0 int q, w, candlen = cand.getName().length();
149    // keep the one with an id 'closer' to the given seqnam string
150  0 if ((q = Math.abs(matchlen - namlen)) > (w = Math
151    .abs(candlen - namlen)) && candlen > matchlen)
152    {
153  0 best.clear();
154  0 match = cand;
155  0 matchlen = candlen;
156  0 best.add(match);
157    }
158  0 if (q == w && candlen == matchlen)
159    {
160    // record any ties
161  0 best.add(cand);
162    }
163    }
164  198 if (best.size() == 0)
165    {
166  0 return null;
167    }
168  198 ;
169  198 return best;
170    }
171   
172    /**
173    * get SequenceI with closest SequenceI.getName() to seq.getName()
174    *
175    * @param seq
176    * SequenceI
177    * @return SequenceI
178    */
 
179  8 toggle public SequenceI findIdMatch(SequenceI seq)
180    {
181  8 SeqIdName nam = new SeqIdName(seq.getName());
182  8 return findIdMatch(nam);
183    }
184   
 
185  210 toggle public SequenceI findIdMatch(String seqnam)
186    {
187  210 SeqIdName nam = new SeqIdName(seqnam);
188  210 return findIdMatch(nam);
189    }
190   
191    /**
192    * Find all matches for a given sequence name.
193    *
194    * @param seqnam
195    * string to query Matcher with.
196    * @return a new array or (possibly) null
197    */
 
198  0 toggle public SequenceI[] findAllIdMatches(String seqnam)
199    {
200   
201  0 SeqIdName nam = new SeqIdName(seqnam);
202  0 List<SequenceI> m = findAllIdMatches(nam);
203  0 if (m != null)
204    {
205  0 return m.toArray(new SequenceI[m.size()]);
206    }
207  0 return null;
208    }
209   
210    /**
211    * findIdMatch
212    *
213    * Return pointers to sequences (or sequence object containers) which have
214    * same Id as a given set of different sequence objects
215    *
216    * @param seqs
217    * SequenceI[]
218    * @return SequenceI[]
219    */
 
220  6 toggle public SequenceI[] findIdMatch(SequenceI[] seqs)
221    {
222  6 SequenceI[] namedseqs = null;
223  6 int i = 0;
224  6 SeqIdName nam;
225   
226  6 if (seqs.length > 0)
227    {
228  6 namedseqs = new SequenceI[seqs.length];
229  6 do
230    {
231  38 nam = new SeqIdName(seqs[i].getName());
232   
233  38 if (names.containsKey(nam))
234    {
235  38 namedseqs[i] = findIdMatch(nam);
236    }
237    else
238    {
239  0 namedseqs[i] = null;
240    }
241  38 } while (++i < seqs.length);
242    }
243   
244  6 return namedseqs;
245    }
246   
247    /**
248    * core findIdMatch search method
249    *
250    * @param nam
251    * SeqIdName
252    * @return SequenceI
253    */
 
254  256 toggle private SequenceI findIdMatch(
255    jalview.analysis.SequenceIdMatcher.SeqIdName nam)
256    {
257  256 Vector matches = new Vector();
258  450 while (names.containsKey(nam))
259    {
260  194 matches.addElement(names.remove(nam));
261    }
262  256 return pickbestMatch(nam, matches);
263    }
264   
265    /**
266    * core findIdMatch search method for finding all equivalent matches
267    *
268    * @param nam
269    * SeqIdName
270    * @return SequenceI[]
271    */
 
272  0 toggle private List<SequenceI> findAllIdMatches(
273    jalview.analysis.SequenceIdMatcher.SeqIdName nam)
274    {
275  0 ArrayList<SequenceI> matches = new ArrayList<SequenceI>();
276  0 while (names.containsKey(nam))
277    {
278  0 matches.add(names.remove(nam));
279    }
280  0 List<SequenceI> r = pickbestMatches(nam, matches);
281  0 return r;
282    }
283   
 
284    class SeqIdName
285    {
286    String id;
287   
 
288  848 toggle SeqIdName(String s)
289    {
290  848 if (s != null)
291    {
292  848 id = s.toLowerCase();
293    }
294    else
295    {
296  0 id = "";
297    }
298    }
299   
 
300  1409 toggle @Override
301    public int hashCode()
302    {
303  1409 return ((id.length() >= 4) ? id.substring(0, 4).hashCode()
304    : id.hashCode());
305    }
306   
 
307  1746 toggle @Override
308    public boolean equals(Object s)
309    {
310  1746 if (s == null)
311    {
312  1 return false;
313    }
314  1745 if (s instanceof SeqIdName)
315    {
316  1704 return this.stringequals(((SeqIdName) s).id);
317    }
318    else
319    {
320  41 if (s instanceof String)
321    {
322  41 return this.stringequals(((String) s).toLowerCase());
323    }
324    }
325   
326  0 return false;
327    }
328   
329    /**
330    * Characters that define the end of a unique sequence ID at the beginning
331    * of an arbitrary ID string JBPNote: This is a heuristic that will fail for
332    * arbritrarily extended sequence id's (like portions of an aligned set of
333    * repeats from one sequence)
334    */
335    private String WORD_SEP = "~. |#\\/<>!\"" + ((char) 0x00A4)
336    + "$%^*)}[@',?_";
337   
338    /**
339    * matches if one ID properly contains another at a whitespace boundary.
340    * TODO: (JBPNote) These are not efficient. should use char[] for speed
341    * todo: (JBPNote) Set separator characters appropriately
342    *
343    * @param s
344    * @return boolean
345    */
 
346  1745 toggle private boolean stringequals(String s)
347    {
348  1745 if (id.length() > s.length())
349    {
350  218 return id.startsWith(s)
351    ? (WORD_SEP.indexOf(id.charAt(s.length())) > -1)
352    : false;
353    }
354    else
355    {
356  1527 return s.startsWith(id)
357  487 ? (s.equals(id) ? true
358    : (WORD_SEP.indexOf(s.charAt(id.length())) > -1))
359    : false;
360    }
361    }
362   
363    /**
364    * toString method returns the wrapped sequence id. For debugging purposes
365    * only, behaviour not guaranteed not to change.
366    */
 
367  0 toggle @Override
368    public String toString()
369    {
370  0 return id;
371    }
372    }
373    }