Clover icon

Coverage Report

  1. Project Clover database Mon Nov 11 2024 16:01:40 GMT
  2. Package jalview.analysis

File SequenceIdMatcher.java

 

Coverage histogram

../../img/srcFileCovDistChart8.png
20% of files have more coverage

Code metrics

50
85
17
2
375
223
47
0.55
5
8.5
2.76

Classes

Class Line # Actions
SequenceIdMatcher 40 70 33
0.6842105468.4%
SequenceIdMatcher.SeqIdName 286 15 14
0.8421052784.2%
 

Contributing tests

This file is covered by 24 tests. .

Source view

1    /*
2    * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3    * Copyright (C) $$Year-Rel$$ The Jalview Authors
4    *
5    * This file is part of Jalview.
6    *
7    * Jalview is free software: you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation, either version 3
10    * of the License, or (at your option) any later version.
11    *
12    * Jalview is distributed in the hope that it will be useful, but
13    * WITHOUT ANY WARRANTY; without even the implied warranty
14    * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15    * PURPOSE. See the GNU General Public License for more details.
16    *
17    * You should have received a copy of the GNU General Public License
18    * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19    * The Jalview Authors are detailed in the 'AUTHORS' file.
20    */
21    package jalview.analysis;
22   
23    import java.util.Locale;
24   
25    import jalview.datamodel.DBRefEntry;
26    import jalview.datamodel.SequenceI;
27   
28    import java.util.ArrayList;
29    import java.util.Arrays;
30    import java.util.HashMap;
31    import java.util.List;
32    import java.util.Vector;
33   
34    /**
35    * Routines for approximate Sequence Id resolution by name using string
36    * containment (on word boundaries) rather than equivalence. It also attempts to
37    * resolve ties where no exact match is available by picking the the id closest
38    * to the query.
39    */
 
40    public class SequenceIdMatcher
41    {
42    private HashMap<SeqIdName, SequenceI> names;
43   
 
44  46 toggle public SequenceIdMatcher(List<SequenceI> seqs)
45    {
46  46 names = new HashMap<SeqIdName, SequenceI>();
47  46 addAll(seqs);
48    }
49   
50    /**
51    * Adds sequences to this matcher
52    *
53    * @param seqs
54    */
 
55  58 toggle public void addAll(List<SequenceI> seqs)
56    {
57  58 for (SequenceI seq : seqs)
58    {
59  326 add(seq);
60    }
61    }
62   
63    /**
64    * Adds one sequence to this matcher
65    *
66    * @param seq
67    */
 
68  332 toggle public void add(SequenceI seq)
69    {
70    // TODO: deal with ID collisions - SequenceI should be appended to list
71    // associated with this key.
72  332 names.put(new SeqIdName(seq.getDisplayId(true)), seq);
73  332 SequenceI dbseq = seq;
74  518 while (dbseq.getDatasetSequence() != null)
75    {
76  186 dbseq = dbseq.getDatasetSequence();
77    }
78    // add in any interesting identifiers
79  332 List<DBRefEntry> dbr = dbseq.getDBRefs();
80  332 if (dbr != null)
81    {
82  143 SeqIdName sid = null;
83  1936 for (int r = 0, nr = dbr.size(); r < nr; r++)
84    {
85  1793 sid = new SeqIdName(dbr.get(r).getAccessionId());
86  1793 if (!names.containsKey(sid))
87    {
88  571 names.put(sid, seq);
89    }
90    }
91    }
92    }
93   
94    /**
95    * convenience method to make a matcher from concrete array
96    *
97    * @param sequences
98    */
 
99  35 toggle public SequenceIdMatcher(SequenceI[] sequences)
100    {
101  35 this(Arrays.asList(sequences));
102    }
103   
104    /**
105    * returns the closest SequenceI in matches to SeqIdName and returns all the
106    * matches to the names hash.
107    *
108    * @param candName
109    * SeqIdName
110    * @param matches
111    * List of SequenceI objects
112    * @return SequenceI closest SequenceI to SeqIdName
113    */
 
114  315 toggle private SequenceI pickbestMatch(SeqIdName candName,
115    List<SequenceI> matches)
116    {
117  315 List<SequenceI> st = pickbestMatches(candName, matches);
118  315 return st == null || st.size() == 0 ? null : st.get(0);
119    }
120   
121    /**
122    * returns the closest SequenceI in matches to SeqIdName and returns all the
123    * matches to the names hash.
124    *
125    * @param candName
126    * SeqIdName
127    * @param matches
128    * Vector of SequenceI objects
129    * @return Object[] { SequenceI closest SequenceI to SeqIdName, SequenceI[]
130    * ties }
131    */
 
132  315 toggle private List<SequenceI> pickbestMatches(SeqIdName candName,
133    List<SequenceI> matches)
134    {
135  315 ArrayList<SequenceI> best = new ArrayList<SequenceI>();
136  315 if (candName == null || matches == null || matches.size() == 0)
137    {
138  73 return null;
139    }
140  242 SequenceI match = matches.remove(0);
141  242 best.add(match);
142  242 names.put(new SeqIdName(match.getName()), match);
143  242 int matchlen = match.getName().length();
144  242 int namlen = candName.id.length();
145  242 while (matches.size() > 0)
146    {
147    // look through for a better one.
148  0 SequenceI cand = matches.remove(0);
149  0 names.put(new SeqIdName(cand.getName()), cand);
150  0 int q, w, candlen = cand.getName().length();
151    // keep the one with an id 'closer' to the given seqnam string
152  0 if ((q = Math.abs(matchlen - namlen)) > (w = Math
153    .abs(candlen - namlen)) && candlen > matchlen)
154    {
155  0 best.clear();
156  0 match = cand;
157  0 matchlen = candlen;
158  0 best.add(match);
159    }
160  0 if (q == w && candlen == matchlen)
161    {
162    // record any ties
163  0 best.add(cand);
164    }
165    }
166  242 if (best.size() == 0)
167    {
168  0 return null;
169    }
170  242 ;
171  242 return best;
172    }
173   
174    /**
175    * get SequenceI with closest SequenceI.getName() to seq.getName()
176    *
177    * @param seq
178    * SequenceI
179    * @return SequenceI
180    */
 
181  8 toggle public SequenceI findIdMatch(SequenceI seq)
182    {
183  8 SeqIdName nam = new SeqIdName(seq.getName());
184  8 return findIdMatch(nam);
185    }
186   
 
187  269 toggle public SequenceI findIdMatch(String seqnam)
188    {
189  269 SeqIdName nam = new SeqIdName(seqnam);
190  269 return findIdMatch(nam);
191    }
192   
193    /**
194    * Find all matches for a given sequence name.
195    *
196    * @param seqnam
197    * string to query Matcher with.
198    * @return a new array or (possibly) null
199    */
 
200  0 toggle public SequenceI[] findAllIdMatches(String seqnam)
201    {
202   
203  0 SeqIdName nam = new SeqIdName(seqnam);
204  0 List<SequenceI> m = findAllIdMatches(nam);
205  0 if (m != null)
206    {
207  0 return m.toArray(new SequenceI[m.size()]);
208    }
209  0 return null;
210    }
211   
212    /**
213    * findIdMatch
214    *
215    * Return pointers to sequences (or sequence object containers) which have
216    * same Id as a given set of different sequence objects
217    *
218    * @param seqs
219    * SequenceI[]
220    * @return SequenceI[]
221    */
 
222  6 toggle public SequenceI[] findIdMatch(SequenceI[] seqs)
223    {
224  6 SequenceI[] namedseqs = null;
225  6 int i = 0;
226  6 SeqIdName nam;
227   
228  6 if (seqs.length > 0)
229    {
230  6 namedseqs = new SequenceI[seqs.length];
231  6 do
232    {
233  38 nam = new SeqIdName(seqs[i].getName());
234   
235  38 if (names.containsKey(nam))
236    {
237  38 namedseqs[i] = findIdMatch(nam);
238    }
239    else
240    {
241  0 namedseqs[i] = null;
242    }
243  38 } while (++i < seqs.length);
244    }
245   
246  6 return namedseqs;
247    }
248   
249    /**
250    * core findIdMatch search method
251    *
252    * @param nam
253    * SeqIdName
254    * @return SequenceI
255    */
 
256  315 toggle private SequenceI findIdMatch(
257    jalview.analysis.SequenceIdMatcher.SeqIdName nam)
258    {
259  315 Vector matches = new Vector();
260  557 while (names.containsKey(nam))
261    {
262  242 matches.addElement(names.remove(nam));
263    }
264  315 return pickbestMatch(nam, matches);
265    }
266   
267    /**
268    * core findIdMatch search method for finding all equivalent matches
269    *
270    * @param nam
271    * SeqIdName
272    * @return SequenceI[]
273    */
 
274  0 toggle private List<SequenceI> findAllIdMatches(
275    jalview.analysis.SequenceIdMatcher.SeqIdName nam)
276    {
277  0 ArrayList<SequenceI> matches = new ArrayList<SequenceI>();
278  0 while (names.containsKey(nam))
279    {
280  0 matches.add(names.remove(nam));
281    }
282  0 List<SequenceI> r = pickbestMatches(nam, matches);
283  0 return r;
284    }
285   
 
286    class SeqIdName
287    {
288    String id;
289   
 
290  2688 toggle SeqIdName(String s)
291    {
292  2688 if (s != null)
293    {
294  2688 id = s.toLowerCase(Locale.ROOT);
295    }
296    else
297    {
298  0 id = "";
299    }
300    }
301   
 
302  3775 toggle @Override
303    public int hashCode()
304    {
305  3775 return ((id.length() >= 4) ? id.substring(0, 4).hashCode()
306    : id.hashCode());
307    }
308   
 
309  18005 toggle @Override
310    public boolean equals(Object s)
311    {
312  18005 if (s == null)
313    {
314  1 return false;
315    }
316  18004 if (s instanceof SeqIdName)
317    {
318  17963 return this.stringequals(((SeqIdName) s).id);
319    }
320    else
321    {
322  41 if (s instanceof String)
323    {
324  41 return this.stringequals(((String) s).toLowerCase(Locale.ROOT));
325    }
326    }
327   
328  0 return false;
329    }
330   
331    /**
332    * Characters that define the end of a unique sequence ID at the beginning
333    * of an arbitrary ID string JBPNote: This is a heuristic that will fail for
334    * arbritrarily extended sequence id's (like portions of an aligned set of
335    * repeats from one sequence)
336    */
337    private String WORD_SEP = "~. |#\\/<>!\"" + ((char) 0x00A4)
338    + "$%^*)}[@',?_";
339   
340    /**
341    * matches if one ID properly contains another at a whitespace boundary.
342    * TODO: (JBPNote) These are not efficient. should use char[] for speed
343    * todo: (JBPNote) Set separator characters appropriately
344    *
345    * @param s
346    * @return boolean
347    */
 
348  18004 toggle private boolean stringequals(String s)
349    {
350  18004 if (id.length() > s.length())
351    {
352  3796 return id.startsWith(s)
353    ? (WORD_SEP.indexOf(id.charAt(s.length())) > -1)
354    : false;
355    }
356    else
357    {
358  14208 return s.startsWith(id)
359  1772 ? (s.equals(id) ? true
360    : (WORD_SEP.indexOf(s.charAt(id.length())) > -1))
361    : false;
362    }
363    }
364   
365    /**
366    * toString method returns the wrapped sequence id. For debugging purposes
367    * only, behaviour not guaranteed not to change.
368    */
 
369  0 toggle @Override
370    public String toString()
371    {
372  0 return id;
373    }
374    }
375    }