Class |
Line # |
Actions |
|||
---|---|---|---|---|---|
SkipBMH | 26 | 91 | 46 |
1 | // | |
2 | // This software is now distributed according to | |
3 | // the Lesser Gnu Public License. Please see | |
4 | // http://www.gnu.org/copyleft/lesser.txt for | |
5 | // the details. | |
6 | // -- Happy Computing! | |
7 | // | |
8 | package com.stevesoft.pat; | |
9 | ||
10 | import com.stevesoft.pat.wrap.StringWrap; | |
11 | ||
12 | /** | |
13 | * Like Skip, but implements a | |
14 | * <a href="http://www.dcc.uchile.cl/~rbaeza/handbook/algs/7/713b.srch.p.html"> | |
15 | * Boyer-Moore-Horspool</a> type search method that has been modified to be more | |
16 | * like a "T-search" (see the Michael Tamm''s article in <i>C'T, magazin fuer | |
17 | * computer und technic</i>, August 97 p 292). Yet another important source of | |
18 | * information for me was the | |
19 | * <a href="http://www.go2net.com/people/paulp/deep/1997/05/14/"> Deep Magic</a> | |
20 | * article on string searching. As of this writing, I can beat String's indexOf | |
21 | * method in many cases. | |
22 | * | |
23 | * @see com.stevesoft.pat.Skip | |
24 | * @see com.stevesoft.pat.Skip2 | |
25 | */ | |
26 | public class SkipBMH extends Skip | |
27 | { | |
28 | // This number could be 256, but I think it's | |
29 | // big enough. Note, it must be a power of 2. | |
30 | final int MAX_CHAR = 64; | |
31 | ||
32 | final char[] skip = new char[MAX_CHAR]; | |
33 | ||
34 | int sm1; | |
35 | ||
36 | int jump_ahead = 0; | |
37 | ||
38 | char uc, lc, tc, x; | |
39 | ||
40 | 0 | final boolean exact(char c) |
41 | { | |
42 | 0 | return (ign && anyc(c)) || c == x; |
43 | } | |
44 | ||
45 | 0 | final boolean anyc(char c) |
46 | { | |
47 | 0 | return c == uc || c == lc || c == tc; |
48 | } | |
49 | ||
50 | 0 | public SkipBMH(String pt, boolean ign) |
51 | { | |
52 | 0 | this(pt, ign, 0); |
53 | } | |
54 | ||
55 | 0 | public SkipBMH(String pt) |
56 | { | |
57 | 0 | this(pt, false, 0); |
58 | } | |
59 | ||
60 | 0 | public SkipBMH(String pt, boolean ign, int offset) |
61 | { | |
62 | 0 | super(pt, ign, offset); |
63 | 0 | for (int k = 0; k < MAX_CHAR; k++) |
64 | { | |
65 | 0 | skip[k] = (char) src.length(); |
66 | } | |
67 | ||
68 | 0 | sm1 = src.length() - 1; |
69 | 0 | x = src.charAt(sm1); |
70 | 0 | uc = CaseMgr.toUpperCase(x); |
71 | 0 | lc = CaseMgr.toLowerCase(x); |
72 | 0 | tc = CaseMgr.toTitleCase(x); |
73 | ||
74 | // We don't really want 65536 long arrays in skip[], | |
75 | // so we mask of the higher bits. This can be combined | |
76 | // with ignore case, so accounting for upper | |
77 | // case costs us nothing extra. | |
78 | 0 | for (int k = 0; k < src.length() - 1; k++) |
79 | { | |
80 | 0 | char x_ = src.charAt(k); |
81 | 0 | if (ign) |
82 | { | |
83 | 0 | char uc_ = CaseMgr.toUpperCase(x_); |
84 | 0 | char lc_ = CaseMgr.toLowerCase(x_); |
85 | 0 | char tc_ = CaseMgr.toTitleCase(x_); |
86 | 0 | skip[uc_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1); |
87 | 0 | skip[lc_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1); |
88 | 0 | skip[tc_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1); |
89 | } | |
90 | else | |
91 | { | |
92 | 0 | skip[x_ & (MAX_CHAR - 1)] = (char) (src.length() - k - 1); |
93 | } | |
94 | } | |
95 | ||
96 | // This trick can be found in the July issue of | |
97 | // C-T magazine. This makes the method a type of | |
98 | // "T-search." | |
99 | 0 | jump_ahead = src.length() - 1; |
100 | 0 | for (int k = 0; k < src.length() - 1; k++) |
101 | { | |
102 | 0 | char y = src.charAt(sm1 - k - 1); |
103 | 0 | if (exact(y)) |
104 | { | |
105 | 0 | jump_ahead = k; |
106 | 0 | break; |
107 | } | |
108 | } | |
109 | } | |
110 | ||
111 | /** | |
112 | * Set to true if you only want to compare two of the characters in the | |
113 | * String. | |
114 | */ | |
115 | 0 | final public int searchRegion(String s, int start, int end) |
116 | { | |
117 | 0 | return find(s, start, end); |
118 | } | |
119 | ||
120 | 0 | final public int searchFrom(String s, int start) |
121 | { | |
122 | 0 | return find(s, start, s.length()); |
123 | } | |
124 | ||
125 | 0 | final public int search(String s) |
126 | { | |
127 | 0 | return find(s, 0, s.length()); |
128 | } | |
129 | ||
130 | 0 | public int find(String s, int start, int end) |
131 | { | |
132 | 0 | start += offset + sm1; |
133 | 0 | int vend = min(s.length() - 1, end + sm1 + offset), k; |
134 | 0 | int vend1 = vend - jump_ahead; |
135 | 0 | if (ign) |
136 | { | |
137 | 0 | for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) |
138 | { | |
139 | // table look-up is expensive, avoid it if possible | |
140 | 0 | if (anyc(s.charAt(k))) |
141 | { | |
142 | 0 | if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1)) |
143 | { | |
144 | 0 | return k - sm1 - offset; |
145 | } | |
146 | 0 | k += jump_ahead; |
147 | } | |
148 | } | |
149 | 0 | for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) |
150 | { | |
151 | // table look-up is expensive, avoid it if possible | |
152 | 0 | if (anyc(s.charAt(k))) |
153 | { | |
154 | 0 | if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1)) |
155 | { | |
156 | 0 | return k - sm1 - offset; |
157 | } | |
158 | 0 | k += jump_ahead; |
159 | 0 | if (k > vend) |
160 | { | |
161 | 0 | return -1; |
162 | } | |
163 | } | |
164 | } | |
165 | } | |
166 | else | |
167 | { | |
168 | 0 | for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) |
169 | { | |
170 | // table look-up is expensive, avoid it if possible | |
171 | 0 | if (x == s.charAt(k)) |
172 | { | |
173 | // if(src.regionMatches(0,s,k-sm1,sm1)) | |
174 | 0 | if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1)) |
175 | { | |
176 | 0 | return k - sm1 - offset; |
177 | } | |
178 | 0 | k += jump_ahead; |
179 | } | |
180 | } | |
181 | 0 | for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) |
182 | { | |
183 | // table look-up is expensive, avoid it if possible | |
184 | 0 | if (x == s.charAt(k)) |
185 | { | |
186 | // if(src.regionMatches(0,s,k-sm1,sm1)) | |
187 | 0 | if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1)) |
188 | { | |
189 | 0 | return k - sm1 - offset; |
190 | } | |
191 | 0 | k += jump_ahead; |
192 | 0 | if (k > vend) |
193 | { | |
194 | 0 | return -1; |
195 | } | |
196 | } | |
197 | } | |
198 | } | |
199 | ||
200 | 0 | return -1; |
201 | } | |
202 | ||
203 | 0 | public int find(StringLike s, int start, int end) |
204 | { | |
205 | 0 | if (s instanceof StringWrap) |
206 | { | |
207 | 0 | return find(s.toString(), start, end); |
208 | } | |
209 | 0 | start += offset + sm1; |
210 | 0 | int vend = min(s.length() - 1, end + sm1 + offset), k; |
211 | 0 | int vend1 = vend - jump_ahead; |
212 | 0 | if (ign) |
213 | { | |
214 | 0 | for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) |
215 | { | |
216 | // table look-up is expensive, avoid it if possible | |
217 | 0 | if (anyc(s.charAt(k))) |
218 | { | |
219 | 0 | if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1)) |
220 | { | |
221 | 0 | return k - sm1 - offset; |
222 | } | |
223 | 0 | k += jump_ahead; |
224 | } | |
225 | } | |
226 | 0 | for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) |
227 | { | |
228 | // table look-up is expensive, avoid it if possible | |
229 | 0 | if (anyc(s.charAt(k))) |
230 | { | |
231 | 0 | if (CaseMgr.regionMatches(src, ign, 0, s, k - sm1, sm1)) |
232 | { | |
233 | 0 | return k - sm1 - offset; |
234 | } | |
235 | 0 | k += jump_ahead; |
236 | 0 | if (k > vend) |
237 | { | |
238 | 0 | return -1; |
239 | } | |
240 | } | |
241 | } | |
242 | } | |
243 | else | |
244 | { | |
245 | 0 | for (k = start; k <= vend1; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) |
246 | { | |
247 | // table look-up is expensive, avoid it if possible | |
248 | 0 | if (x == s.charAt(k)) |
249 | { | |
250 | // if(src.regionMatches(0,s,k-sm1,sm1)) | |
251 | 0 | if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1)) |
252 | { | |
253 | 0 | return k - sm1 - offset; |
254 | } | |
255 | 0 | k += jump_ahead; |
256 | } | |
257 | } | |
258 | 0 | for (; k <= vend; k += skip[s.charAt(k) & (MAX_CHAR - 1)]) |
259 | { | |
260 | // table look-up is expensive, avoid it if possible | |
261 | 0 | if (x == s.charAt(k)) |
262 | { | |
263 | // if(src.regionMatches(0,s,k-sm1,sm1)) | |
264 | 0 | if (CaseMgr.regionMatches(src, false, 0, s, k - sm1, sm1)) |
265 | { | |
266 | 0 | return k - sm1 - offset; |
267 | } | |
268 | 0 | k += jump_ahead; |
269 | 0 | if (k > vend) |
270 | { | |
271 | 0 | return -1; |
272 | } | |
273 | } | |
274 | } | |
275 | } | |
276 | ||
277 | 0 | return -1; |
278 | } | |
279 | } |