Merge branch 'master' into polymorphic_extern_classes
[nit.git] / lib / standard / string_search.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Copyright 2005-2008 Jean Privat <jean@pryen.org>
4 #
5 # This file is free software, which comes along with NIT. This software is
6 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
7 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
8 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
9 # is kept unaltered, and a notification of the changes is added.
10 # You are allowed to redistribute it and sell it, alone or is a part of
11 # another product.
12
13 # Basic string search, match and replace.
14 module string_search
15
16 import string
17
18 # Patterns are abstract string motifs (include `String` and `Char`).
19 interface Pattern
20 # Search `self` into `s` from a certain position.
21 # Return the position of the first character of the matching section.
22 # Return -1 if not found.
23 #
24 # assert 'l'.search_index_in("hello world", 0) == 2
25 # assert 'l'.search_index_in("hello world", 3) == 3
26 # assert 'z'.search_index_in("hello world", 0) == -1
27 #
28 # This method is usually faster than `search_in` if what is
29 # required is only the index.
30 # Note: in most implementation, `search_in` is implemented with this method.
31 protected fun search_index_in(s: Text, from: Int): Int is abstract
32
33 # Search `self` into `s` from a certain position.
34 # Return null if not found.
35 #
36 # assert 'l'.search_in("hello world", 0).from == 2
37 # assert 'l'.search_in("hello world", 3).from == 3
38 # assert 'z'.search_in("hello world", 0) == null
39 #
40 # If only the index of the first character if required, see `search_index_in`.
41 #
42 # Note: Is used by `String::search`, `String::search_from`, and others.
43 protected fun search_in(s: Text, from: Int): nullable Match is abstract
44
45 # Search all `self` occurrences into `s`.
46 #
47 # assert 'l'.search_all_in("hello world").length == 3
48 # assert 'z'.search_all_in("hello world").length == 0
49 #
50 # Note: Is used by `String::search_all`.
51 protected fun search_all_in(s: Text): Array[Match]
52 do
53 var res = new Array[Match] # Result
54 var match = search_in(s, 0)
55 while match != null do
56 res.add(match)
57 match = search_in(s, match.after)
58 end
59 return res
60 end
61
62 # Split `s` using `self` is separator.
63 #
64 # Returns an array of matches that are between each occurence of `self`.
65 # If self is not present, an array with a single match on `s` is retunred.
66 #
67 # assert 'l'.split_in("hello world").join("|") == "he||o wor|d"
68 # assert 'z'.split_in("hello world").join("|") == "hello world"
69 #
70 # Note: is used by `String::split`
71 protected fun split_in(s: Text): Array[Match]
72 do
73 var res = new Array[Match] # Result
74 var i = 0 # Cursor
75 var match = search_in(s, 0)
76 while match != null do
77 # Compute the splited part length
78 var len = match.from - i
79 res.add(new Match(s.to_s, i, len))
80 i = match.after
81 match = search_in(s, i)
82 end
83 # Add the last part
84 res.add(new Match(s.to_s, i, s.length - i))
85 return res
86 end
87
88 protected fun is_in(s: Text): Bool do return search_index_in(s, 0) != -1
89 end
90
91 # BM_Pattern are pre-compiled string motif for the Boyer-Moore algorithm.
92 # (cf. A Fast String Searching Algorithm, with R.S. Boyer. Communications
93 # of the Association for Computing Machinery, 20(10), 1977, pp. 762-772.)
94 # http://www.cs.utexas.edu/users/moore/best-ideas/string-searching/index.html
95 #
96 # var pat = new BM_Pattern("hello")
97 # assert "I say hello to the world!".search(pat).from == 6
98 # assert "I say goodbye to the world!".search(pat) == null
99 class BM_Pattern
100 super Pattern
101
102 redef fun to_s do return _motif
103
104 # boyer-moore search gives the position of the first occurrence of a pattern starting at position `from`
105 redef fun search_index_in(s, from)
106 do
107 assert from >= 0
108 var n = s.length
109 var m = _length
110
111 var j = from
112 while j < n - m + 1 do
113 var i = m - 1 # Cursor in the pattern
114 while i >= 0 and _motif.chars[i] == s.chars[i + j] do i -= 1
115 if i < 0 then
116 return j
117 else
118 var gs = _gs[i] # Good shift
119 var bc = bc(s.chars[i+j]) - m + 1 + i # Bad char
120 # Both are true, do move to the best
121 if gs > bc then
122 j += gs
123 else
124 j += bc
125 end
126 end
127 end
128 return -1 # found nothing
129 end
130
131 # boyer-moore search. Return null if not found
132 redef fun search_in(s, from)
133 do
134 var to = search_index_in(s, from)
135 if to < 0 then
136 return null
137 else
138 return new Match(s.to_s, to, _length)
139 end
140 end
141
142 # Compile a new motif
143 init(motif: String)
144 do
145 _motif = motif
146 _length = _motif.length
147 _gs = new Array[Int].with_capacity(_length)
148 _bc_table = new ArrayMap[Char, Int]
149 compute_gs
150 compute_bc
151 end
152
153 # searched motif
154 private var motif: String
155
156 # length of the motif
157 private var length: Int
158
159 private fun bc(e: Char): Int
160 do
161 if _bc_table.has_key(e) then
162 return _bc_table[e]
163 else
164 return _length
165 end
166 end
167
168 # good shifts
169 private var gs: Array[Int]
170
171 # bad characters
172 private var bc_table: Map[Char, Int]
173
174 private fun compute_bc
175 do
176 var x = _motif
177 var m = _length
178 var i = 0
179 while i < m - 1 do
180 _bc_table[x.chars[i]] = m - i - 1
181 i += 1
182 end
183 end
184
185 private fun suffixes: Array[Int]
186 do
187 var x = _motif
188 var m = _length
189 var suff = new Array[Int].filled_with(m, m)
190
191 var f = 0
192 var g = m - 1
193 var i = m - 2
194 while i >= 0 do
195 if i > g and suff[i+m-1-f] < i - g then
196 suff[i] = suff[i+m-1-f]
197 else
198 if i < g then g = i
199 f = i
200 while g >= 0 and x.chars[g] == x.chars[g + m - 1 - f] do g -= 1
201 suff[i] = f - g
202 end
203 i -= 1
204 end
205 return suff
206 end
207
208 private fun compute_gs
209 do
210 var x = _motif
211 var m = _length
212 var suff = suffixes
213 var i = 0
214 while i < m do
215 _gs[i] = m
216 i += 1
217 end
218 var j = 0
219 i = m - 1
220 while i >= -1 do
221 if i == -1 or suff[i] == i + 1 then
222 while j < m - 1 - i do
223 if _gs[j] == m then _gs[j] = m - 1 - i
224 j += 1
225 end
226 end
227 i -= 1
228 end
229 i = 0
230 while i < m - 1 do
231 _gs[m - 1 - suff[i]] = m - 1 - i
232 i += 1
233 end
234 end
235
236 redef fun hash do return _motif.hash
237 redef fun ==(o) do return o isa BM_Pattern and o._motif == _motif
238 end
239
240 # Matches are a part of a `Text` found by a `Pattern`.
241 class Match
242 # The base string matched
243 var string: String
244
245 # The starting position in the string
246 var from: Int
247
248 # The length of the matching part
249 var length: Int
250
251 # The position of the first character just after the matching part.
252 # May be out of the base string
253 fun after: Int do return from + length
254
255 # The contents of the matching part
256 redef fun to_s do return string.substring(from,length)
257
258 # Matches `len` characters of `s` from `f`.
259 init(s: String, f: Int, len: Int)
260 do
261 assert positive_length: len >= 0
262 assert valid_from: f >= 0
263 assert valid_after: f + len <= s.length
264 string = s
265 from = f
266 length = len
267 end
268 end
269
270 redef class Char
271 super Pattern
272
273 redef fun search_index_in(s, from)
274 do
275 var stop = s.length
276 while from < stop do
277 if s.chars[from] == self then return from
278 from += 1
279 end
280 return -1
281 end
282
283 redef fun search_in(s, from)
284 do
285 var pos = search_index_in(s, from)
286 if pos < 0 then
287 return null
288 else
289 return new Match(s.to_s, pos, 1)
290 end
291 end
292 end
293
294 redef class Text
295 super Pattern
296
297 redef fun search_index_in(s, from)
298 do
299 assert from >= 0
300 var stop = s.length - length + 1
301 while from < stop do
302 var i = length - 1
303 while i >= 0 and self.chars[i] == s.chars[i + from] do i -= 1
304 # Test if we found
305 if i < 0 then return from
306 # Not found so try next one
307 from += 1
308 end
309 return -1
310 end
311
312 redef fun search_in(s, from)
313 do
314 var pos = search_index_in(s, from)
315 if pos < 0 then
316 return null
317 else
318 return new Match(s.to_s, pos, length)
319 end
320 end
321
322 # Search the first occurence of the pattern `p`.
323 # Return null if not found.
324 #
325 # assert "I say hello to the world!".search("hello").from == 6
326 # assert "I say goodbye to the world!".search("hello") == null
327 fun search(p: Pattern): nullable Match do return p.search_in(self, 0)
328
329 # Search the first occurence of the pattern `p` after `from`.
330 # The search starts at `from`.
331 # Return null if not found.
332 #
333 # assert "I say hello to the world!".search_from("hello",4).from == 6
334 # assert "I say hello to the world!".search_from("hello",7) == null
335 fun search_from(p: Pattern, from: Int): nullable Match do return p.search_in(self, from)
336
337 # Search all occurrences of p into self.
338 #
339 # var a = new Array[Int]
340 # for i in "hello world".search_all('o') do
341 # a.add(i.from)
342 # end
343 # assert a == [4, 7]
344 fun search_all(p: Pattern): Array[Match] do return p.search_all_in(self)
345
346 # Split `self` using `p` as separator.
347 #
348 # assert "hello world".split('o') == ["hell", " w", "rld"]
349 fun split(p: Pattern): Array[SELFTYPE]
350 do
351 var matches = p.split_in(self)
352 var res = new Array[SELFTYPE].with_capacity(matches.length)
353 for m in matches do res.add(m.to_s)
354 return res
355 end
356
357 # @deprecated alias for `split`
358 fun split_with(p: Pattern): Array[SELFTYPE] do return self.split(p)
359
360 # Split `self` on the first `=`
361 #
362 # assert "hello".split_once_on('l') == ["he", "lo"]
363 # assert "a, b, c, d, e".split_once_on(", ") == ["a", "b, c, d, e"]
364 fun split_once_on(p: Pattern): Array[SELFTYPE]
365 do
366 var m = p.search_in(self, 0)
367 if m == null then return [self]
368 return new Array[SELFTYPE].with_items(substring(0, m.from), substring_from(m.after))
369 end
370
371 # Replace all occurences of a pattern with a string
372 #
373 # assert "hlelo".replace("le", "el") == "hello"
374 # assert "hello".replace('l', "") == "heo"
375 fun replace(p: Pattern, string: SELFTYPE): SELFTYPE
376 do
377 return self.split_with(p).join(string)
378 end
379
380 # Does `self` contains at least one instance of `pattern`?
381 #
382 # assert "hello".has('l')
383 # assert "hello".has("ll")
384 # assert not "hello".has("lll")
385 fun has(pattern: Pattern): Bool do return pattern.is_in(self)
386 end