core/text: fix input type in `replace`
[nit.git] / lib / core / text / string_search.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Copyright 2005-2008 Jean Privat <jean@pryen.org>
4 #
5 # This file is free software, which comes along with NIT. This software is
6 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
7 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
8 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
9 # is kept unaltered, and a notification of the changes is added.
10 # You are allowed to redistribute it and sell it, alone or is a part of
11 # another product.
12
13 # Basic string search, match and replace.
14 module string_search
15
16 import abstract_text
17
18 # Patterns are abstract string motifs (include `String` and `Char`).
19 interface Pattern
20 # Search `self` into `s` from a certain position.
21 # Return the position of the first character of the matching section.
22 # Return -1 if not found.
23 #
24 # assert 'l'.search_index_in("hello world", 0) == 2
25 # assert 'l'.search_index_in("hello world", 3) == 3
26 # assert 'z'.search_index_in("hello world", 0) == -1
27 #
28 # This method is usually faster than `search_in` if what is
29 # required is only the index.
30 # Note: in most implementation, `search_in` is implemented with this method.
31 protected fun search_index_in(s: Text, from: Int): Int is abstract
32
33 # Search `self` into `s` from a certain position.
34 # Return null if not found.
35 #
36 # assert 'l'.search_in("hello world", 0).from == 2
37 # assert 'l'.search_in("hello world", 3).from == 3
38 # assert 'z'.search_in("hello world", 0) == null
39 #
40 # If only the index of the first character if required, see `search_index_in`.
41 #
42 # Note: Is used by `String::search`, `String::search_from`, and others.
43 protected fun search_in(s: Text, from: Int): nullable Match is abstract
44
45 # Search all `self` occurrences into `s`.
46 #
47 # assert 'l'.search_all_in("hello world").length == 3
48 # assert 'z'.search_all_in("hello world").length == 0
49 #
50 # Note: Is used by `String::search_all`.
51 protected fun search_all_in(s: Text): Array[Match]
52 do
53 var res = new Array[Match] # Result
54 var match = search_in(s, 0)
55 while match != null do
56 res.add(match)
57 match = search_in(s, match.after)
58 end
59 return res
60 end
61
62 # Split `s` using `self` is separator.
63 #
64 # Returns an array of matches that are between each occurence of `self`.
65 # If self is not present, an array with a single match on `s` is retunred.
66 #
67 # assert 'l'.split_in("hello world").join("|") == "he||o wor|d"
68 # assert 'z'.split_in("hello world").join("|") == "hello world"
69 #
70 # Note: is used by `String::split`
71 protected fun split_in(s: Text): Array[Match]
72 do
73 var res = new Array[Match] # Result
74 var i = 0 # Cursor
75 var match = search_in(s, 0)
76 while match != null do
77 # Compute the splited part length
78 var len = match.from - i
79 res.add(new Match(s.to_s, i, len))
80 i = match.after
81 match = search_in(s, i)
82 end
83 # Add the last part
84 res.add(new Match(s.to_s, i, s.length - i))
85 return res
86 end
87
88 # Is `self` in `s`?
89 protected fun is_in(s: Text): Bool do return search_index_in(s, 0) != -1
90 end
91
92 # BM_Pattern are pre-compiled string motif for the Boyer-Moore algorithm.
93 # (cf. A Fast String Searching Algorithm, with R.S. Boyer. Communications
94 # of the Association for Computing Machinery, 20(10), 1977, pp. 762-772.)
95 # http://www.cs.utexas.edu/users/moore/best-ideas/string-searching/index.html
96 #
97 # var pat = new BM_Pattern("hello")
98 # assert "I say hello to the world!".search(pat).from == 6
99 # assert "I say goodbye to the world!".search(pat) == null
100 class BM_Pattern
101 super Pattern
102
103 redef fun to_s do return _motif
104
105 # boyer-moore search gives the position of the first occurrence of a pattern starting at position `from`
106 redef fun search_index_in(s, from)
107 do
108 assert from >= 0
109 var n = s.length
110 var m = _length
111
112 var j = from
113 while j < n - m + 1 do
114 var i = m - 1 # Cursor in the pattern
115 while i >= 0 and _motif[i] == s[i + j] do i -= 1
116 if i < 0 then
117 return j
118 else
119 var gs = _gs[i] # Good shift
120 var bc = bc(s[i+j]) - m + 1 + i # Bad char
121 # Both are true, do move to the best
122 if gs > bc then
123 j += gs
124 else
125 j += bc
126 end
127 end
128 end
129 return -1 # found nothing
130 end
131
132 # boyer-moore search. Return null if not found
133 redef fun search_in(s, from)
134 do
135 var to = search_index_in(s, from)
136 if to < 0 then
137 return null
138 else
139 return new Match(s.to_s, to, _length)
140 end
141 end
142
143 init
144 do
145 _length = _motif.length
146 _gs = new Array[Int].with_capacity(_length)
147 compute_gs
148 compute_bc
149 end
150
151 # searched motif
152 private var motif: String
153
154 # length of the motif
155 private var length: Int is noinit
156
157 private fun bc(e: Char): Int
158 do
159 if _bc_table.has_key(e) then
160 return _bc_table[e]
161 else
162 return _length
163 end
164 end
165
166 # good shifts
167 private var gs: Array[Int] is noinit
168
169 # bad characters
170 private var bc_table = new ArrayMap[Char, Int]
171
172 private fun compute_bc
173 do
174 var x = _motif
175 var m = _length
176 var i = 0
177 while i < m - 1 do
178 _bc_table[x.chars[i]] = m - i - 1
179 i += 1
180 end
181 end
182
183 private fun suffixes: Array[Int]
184 do
185 var x = _motif
186 var m = _length
187 var suff = new Array[Int].filled_with(m, m)
188
189 var f = 0
190 var g = m - 1
191 var i = m - 2
192 while i >= 0 do
193 if i > g and suff[i+m-1-f] < i - g then
194 suff[i] = suff[i+m-1-f]
195 else
196 if i < g then g = i
197 f = i
198 while g >= 0 and x.chars[g] == x.chars[g + m - 1 - f] do g -= 1
199 suff[i] = f - g
200 end
201 i -= 1
202 end
203 return suff
204 end
205
206 private fun compute_gs
207 do
208 var m = _length
209 var suff = suffixes
210 var i = 0
211 while i < m do
212 _gs[i] = m
213 i += 1
214 end
215 var j = 0
216 i = m - 1
217 while i >= -1 do
218 if i == -1 or suff[i] == i + 1 then
219 while j < m - 1 - i do
220 if _gs[j] == m then _gs[j] = m - 1 - i
221 j += 1
222 end
223 end
224 i -= 1
225 end
226 i = 0
227 while i < m - 1 do
228 _gs[m - 1 - suff[i]] = m - 1 - i
229 i += 1
230 end
231 end
232
233 redef fun hash do return _motif.hash
234 redef fun ==(o) do return o isa BM_Pattern and o._motif == _motif
235 end
236
237 # Matches are a part of a `Text` found by a `Pattern`.
238 class Match
239 # The base string matched
240 #
241 # ~~~
242 # var m = "hello world".search("lo")
243 # assert m.string == "hello world"
244 # ~~~
245 var string: String
246
247 # The starting position in the string
248 #
249 # ~~~
250 # var m = "hello world".search("lo")
251 # assert m.from == 3
252 # ~~~
253 var from: Int
254
255 # The length of the matching part
256 #
257 # ~~~
258 # var m = "hello world".search("lo")
259 # assert m.length == 2
260 # ~~~
261 var length: Int
262
263 # The position of the first character just after the matching part.
264 # May be out of the base string
265 #
266 # ~~~
267 # var m = "hello world".search("lo")
268 # assert m.after == 5
269 # ~~~
270 fun after: Int do return from + length
271
272 # The contents of the matching part
273 #
274 # ~~~
275 # var m = "hello world".search("lo")
276 # assert m.to_s == "lo"
277 # ~~~
278 redef fun to_s do return string.substring(from,length)
279
280 # The content of `string` before the match
281 #
282 # ~~~
283 # var m = "hello world".search("lo")
284 # assert m.text_before == "hel"
285 # ~~~
286 fun text_before: String do return string.substring(0, from)
287
288 # The content of `string` after the match
289 #
290 # ~~~
291 # var m = "hello world".search("lo")
292 # assert m.text_after == " world"
293 # ~~~
294 fun text_after: String do return string.substring_from(after)
295
296 init
297 do
298 assert positive_length: length >= 0
299 assert valid_from: from >= 0
300 assert valid_after: from + length <= string.length
301 end
302 end
303
304 redef class Char
305 super Pattern
306
307 redef fun search_index_in(s, from)
308 do
309 var stop = s.length
310 while from < stop do
311 if s[from] == self then return from
312 from += 1
313 end
314 return -1
315 end
316
317 redef fun search_in(s, from)
318 do
319 var pos = search_index_in(s, from)
320 if pos < 0 then
321 return null
322 else
323 return new Match(s.to_s, pos, 1)
324 end
325 end
326 end
327
328 redef class Text
329 super Pattern
330
331 redef fun search_index_in(s, from)
332 do
333 assert from >= 0
334 var stop = s.length - length + 1
335 while from < stop do
336 var i = length - 1
337 while i >= 0 and self[i] == s[i + from] do i -= 1
338 # Test if we found
339 if i < 0 then return from
340 # Not found so try next one
341 from += 1
342 end
343 return -1
344 end
345
346 redef fun search_in(s, from)
347 do
348 var pos = search_index_in(s, from)
349 if pos < 0 then
350 return null
351 else
352 return new Match(s.to_s, pos, length)
353 end
354 end
355
356 # Search the first occurence of `pattern`.
357 # Return null if not found.
358 #
359 # assert "I say hello to the world!".search("hello").from == 6
360 # assert "I say goodbye to the world!".search("hello") == null
361 fun search(pattern: Pattern): nullable Match do return pattern.search_in(self, 0)
362
363 # Search the first occurence of `pattern` after `from`.
364 # The search starts at `from`.
365 # Return null if not found.
366 #
367 # assert "I say hello to the world!".search_from("hello",4).from == 6
368 # assert "I say hello to the world!".search_from("hello",7) == null
369 fun search_from(pattern: Pattern, from: Int): nullable Match do return pattern.search_in(self, from)
370
371 # Search the last occurence of the text `t`.
372 #
373 # assert "bob".search_last("b").from == 2
374 # assert "bob".search_last("bo").from == 0
375 # assert "bob".search_last("ob").from == 1
376 # assert "bobob".search_last("ob").from == 3
377 # assert "bobbob".search_last("bb").from == 2
378 # assert "bobbob".search_last("bob").from == 3
379 # assert "bob".search_last("z") == null
380 # assert "".search_last("b") == null
381 fun search_last(t: Text): nullable Match do
382 return search_last_up_to(t, length)
383 end
384
385 # Search the last occurence of the text `t` before `up_to`.
386 #
387 # assert "bobbob".search_last_up_to("b", 3).from == 2
388 # assert "bobbob".search_last_up_to("b", 6).from == 5
389 # assert "bobbob".search_last_up_to("b", 0) == null
390 fun search_last_up_to(t: Text, up_to: Int): nullable Match do
391 var i = up_to - t.length
392
393 while i >= 0 do
394 if substring(i, t.length) == t then
395 return new Match(self.to_s, i, t.length)
396 end
397 i -= 1
398 end
399 return null
400 end
401
402 # Extract a given prefix, if any.
403 #
404 # ~~~
405 # var p = "hello world".prefix("hello")
406 # assert p != null
407 # assert p.text_after == " world"
408 # ~~~
409 fun prefix(t: Text): nullable Match do
410 var len = t.length
411 if substring(0, len) == t then
412 return new Match(self.to_s, 0, len)
413 end
414 return null
415 end
416
417 # Extract a given suffix, if any.
418 #
419 # ~~~
420 # var p = "hello world".suffix("world")
421 # assert p != null
422 # assert p.text_before == "hello "
423 # ~~~
424 fun suffix(t: Text): nullable Match do
425 var len = t.length
426 var from = length - len
427 if substring(from, len) == t then
428 return new Match(self.to_s, from, len)
429 end
430 return null
431 end
432
433 # Search all occurrences of `pattern` into self.
434 #
435 # var a = new Array[Int]
436 # for i in "hello world".search_all('o') do
437 # a.add(i.from)
438 # end
439 # assert a == [4, 7]
440 fun search_all(pattern: Pattern): Array[Match] do return pattern.search_all_in(self)
441
442 # Split `self` using `pattern` as separator.
443 #
444 # assert "hello world".split('o') == ["hell", " w", "rld"]
445 fun split(pattern: Pattern): Array[String]
446 do
447 var matches = pattern.split_in(self)
448 var res = new Array[String].with_capacity(matches.length)
449 for m in matches do res.add(m.to_s)
450 return res
451 end
452
453 # @deprecated alias for `split`
454 fun split_with(pattern: Pattern): Array[String] do return self.split(pattern)
455
456 # Split `self` on the first occurence of `pattern`
457 #
458 # assert "hello".split_once_on('l') == ["he", "lo"]
459 # assert "a, b, c, d, e".split_once_on(", ") == ["a", "b, c, d, e"]
460 fun split_once_on(pattern: Pattern): Array[SELFTYPE]
461 do
462 var m = pattern.search_in(self, 0)
463 var res = new Array[SELFTYPE]
464 if m == null then
465 res.add self
466 else
467 res.add substring(0, m.from)
468 res.add substring_from(m.after)
469 end
470 return res
471 end
472
473 # Replace all occurrences of `pattern` with `string`
474 #
475 # assert "hlelo".replace("le", "el") == "hello"
476 # assert "hello".replace('l', "") == "heo"
477 #
478 # var t: Text = "hello"
479 # assert t.replace("hello", new FlatBuffer) == ""
480 fun replace(pattern: Pattern, string: Text): String
481 do
482 return self.split_with(pattern).join(string)
483 end
484
485 # Does `self` contains at least one instance of `pattern`?
486 #
487 # assert "hello".has('l')
488 # assert "hello".has("ll")
489 # assert not "hello".has("lll")
490 fun has(pattern: Pattern): Bool do return pattern.is_in(self)
491
492 # Returns a copy of `self` minus all occurences of `pattern`
493 #
494 # assert "__init__".remove_all('_') == "init"
495 # assert "abcd".remove_all("bc") == "ad"
496 # assert "abcd".remove_all("[ad]".to_re) == "bc"
497 fun remove_all(pattern: Pattern): String do return split(pattern).join
498 end