nitgs: opt_direct_call_monomorph call `before_send` to deal with special cases
[nit.git] / lib / standard / string_search.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Copyright 2005-2008 Jean Privat <jean@pryen.org>
4 #
5 # This file is free software, which comes along with NIT. This software is
6 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
7 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
8 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
9 # is kept unaltered, and a notification of the changes is added.
10 # You are allowed to redistribute it and sell it, alone or is a part of
11 # another product.
12
13 # Basic string search, match and replace.
14 module string_search
15
16 import string
17
18 # Patterns are abstract string motifs (include `String` and `Char`).
19 interface Pattern
20 # Search `self` into `s` from a certain position.
21 # Return the position of the first character of the matching section.
22 # Return -1 if not found.
23 #
24 # assert 'l'.search_index_in("hello world", 0) == 2
25 # assert 'l'.search_index_in("hello world", 3) == 3
26 # assert 'z'.search_index_in("hello world", 0) == -1
27 #
28 # This method is usually faster than `search_in` if what is
29 # required is only the index.
30 # Note: in most implementation, `search_in` is implemented with this method.
31 protected fun search_index_in(s: String, from: Int): Int is abstract
32
33 # Search `self` into `s` from a certain position.
34 # Return null if not found.
35 #
36 # assert 'l'.search_in("hello world", 0).from == 2
37 # assert 'l'.search_in("hello world", 3).from == 3
38 # assert 'z'.search_in("hello world", 0) == null
39 #
40 # If only the index of the first character if required, see `search_index_in`.
41 #
42 # Note: Is used by `String::search`, `String::search_from`, and others.
43 protected fun search_in(s: String, from: Int): nullable Match is abstract
44
45 # Search all `self` occurrences into `s`.
46 #
47 # assert 'l'.search_all_in("hello world").length == 3
48 # assert 'z'.search_all_in("hello world"),length == 0
49 #
50 # Note: Is used by `String::search_all`.
51 fun search_all_in(s: String): Array[Match]
52 do
53 var res = new Array[Match] # Result
54 var match = search_in(s, 0)
55 while match != null do
56 res.add(match)
57 match = search_in(s, match.after)
58 end
59 return res
60 end
61
62 # Split `s` using `self` is separator.
63 #
64 # Returns an array of matches that are between each occurence of `self`.
65 # If self is not present, an array with a single match on `s` is retunred.
66 #
67 # assert 'l'.split_in("hello world").join("|") == "he||o wor|d"
68 # assert 'z'.split_in("hello world").join("|") == "hello world"
69 #
70 # Note: is used by `String::split`
71 fun split_in(s: String): Array[Match]
72 do
73 var res = new Array[Match] # Result
74 var i = 0 # Cursor
75 var match = search_in(s, 0)
76 while match != null do
77 # Compute the splited part length
78 var len = match.from - i
79 res.add(new Match(s, i, len))
80 i = match.after
81 match = search_in(s, i)
82 end
83 # Add the last part
84 res.add(new Match(s, i, s.length - i))
85 return res
86 end
87 end
88
89 # BM_Pattern are pre-compiled string motif for the Boyer-Moore algorithm.
90 # (cf. A Fast String Searching Algorithm, with R.S. Boyer. Communications
91 # of the Association for Computing Machinery, 20(10), 1977, pp. 762-772.)
92 # http://www.cs.utexas.edu/users/moore/best-ideas/string-searching/index.html
93 #
94 # var pat = new BM_Pattern("hello")
95 # assert "I say hello to the world!".search(pat).from == 6
96 # assert "I say goodbye to the world!".search(pat) == null
97 class BM_Pattern
98 super Pattern
99
100 redef fun to_s do return _motif
101
102 # boyer-moore search gives the position of the first occurrence of a pattern starting at position `from`
103 redef fun search_index_in(s, from)
104 do
105 assert from >= 0
106 var n = s.length
107 var m = _length
108
109 var j = from
110 while j < n - m + 1 do
111 var i = m - 1 # Cursor in the pattern
112 while i >= 0 and _motif.chars[i] == s.chars[i + j] do i -= 1
113 if i < 0 then
114 return j
115 else
116 var gs = _gs[i] # Good shift
117 var bc = bc(s.chars[i+j]) - m + 1 + i # Bad char
118 # Both are true, do move to the best
119 if gs > bc then
120 j += gs
121 else
122 j += bc
123 end
124 end
125 end
126 return -1 # found nothing
127 end
128
129 # boyer-moore search. Return null if not found
130 redef fun search_in(s, from)
131 do
132 var to = search_index_in(s, from)
133 if to < 0 then
134 return null
135 else
136 return new Match(s, to, _length)
137 end
138 end
139
140 # Compile a new motif
141 init(motif: String)
142 do
143 _motif = motif
144 _length = motif.length
145 _gs = new Array[Int].with_capacity(_length)
146 _bc_table = new ArrayMap[Char, Int]
147 compute_gs
148 compute_bc
149 end
150
151 # searched motif
152 var _motif: String
153
154 # length of the motif
155 var _length: Int
156
157 private fun bc(e: Char): Int
158 do
159 if _bc_table.has_key(e) then
160 return _bc_table[e]
161 else
162 return _length
163 end
164 end
165
166 # good shifts
167 var _gs: Array[Int]
168
169 # bad characters
170 var _bc_table: Map[Char, Int]
171
172 private fun compute_bc
173 do
174 var x = _motif
175 var m = _length
176 var i = 0
177 while i < m - 1 do
178 _bc_table[x.chars[i]] = m - i - 1
179 i += 1
180 end
181 end
182
183 private fun suffixes: Array[Int]
184 do
185 var x = _motif
186 var m = _length
187 var suff = new Array[Int].filled_with(m, m)
188
189 var f = 0
190 var g = m - 1
191 var i = m - 2
192 while i >= 0 do
193 if i > g and suff[i+m-1-f] < i - g then
194 suff[i] = suff[i+m-1-f]
195 else
196 if i < g then g = i
197 f = i
198 while g >= 0 and x.chars[g] == x.chars[g + m - 1 - f] do g -= 1
199 suff[i] = f - g
200 end
201 i -= 1
202 end
203 return suff
204 end
205
206 private fun compute_gs
207 do
208 var x = _motif
209 var m = _length
210 var suff = suffixes
211 var i = 0
212 while i < m do
213 _gs[i] = m
214 i += 1
215 end
216 var j = 0
217 i = m - 1
218 while i >= -1 do
219 if i == -1 or suff[i] == i + 1 then
220 while j < m - 1 - i do
221 if _gs[j] == m then _gs[j] = m - 1 - i
222 j += 1
223 end
224 end
225 i -= 1
226 end
227 i = 0
228 while i < m - 1 do
229 _gs[m - 1 - suff[i]] = m - 1 - i
230 i += 1
231 end
232 end
233
234 redef fun hash do return _motif.hash
235 redef fun ==(o) do return o isa BM_Pattern and o._motif == _motif
236 end
237
238 # Matches are a part of a `String` found ba a `Pattern`.
239 class Match
240 # The base string matched
241 readable var _string: String
242
243 # The starting position in the string
244 readable var _from: Int
245
246 # The length of the matching part
247 readable var _length: Int
248
249 # The position of the first character just after the matching part.
250 # May be out of the base string
251 fun after: Int do return _from + _length
252
253 # The contents of the matching part
254 redef fun to_s do return _string.substring(_from, _length)
255
256 # Matches `len` characters of `s` from `f`.
257 init(s: String, f: Int, len: Int)
258 do
259 assert positive_length: len >= 0
260 assert valid_from: f >= 0
261 assert valid_after: f + len <= s.length
262 _string = s
263 _from = f
264 _length = len
265 end
266 end
267
268 redef class Char
269 super Pattern
270
271 redef fun search_index_in(s, from)
272 do
273 var stop = s.length
274 while from < stop do
275 if s.chars[from] == self then return from
276 from += 1
277 end
278 return -1
279 end
280
281 redef fun search_in(s, from)
282 do
283 var pos = search_index_in(s, from)
284 if pos < 0 then
285 return null
286 else
287 return new Match(s, pos, 1)
288 end
289 end
290 end
291
292 redef class String
293 super Pattern
294
295 redef fun search_index_in(s, from)
296 do
297 assert from >= 0
298 var stop = s.length - length + 1
299 while from < stop do
300 var i = length - 1
301 while i >= 0 and self.chars[i] == s.chars[i + from] do i -= 1
302 # Test if we found
303 if i < 0 then return from
304 # Not found so try next one
305 from += 1
306 end
307 return -1
308 end
309
310 redef fun search_in(s, from)
311 do
312 var pos = search_index_in(s, from)
313 if pos < 0 then
314 return null
315 else
316 return new Match(s, pos, length)
317 end
318 end
319
320 # Search the first occurence of the pattern `p`.
321 # Return null if not found.
322 #
323 # assert "I say hello to the world!".search("hello").from == 6
324 # assert "I say goodbye to the world!".search("hello") == null
325 fun search(p: Pattern): nullable Match do return p.search_in(self, 0)
326
327 # Search the first occurence of the pattern `p` after `from`.
328 # The search starts at `from`.
329 # Return null if not found.
330 #
331 # assert "I say hello to the world!".search_from("hello",4).from == 6
332 # assert "I say hello to the world!".search_from("hello",7) == null
333 fun search_from(p: Pattern, from: Int): nullable Match do return p.search_in(self, from)
334
335 # Search all occurrences of p into self.
336 #
337 # var a = new Array[Int]
338 # for i in "hello world".search_all('o') do
339 # a.add(i.from)
340 # end
341 # assert a == [4, 7]
342 fun search_all(p: Pattern): Array[Match] do return p.search_all_in(self)
343
344 # Split `self` using `p` as separator.
345 #
346 # assert "hello world".split('o') == ["hell", " w", "rld"]
347 fun split(p: Pattern): Array[String]
348 do
349 var matches = p.split_in(self)
350 var res = new Array[String].with_capacity(matches.length)
351 for m in matches do res.add(m.to_s)
352 return res
353 end
354
355 # @deprecated alias for `split`
356 fun split_with(p: Pattern): Array[String] do return self.split(p)
357
358 # Replace all occurences of a pattern with a string
359 #
360 # assert "hlelo".replace("le", "el") == "hello"
361 # assert "hello".replace('l', "") == "heo"
362 fun replace(p: Pattern, string: String): String
363 do
364 return self.split_with(p).join(string)
365 end
366
367 # Escape the four characters `<`, `>`, `&`, and `"` with their html counterpart
368 #
369 # assert "a&b->\"x\"".html_escape == "a&amp;b-&gt;&quot;x&quot;"
370 fun html_escape: String
371 do
372 var ret = self
373 if ret.chars.has('&') then ret = ret.replace('&', "&amp;")
374 if ret.chars.has('<') then ret = ret.replace('<', "&lt;")
375 if ret.chars.has('>') then ret = ret.replace('>', "&gt;")
376 if ret.chars.has('"') then ret = ret.replace('"', "&quot;")
377 return ret
378 end
379 end