6aff1cd2c2baa2b1298790b4fd7909a2549644b9
[nit.git] / lib / standard / string_search.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Copyright 2005-2008 Jean Privat <jean@pryen.org>
4 #
5 # This file is free software, which comes along with NIT. This software is
6 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
7 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
8 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
9 # is kept unaltered, and a notification of the changes is added.
10 # You are allowed to redistribute it and sell it, alone or is a part of
11 # another product.
12
13 # Basic string search, match and replace.
14 package string_search
15
16 import string
17
18 # Patterns are abstract string motifs (include `String` and `Char`).
19 interface Pattern
20 # Search `self` into `s` from a certain position.
21 # Return the position of the first character of the matching section.
22 # Return -1 if not found.
23 fun search_index_in(s: String, from: Int): Int is abstract
24
25 # Search `self` into `s` from a certain position.
26 # Return null if not found.
27 fun search_in(s: String, from: Int): nullable Match is abstract
28
29 # Search all `self` occurrences into `s`.
30 fun search_all_in(s: String): Array[Match]
31 do
32 var res = new Array[Match] # Result
33 var match = search_in(s, 0)
34 while match != null do
35 res.add(match)
36 match = search_in(s, match.after)
37 end
38 return res
39 end
40
41 # Split `s` using `self` is separator.
42 fun split_in(s: String): Array[Match]
43 do
44 var res = new Array[Match] # Result
45 var i = 0 # Cursor
46 var match = search_in(s, 0)
47 while match != null do
48 # Compute the splited part length
49 var len = match.from - i
50 res.add(new Match(s, i, len))
51 i = match.after
52 match = search_in(s, i)
53 end
54 # Add the last part
55 res.add(new Match(s, i, s.length - i))
56 return res
57 end
58 end
59
60 # BM_Pattern are pre-compiled string motif for the Boyer-Moore algorithm.
61 # (cf. A Fast String Searching Algorithm, with R.S. Boyer. Communications
62 # of the Association for Computing Machinery, 20(10), 1977, pp. 762-772.)
63 # http://www.cs.utexas.edu/users/moore/best-ideas/string-searching/index.html
64 class BM_Pattern
65 super Pattern
66
67 redef fun to_s do return _motif
68
69 # boyer-moore search gives the position of the first occurrence of a pattern starting at position `from`
70 redef fun search_index_in(s, from)
71 do
72 assert from >= 0
73 var n = s.length
74 var m = _length
75
76 var j = from
77 while j < n - m + 1 do
78 var i = m - 1 # Cursor in the pattern
79 while i >= 0 and _motif[i] == s[i + j] do i -= 1
80 if i < 0 then
81 return j
82 else
83 var gs = _gs[i] # Good shift
84 var bc = bc(s[i+j]) - m + 1 + i # Bad char
85 # Both are true, do move to the best
86 if gs > bc then
87 j += gs
88 else
89 j += bc
90 end
91 end
92 end
93 return -1 # found nothing
94 end
95
96 # boyer-moore search. Return null if not found
97 redef fun search_in(s, from)
98 do
99 var to = search_index_in(s, from)
100 if to < 0 then
101 return null
102 else
103 return new Match(s, to, _length)
104 end
105 end
106
107 # Compile a new motif
108 init(motif: String)
109 do
110 _motif = motif
111 _length = motif.length
112 _gs = new Array[Int].with_capacity(_length)
113 _bc_table = new ArrayMap[Char, Int]
114 compute_gs
115 compute_bc
116 end
117
118 # searched motif
119 var _motif: String
120
121 # length of the motif
122 var _length: Int
123
124 private fun bc(e: Char): Int
125 do
126 if _bc_table.has_key(e) then
127 return _bc_table[e]
128 else
129 return _length
130 end
131 end
132
133 # good shifts
134 var _gs: Array[Int]
135
136 # bad characters
137 var _bc_table: Map[Char, Int]
138
139 private fun compute_bc
140 do
141 var x = _motif
142 var m = _length
143 var i = 0
144 while i < m - 1 do
145 _bc_table[x[i]] = m - i - 1
146 i += 1
147 end
148 end
149
150 private fun suffixes: Array[Int]
151 do
152 var x = _motif
153 var m = _length
154 var suff = new Array[Int].filled_with(m, m)
155
156 var f = 0
157 var g = m - 1
158 var i = m - 2
159 while i >= 0 do
160 if i > g and suff[i+m-1-f] < i - g then
161 suff[i] = suff[i+m-1-f]
162 else
163 if i < g then g = i
164 f = i
165 while g >= 0 and x[g] == x[g + m - 1 - f] do g -= 1
166 suff[i] = f - g
167 end
168 i -= 1
169 end
170 return suff
171 end
172
173 private fun compute_gs
174 do
175 var x = _motif
176 var m = _length
177 var suff = suffixes
178 var i = 0
179 while i < m do
180 _gs[i] = m
181 i += 1
182 end
183 var j = 0
184 i = m - 1
185 while i >= -1 do
186 if i == -1 or suff[i] == i + 1 then
187 while j < m - 1 - i do
188 if _gs[j] == m then _gs[j] = m - 1 - i
189 j += 1
190 end
191 end
192 i -= 1
193 end
194 i = 0
195 while i < m - 1 do
196 _gs[m - 1 - suff[i]] = m - 1 - i
197 i += 1
198 end
199 end
200
201 redef fun hash do return _motif.hash
202 redef fun ==(o) do return o isa BM_Pattern and o._motif == _motif
203 end
204
205 # Matches are a part of a string.
206 class Match
207 # The base string matched
208 readable var _string: String
209
210 # The starting position in the string
211 readable var _from: Int
212
213 # The length of the matching part
214 readable var _length: Int
215
216 # The position of the first character just after the matching part.
217 # May be out of the base string
218 fun after: Int do return _from + _length
219
220 # The contents of the matching part
221 redef fun to_s do return _string.substring(_from, _length)
222
223 # Matches `len` characters of `s` from `f`.
224 init(s: String, f: Int, len: Int)
225 do
226 assert positive_length: len >= 0
227 assert valid_from: f >= 0
228 assert valid_after: f + len <= s.length
229 _string = s
230 _from = f
231 _length = len
232 end
233 end
234
235 redef class Char
236 super Pattern
237
238 redef fun search_index_in(s, from)
239 do
240 var stop = s.length
241 while from < stop do
242 if s[from] == self then return from
243 from += 1
244 end
245 return -1
246 end
247
248 redef fun search_in(s, from)
249 do
250 var pos = search_index_in(s, from)
251 if pos < 0 then
252 return null
253 else
254 return new Match(s, pos, 1)
255 end
256 end
257 end
258
259 redef class String
260 super Pattern
261
262 redef fun search_index_in(s, from)
263 do
264 assert from >= 0
265 var stop = s.length - length + 1
266 while from < stop do
267 var i = length - 1
268 while i >= 0 and self[i] == s[i + from] do i -= 1
269 # Test if we found
270 if i < 0 then return from
271 # Not found so try next one
272 from += 1
273 end
274 return -1
275 end
276
277 redef fun search_in(s, from)
278 do
279 var pos = search_index_in(s, from)
280 if pos < 0 then
281 return null
282 else
283 return new Match(s, pos, length)
284 end
285 end
286
287 # Like `search_from` but from the first character.
288 fun search(p: Pattern): nullable Match do return p.search_in(self, 0)
289
290 # Search the given pattern into self from a.
291 # The search starts at `from`.
292 # Return null if not found.
293 fun search_from(p: Pattern, from: Int): nullable Match do return p.search_in(self, from)
294
295 # Search all occurrences of p into self.
296 #
297 # var a = new Array[Int]
298 # for i in "hello world".search_all('o') do
299 # a.add(i.from)
300 # end
301 # assert a == [4, 7]
302 fun search_all(p: Pattern): Array[Match] do return p.search_all_in(self)
303
304 # Split `self` using `p` as separator.
305 #
306 # assert "hello world".split('o') == ["hell", " w", "rld"]
307 fun split(p: Pattern): Array[String]
308 do
309 var matches = p.split_in(self)
310 var res = new Array[String].with_capacity(matches.length)
311 for m in matches do res.add(m.to_s)
312 return res
313 end
314
315 # @deprecated alias for `split`
316 fun split_with(p: Pattern): Array[String] do return self.split(p)
317
318 # Replace all occurences of a pattern with a string
319 #
320 # assert "hlelo".replace("le", "el") == "hello"
321 # assert "hello".replace('l', "") == "heo"
322 fun replace(p: Pattern, string: String): String
323 do
324 return self.split_with(p).join(string)
325 end
326
327 # Escape the four characters < > & and " with their html counterpart
328 #
329 # assert "a&b->\"x\"".html_escape == "a&amp;b-&gt;&quot;x&quot;"
330 fun html_escape: String
331 do
332 var ret = self
333 if ret.has('&') then ret = ret.replace('&', "&amp;")
334 if ret.has('<') then ret = ret.replace('<', "&lt;")
335 if ret.has('>') then ret = ret.replace('>', "&gt;")
336 if ret.has('"') then ret = ret.replace('"', "&quot;")
337 return ret
338 end
339 end