syntax: 'meth' -> 'fun', 'attr' -> 'var'
[nit.git] / lib / standard / string_search.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Copyright 2005-2008 Jean Privat <jean@pryen.org>
4 #
5 # This file is free software, which comes along with NIT. This software is
6 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
7 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
8 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
9 # is kept unaltered, and a notification of the changes is added.
10 # You are allowed to redistribute it and sell it, alone or is a part of
11 # another product.
12
13 # This module is about string search and matching.
14 # It includes some good features
15 package string_search
16
17 import string
18
19 # Patterns are string motifs.
20 class Pattern
21 # Search `self' into `s' from a certain position.
22 # Return the position of the first character of the matching section.
23 # Return -1 if not found.
24 fun search_index_in(s: String, from: Int): Int is abstract
25
26 # Search `self' into `s' from a certain position.
27 # Return null if not found.
28 fun search_in(s: String, from: Int): nullable Match is abstract
29
30 # Search all `self' occucences into `s'.
31 fun search_all_in(s: String): Array[Match]
32 do
33 var res = new Array[Match] # Result
34 var match = search_in(s, 0)
35 while match != null do
36 res.add(match)
37 match = search_in(s, match.after)
38 end
39 return res
40 end
41
42 # Split `s' using `self' is separator.
43 fun split_in(s: String): Array[Match]
44 do
45 var res = new Array[Match] # Result
46 var i = 0 # Cursor
47 var match = search_in(s, 0)
48 while match != null do
49 # Compute the splited part length
50 var len = match.from - i
51 res.add(new Match(s, i, len))
52 i = match.after
53 match = search_in(s, i)
54 end
55 # Add the last part
56 res.add(new Match(s, i, s.length - i))
57 return res
58 end
59 end
60
61 # BM_Pattern are precompiled string motif for the Boyer-Moore Fast String Searching Algorithm
62 # (cf. A Fast String Searching Algorithm, with R.S. Boyer.
63 # Communications of the Association for Computing Machinery, 20(10), 1977, pp. 762-772.)
64 # see also http://www.cs.utexas.edu/users/moore/best-ideas/string-searching/index.html
65 class BM_Pattern
66 special Pattern
67 redef fun to_s do return _motif
68
69 # boyer-moore search gives the position of the first occurence of a pattern starting at position `from'
70 redef fun search_index_in(s, from)
71 do
72 assert from >= 0
73 var n = s.length
74 var m = _length
75
76 var j = from
77 while j < n - m + 1 do
78 var i = m - 1 # Cursor in the pattern
79 while i >= 0 and _motif[i] == s[i + j] do i -= 1
80 if (i < 0) then
81 return j
82 else
83 var gs = _gs[i] # Good shift
84 var bc = bc(s[i+j]) - m + 1 + i # Bad char
85 # Both are true, do move to the best
86 if gs > bc then
87 j += gs
88 else
89 j += bc
90 end
91 end
92 end
93 return -1 # found nothing
94 end
95
96 # boyer-moore search. Return null if not found
97 redef fun search_in(s, from)
98 do
99 var to = search_index_in(s, from)
100 if to < 0 then
101 return null
102 else
103 return new Match(s, to, _length)
104 end
105 end
106
107 # Compile a new motif
108 init(motif: String)
109 do
110 _motif = motif
111 _length = motif.length
112 _gs = new Array[Int].with_capacity(_length)
113 _bc_table = new ArrayMap[Char, Int]
114 compute_gs
115 compute_bc
116 end
117
118 # searched motif
119 var _motif: String
120
121 # length of the motif
122 var _length: Int
123
124 private fun bc(e: Char): Int
125 do
126 if _bc_table.has_key(e) then
127 return _bc_table[e]
128 else
129 return _length
130 end
131 end
132
133 # good shifts
134 var _gs: Array[Int]
135
136 # bad characters
137 var _bc_table: Map[Char, Int]
138
139 private fun compute_bc
140 do
141 var x = _motif
142 var m = _length
143 var i = 0
144 while i < m - 1 do
145 _bc_table[x[i]] = m - i - 1
146 i += 1
147 end
148 end
149
150 private fun suffixes: Array[Int]
151 do
152 var x = _motif
153 var m = _length
154 var suff = new Array[Int].filled_with(m, m)
155
156 var f = 0
157 var g = m - 1
158 var i = m - 2
159 while i >= 0 do
160 if i > g and suff[i+m-1-f] < i - g then
161 suff[i] = suff[i+m-1-f]
162 else
163 if i < g then g = i
164 f = i
165 while g >= 0 and x[g] == x[g + m - 1 - f] do g -= 1
166 suff[i] = f - g
167 end
168 i -= 1
169 end
170 return suff
171 end
172
173 private fun compute_gs
174 do
175 var x = _motif
176 var m = _length
177 var suff = suffixes
178 var i = 0
179 while i < m do
180 _gs[i] = m
181 i += 1
182 end
183 var j = 0
184 i = m - 1
185 while i >= -1 do
186 if i == -1 or suff[i] == i + 1 then
187 while j < m - 1 - i do
188 if _gs[j] == m then _gs[j] = m - 1 - i
189 j += 1
190 end
191 end
192 i -= 1
193 end
194 i = 0
195 while i < m - 1 do
196 _gs[m - 1 - suff[i]] = m - 1 - i
197 i += 1
198 end
199 end
200 end
201
202 # Matches are a part of a string.
203 class Match
204 # The base string matched
205 readable var _string: String
206
207 # The starting position in the string
208 readable var _from: Int
209
210 # The length of the mathching part
211 readable var _length: Int
212
213 # The position of the first character just after the matching part.
214 # May be out of the base string
215 fun after: Int do return _from + _length
216
217 # The contents of the mathing part
218 redef fun to_s do return _string.substring(_from, _length)
219
220 # Matches `len' characters of `s' from `f'.
221 init(s: String, f: Int, len: Int)
222 do
223 assert positive_length: len >= 0
224 assert valid_from: f >= 0
225 assert valid_after: f + len <= s.length
226 _string = s
227 _from = f
228 _length = len
229 end
230 end
231
232 redef class Char
233 special Pattern
234 redef fun search_index_in(s, from)
235 do
236 var stop = s.length
237 while from < stop do
238 if s[from] == self then return from
239 from += 1
240 end
241 return -1
242 end
243
244 redef fun search_in(s, from)
245 do
246 var pos = search_index_in(s, from)
247 if pos < 0 then
248 return null
249 else
250 return new Match(s, pos, 1)
251 end
252 end
253 end
254
255 redef class String
256 special Pattern
257 redef fun search_index_in(s, from)
258 do
259 assert from >= 0
260 var stop = s.length - length + 1
261 while from < stop do
262 var i = length - 1
263 while i >= 0 and self[i] == s[i + from] do i -= 1
264 # Test if we found
265 if i < 0 then return from
266 # Not found so try next one
267 from += 1
268 end
269 return -1
270 end
271
272 redef fun search_in(s, from)
273 do
274 var pos = search_index_in(s, from)
275 if pos < 0 then
276 return null
277 else
278 return new Match(s, pos, length)
279 end
280 end
281
282 # Like `search_from' but from the first chararter.
283 fun search(p: Pattern): nullable Match do return p.search_in(self, 0)
284
285 # Search the given pattern into self from a.
286 # The search starts at `from'.
287 # Return null if not found.
288 fun search_from(p: Pattern, from: Int): nullable Match do return p.search_in(self, from)
289
290 # Search all occurences of p into self.
291 #
292 # var a = new Array[Int]
293 # for i in "hello world".searches('o') do
294 # a.add(i.from)
295 # end
296 # a # -> [4, 7]
297 fun search_all(p: Pattern): Array[Match] do return p.search_all_in(self)
298
299 # Split self using p is separator.
300 # "hello world".split('o') # -> ["hell", " w", "rld"]
301 fun split_with(p: Pattern): Array[String]
302 do
303 var matches = p.split_in(self)
304 var res = new Array[String].with_capacity(matches.length)
305 for m in matches do res.add(m.to_s)
306 return res
307 end
308
309 # Split self using '\n' is separator.
310 # "hello\nworld".split # -> ["hello","world"]
311 fun split: Array[String] do return split_with('\n')
312 end