code: update references to `standard`
[nit.git] / lib / core / re.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Copyright 2014 Alexis Laferrière <alexis.laf@xymus.net>
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16
17 # Regular expression support for all services based on `Pattern`
18 #
19 # Implemented using libc regular expressions.
20 #
21 # The main entities are `Text::to_re` and `Regex`.
22 module re
23
24 import text
25 import gc
26 import error
27
28 in "C Header" `{
29 #include <sys/types.h>
30 #include <regex.h>
31 `}
32
33 # Main extern class to wrap libc regular expression support
34 #
35 # It is recommanded to use the higher level API offered by the class `Regex`,
36 # but it can still be used for advanced purpose or in optimized code.
37 #
38 # To use this class and other `private` entities of this module, use `intrude import core::re`
39 private extern class NativeRegex `{ regex_t* `}
40 # Allocate a new `NativeRegex`, it must then be compiled using `regcomp` before calling `regexec`
41 new malloc `{ return malloc(sizeof(regex_t)); `}
42
43 # Compile the regular expression `regex` into a form that is suitable for subsequent `regexec` searches
44 fun regcomp(regex: NativeString, cflags: Int): Int `{
45 return regcomp(self, regex, cflags);
46 `}
47
48 # Match `string` against the precompiled pattern buffer of `self`, locating matches
49 #
50 # `nmatch` and `pmatch` are used to provide information regarding the location of any matches.
51 # `eflags` may be the bitwise-or of one or both of `flag_notbol` and `flag_noteol`.
52 fun regexec(string: NativeString, nmatch: Int, pmatch: NativeMatchArray, eflags: Int): Int `{
53 return regexec(self, string, nmatch, pmatch, eflags);
54 `}
55
56 # Match `string` against the precompiled pattern buffer of `self`, do not locate matches
57 #
58 # `eflags` may be the bitwise-or of one or both of `flag_notbol` and `flag_noteol`.
59 fun regexec_match_only(string: NativeString, eflags: Int): Int `{
60 return regexec(self, string, 0, NULL, eflags);
61 `}
62
63 # Free the memory allocated to the pattern buffer by the compiling process
64 #
65 # Does not free the memory holding `self`, use `free` for this purpose.
66 fun regfree `{ regfree(self); `}
67
68 # Turn the error codes that can be returned by both `regcomp` and `regexec` into error message strings
69 fun regerror(errcode: Int): NativeString `{
70 size_t len = regerror(errcode, self, NULL, 0);
71 char *message = malloc(len);
72 regerror(errcode, self, message, len);
73
74 return message;
75 `}
76
77 # Number of parenthetical subexpressions in this compiled regular expression
78 fun re_nsub: Int `{ return self->re_nsub; `}
79 end
80
81 # Flags for `NativeRegex::regcomp`
82
83 private fun flag_extended: Int `{ return REG_EXTENDED; `}
84 private fun flag_icase: Int `{ return REG_ICASE; `}
85 private fun flag_nosub: Int `{ return REG_NOSUB; `}
86 private fun flag_newline: Int `{ return REG_NEWLINE; `}
87
88 # Flags for `NativeRegex::regexec`
89
90 private fun flag_notbol: Int `{ return REG_NOTBOL; `}
91 private fun flag_noteol: Int `{ return REG_NOTEOL; `}
92
93 # Errors of `NativeRegex::regexec`
94
95 private fun error_nomatch: Int `{ return REG_NOMATCH; `}
96 private fun error_espace: Int `{ return REG_ESPACE; `}
97
98 redef universal Int
99 private fun is_nomatch: Bool `{ return self == REG_NOMATCH; `}
100 end
101
102 # An array of `regmatch_t` or a pointer to one
103 private extern class NativeMatchArray `{ regmatch_t* `}
104 # Allocate a new array of `length` `regmatch_t`
105 new malloc(length: Int) `{ return malloc(length * sizeof(regmatch_t)); `}
106
107 # The offset in string of the beginning of a substring
108 fun rm_so: Int `{ return self->rm_so; `}
109
110 # The offset in string of the end of the substring
111 fun rm_eo: Int `{ return self->rm_eo; `}
112
113 # Get a pointer to the element at `index`, can also be used as a subarray
114 fun [](index: Int): NativeMatchArray `{ return self + index; `}
115 end
116
117 redef extern class NativeString
118 private fun substring_from(index: Int): NativeString `{ return self + index; `}
119 end
120
121 redef class Text
122 # Get a `Regex` instance from `self`
123 fun to_re: Regex do return new Regex(self.to_s)
124 end
125
126 # A regular expression pattern
127 #
128 # Used as a `Pattern` on intances of `Text` to call `has`, `search_all`, `replace`, etc.
129 #
130 # Example:
131 #
132 # var re = "ab+a".to_re
133 # assert "aabbbbaaaaba".search_all(re).join(", ") == "abbbba, aba"
134 # assert "aabbbbaaaaba".has(re)
135 # assert "aabbbbaaaaba".replace(re, "+") == "a+aa+"
136 # assert "aabbbbaaaaba".split(re) == ["a", "aa", ""]
137 class Regex
138 super Finalizable
139 super Pattern
140
141 # The `String` source of this regular expression
142 var string: String is writable
143
144 # Treat the pattern as a POSIX extended regular expression (the default)
145 #
146 # If `false`, it is treated as a POSIX basic regular expression (BRE).
147 #
148 # The extended syntax supports `?`, `+` and `|`. Also, `\` causes the following
149 # character to be used as literal.
150 var extended = true is writable
151
152 # Ignore case when matching letters
153 var ignore_case = false is writable
154
155 # Optimize `self` for `is_in` and `String::has`, but do not support searches
156 #
157 # If `true`, `self` cannont be used with `String::search_all`, `String::replace`
158 # or `String::split`.
159 var optimize_is_in = false is writable
160
161 # Treat a newline in string as dividing string into multiple lines
162 #
163 # So that `$` can match before the newline and `^` can match after.
164 # Also, don’t permit `.` to match a newline, and don’t permit `[^…]` to match a newline.
165 #
166 # Otherwise, newline acts like any other ordinary character.
167 var newline = false is writable
168
169 # Do not regard the beginning of the specified string as the beginning of a line
170 #
171 # More generally, don’t make any assumptions about what text might precede it.
172 var not_bol = false is writable
173
174 # Do not regard the end of the specified string as the end of a line
175 #
176 # More generally, don’t make any assumptions about what text might follow it.
177 var not_eol = false is writable
178
179 # Cache of the last used compiled regular expression
180 private var native: nullable NativeRegex = null
181
182 # Cache of a single `regmatch_t` to prevent many calls to `malloc`
183 private var native_match: NativeMatchArray is lazy do
184 native_match_is_init = true
185 return new NativeMatchArray.malloc(native.re_nsub+1)
186 end
187
188 private var native_match_is_init = false
189
190 # `cflags` of the last successful `compile`
191 private var cflags_cache = 0
192
193 # `string` of the last successful `compile`
194 private var string_cache: nullable String = null
195
196 # Compile the regular expression, if needed
197 #
198 # Return `null` on success and an `Error` otherwise.
199 #
200 # This method is always called by `get_match` and `has_match`, but the user
201 # should call it to check for errors.
202 #
203 # assert "ab".to_re.compile == null
204 # assert "[ab".to_re.compile.message == "Unmatched [ or [^"
205 fun compile: nullable Error
206 do
207 var cflags = 0
208 if extended then cflags |= flag_extended
209 if ignore_case then cflags |= flag_icase
210 if optimize_is_in then cflags |= flag_nosub
211 if newline then cflags |= flag_newline
212
213 var native = self.native
214 var need_compilation = native == null or cflags != cflags_cache or string != string_cache
215
216 if need_compilation then
217
218 # Initial allocation
219 if native == null then
220 native = new NativeRegex.malloc
221 self.native = native
222 end
223
224 var res = native.regcomp(string.to_cstring, cflags)
225
226 # All is good
227 if res == 0 then
228 # Update the cache
229 self.native = native
230
231 # We store these to know if we need to recompile or not
232 self.cflags_cache = cflags
233 self.string_cache = string
234
235 return null
236 end
237
238 var error_cstr = native.regerror(res)
239
240 # We leave it to the lib to decide how to allocate the string that we keep
241 var error_str = error_cstr.to_s_with_copy
242 error_cstr.free
243
244 return new Error(error_str)
245 end
246
247 return null
248 end
249
250 redef fun finalize
251 do
252 var native = self.native
253 if native != null then
254 native.regfree
255 native.free
256 self.native = null
257
258 if native_match_is_init then
259 self.native_match.free
260 end
261 end
262 end
263
264 private fun gather_eflags: Int
265 do
266 var eflags = 0
267 if not_bol then eflags |= flag_notbol
268 if not_eol then eflags |= flag_noteol
269 return eflags
270 end
271
272 private fun get_error(errcode: Int): String
273 do
274 # Error, should be out of memory but we cover any possible error anyway
275 var error_cstr = native.regerror(errcode)
276
277 # We leave it to the lib to decide how to allocate the string that we keep
278 var error_str = error_cstr.to_s_with_copy
279 error_cstr.free
280
281 return error_str
282 end
283
284 # assert "ab".to_re.is_in("abcd")
285 # assert "ab".to_re.is_in("cdab")
286 # assert not "ab".to_re.is_in("acdb")
287 # assert "ab".to_re.is_in("ab")
288 redef fun is_in(text)
289 do
290 var comp_res = compile
291 assert comp_res == null else "Regex compilation failed with: {comp_res.message}\n".output
292
293 # Actually execute
294 var eflags = gather_eflags
295 var res = native.regexec_match_only(text.to_cstring, eflags)
296
297 # Got a match?
298 if res == 0 then return true
299
300 # Got no match, not an error?
301 if res.is_nomatch then return false
302
303 # Error, should be out of memory but we cover any possible error anyway
304 var error_str = get_error(res)
305 "Regex search failed with: {error_str}\n".output
306 abort
307 end
308
309 # require: not optimize_is_in
310 #
311 # assert "l".to_re.search_index_in("hello world", 0) == 2
312 # assert "el+o".to_re.search_index_in("hello world", 0) == 1
313 # assert "l+".to_re.search_index_in("hello world", 3) == 3
314 # assert "z".to_re.search_index_in("hello world", 0) == -1
315 redef fun search_index_in(text, from)
316 do
317 assert not optimize_is_in
318
319 var comp_res = compile
320 assert comp_res == null else "Regex compilation failed with: {comp_res.message}\n".output
321
322 # Actually execute
323 text = text.to_s
324 var cstr = text.substring_from(from).to_cstring
325 var eflags = gather_eflags
326 var match = self.native_match
327
328 var res = native.regexec(cstr, 1, match, eflags)
329
330 # Found one?
331 if res == 0 then return match.rm_so + from
332
333 # No more match?
334 if res.is_nomatch then return -1
335
336 # Error, should be out of memory but we cover any possible error anyway
337 var error_str = get_error(res)
338 "Regex search failed with: {error_str}\n".output
339 abort
340 end
341
342 # require: not optimize_is_in
343 #
344 # assert "l".to_re.search_in("hello world", 0).from == 2
345 # assert "el+o".to_re.search_in("hello world", 0).from == 1
346 # assert "l+".to_re.search_in("hello world", 3).from == 3
347 # assert "z".to_re.search_in("hello world", 0) == null
348 redef fun search_in(text, from)
349 do
350 assert not optimize_is_in
351
352 var comp_res = compile
353 assert comp_res == null else "Regex compilation failed with: {comp_res.message}\n".output
354
355 # Actually execute
356 text = text.to_s
357 var cstr = text.substring_from(from).to_cstring
358 var eflags = gather_eflags
359 var native_match = self.native_match
360
361 var nsub = native.re_nsub
362 var res = native.regexec(cstr, nsub+1, native_match, eflags)
363
364 # Found one?
365 if res == 0 then
366 var match = new Match(text,
367 from + native_match.rm_so,
368 native_match.rm_eo - native_match.rm_so)
369
370 # Add sub expressions
371 for i in [1..nsub] do
372 match.subs.add new Match( text,
373 native_match[i].rm_so,
374 native_match[i].rm_eo - native_match[i].rm_so)
375 end
376
377 return match
378 end
379
380 # No more match?
381 if res.is_nomatch then return null
382
383 # Error, should be out of memory but we cover any possible error anyway
384 var error_str = get_error(res)
385 "Regex search failed with: {error_str}\n".output
386 abort
387 end
388
389 # require: not optimize_is_in
390 #
391 # assert "ab".to_re.search_all_in("abbab").join(", ") == "ab, ab"
392 # assert "b+".to_re.search_all_in("abbabaabbbbbcab").join(", ") == "bb, b, bbbbb, b"
393 redef fun search_all_in(text)
394 do
395 assert not optimize_is_in
396
397 var comp_res = compile
398 assert comp_res == null else "Regex compilation failed with: {comp_res.message}\n".output
399
400 # Actually execute
401 text = text.to_s
402 var cstr = text.to_cstring
403 var eflags = gather_eflags
404 var eflags_or_notbol = eflags | flag_notbol
405 var native_match = self.native_match
406 var matches = new Array[Match]
407
408 var nsub = native.re_nsub
409 var res = native.regexec(cstr, nsub+1, native_match, eflags)
410 var d = 0
411 while res == 0 do
412 var match = new Match(text,
413 d + native_match.rm_so,
414 native_match.rm_eo - native_match.rm_so)
415 matches.add match
416
417 # Add sub expressions
418 for i in [1..nsub] do
419 match.subs.add new Match( text,
420 d + native_match[i].rm_so,
421 native_match[i].rm_eo - native_match[i].rm_so)
422 end
423
424 if d == native_match.rm_eo then
425 d += 1
426 else d = d + native_match.rm_eo
427 cstr = cstr.substring_from(native_match.rm_eo)
428 res = native.regexec(cstr, nsub+1, native_match, eflags_or_notbol)
429 end
430
431 # No more match?
432 if res.is_nomatch then return matches
433
434 # Error, should be out of memory but we cover any possible error anyway
435 var error_str = get_error(res)
436 "Regex search failed with: {error_str}\n".output
437 abort
438 end
439
440 redef fun to_s do return "/{string}/"
441 end
442
443 redef class Match
444 # Parenthesized subexpressions in this match
445 #
446 # ~~~
447 # var re = "c (d e+) f".to_re
448 # var match = "a b c d eee f g".search(re)
449 # assert match.subs.length == 1
450 # assert match.subs.first.to_s == "d eee"
451 # ~~~
452 var subs = new Array[Match] is lazy
453
454 # Get the `n`th expression in this match
455 #
456 # `n == 0` returns this match, and a greater `n` returns the corresponding
457 # subexpression.
458 #
459 # Require: `n >= 0 and n <= subs.length`
460 #
461 # ~~~
462 # var re = "c (d e+) f".to_re
463 # var match = "a b c d eee f g".search(re)
464 # assert match[0].to_s == "c d eee f"
465 # assert match[1].to_s == "d eee"
466 # ~~~
467 fun [](n: Int): Match do
468 if n == 0 then return self
469 assert n > 0 and n <= subs.length
470 return subs[n-1]
471 end
472 end