1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # Copyright 2014 Alexis Laferrière <alexis.laf@xymus.net>
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
17 # Regular expression support for all services based on `Pattern`
19 # Implemented using libc regular expressions.
21 # The main entities are `Text::to_re` and `Regex`.
25 intrude import text
::flat
30 #include <sys/types.h>
34 # Main extern class to wrap libc regular expression support
36 # It is recommanded to use the higher level API offered by the class `Regex`,
37 # but it can still be used for advanced purpose or in optimized code.
39 # To use this class and other `private` entities of this module, use `intrude import core::re`
40 private extern class NativeRegex `{ regex_t* `}
41 # Allocate a new `NativeRegex`, it must then be compiled using `regcomp
` before calling `regexec
`
42 new malloc `{ return malloc(sizeof(regex_t)); `}
44 # Compile the regular expression `regex` into a form that is suitable for subsequent `regexec` searches
45 fun regcomp
(regex
: NativeString, cflags
: Int): Int `{
46 return regcomp(self, regex, cflags);
49 # Match `string` against the precompiled pattern buffer of `self`, locating matches
51 # `nmatch` and `pmatch` are used to provide information regarding the location of any matches.
52 # `eflags` may be the bitwise-or of one or both of `flag_notbol` and `flag_noteol`.
53 fun regexec
(string
: NativeString, nmatch
: Int, pmatch
: NativeMatchArray, eflags
: Int): Int `{
54 return regexec(self, string, nmatch, pmatch, eflags);
57 # Match `string` against the precompiled pattern buffer of `self`, do not locate matches
59 # `eflags` may be the bitwise-or of one or both of `flag_notbol` and `flag_noteol`.
60 fun regexec_match_only
(string
: NativeString, eflags
: Int): Int `{
61 return regexec(self, string, 0, NULL, eflags);
64 # Free the memory allocated to the pattern buffer by the compiling process
66 # Does not free the memory holding `self`, use `free` for this purpose.
67 fun regfree
`{ regfree(self); `}
69 # Turn the error codes that can be returned by both `regcomp
` and `regexec
` into error message strings
70 fun regerror(errcode: Int): NativeString `{
71 size_t len
= regerror
(errcode
, self, NULL, 0);
72 char
*message
= malloc
(len
);
73 regerror
(errcode
, self, message
, len
);
78 # Number of parenthetical subexpressions in this compiled regular expression
79 fun re_nsub: Int `{ return self->re_nsub; `}
82 # Flags for `NativeRegex::regcomp`
84 private fun flag_extended
: Int `{ return REG_EXTENDED; `}
85 private fun flag_icase: Int `{ return REG_ICASE; `}
86 private fun flag_nosub
: Int `{ return REG_NOSUB; `}
87 private fun flag_newline: Int `{ return REG_NEWLINE; `}
89 # Flags for `NativeRegex::regexec`
91 private fun flag_notbol
: Int `{ return REG_NOTBOL; `}
92 private fun flag_noteol: Int `{ return REG_NOTEOL; `}
94 # Errors of `NativeRegex::regexec`
96 private fun error_nomatch
: Int `{ return REG_NOMATCH; `}
97 private fun error_espace: Int `{ return REG_ESPACE; `}
100 private fun is_nomatch
: Bool `{ return self == REG_NOMATCH; `}
103 # An array of `regmatch_t
` or a pointer to one
104 private extern class NativeMatchArray `{ regmatch_t* `}
105 # Allocate a new array of `length` `regmatch_t`
106 new malloc
(length
: Int) `{ return malloc(length * sizeof(regmatch_t)); `}
108 # The offset in string of the beginning of a substring
109 fun rm_so: Int `{ return self->rm_so; `}
111 # The offset in string of the end of the substring
112 fun rm_eo
: Int `{ return self->rm_eo; `}
114 # Get a pointer to the element at `index
`, can also be used as a subarray
115 fun [](index: Int): NativeMatchArray `{ return self + index; `}
118 redef extern class NativeString
119 private fun substring_from
(index
: Int): NativeString `{ return self + index; `}
123 # Get a `Regex` instance from `self`
124 fun to_re: Regex do return new Regex(self.to_s)
127 # A regular expression pattern
129 # Used as a `Pattern` on intances of `Text` to call `has
`, `search_all
`, `replace
`, etc.
133 # var re = "ab+a".to_re
134 # assert "aabbbbaaaaba".search_all(re).join(", ") == "abbbba, aba"
135 # assert "aabbbbaaaaba".has(re)
136 # assert "aabbbbaaaaba".replace(re, "+") == "a+aa+"
137 # assert "aabbbbaaaaba".split(re) == ["a", "aa", ""]
142 # The `String` source of this regular expression
143 var string: String is writable
145 # Treat the pattern as a POSIX extended regular expression (the default)
147 # If `false`, it is treated as a POSIX basic regular expression (BRE).
149 # The extended syntax supports `?`, `+` and `|`. Also, `\
` causes the following
150 # character to be used as literal.
151 var extended = true is writable
153 # Ignore case when matching letters
154 var ignore_case = false is writable
156 # Optimize `self` for `is_in
` and `String::has
`, but do not support searches
158 # If `true`, `self` cannont be used with `String::search_all
`, `String::replace
`
159 # or `String::split
`.
160 var optimize_is_in = false is writable
162 # Treat a newline in string as dividing string into multiple lines
164 # So that `$
` can match before the newline and `^
` can match after.
165 # Also, don’t permit `.` to match a newline, and don’t permit `[^…
]` to match a newline.
167 # Otherwise, newline acts like any other ordinary character.
168 var newline = false is writable
170 # Do not regard the beginning of the specified string as the beginning of a line
172 # More generally, don’t make any assumptions about what text might precede it.
173 var not_bol = false is writable
175 # Do not regard the end of the specified string as the end of a line
177 # More generally, don’t make any assumptions about what text might follow it.
178 var not_eol = false is writable
180 # Cache of the last used compiled regular expression
181 private var native: nullable NativeRegex = null
183 # Cache of a single `regmatch_t
` to prevent many calls to `malloc
`
184 private var native_match: NativeMatchArray is lazy do
185 native_match_is_init = true
186 return new NativeMatchArray.malloc(native.re_nsub+1)
189 private var native_match_is_init = false
191 # `cflags
` of the last successful `compile
`
192 private var cflags_cache = 0
194 # `string
` of the last successful `compile
`
195 private var string_cache: nullable String = null
197 # Compile the regular expression, if needed
199 # Return `null` on success and an `Error` otherwise.
201 # This method is always called by `get_match
` and `has_match
`, but the user
202 # should call it to check for errors.
204 # assert "ab".to_re.compile == null
205 # assert "[ab".to_re.compile.message == "Unmatched [ or [^"
206 fun compile: nullable Error
209 if extended then cflags |= flag_extended
210 if ignore_case then cflags |= flag_icase
211 if optimize_is_in then cflags |= flag_nosub
212 if newline then cflags |= flag_newline
214 var native = self.native
215 var need_compilation = native == null or cflags != cflags_cache or string != string_cache
217 if need_compilation then
220 if native == null then
221 native = new NativeRegex.malloc
225 var res = native.regcomp(string.to_cstring, cflags)
232 # We store these to know if we need to recompile or not
233 self.cflags_cache = cflags
234 self.string_cache = string
239 var error_cstr = native.regerror(res)
241 # We leave it to the lib to decide how to allocate the string that we keep
242 var error_str = error_cstr.to_s_with_copy
245 return new Error(error_str)
253 var native = self.native
254 if native != null then
259 if native_match_is_init then
260 self.native_match.free
265 private fun gather_eflags: Int
268 if not_bol then eflags |= flag_notbol
269 if not_eol then eflags |= flag_noteol
273 private fun get_error(errcode: Int): String
275 # Error, should be out of memory but we cover any possible error anyway
276 var error_cstr = native.regerror(errcode)
278 # We leave it to the lib to decide how to allocate the string that we keep
279 var error_str = error_cstr.to_s_with_copy
285 # assert "ab".to_re.is_in("abcd")
286 # assert "ab".to_re.is_in("cdab")
287 # assert not "ab".to_re.is_in("acdb")
288 # assert "ab".to_re.is_in("ab")
289 redef fun is_in(text)
291 var comp_res = compile
292 assert comp_res == null else "Regex compilation failed with: {comp_res.message}\n".output
295 var eflags = gather_eflags
296 var res = native.regexec_match_only(text.to_cstring, eflags)
299 if res == 0 then return true
301 # Got no match, not an error?
302 if res.is_nomatch then return false
304 # Error, should be out of memory but we cover any possible error anyway
305 var error_str = get_error(res)
306 "Regex search failed with: {error_str}\n".output
310 # require: not optimize_is_in
312 # assert "l".to_re.search_index_in("hello world", 0) == 2
313 # assert "el+o".to_re.search_index_in("hello world", 0) == 1
314 # assert "l+".to_re.search_index_in("hello world", 3) == 3
315 # assert "z".to_re.search_index_in("hello world", 0) == -1
316 redef fun search_index_in(text, from)
318 assert not optimize_is_in
320 var comp_res = compile
321 assert comp_res == null else "Regex compilation failed with: {comp_res.message}\n".output
325 var cstr = text.substring_from(from).to_cstring
326 var eflags = gather_eflags
327 var match = self.native_match
329 var res = native.regexec(cstr, 1, match, eflags)
332 if res == 0 then return match.rm_so + from
335 if res.is_nomatch then return -1
337 # Error, should be out of memory but we cover any possible error anyway
338 var error_str = get_error(res)
339 "Regex search failed with: {error_str}\n".output
343 # require: not optimize_is_in
345 # assert "l".to_re.search_in("hello world", 0).from == 2
346 # assert "el+o".to_re.search_in("hello world", 0).from == 1
347 # assert "l+".to_re.search_in("hello world", 3).from == 3
348 # assert "z".to_re.search_in("hello world", 0) == null
349 redef fun search_in(text, from)
351 assert not optimize_is_in
353 var comp_res = compile
354 assert comp_res == null else "Regex compilation failed with: {comp_res.message}\n".output
358 var sub = text.substring_from(from)
359 var cstr = sub.to_cstring
360 var bstr = new FlatString.full(cstr, sub.bytelen, 0, sub.bytelen - 1, text.length - from)
361 var eflags = gather_eflags
362 var native_match = self.native_match
364 var nsub = native.re_nsub
365 var res = native.regexec(cstr, nsub+1, native_match, eflags)
369 var bso = bstr.byte_to_char_index(native_match.rm_so)
370 var ln = bstr.byte_to_char_index(native_match.rm_eo - native_match.rm_so - 1)
371 var match = new Match(text,
375 # Add sub expressions
376 for i in [1 .. nsub] do
377 bso = bstr.byte_to_char_index(native_match[i].rm_so)
378 ln = bstr.byte_to_char_index(native_match[i].rm_eo - native_match[i].rm_so - 1)
379 match.subs.add new Match( text,
388 if res.is_nomatch then return null
390 # Error, should be out of memory but we cover any possible error anyway
391 var error_str = get_error(res)
392 "Regex search failed with: {error_str}\n".output
396 # require: not optimize_is_in
398 # assert "ab".to_re.search_all_in("abbab").join(", ") == "ab, ab"
399 # assert "b+".to_re.search_all_in("abbabaabbbbbcab").join(", ") == "bb, b, bbbbb, b"
400 redef fun search_all_in(text)
402 assert not optimize_is_in
404 var comp_res = compile
405 assert comp_res == null else "Regex compilation failed with: {comp_res.message}\n".output
409 var cstr = text.to_cstring
410 var eflags = gather_eflags
411 var eflags_or_notbol = eflags | flag_notbol
412 var native_match = self.native_match
413 var matches = new Array[Match]
415 var nsub = native.re_nsub
416 var res = native.regexec(cstr, nsub+1, native_match, eflags)
419 var match = new Match(text,
420 d + native_match.rm_so,
421 native_match.rm_eo - native_match.rm_so)
424 # Add sub expressions
425 for i in [1..nsub] do
426 match.subs.add new Match( text,
427 d + native_match[i].rm_so,
428 native_match[i].rm_eo - native_match[i].rm_so)
431 if d == native_match.rm_eo then
433 else d = d + native_match.rm_eo
434 cstr = cstr.substring_from(native_match.rm_eo)
435 res = native.regexec(cstr, nsub+1, native_match, eflags_or_notbol)
439 if res.is_nomatch then return matches
441 # Error, should be out of memory but we cover any possible error anyway
442 var error_str = get_error(res)
443 "Regex search failed with: {error_str}\n".output
447 redef fun to_s do return "/{string}/"
451 # Parenthesized subexpressions in this match
454 # var re = "c (d e+) f".to_re
455 # var match = "a b c d eee f g".search(re)
456 # assert match.subs.length == 1
457 # assert match.subs.first.to_s == "d eee"
459 var subs = new Array[Match] is lazy
461 # Get the `n
`th expression in this match
463 # `n
== 0` returns this match, and a greater `n
` returns the corresponding
466 # Require: `n
>= 0 and n
<= subs
.length
`
469 # var re = "c (d e+) f".to_re
470 # var match = "a b c d eee f g".search(re)
471 # assert match[0].to_s == "c d eee f"
472 # assert match[1].to_s == "d eee"
474 fun [](n: Int): Match do
475 if n == 0 then return self
476 assert n > 0 and n <= subs.length