1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # Copyright 2014 Alexis Laferrière <alexis.laf@xymus.net>
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
17 # Regular expression support for all services based on `Pattern`
19 # Implemented using libc regular expressions.
21 # The main entities are `Text::to_re` and `Regex`.
29 #include <sys/types.h>
33 # Main extern class to wrap libc regular expression support
35 # It is recommanded to use the higher level API offered by the class `Regex`,
36 # but it can still be used for advanced purpose or in optimized code.
38 # To use this class and other `private` entities of this module, use `intrude import standard::re`
39 private extern class NativeRegex `{ regex_t* `}
40 # Allocate a new `NativeRegex`, it must then be compiled using `regcomp
` before calling `regexec
`
41 new malloc `{ return malloc(sizeof(regex_t)); `}
43 # Compile the regular expression `regex` into a form that is suitable for subsequent `regexec` searches
44 fun regcomp
(regex
: NativeString, cflags
: Int): Int `{
45 return regcomp(recv, regex, cflags);
48 # Match `string` against the precompiled pattern buffer of `self`, locating matches
50 # `nmatch` and `pmatch` are used to provide information regarding the location of any matches.
51 # `eflags` may be the bitwise-or of one or both of `flag_notbol` and `flag_noteol`.
52 fun regexec
(string
: NativeString, nmatch
: Int, pmatch
: NativeMatchArray, eflags
: Int): Int `{
53 return regexec(recv, string, nmatch, pmatch, eflags);
56 # Match `string` against the precompiled pattern buffer of `self`, do not locate matches
58 # `eflags` may be the bitwise-or of one or both of `flag_notbol` and `flag_noteol`.
59 fun regexec_match_only
(string
: NativeString, eflags
: Int): Int `{
60 return regexec(recv, string, 0, NULL, eflags);
63 # Free the memory allocated to the pattern buffer by the compiling process
65 # Does not free the memory holding `self`, use `free` for this purpose.
66 fun regfree
`{ regfree(recv); `}
68 # Turn the error codes that can be returned by both `regcomp
` and `regexec
` into error message strings
69 fun regerror(errcode: Int): NativeString `{
70 size_t len
= regerror
(errcode
, recv
, NULL, 0);
71 char
*message
= malloc
(len
);
72 regerror
(errcode
, recv
, message
, len
);
77 # This field holds the number of parenthetical subexpressions in the regular expression that was compiled.
78 fun re_nsub: Int `{ return recv->re_nsub; `}
81 # Flags for `NativeRegex::regcomp`
83 private fun flag_extended
: Int `{ return REG_EXTENDED; `}
84 private fun flag_icase: Int `{ return REG_ICASE; `}
85 private fun flag_nosub
: Int `{ return REG_NOSUB; `}
86 private fun flag_newline: Int `{ return REG_NEWLINE; `}
88 # Flags for `NativeRegex::regexec`
90 private fun flag_notbol
: Int `{ return REG_NOTBOL; `}
91 private fun flag_noteol: Int `{ return REG_NOTEOL; `}
93 # Errors of `NativeRegex::regexec`
95 private fun error_nomatch
: Int `{ return REG_NOMATCH; `}
96 private fun error_espace: Int `{ return REG_ESPACE; `}
99 private fun is_nomatch
: Bool `{ return recv == REG_NOMATCH; `}
102 # An array of `regmatch_t
` or a pointer to one
103 private extern class NativeMatchArray `{ regmatch_t* `}
104 # Allocate a new array of `length` `regmatch_t`
105 new malloc
(length
: Int) `{ return malloc(length * sizeof(regmatch_t)); `}
107 # The offset in string of the beginning of a substring
108 fun rm_so: Int `{ return recv->rm_so; `}
110 # The offset in string of the end of the substring
111 fun rm_eo
: Int `{ return recv->rm_eo; `}
113 # Get a pointer to the element at `index
`, can also be used as a subarray
114 fun [](index: Int): NativeMatchArray `{ return recv + index; `}
117 redef extern class NativeString
118 private fun substring_from
(index
: Int): NativeString `{ return recv + index; `}
122 # Get a `Regex` instance from `self`
123 fun to_re: Regex do return new Regex(self.to_s)
126 # A regular expression pattern
128 # Used as a `Pattern` on intances of `Text` to call `has
`, `search_all
`, `replace
`, etc.
132 # var re = "ab+a".to_re
133 # assert "aabbbbaaaaba".search_all(re).join(", ") == "abbbba, aba"
134 # assert "aabbbbaaaaba".has(re)
135 # assert "aabbbbaaaaba".replace(re, "+") == "a+aa+"
136 # assert "aabbbbaaaaba".split(re) == ["a", "aa", ""]
141 # The `String` source of this regular expression
142 var string: String is writable
144 # Treat the pattern as a POSIX extended regular expression (the default)
146 # If `false`, it is treated as a POSIX basic regular expression (BRE).
148 # The extended syntax supports `?`, `+` and `|`. Also, `\
` causes the following
149 # character to be used as literal.
150 var extended = true is writable
152 # Ignore case when matching letters
153 var ignore_case = false is writable
155 # Optimize `self` for `is_in
` and `String::has
`, but do not support searches
157 # If `true`, `self` cannont be used with `String::search_all
`, `String::replace
`
158 # or `String::split
`.
159 var optimize_is_in = false is writable
161 # Treat a newline in string as dividing string into multiple lines
163 # So that `$
` can match before the newline and `^
` can match after.
164 # Also, don’t permit `.` to match a newline, and don’t permit `[^…
]` to match a newline.
166 # Otherwise, newline acts like any other ordinary character.
167 var newline = false is writable
169 # Do not regard the beginning of the specified string as the beginning of a line
171 # More generally, don’t make any assumptions about what text might precede it.
172 var not_bol = false is writable
174 # Do not regard the end of the specified string as the end of a line
176 # More generally, don’t make any assumptions about what text might follow it.
177 var not_eol = false is writable
179 # Cache of the last used compiled regular expression
180 private var native: nullable NativeRegex = null
182 # Cache of a single `regmatch_t
` to prevent many calls to `malloc
`
183 private var native_match = new NativeMatchArray.malloc(1) is lazy
185 # `cflags
` of the last successful `compile
`
186 private var cflags_cache = 0
188 # `string
` of the last successful `compile
`
189 private var string_cache: nullable String = null
191 # Compile the regular expression, if needed
193 # Return `null` on success and an `Error` otherwise.
195 # This method is always called by `get_match
` and `has_match
`, but the user
196 # should call it to check for errors.
198 # assert "ab".to_re.compile == null
199 # assert "[ab".to_re.compile.message == "Unmatched [ or [^"
200 fun compile: nullable Error
203 if extended then cflags = cflags.bin_or(flag_extended)
204 if ignore_case then cflags = cflags.bin_or(flag_icase)
205 if optimize_is_in then cflags = cflags.bin_or(flag_nosub)
206 if newline then cflags = cflags.bin_or(flag_newline)
208 var native = self.native
209 var need_compilation = native == null or cflags != cflags_cache or string != string_cache
211 if need_compilation then
214 if native == null then
215 native = new NativeRegex.malloc
219 var res = native.regcomp(string.to_cstring, cflags)
226 # We store these to know if we need to recompile or not
227 self.cflags_cache = cflags
228 self.string_cache = string
233 var error_cstr = native.regerror(res)
235 # We leave it to the lib to decide how to allocate the string that we keep
236 var error_str = error_cstr.to_s_with_copy
239 return new Error(error_str)
247 var native = self.native
248 if native != null then
252 self.native_match.free
256 private fun gather_eflags: Int
259 if not_bol then eflags = eflags.bin_or(flag_notbol)
260 if not_eol then eflags = eflags.bin_or(flag_noteol)
264 private fun get_error(errcode: Int): String
266 # Error, should be out of memory but we cover any possible error anyway
267 var error_cstr = native.regerror(errcode)
269 # We leave it to the lib to decide how to allocate the string that we keep
270 var error_str = error_cstr.to_s_with_copy
276 # assert "ab".to_re.is_in("abcd")
277 # assert "ab".to_re.is_in("cdab")
278 # assert not "ab".to_re.is_in("acdb")
279 # assert "ab".to_re.is_in("ab")
280 redef fun is_in(text)
282 var comp_res = compile
283 assert comp_res == null else "Regex compilation failed with: {comp_res.message}\n".output
286 var eflags = gather_eflags
287 var res = native.regexec_match_only(text.to_cstring, eflags)
290 if res == 0 then return true
292 # Got no match, not an error?
293 if res.is_nomatch then return false
295 # Error, should be out of memory but we cover any possible error anyway
296 var error_str = get_error(res)
297 "Regex search failed with: {error_str}\n".output
301 # require: not optimize_is_in
303 # assert "l".to_re.search_index_in("hello world", 0) == 2
304 # assert "el+o".to_re.search_index_in("hello world", 0) == 1
305 # assert "l+".to_re.search_index_in("hello world", 3) == 3
306 # assert "z".to_re.search_index_in("hello world", 0) == -1
307 redef fun search_index_in(text, from)
309 assert not optimize_is_in
311 var comp_res = compile
312 assert comp_res == null else "Regex compilation failed with: {comp_res.message}\n".output
316 var cstr = text.substring_from(from).to_cstring
317 var eflags = gather_eflags
318 var match = self.native_match
320 var res = native.regexec(cstr, 1, match, eflags)
323 if res == 0 then return match.rm_so + from
326 if res.is_nomatch then return -1
328 # Error, should be out of memory but we cover any possible error anyway
329 var error_str = get_error(res)
330 "Regex search failed with: {error_str}\n".output
334 # require: not optimize_is_in
336 # assert "l".to_re.search_in("hello world", 0).from == 2
337 # assert "el+o".to_re.search_in("hello world", 0).from == 1
338 # assert "l+".to_re.search_in("hello world", 3).from == 3
339 # assert "z".to_re.search_in("hello world", 0) == null
340 redef fun search_in(text, from)
342 assert not optimize_is_in
344 var comp_res = compile
345 assert comp_res == null else "Regex compilation failed with: {comp_res.message}\n".output
349 var cstr = text.substring_from(from).to_cstring
350 var eflags = gather_eflags
351 var match = self.native_match
352 var matches = new Array[Match]
354 var res = native.regexec(cstr, 1, match, eflags)
357 if res == 0 then return new Match(text, from + match.rm_so, match.rm_eo - match.rm_so)
360 if res.is_nomatch then return null
362 # Error, should be out of memory but we cover any possible error anyway
363 var error_str = get_error(res)
364 "Regex search failed with: {error_str}\n".output
368 # require: not optimize_is_in
370 # assert "ab".to_re.search_all_in("abbab").join(", ") == "ab, ab"
371 # assert "b+".to_re.search_all_in("abbabaabbbbbcab").join(", ") == "bb, b, bbbbb, b"
372 redef fun search_all_in(text)
374 assert not optimize_is_in
376 var comp_res = compile
377 assert comp_res == null else "Regex compilation failed with: {comp_res.message}\n".output
381 var cstr = text.to_cstring
382 var eflags = gather_eflags
383 var eflags_or_notbol = eflags.bin_or(flag_notbol)
384 var match = self.native_match
385 var matches = new Array[Match]
387 var res = native.regexec(cstr, 1, match, eflags)
390 matches.add new Match(text, d + match.rm_so, match.rm_eo - match.rm_so)
391 if d == match.rm_eo then
393 else d = d + match.rm_eo
394 cstr = cstr.substring_from(match.rm_eo)
395 res = native.regexec(cstr, 1, match, eflags_or_notbol)
399 if res.is_nomatch then return matches
401 # Error, should be out of memory but we cover any possible error anyway
402 var error_str = get_error(res)
403 "Regex search failed with: {error_str}\n".output
407 redef fun to_s do return "/{string}/"