# The main entities are `Text::to_re` and `Regex`.
module re
-import string_search
+import text
import gc
import error
# Compile the regular expression `regex` into a form that is suitable for subsequent `regexec` searches
fun regcomp(regex: NativeString, cflags: Int): Int `{
- return regcomp(recv, regex, cflags);
+ return regcomp(self, regex, cflags);
`}
# Match `string` against the precompiled pattern buffer of `self`, locating matches
# `nmatch` and `pmatch` are used to provide information regarding the location of any matches.
# `eflags` may be the bitwise-or of one or both of `flag_notbol` and `flag_noteol`.
fun regexec(string: NativeString, nmatch: Int, pmatch: NativeMatchArray, eflags: Int): Int `{
- return regexec(recv, string, nmatch, pmatch, eflags);
+ return regexec(self, string, nmatch, pmatch, eflags);
`}
# Match `string` against the precompiled pattern buffer of `self`, do not locate matches
#
# `eflags` may be the bitwise-or of one or both of `flag_notbol` and `flag_noteol`.
fun regexec_match_only(string: NativeString, eflags: Int): Int `{
- return regexec(recv, string, 0, NULL, eflags);
+ return regexec(self, string, 0, NULL, eflags);
`}
# Free the memory allocated to the pattern buffer by the compiling process
#
# Does not free the memory holding `self`, use `free` for this purpose.
- fun regfree `{ regfree(recv); `}
+ fun regfree `{ regfree(self); `}
# Turn the error codes that can be returned by both `regcomp` and `regexec` into error message strings
fun regerror(errcode: Int): NativeString `{
- size_t len = regerror(errcode, recv, NULL, 0);
+ size_t len = regerror(errcode, self, NULL, 0);
char *message = malloc(len);
- regerror(errcode, recv, message, len);
+ regerror(errcode, self, message, len);
return message;
`}
- # This field holds the number of parenthetical subexpressions in the regular expression that was compiled.
- fun re_nsub: Int `{ return recv->re_nsub; `}
+ # Number of parenthetical subexpressions in this compiled regular expression
+ fun re_nsub: Int `{ return self->re_nsub; `}
end
# Flags for `NativeRegex::regcomp`
private fun error_espace: Int `{ return REG_ESPACE; `}
redef universal Int
- private fun is_nomatch: Bool `{ return recv == REG_NOMATCH; `}
+ private fun is_nomatch: Bool `{ return self == REG_NOMATCH; `}
end
# An array of `regmatch_t` or a pointer to one
new malloc(length: Int) `{ return malloc(length * sizeof(regmatch_t)); `}
# The offset in string of the beginning of a substring
- fun rm_so: Int `{ return recv->rm_so; `}
+ fun rm_so: Int `{ return self->rm_so; `}
# The offset in string of the end of the substring
- fun rm_eo: Int `{ return recv->rm_eo; `}
+ fun rm_eo: Int `{ return self->rm_eo; `}
# Get a pointer to the element at `index`, can also be used as a subarray
- fun [](index: Int): NativeMatchArray `{ return recv + index; `}
+ fun [](index: Int): NativeMatchArray `{ return self + index; `}
end
redef extern class NativeString
- private fun substring_from(index: Int): NativeString `{ return recv + index; `}
+ private fun substring_from(index: Int): NativeString `{ return self + index; `}
end
redef class Text
private var native: nullable NativeRegex = null
# Cache of a single `regmatch_t` to prevent many calls to `malloc`
- private var native_match = new NativeMatchArray.malloc(0) is lazy
+ private var native_match: NativeMatchArray is lazy do
+ native_match_is_init = true
+ return new NativeMatchArray.malloc(native.re_nsub+1)
+ end
+
+ private var native_match_is_init = false
# `cflags` of the last successful `compile`
private var cflags_cache = 0
fun compile: nullable Error
do
var cflags = 0
- if extended then cflags = cflags.bin_or(flag_extended)
- if ignore_case then cflags = cflags.bin_or(flag_icase)
- if optimize_is_in then cflags = cflags.bin_or(flag_nosub)
- if newline then cflags = cflags.bin_or(flag_newline)
+ if extended then cflags |= flag_extended
+ if ignore_case then cflags |= flag_icase
+ if optimize_is_in then cflags |= flag_nosub
+ if newline then cflags |= flag_newline
var native = self.native
var need_compilation = native == null or cflags != cflags_cache or string != string_cache
native.regfree
native.free
self.native = null
- self.native_match.free
+
+ if native_match_is_init then
+ self.native_match.free
+ end
end
end
private fun gather_eflags: Int
do
var eflags = 0
- if not_bol then eflags = eflags.bin_or(flag_notbol)
- if not_eol then eflags = eflags.bin_or(flag_noteol)
+ if not_bol then eflags |= flag_notbol
+ if not_eol then eflags |= flag_noteol
return eflags
end
text = text.to_s
var cstr = text.substring_from(from).to_cstring
var eflags = gather_eflags
- var match = self.native_match
- var matches = new Array[Match]
+ var native_match = self.native_match
- var res = native.regexec(cstr, 1, match, eflags)
+ var nsub = native.re_nsub
+ var res = native.regexec(cstr, nsub+1, native_match, eflags)
# Found one?
- if res == 0 then return new Match(text, from + match.rm_so, match.rm_eo - match.rm_so)
+ if res == 0 then
+ var match = new Match(text,
+ from + native_match.rm_so,
+ native_match.rm_eo - native_match.rm_so)
+
+ # Add sub expressions
+ for i in [1..nsub] do
+ match.subs.add new Match( text,
+ native_match[i].rm_so,
+ native_match[i].rm_eo - native_match[i].rm_so)
+ end
+
+ return match
+ end
# No more match?
if res.is_nomatch then return null
text = text.to_s
var cstr = text.to_cstring
var eflags = gather_eflags
- var eflags_or_notbol = eflags.bin_or(flag_notbol)
- var match = self.native_match
+ var eflags_or_notbol = eflags | flag_notbol
+ var native_match = self.native_match
var matches = new Array[Match]
- var res = native.regexec(cstr, 1, match, eflags)
+ var nsub = native.re_nsub
+ var res = native.regexec(cstr, nsub+1, native_match, eflags)
var d = 0
while res == 0 do
- matches.add new Match(text, d + match.rm_so, match.rm_eo - match.rm_so)
- if d == match.rm_eo then
+ var match = new Match(text,
+ d + native_match.rm_so,
+ native_match.rm_eo - native_match.rm_so)
+ matches.add match
+
+ # Add sub expressions
+ for i in [1..nsub] do
+ match.subs.add new Match( text,
+ d + native_match[i].rm_so,
+ native_match[i].rm_eo - native_match[i].rm_so)
+ end
+
+ if d == native_match.rm_eo then
d += 1
- else d = d + match.rm_eo
- cstr = cstr.substring_from(match.rm_eo)
- res = native.regexec(cstr, 1, match, eflags_or_notbol)
+ else d = d + native_match.rm_eo
+ cstr = cstr.substring_from(native_match.rm_eo)
+ res = native.regexec(cstr, nsub+1, native_match, eflags_or_notbol)
end
# No more match?
redef fun to_s do return "/{string}/"
end
+
+redef class Match
+ # Parenthesized subexpressions in this match
+ #
+ # ~~~
+ # var re = "c (d e+) f".to_re
+ # var match = "a b c d eee f g".search(re)
+ # assert match.subs.length == 1
+ # assert match.subs.first.to_s == "d eee"
+ # ~~~
+ var subs = new Array[Match] is lazy
+
+ # Get the `n`th expression in this match
+ #
+ # `n == 0` returns this match, and a greater `n` returns the corresponding
+ # subexpression.
+ #
+ # Require: `n >= 0 and n <= subs.length`
+ #
+ # ~~~
+ # var re = "c (d e+) f".to_re
+ # var match = "a b c d eee f g".search(re)
+ # assert match[0].to_s == "c d eee f"
+ # assert match[1].to_s == "d eee"
+ # ~~~
+ fun [](n: Int): Match do
+ if n == 0 then return self
+ assert n > 0 and n <= subs.length
+ return subs[n-1]
+ end
+end