X-Git-Url: http://nitlanguage.org diff --git a/lib/core/re.nit b/lib/core/re.nit index 7d588ca..3a3d196 100644 --- a/lib/core/re.nit +++ b/lib/core/re.nit @@ -22,6 +22,7 @@ module re import text +import text::flat import gc import error @@ -35,13 +36,13 @@ in "C Header" `{ # It is recommanded to use the higher level API offered by the class `Regex`, # but it can still be used for advanced purpose or in optimized code. # -# To use this class and other `private` entities of this module, use `intrude import standard::re` +# To use this class and other `private` entities of this module, use `intrude import core::re` private extern class NativeRegex `{ regex_t* `} # Allocate a new `NativeRegex`, it must then be compiled using `regcomp` before calling `regexec` new malloc `{ return malloc(sizeof(regex_t)); `} # Compile the regular expression `regex` into a form that is suitable for subsequent `regexec` searches - fun regcomp(regex: NativeString, cflags: Int): Int `{ + fun regcomp(regex: CString, cflags: Int): Int `{ return regcomp(self, regex, cflags); `} @@ -49,14 +50,14 @@ private extern class NativeRegex `{ regex_t* `} # # `nmatch` and `pmatch` are used to provide information regarding the location of any matches. # `eflags` may be the bitwise-or of one or both of `flag_notbol` and `flag_noteol`. - fun regexec(string: NativeString, nmatch: Int, pmatch: NativeMatchArray, eflags: Int): Int `{ + fun regexec(string: CString, nmatch: Int, pmatch: NativeMatchArray, eflags: Int): Int `{ return regexec(self, string, nmatch, pmatch, eflags); `} # Match `string` against the precompiled pattern buffer of `self`, do not locate matches # # `eflags` may be the bitwise-or of one or both of `flag_notbol` and `flag_noteol`. - fun regexec_match_only(string: NativeString, eflags: Int): Int `{ + fun regexec_match_only(string: CString, eflags: Int): Int `{ return regexec(self, string, 0, NULL, eflags); `} @@ -66,7 +67,7 @@ private extern class NativeRegex `{ regex_t* `} fun regfree `{ regfree(self); `} # Turn the error codes that can be returned by both `regcomp` and `regexec` into error message strings - fun regerror(errcode: Int): NativeString `{ + fun regerror(errcode: Int): CString `{ size_t len = regerror(errcode, self, NULL, 0); char *message = malloc(len); regerror(errcode, self, message, len); @@ -114,8 +115,8 @@ private extern class NativeMatchArray `{ regmatch_t* `} fun [](index: Int): NativeMatchArray `{ return self + index; `} end -redef extern class NativeString - private fun substring_from(index: Int): NativeString `{ return self + index; `} +redef extern class CString + private fun substring_from(index: Int): CString `{ return self + index; `} end redef class Text @@ -152,11 +153,11 @@ class Regex # Ignore case when matching letters var ignore_case = false is writable - # Optimize `self` for `is_in` and `String::has`, but do not support searches + # Optimize `self` for `String::has` and `is_in`, but do not support searches # # If `true`, `self` cannont be used with `String::search_all`, `String::replace` # or `String::split`. - var optimize_is_in = false is writable + var optimize_has = false is writable # Treat a newline in string as dividing string into multiple lines # @@ -182,7 +183,7 @@ class Regex # Cache of a single `regmatch_t` to prevent many calls to `malloc` private var native_match: NativeMatchArray is lazy do native_match_is_init = true - return new NativeMatchArray.malloc(native.re_nsub+1) + return new NativeMatchArray.malloc(native.as(not null).re_nsub+1) end private var native_match_is_init = false @@ -207,7 +208,7 @@ class Regex var cflags = 0 if extended then cflags |= flag_extended if ignore_case then cflags |= flag_icase - if optimize_is_in then cflags |= flag_nosub + if optimize_has then cflags |= flag_nosub if newline then cflags |= flag_newline var native = self.native @@ -271,6 +272,9 @@ class Regex private fun get_error(errcode: Int): String do + var native = native + assert native != null + # Error, should be out of memory but we cover any possible error anyway var error_cstr = native.regerror(errcode) @@ -290,6 +294,9 @@ class Regex var comp_res = compile assert comp_res == null else "Regex compilation failed with: {comp_res.message}\n".output + var native = native + assert native != null + # Actually execute var eflags = gather_eflags var res = native.regexec_match_only(text.to_cstring, eflags) @@ -306,7 +313,7 @@ class Regex abort end - # require: not optimize_is_in + # require: not optimize_has # # assert "l".to_re.search_index_in("hello world", 0) == 2 # assert "el+o".to_re.search_index_in("hello world", 0) == 1 @@ -314,11 +321,14 @@ class Regex # assert "z".to_re.search_index_in("hello world", 0) == -1 redef fun search_index_in(text, from) do - assert not optimize_is_in + assert not optimize_has var comp_res = compile assert comp_res == null else "Regex compilation failed with: {comp_res.message}\n".output + var native = native + assert native != null + # Actually execute text = text.to_s var cstr = text.substring_from(from).to_cstring @@ -339,39 +349,54 @@ class Regex abort end - # require: not optimize_is_in + # require: not optimize_has # # assert "l".to_re.search_in("hello world", 0).from == 2 # assert "el+o".to_re.search_in("hello world", 0).from == 1 # assert "l+".to_re.search_in("hello world", 3).from == 3 # assert "z".to_re.search_in("hello world", 0) == null - redef fun search_in(text, from) + # assert "cd(e)".to_re.search_in("abcdef", 2)[1].to_s == "e" + redef fun search_in(text, charfrom) do - assert not optimize_is_in + assert not optimize_has var comp_res = compile assert comp_res == null else "Regex compilation failed with: {comp_res.message}\n".output + var native = native + assert native != null + # Actually execute - text = text.to_s - var cstr = text.substring_from(from).to_cstring + var cstr = text.to_cstring + var rets = cstr.to_s_with_length(text.byte_length) + var bytefrom = cstr.char_to_byte_index_cached(charfrom, 0, 0) + var subcstr = cstr.fast_cstring(bytefrom) var eflags = gather_eflags var native_match = self.native_match var nsub = native.re_nsub - var res = native.regexec(cstr, nsub+1, native_match, eflags) + var res = native.regexec(subcstr, nsub + 1, native_match, eflags) # Found one? if res == 0 then - var match = new Match(text, - from + native_match.rm_so, - native_match.rm_eo - native_match.rm_so) + var bfrom = native_match.rm_so + bytefrom + var bto = native_match.rm_eo - 1 + bytefrom + var cpos = cstr.byte_to_char_index_cached(bfrom, charfrom, bytefrom) + var len = cstr.utf8_length(bfrom, bto - bfrom + 1) + var match = new Match(rets, cpos, len) + var subs = match.subs # Add sub expressions - for i in [1..nsub] do - match.subs.add new Match( text, - native_match[i].rm_so, - native_match[i].rm_eo - native_match[i].rm_so) + for i in [1 .. nsub] do + if native_match[i].rm_so < 0 then + subs.add null + continue + end + var sub_bfrom = native_match[i].rm_so + bytefrom + var sub_bto = native_match[i].rm_eo - 1 + bytefrom + var sub_cpos = cstr.byte_to_char_index_cached(sub_bfrom, cpos, bfrom) + var sub_len = cstr.utf8_length(sub_bfrom, sub_bto - sub_bfrom + 1) + subs.add(new Match(rets, sub_cpos, sub_len)) end return match @@ -386,46 +411,59 @@ class Regex abort end - # require: not optimize_is_in + # require: not optimize_has # # assert "ab".to_re.search_all_in("abbab").join(", ") == "ab, ab" # assert "b+".to_re.search_all_in("abbabaabbbbbcab").join(", ") == "bb, b, bbbbb, b" redef fun search_all_in(text) do - assert not optimize_is_in + assert not optimize_has var comp_res = compile assert comp_res == null else "Regex compilation failed with: {comp_res.message}\n".output + var native = native + assert native != null + # Actually execute - text = text.to_s var cstr = text.to_cstring + var subcstr = cstr + var rets = cstr.to_s_with_length(text.byte_length) var eflags = gather_eflags var eflags_or_notbol = eflags | flag_notbol var native_match = self.native_match var matches = new Array[Match] var nsub = native.re_nsub - var res = native.regexec(cstr, nsub+1, native_match, eflags) - var d = 0 + var res = native.regexec(subcstr, nsub + 1, native_match, eflags) + var bytesub = 0 + var charsub = 0 while res == 0 do - var match = new Match(text, - d + native_match.rm_so, - native_match.rm_eo - native_match.rm_so) + var bfrom = native_match.rm_so + bytesub + var bto = native_match.rm_eo - 1 + bytesub + var cstart = cstr.byte_to_char_index_cached(bfrom, charsub, bytesub) + var len = cstr.utf8_length(bfrom, bto - bfrom + 1) + var match = new Match(rets, cstart, len) matches.add match + var subs = match.subs # Add sub expressions - for i in [1..nsub] do - match.subs.add new Match( text, - d + native_match[i].rm_so, - native_match[i].rm_eo - native_match[i].rm_so) + for i in [1 .. nsub] do + if native_match[i].rm_so < 0 then + subs.add null + continue + end + var sub_bfrom = native_match[i].rm_so + bytesub + var sub_bto = native_match[i].rm_eo - 1 + bytesub + var sub_cstart = cstr.byte_to_char_index_cached(sub_bfrom, cstart, bfrom) + var sub_len = cstr.utf8_length(sub_bfrom, sub_bto - sub_bfrom + 1) + subs.add(new Match(rets, sub_cstart, sub_len)) end - if d == native_match.rm_eo then - d += 1 - else d = d + native_match.rm_eo - cstr = cstr.substring_from(native_match.rm_eo) - res = native.regexec(cstr, nsub+1, native_match, eflags_or_notbol) + bytesub = bto + 1 + charsub = cstart + len + subcstr = cstr.fast_cstring(bytesub) + res = native.regexec(subcstr, nsub + 1, native_match, eflags_or_notbol) end # No more match? @@ -449,7 +487,7 @@ redef class Match # assert match.subs.length == 1 # assert match.subs.first.to_s == "d eee" # ~~~ - var subs = new Array[Match] is lazy + var subs = new Array[nullable Match] is lazy # Get the `n`th expression in this match # @@ -464,7 +502,7 @@ redef class Match # assert match[0].to_s == "c d eee f" # assert match[1].to_s == "d eee" # ~~~ - fun [](n: Int): Match do + fun [](n: Int): nullable Match do if n == 0 then return self assert n > 0 and n <= subs.length return subs[n-1]