From: Jean Privat Date: Thu, 10 Sep 2015 00:25:10 +0000 (-0400) Subject: Merge: UTF-8 Regex X-Git-Tag: v0.7.8~37 X-Git-Url: http://nitlanguage.org?hp=1c55b90444d5e865b2c92805c482f4898a4efcd3 Merge: UTF-8 Regex This PR closes #1684 Instead of making `byte_to_char_index` public, it has been removed as it had no real reason to live. Names are corrected and should correctly reflect their use. Some examples of regular expressions with UTF-8 have been included. Note however that the C-library underneath does not have UTF-8 semantics, as such, when using repetition operators on UTF-8 strings, capture the problematic characters with parentheses as in the example, or else the result will be erroneous. Additionally, performances should be a bit better since less allocations and copy_to should be done. Pull-Request: #1692 Reviewed-by: Jean Privat Reviewed-by: Alexis Laferrière --- diff --git a/lib/core/re.nit b/lib/core/re.nit index c37bf25..f3b6208 100644 --- a/lib/core/re.nit +++ b/lib/core/re.nit @@ -22,7 +22,7 @@ module re import text -intrude import text::flat +import text::flat import gc import error @@ -356,7 +356,7 @@ class Regex # assert "l+".to_re.search_in("hello world", 3).from == 3 # assert "z".to_re.search_in("hello world", 0) == null # assert "cd(e)".to_re.search_in("abcdef", 2)[1].to_s == "e" - redef fun search_in(text, from) + redef fun search_in(text, charfrom) do assert not optimize_has @@ -367,31 +367,36 @@ class Regex assert native != null # Actually execute - text = text.to_s - var sub = text.substring_from(from) - var cstr = sub.to_cstring - var bstr = new FlatString.full(cstr, sub.bytelen, 0, sub.bytelen - 1, text.length - from) + var cstr = text.to_cstring + var rets = cstr.to_s_with_length(text.bytelen) + var bytefrom = cstr.char_to_byte_index_cached(charfrom, 0, 0) + var subcstr = cstr.fast_cstring(bytefrom) var eflags = gather_eflags var native_match = self.native_match var nsub = native.re_nsub - var res = native.regexec(cstr, nsub+1, native_match, eflags) + var res = native.regexec(subcstr, nsub + 1, native_match, eflags) # Found one? if res == 0 then - var first_char = bstr.byte_to_char_index(native_match.rm_so) - var length_char = bstr.byte_to_char_index(native_match.rm_eo - native_match.rm_so - 1) # FIXME For issue #1684 - var match = new Match(text, - from + first_char, - length_char + 1) + var bfrom = native_match.rm_so + bytefrom + var bto = native_match.rm_eo - 1 + bytefrom + var cpos = cstr.byte_to_char_index_cached(bfrom, charfrom, bytefrom) + var len = cstr.utf8_length(bfrom, bto) + var match = new Match(rets, cpos, len) + var subs = match.subs # Add sub expressions for i in [1 .. nsub] do - first_char = bstr.byte_to_char_index(native_match[i].rm_so) - length_char = bstr.byte_to_char_index(native_match[i].rm_eo - native_match[i].rm_so - 1) # FIXME For issue #1684 - match.subs.add new Match( text, - from + first_char, - length_char + 1) + if native_match[i].rm_so < 0 then + subs.add null + continue + end + var sub_bfrom = native_match[i].rm_so + bytefrom + var sub_bto = native_match[i].rm_eo - 1 + bytefrom + var sub_cpos = cstr.byte_to_char_index_cached(sub_bfrom, cpos, bfrom) + var sub_len = cstr.utf8_length(sub_bfrom, sub_bto) + subs.add(new Match(rets, sub_cpos, sub_len)) end return match @@ -421,34 +426,44 @@ class Regex assert native != null # Actually execute - text = text.to_s var cstr = text.to_cstring + var subcstr = cstr + var rets = cstr.to_s_with_length(text.bytelen) var eflags = gather_eflags var eflags_or_notbol = eflags | flag_notbol var native_match = self.native_match var matches = new Array[Match] var nsub = native.re_nsub - var res = native.regexec(cstr, nsub+1, native_match, eflags) - var d = 0 + var res = native.regexec(subcstr, nsub + 1, native_match, eflags) + var bytesub = 0 + var charsub = 0 while res == 0 do - var match = new Match(text, - d + native_match.rm_so, - native_match.rm_eo - native_match.rm_so) + var bfrom = native_match.rm_so + bytesub + var bto = native_match.rm_eo - 1 + bytesub + var cstart = cstr.byte_to_char_index_cached(bfrom, charsub, bytesub) + var len = cstr.utf8_length(bfrom, bto) + var match = new Match(rets, cstart, len) matches.add match + var subs = match.subs # Add sub expressions - for i in [1..nsub] do - match.subs.add new Match( text, - d + native_match[i].rm_so, - native_match[i].rm_eo - native_match[i].rm_so) + for i in [1 .. nsub] do + if native_match[i].rm_so < 0 then + subs.add null + continue + end + var sub_bfrom = native_match[i].rm_so + bytesub + var sub_bto = native_match[i].rm_eo - 1 + bytesub + var sub_cstart = cstr.byte_to_char_index_cached(sub_bfrom, cstart, bfrom) + var sub_len = cstr.utf8_length(sub_bfrom, sub_bto) + subs.add(new Match(rets, sub_cstart, sub_len)) end - if d == native_match.rm_eo then - d += 1 - else d = d + native_match.rm_eo - cstr = cstr.substring_from(native_match.rm_eo) - res = native.regexec(cstr, nsub+1, native_match, eflags_or_notbol) + bytesub = bto + 1 + charsub = cstart + len + subcstr = cstr.fast_cstring(bytesub) + res = native.regexec(subcstr, nsub + 1, native_match, eflags_or_notbol) end # No more match? @@ -472,7 +487,7 @@ redef class Match # assert match.subs.length == 1 # assert match.subs.first.to_s == "d eee" # ~~~ - var subs = new Array[Match] is lazy + var subs = new Array[nullable Match] is lazy # Get the `n`th expression in this match # @@ -487,7 +502,7 @@ redef class Match # assert match[0].to_s == "c d eee f" # assert match[1].to_s == "d eee" # ~~~ - fun [](n: Int): Match do + fun [](n: Int): nullable Match do if n == 0 then return self assert n > 0 and n <= subs.length return subs[n-1] diff --git a/lib/core/text/flat.nit b/lib/core/text/flat.nit index f0e1425..c8b6ecd 100644 --- a/lib/core/text/flat.nit +++ b/lib/core/text/flat.nit @@ -177,44 +177,6 @@ redef class FlatText return nns.to_s_with_length(nlen) end - private fun byte_to_char_index(index: Int): Int do - var ln = _bytelen - assert index >= 0 - assert index < ln - - var pos = _bytepos - # Find best insertion point - var delta_begin = index - var delta_end = (ln - 1) - index - var delta_cache = (pos - index).abs - var min = delta_begin - var its = _items - - if delta_cache < min then min = delta_cache - if delta_end < min then min = delta_end - - var ns_i: Int - var my_i: Int - - if min == delta_begin then - ns_i = first_byte - my_i = 0 - else if min == delta_cache then - ns_i = pos - my_i = _position - else - ns_i = its.find_beginning_of_char_at(last_byte) - my_i = length - 1 - end - - my_i = its.byte_to_char_index_cached(index, my_i, ns_i) - - _position = my_i - _bytepos = index - - return my_i - end - redef fun [](index) do return _items.char_at(char_to_byte_index(index)) end @@ -235,15 +197,7 @@ class FlatString redef var length is lazy do if _bytelen == 0 then return 0 - var st = _first_byte - var its = _items - var ln = 0 - var lst = _last_byte - while st <= lst do - st += its.length_of_char_at(st) - ln += 1 - end - return ln + return _items.utf8_length(_first_byte, _last_byte) end redef fun reversed diff --git a/lib/core/text/native.nit b/lib/core/text/native.nit index 11c8d34..acc9a12 100644 --- a/lib/core/text/native.nit +++ b/lib/core/text/native.nit @@ -130,7 +130,7 @@ extern class NativeString `{ char* `} return ns_i end - # Gets the byte index of char at position `n` in UTF-8 String + # Gets the char index of byte at position `n` in a UTF-8 String # # `char_from` and `byte_from` are cached values to seek from. # @@ -173,4 +173,16 @@ extern class NativeString `{ char* `} if length_of_char_at(stpos) >= (endpos - stpos + 1) then return pos return endpos end + + # Number of UTF-8 characters in `self` between positions `from` and `to` + fun utf8_length(from, to: Int): Int do + var st = from + var lst = to + var ln = 0 + while st <= lst do + st += length_of_char_at(st) + ln += 1 + end + return ln + end end diff --git a/src/interpreter/naive_interpreter.nit b/src/interpreter/naive_interpreter.nit index 23a7b0c..b484f15 100644 --- a/src/interpreter/naive_interpreter.nit +++ b/src/interpreter/naive_interpreter.nit @@ -1154,8 +1154,8 @@ redef class AMethPropdef else if pname == "atoi" then return v.int_instance(recvval.atoi) else if pname == "fast_cstring" then - var ns = recvval.to_s.substring_from(args[1].to_i) - return v.native_string_instance(ns) + var ns = recvval.fast_cstring(args[1].to_i) + return v.native_string_instance(ns.to_s) end else if pname == "calloc_string" then return v.native_string_instance_len(args[1].to_i) diff --git a/tests/sav/nitserial_args1.res b/tests/sav/nitserial_args1.res index 2d0898a..fe675df 100644 --- a/tests/sav/nitserial_args1.res +++ b/tests/sav/nitserial_args1.res @@ -14,6 +14,7 @@ redef class Deserializer if name == "Array[Serializable]" then return new Array[Serializable].from_deserializer(self) if name == "Array[Object]" then return new Array[Object].from_deserializer(self) if name == "Array[Match]" then return new Array[Match].from_deserializer(self) + if name == "Array[nullable Match]" then return new Array[nullable Match].from_deserializer(self) return super end end diff --git a/tests/sav/test_regex_check.res b/tests/sav/test_regex_check.res index 47b6906..7fc9b83 100644 --- a/tests/sav/test_regex_check.res +++ b/tests/sav/test_regex_check.res @@ -2,3 +2,14 @@ true false [é12,45] [é1234,] +rés, rés, rés +ついほ +Match found : あ +Submatches: +[0] : null +Match found : あの +Submatches: +[0] : の +Match found : あ +Submatches: +[0] : null diff --git a/tests/test_regex_check.nit b/tests/test_regex_check.nit index 8c3efda..3bd703a 100644 --- a/tests/test_regex_check.nit +++ b/tests/test_regex_check.nit @@ -27,3 +27,19 @@ print str.split(re1) var re2 = "5".to_re print str.split(re2) + +str = "résonnance réseau résultat" + +print str.search_all("rés".to_re).join(", ") + +str = "あついあのあほ" +print str.split("あ(の)?".to_re).join("") + +for i in str.search_all("あ(の)?".to_re) do + print "Match found : {i}" + print "Submatches: " + var sbs = i.subs + for j in sbs.length.times do + print "[{j}] : {sbs[j] or else "null"} " + end +end