Merge: Fix Opportunity behavior with unicode characters

author Jean Privat <jean@pryen.org>

Thu, 10 Sep 2015 00:25:20 +0000 (20:25 -0400)

committer Jean Privat <jean@pryen.org>

Thu, 10 Sep 2015 00:25:20 +0000 (20:25 -0400)
author Jean Privat <jean@pryen.org>
Thu, 10 Sep 2015 00:25:20 +0000 (20:25 -0400)
committer Jean Privat <jean@pryen.org>
Thu, 10 Sep 2015 00:25:20 +0000 (20:25 -0400)
diff --git a/lib/core/re.nit b/lib/core/re.nit

index c37bf25..f3b6208 100644 (file)
--- a/lib/core/re.nit
+++ b/lib/core/re.nit
@@ -22,7 +22,7 @@
  module re
  
  import text
-intrude import text::flat
+import text::flat
  import gc
  import error
  
@@ -356,7 +356,7 @@ class Regex
         #     assert "l+".to_re.search_in("hello world", 3).from == 3
         #     assert "z".to_re.search_in("hello world", 0) == null
         #     assert "cd(e)".to_re.search_in("abcdef", 2)[1].to_s == "e"
-       redef fun search_in(text, from)
+       redef fun search_in(text, charfrom)
         do
                 assert not optimize_has
  
@@ -367,31 +367,36 @@ class Regex
                 assert native != null
  
                 # Actually execute
-               text = text.to_s
-               var sub = text.substring_from(from)
-               var cstr = sub.to_cstring
-               var bstr = new FlatString.full(cstr, sub.bytelen, 0, sub.bytelen - 1, text.length - from)
+               var cstr = text.to_cstring
+               var rets = cstr.to_s_with_length(text.bytelen)
+               var bytefrom = cstr.char_to_byte_index_cached(charfrom, 0, 0)
+               var subcstr = cstr.fast_cstring(bytefrom)
                 var eflags = gather_eflags
                 var native_match = self.native_match
  
                 var nsub = native.re_nsub
-               var res = native.regexec(cstr, nsub+1, native_match, eflags)
+               var res = native.regexec(subcstr, nsub + 1, native_match, eflags)
  
                 # Found one?
                 if res == 0 then
-                       var first_char = bstr.byte_to_char_index(native_match.rm_so)
-                       var length_char = bstr.byte_to_char_index(native_match.rm_eo - native_match.rm_so - 1) # FIXME For issue #1684
-                       var match = new Match(text,
-                               from + first_char,
-                               length_char + 1)
+                       var bfrom = native_match.rm_so + bytefrom
+                       var bto = native_match.rm_eo - 1 + bytefrom
+                       var cpos = cstr.byte_to_char_index_cached(bfrom, charfrom, bytefrom)
+                       var len = cstr.utf8_length(bfrom, bto)
+                       var match = new Match(rets, cpos, len)
+                       var subs = match.subs
  
                         # Add sub expressions
                         for i in [1 .. nsub] do
-                               first_char = bstr.byte_to_char_index(native_match[i].rm_so)
-                               length_char = bstr.byte_to_char_index(native_match[i].rm_eo - native_match[i].rm_so - 1) # FIXME For issue #1684
-                               match.subs.add new Match( text,
-                                       from + first_char,
-                                       length_char + 1)
+                               if native_match[i].rm_so < 0 then
+                                       subs.add null
+                                       continue
+                               end
+                               var sub_bfrom = native_match[i].rm_so + bytefrom
+                               var sub_bto = native_match[i].rm_eo - 1 + bytefrom
+                               var sub_cpos = cstr.byte_to_char_index_cached(sub_bfrom, cpos, bfrom)
+                               var sub_len = cstr.utf8_length(sub_bfrom, sub_bto)
+                               subs.add(new Match(rets, sub_cpos, sub_len))
                         end
  
                         return match
@@ -421,34 +426,44 @@ class Regex
                 assert native != null
  
                 # Actually execute
-               text = text.to_s
                 var cstr = text.to_cstring
+               var subcstr = cstr
+               var rets = cstr.to_s_with_length(text.bytelen)
                 var eflags = gather_eflags
                 var eflags_or_notbol = eflags | flag_notbol
                 var native_match = self.native_match
                 var matches = new Array[Match]
  
                 var nsub = native.re_nsub
-               var res = native.regexec(cstr, nsub+1, native_match, eflags)
-               var d = 0
+               var res = native.regexec(subcstr, nsub + 1, native_match, eflags)
+               var bytesub = 0
+               var charsub = 0
                 while res == 0 do
-                       var match = new Match(text,
-                               d + native_match.rm_so,
-                               native_match.rm_eo - native_match.rm_so)
+                       var bfrom = native_match.rm_so + bytesub
+                       var bto = native_match.rm_eo - 1 + bytesub
+                       var cstart = cstr.byte_to_char_index_cached(bfrom, charsub, bytesub)
+                       var len = cstr.utf8_length(bfrom, bto)
+                       var match = new Match(rets, cstart, len)
                         matches.add match
+                       var subs = match.subs
  
                         # Add sub expressions
-                       for i in [1..nsub] do
-                               match.subs.add new Match( text,
-                                       d + native_match[i].rm_so,
-                                       native_match[i].rm_eo - native_match[i].rm_so)
+                       for i in [1 .. nsub] do
+                               if native_match[i].rm_so < 0 then
+                                       subs.add null
+                                       continue
+                               end
+                               var sub_bfrom = native_match[i].rm_so + bytesub
+                               var sub_bto = native_match[i].rm_eo - 1 + bytesub
+                               var sub_cstart = cstr.byte_to_char_index_cached(sub_bfrom, cstart, bfrom)
+                               var sub_len = cstr.utf8_length(sub_bfrom, sub_bto)
+                               subs.add(new Match(rets, sub_cstart, sub_len))
                         end
  
-                       if d == native_match.rm_eo then
-                               d += 1
-                       else d = d + native_match.rm_eo
-                       cstr = cstr.substring_from(native_match.rm_eo)
-                       res = native.regexec(cstr, nsub+1, native_match, eflags_or_notbol)
+                       bytesub = bto + 1
+                       charsub = cstart + len
+                       subcstr = cstr.fast_cstring(bytesub)
+                       res = native.regexec(subcstr, nsub + 1, native_match, eflags_or_notbol)
                 end
  
                 # No more match?
@@ -472,7 +487,7 @@ redef class Match
         # assert match.subs.length == 1
         # assert match.subs.first.to_s == "d eee"
         # ~~~
-       var subs = new Array[Match] is lazy
+       var subs = new Array[nullable Match] is lazy
  
         # Get the `n`th expression in this match
         #
@@ -487,7 +502,7 @@ redef class Match
         # assert match[0].to_s == "c d eee f"
         # assert match[1].to_s == "d eee"
         # ~~~
-       fun [](n: Int): Match do
+       fun [](n: Int): nullable Match do
                 if n == 0 then return self
                 assert n > 0 and n <= subs.length
                 return subs[n-1]
diff --git a/lib/core/text/flat.nit b/lib/core/text/flat.nit

index f0e1425..c8b6ecd 100644 (file)
--- a/lib/core/text/flat.nit
+++ b/lib/core/text/flat.nit
@@ -177,44 +177,6 @@ redef class FlatText
                 return nns.to_s_with_length(nlen)
         end
  
-       private fun byte_to_char_index(index: Int): Int do
-               var ln = _bytelen
-               assert index >= 0
-               assert index < ln
-
-               var pos = _bytepos
-               # Find best insertion point
-               var delta_begin = index
-               var delta_end = (ln - 1) - index
-               var delta_cache = (pos - index).abs
-               var min = delta_begin
-               var its = _items
-
-               if delta_cache < min then min = delta_cache
-               if delta_end < min then min = delta_end
-
-               var ns_i: Int
-               var my_i: Int
-
-               if min == delta_begin then
-                       ns_i = first_byte
-                       my_i = 0
-               else if min == delta_cache then
-                       ns_i = pos
-                       my_i = _position
-               else
-                       ns_i = its.find_beginning_of_char_at(last_byte)
-                       my_i = length - 1
-               end
-
-               my_i = its.byte_to_char_index_cached(index, my_i, ns_i)
-
-               _position = my_i
-               _bytepos = index
-
-               return my_i
-       end
-
         redef fun [](index) do return _items.char_at(char_to_byte_index(index))
  end
  
@@ -235,15 +197,7 @@ class FlatString
  
         redef var length is lazy do
                 if _bytelen == 0 then return 0
-               var st = _first_byte
-               var its = _items
-               var ln = 0
-               var lst = _last_byte
-               while st <= lst do
-                       st += its.length_of_char_at(st)
-                       ln += 1
-               end
-               return ln
+               return _items.utf8_length(_first_byte, _last_byte)
         end
  
         redef fun reversed
diff --git a/lib/core/text/native.nit b/lib/core/text/native.nit

index 11c8d34..acc9a12 100644 (file)
--- a/lib/core/text/native.nit
+++ b/lib/core/text/native.nit
@@ -130,7 +130,7 @@ extern class NativeString `{ char* `}
                 return ns_i
         end
  
-       # Gets the byte index of char at position `n` in UTF-8 String
+       # Gets the char index of byte at position `n` in a UTF-8 String
         #
         # `char_from` and `byte_from` are cached values to seek from.
         #
@@ -173,4 +173,16 @@ extern class NativeString `{ char* `}
                 if length_of_char_at(stpos) >= (endpos - stpos + 1) then return pos
                 return endpos
         end
+
+       # Number of UTF-8 characters in `self` between positions `from` and `to`
+       fun utf8_length(from, to: Int): Int do
+               var st = from
+               var lst = to
+               var ln = 0
+               while st <= lst do
+                       st += length_of_char_at(st)
+                       ln += 1
+               end
+               return ln
+       end
  end
diff --git a/src/interpreter/naive_interpreter.nit b/src/interpreter/naive_interpreter.nit

index 23a7b0c..b484f15 100644 (file)
--- a/src/interpreter/naive_interpreter.nit
+++ b/src/interpreter/naive_interpreter.nit
@@ -1154,8 +1154,8 @@ redef class AMethPropdef
                         else if pname == "atoi" then
                                 return v.int_instance(recvval.atoi)
                         else if pname == "fast_cstring" then
-                               var ns = recvval.to_s.substring_from(args[1].to_i)
-                               return v.native_string_instance(ns)
+                               var ns = recvval.fast_cstring(args[1].to_i)
+                               return v.native_string_instance(ns.to_s)
                         end
                 else if pname == "calloc_string" then
                         return v.native_string_instance_len(args[1].to_i)
diff --git a/tests/sav/nitserial_args1.res b/tests/sav/nitserial_args1.res

index 2d0898a..fe675df 100644 (file)
--- a/tests/sav/nitserial_args1.res
+++ b/tests/sav/nitserial_args1.res
@@ -14,6 +14,7 @@ redef class Deserializer
                 if name == "Array[Serializable]" then return new Array[Serializable].from_deserializer(self)
                 if name == "Array[Object]" then return new Array[Object].from_deserializer(self)
                 if name == "Array[Match]" then return new Array[Match].from_deserializer(self)
+               if name == "Array[nullable Match]" then return new Array[nullable Match].from_deserializer(self)
                 return super
         end
  end
diff --git a/tests/sav/test_regex_check.res b/tests/sav/test_regex_check.res

index 47b6906..7fc9b83 100644 (file)
--- a/tests/sav/test_regex_check.res
+++ b/tests/sav/test_regex_check.res
@@ -2,3 +2,14 @@ true
  false
  [é12,45]
  [é1234,]
+rés, rés, rés
+ついほ
+Match found : あ
+Submatches: 
+[0] : null 
+Match found : あの
+Submatches: 
+[0] : の 
+Match found : あ
+Submatches: 
+[0] : null 
diff --git a/tests/test_regex_check.nit b/tests/test_regex_check.nit

index 8c3efda..3bd703a 100644 (file)
--- a/tests/test_regex_check.nit
+++ b/tests/test_regex_check.nit
@@ -27,3 +27,19 @@ print str.split(re1)
  
  var re2 = "5".to_re
  print str.split(re2)
+
+str = "résonnance réseau résultat"
+
+print str.search_all("rés".to_re).join(", ")
+
+str = "あついあのあほ"
+print str.split("あ(の)?".to_re).join("")
+
+for i in str.search_all("あ(の)?".to_re) do
+       print "Match found : {i}"
+       print "Submatches: "
+       var sbs = i.subs
+       for j in sbs.length.times do
+               print "[{j}] : {sbs[j] or else "null"} "
+       end
+end
author	Jean Privat <jean@pryen.org>
	Thu, 10 Sep 2015 00:25:20 +0000 (20:25 -0400)
committer	Jean Privat <jean@pryen.org>
	Thu, 10 Sep 2015 00:25:20 +0000 (20:25 -0400)
lib/core/re.nit		patch \| blob \| history
lib/core/text/flat.nit		patch \| blob \| history
lib/core/text/native.nit		patch \| blob \| history
src/interpreter/naive_interpreter.nit		patch \| blob \| history
tests/sav/nitserial_args1.res		patch \| blob \| history
tests/sav/test_regex_check.res		patch \| blob \| history
tests/test_regex_check.nit		patch \| blob \| history