From: Jean Privat <jean@pryen.org>
Date: Thu, 10 Sep 2015 00:25:10 +0000 (-0400)
Subject: Merge: UTF-8 Regex
X-Git-Tag: v0.7.8~37
X-Git-Url: http://nitlanguage.org?hp=1c55b90444d5e865b2c92805c482f4898a4efcd3

Merge: UTF-8 Regex

This PR closes #1684

Instead of making `byte_to_char_index` public, it has been removed as it had no real reason to live.

Names are corrected and should correctly reflect their use.
Some examples of regular expressions with UTF-8 have been included.

Note however that the C-library underneath does not have UTF-8 semantics, as such, when using repetition operators on UTF-8 strings, capture the problematic characters with parentheses as in the example, or else the result will be erroneous.

Additionally, performances should be a bit better since less allocations and copy_to should be done.

Pull-Request: #1692
Reviewed-by: Jean Privat <jean@pryen.org>
Reviewed-by: Alexis Laferrière <alexis.laf@xymus.net>
---

diff --git a/lib/core/re.nit b/lib/core/re.nit
index c37bf25..f3b6208 100644
--- a/lib/core/re.nit
+++ b/lib/core/re.nit
@@ -22,7 +22,7 @@
 module re
 
 import text
-intrude import text::flat
+import text::flat
 import gc
 import error
 
@@ -356,7 +356,7 @@ class Regex
 	#     assert "l+".to_re.search_in("hello world", 3).from == 3
 	#     assert "z".to_re.search_in("hello world", 0) == null
 	#     assert "cd(e)".to_re.search_in("abcdef", 2)[1].to_s == "e"
-	redef fun search_in(text, from)
+	redef fun search_in(text, charfrom)
 	do
 		assert not optimize_has
 
@@ -367,31 +367,36 @@ class Regex
 		assert native != null
 
 		# Actually execute
-		text = text.to_s
-		var sub = text.substring_from(from)
-		var cstr = sub.to_cstring
-		var bstr = new FlatString.full(cstr, sub.bytelen, 0, sub.bytelen - 1, text.length - from)
+		var cstr = text.to_cstring
+		var rets = cstr.to_s_with_length(text.bytelen)
+		var bytefrom = cstr.char_to_byte_index_cached(charfrom, 0, 0)
+		var subcstr = cstr.fast_cstring(bytefrom)
 		var eflags = gather_eflags
 		var native_match = self.native_match
 
 		var nsub = native.re_nsub
-		var res = native.regexec(cstr, nsub+1, native_match, eflags)
+		var res = native.regexec(subcstr, nsub + 1, native_match, eflags)
 
 		# Found one?
 		if res == 0 then
-			var first_char = bstr.byte_to_char_index(native_match.rm_so)
-			var length_char = bstr.byte_to_char_index(native_match.rm_eo - native_match.rm_so - 1) # FIXME For issue #1684
-			var match = new Match(text,
-				from + first_char,
-				length_char + 1)
+			var bfrom = native_match.rm_so + bytefrom
+			var bto = native_match.rm_eo - 1 + bytefrom
+			var cpos = cstr.byte_to_char_index_cached(bfrom, charfrom, bytefrom)
+			var len = cstr.utf8_length(bfrom, bto)
+			var match = new Match(rets, cpos, len)
+			var subs = match.subs
 
 			# Add sub expressions
 			for i in [1 .. nsub] do
-				first_char = bstr.byte_to_char_index(native_match[i].rm_so)
-				length_char = bstr.byte_to_char_index(native_match[i].rm_eo - native_match[i].rm_so - 1) # FIXME For issue #1684
-				match.subs.add new Match( text,
-					from + first_char,
-					length_char + 1)
+				if native_match[i].rm_so < 0 then
+					subs.add null
+					continue
+				end
+				var sub_bfrom = native_match[i].rm_so + bytefrom
+				var sub_bto = native_match[i].rm_eo - 1 + bytefrom
+				var sub_cpos = cstr.byte_to_char_index_cached(sub_bfrom, cpos, bfrom)
+				var sub_len = cstr.utf8_length(sub_bfrom, sub_bto)
+				subs.add(new Match(rets, sub_cpos, sub_len))
 			end
 
 			return match
@@ -421,34 +426,44 @@ class Regex
 		assert native != null
 
 		# Actually execute
-		text = text.to_s
 		var cstr = text.to_cstring
+		var subcstr = cstr
+		var rets = cstr.to_s_with_length(text.bytelen)
 		var eflags = gather_eflags
 		var eflags_or_notbol = eflags | flag_notbol
 		var native_match = self.native_match
 		var matches = new Array[Match]
 
 		var nsub = native.re_nsub
-		var res = native.regexec(cstr, nsub+1, native_match, eflags)
-		var d = 0
+		var res = native.regexec(subcstr, nsub + 1, native_match, eflags)
+		var bytesub = 0
+		var charsub = 0
 		while res == 0 do
-			var match = new Match(text,
-				d + native_match.rm_so,
-				native_match.rm_eo - native_match.rm_so)
+			var bfrom = native_match.rm_so + bytesub
+			var bto = native_match.rm_eo - 1 + bytesub
+			var cstart = cstr.byte_to_char_index_cached(bfrom, charsub, bytesub)
+			var len = cstr.utf8_length(bfrom, bto)
+			var match = new Match(rets, cstart, len)
 			matches.add match
+			var subs = match.subs
 
 			# Add sub expressions
-			for i in [1..nsub] do
-				match.subs.add new Match( text,
-					d + native_match[i].rm_so,
-					native_match[i].rm_eo - native_match[i].rm_so)
+			for i in [1 .. nsub] do
+				if native_match[i].rm_so < 0 then
+					subs.add null
+					continue
+				end
+				var sub_bfrom = native_match[i].rm_so + bytesub
+				var sub_bto = native_match[i].rm_eo - 1 + bytesub
+				var sub_cstart = cstr.byte_to_char_index_cached(sub_bfrom, cstart, bfrom)
+				var sub_len = cstr.utf8_length(sub_bfrom, sub_bto)
+				subs.add(new Match(rets, sub_cstart, sub_len))
 			end
 
-			if d == native_match.rm_eo then
-				d += 1
-			else d = d + native_match.rm_eo
-			cstr = cstr.substring_from(native_match.rm_eo)
-			res = native.regexec(cstr, nsub+1, native_match, eflags_or_notbol)
+			bytesub = bto + 1
+			charsub = cstart + len
+			subcstr = cstr.fast_cstring(bytesub)
+			res = native.regexec(subcstr, nsub + 1, native_match, eflags_or_notbol)
 		end
 
 		# No more match?
@@ -472,7 +487,7 @@ redef class Match
 	# assert match.subs.length == 1
 	# assert match.subs.first.to_s == "d eee"
 	# ~~~
-	var subs = new Array[Match] is lazy
+	var subs = new Array[nullable Match] is lazy
 
 	# Get the `n`th expression in this match
 	#
@@ -487,7 +502,7 @@ redef class Match
 	# assert match[0].to_s == "c d eee f"
 	# assert match[1].to_s == "d eee"
 	# ~~~
-	fun [](n: Int): Match do
+	fun [](n: Int): nullable Match do
 		if n == 0 then return self
 		assert n > 0 and n <= subs.length
 		return subs[n-1]
diff --git a/lib/core/text/flat.nit b/lib/core/text/flat.nit
index f0e1425..c8b6ecd 100644
--- a/lib/core/text/flat.nit
+++ b/lib/core/text/flat.nit
@@ -177,44 +177,6 @@ redef class FlatText
 		return nns.to_s_with_length(nlen)
 	end
 
-	private fun byte_to_char_index(index: Int): Int do
-		var ln = _bytelen
-		assert index >= 0
-		assert index < ln
-
-		var pos = _bytepos
-		# Find best insertion point
-		var delta_begin = index
-		var delta_end = (ln - 1) - index
-		var delta_cache = (pos - index).abs
-		var min = delta_begin
-		var its = _items
-
-		if delta_cache < min then min = delta_cache
-		if delta_end < min then min = delta_end
-
-		var ns_i: Int
-		var my_i: Int
-
-		if min == delta_begin then
-			ns_i = first_byte
-			my_i = 0
-		else if min == delta_cache then
-			ns_i = pos
-			my_i = _position
-		else
-			ns_i = its.find_beginning_of_char_at(last_byte)
-			my_i = length - 1
-		end
-
-		my_i = its.byte_to_char_index_cached(index, my_i, ns_i)
-
-		_position = my_i
-		_bytepos = index
-
-		return my_i
-	end
-
 	redef fun [](index) do return _items.char_at(char_to_byte_index(index))
 end
 
@@ -235,15 +197,7 @@ class FlatString
 
 	redef var length is lazy do
 		if _bytelen == 0 then return 0
-		var st = _first_byte
-		var its = _items
-		var ln = 0
-		var lst = _last_byte
-		while st <= lst do
-			st += its.length_of_char_at(st)
-			ln += 1
-		end
-		return ln
+		return _items.utf8_length(_first_byte, _last_byte)
 	end
 
 	redef fun reversed
diff --git a/lib/core/text/native.nit b/lib/core/text/native.nit
index 11c8d34..acc9a12 100644
--- a/lib/core/text/native.nit
+++ b/lib/core/text/native.nit
@@ -130,7 +130,7 @@ extern class NativeString `{ char* `}
 		return ns_i
 	end
 
-	# Gets the byte index of char at position `n` in UTF-8 String
+	# Gets the char index of byte at position `n` in a UTF-8 String
 	#
 	# `char_from` and `byte_from` are cached values to seek from.
 	#
@@ -173,4 +173,16 @@ extern class NativeString `{ char* `}
 		if length_of_char_at(stpos) >= (endpos - stpos + 1) then return pos
 		return endpos
 	end
+
+	# Number of UTF-8 characters in `self` between positions `from` and `to`
+	fun utf8_length(from, to: Int): Int do
+		var st = from
+		var lst = to
+		var ln = 0
+		while st <= lst do
+			st += length_of_char_at(st)
+			ln += 1
+		end
+		return ln
+	end
 end
diff --git a/src/interpreter/naive_interpreter.nit b/src/interpreter/naive_interpreter.nit
index 23a7b0c..b484f15 100644
--- a/src/interpreter/naive_interpreter.nit
+++ b/src/interpreter/naive_interpreter.nit
@@ -1154,8 +1154,8 @@ redef class AMethPropdef
 			else if pname == "atoi" then
 				return v.int_instance(recvval.atoi)
 			else if pname == "fast_cstring" then
-				var ns = recvval.to_s.substring_from(args[1].to_i)
-				return v.native_string_instance(ns)
+				var ns = recvval.fast_cstring(args[1].to_i)
+				return v.native_string_instance(ns.to_s)
 			end
 		else if pname == "calloc_string" then
 			return v.native_string_instance_len(args[1].to_i)
diff --git a/tests/sav/nitserial_args1.res b/tests/sav/nitserial_args1.res
index 2d0898a..fe675df 100644
--- a/tests/sav/nitserial_args1.res
+++ b/tests/sav/nitserial_args1.res
@@ -14,6 +14,7 @@ redef class Deserializer
 		if name == "Array[Serializable]" then return new Array[Serializable].from_deserializer(self)
 		if name == "Array[Object]" then return new Array[Object].from_deserializer(self)
 		if name == "Array[Match]" then return new Array[Match].from_deserializer(self)
+		if name == "Array[nullable Match]" then return new Array[nullable Match].from_deserializer(self)
 		return super
 	end
 end
diff --git a/tests/sav/test_regex_check.res b/tests/sav/test_regex_check.res
index 47b6906..7fc9b83 100644
--- a/tests/sav/test_regex_check.res
+++ b/tests/sav/test_regex_check.res
@@ -2,3 +2,14 @@ true
 false
 [Ã©12,45]
 [Ã©1234,]
+rÃ©s, rÃ©s, rÃ©s
+ã¤ãã»
+Match found : ã
+Submatches: 
+[0] : null 
+Match found : ãã®
+Submatches: 
+[0] : ã® 
+Match found : ã
+Submatches: 
+[0] : null 
diff --git a/tests/test_regex_check.nit b/tests/test_regex_check.nit
index 8c3efda..3bd703a 100644
--- a/tests/test_regex_check.nit
+++ b/tests/test_regex_check.nit
@@ -27,3 +27,19 @@ print str.split(re1)
 
 var re2 = "5".to_re
 print str.split(re2)
+
+str = "rÃ©sonnance rÃ©seau rÃ©sultat"
+
+print str.search_all("rÃ©s".to_re).join(", ")
+
+str = "ãã¤ããã®ãã»"
+print str.split("ã(ã®)?".to_re).join("")
+
+for i in str.search_all("ã(ã®)?".to_re) do
+	print "Match found : {i}"
+	print "Submatches: "
+	var sbs = i.subs
+	for j in sbs.length.times do
+		print "[{j}] : {sbs[j] or else "null"} "
+	end
+end