lib/core: Improve speed of `utf8_length` in NativeString and change signature
authorLucas Bajolet <r4pass@hotmail.com>
Tue, 8 Dec 2015 18:24:17 +0000 (13:24 -0500)
committerLucas Bajolet <r4pass@hotmail.com>
Tue, 29 Dec 2015 04:49:28 +0000 (23:49 -0500)
Signed-off-by: Lucas Bajolet <r4pass@hotmail.com>

lib/core/re.nit
lib/core/text/flat.nit
lib/core/text/native.nit

index f3b6208..917ec4f 100644 (file)
@@ -382,7 +382,7 @@ class Regex
                        var bfrom = native_match.rm_so + bytefrom
                        var bto = native_match.rm_eo - 1 + bytefrom
                        var cpos = cstr.byte_to_char_index_cached(bfrom, charfrom, bytefrom)
-                       var len = cstr.utf8_length(bfrom, bto)
+                       var len = cstr.utf8_length(bfrom, bto - bfrom + 1)
                        var match = new Match(rets, cpos, len)
                        var subs = match.subs
 
@@ -395,7 +395,7 @@ class Regex
                                var sub_bfrom = native_match[i].rm_so + bytefrom
                                var sub_bto = native_match[i].rm_eo - 1 + bytefrom
                                var sub_cpos = cstr.byte_to_char_index_cached(sub_bfrom, cpos, bfrom)
-                               var sub_len = cstr.utf8_length(sub_bfrom, sub_bto)
+                               var sub_len = cstr.utf8_length(sub_bfrom, sub_bto - sub_bfrom + 1)
                                subs.add(new Match(rets, sub_cpos, sub_len))
                        end
 
@@ -442,7 +442,7 @@ class Regex
                        var bfrom = native_match.rm_so + bytesub
                        var bto = native_match.rm_eo - 1 + bytesub
                        var cstart = cstr.byte_to_char_index_cached(bfrom, charsub, bytesub)
-                       var len = cstr.utf8_length(bfrom, bto)
+                       var len = cstr.utf8_length(bfrom, bto - bfrom + 1)
                        var match = new Match(rets, cstart, len)
                        matches.add match
                        var subs = match.subs
@@ -456,7 +456,7 @@ class Regex
                                var sub_bfrom = native_match[i].rm_so + bytesub
                                var sub_bto = native_match[i].rm_eo - 1 + bytesub
                                var sub_cstart = cstr.byte_to_char_index_cached(sub_bfrom, cstart, bfrom)
-                               var sub_len = cstr.utf8_length(sub_bfrom, sub_bto)
+                               var sub_len = cstr.utf8_length(sub_bfrom, sub_bto - sub_bfrom + 1)
                                subs.add(new Match(rets, sub_cstart, sub_len))
                        end
 
index a72af19..9a70db8 100644 (file)
@@ -458,7 +458,7 @@ class FlatString
                self._bytelen = bytelen
                _first_byte = from
                _bytepos = from
-               _length = _items.utf8_length(_first_byte, last_byte)
+               _length = _items.utf8_length(_first_byte, bytelen)
        end
 
        # Low-level creation of a new string with all the data.
index a8bbc4b..3eb41f2 100644 (file)
@@ -248,14 +248,23 @@ extern class NativeString `{ char* `}
                return endpos
        end
 
-       # Number of UTF-8 characters in `self` between positions `from` and `to`
-       fun utf8_length(from, to: Int): Int do
+       # Number of UTF-8 characters in `self` starting at `from`, for a length of `bytelen`
+       fun utf8_length(from, bytelen: Int): Int do
                var st = from
-               var lst = to
                var ln = 0
-               while st <= lst do
-                       st += length_of_char_at(st)
+               while bytelen > 0 do
+                       while bytelen >= 4 do
+                               var i = fetch_4_chars(st)
+                               if i & 0x80808080 != 0 then break
+                               bytelen -= 4
+                               st += 4
+                               ln += 4
+                       end
+                       if bytelen == 0 then break
+                       var cln = length_of_char_at(st)
+                       st += cln
                        ln += 1
+                       bytelen -= cln
                end
                return ln
        end