Merge: Faster hex parsing
authorJean Privat <jean@pryen.org>
Fri, 18 Dec 2015 20:27:41 +0000 (15:27 -0500)
committerJean Privat <jean@pryen.org>
Fri, 18 Dec 2015 20:27:41 +0000 (15:27 -0500)
As said in #1895, we need faster parsing of UTF-16 escaping sequences, this PR is the answer.

It makes the runtime of the `large_escaped` benchmark go down from ~5s to ~3.5s, and with valgrind, from 26GIr to 20GIr

Note: based on #1886, only the 4 last commits are of interest here

Pull-Request: #1896
Reviewed-by: Jean Privat <jean@pryen.org>

1  2 
lib/core/text/abstract_text.nit

@@@ -248,7 -248,17 +248,17 @@@ abstract class Tex
        # If `self` contains only digits and alpha <= 'f', return the corresponding integer.
        #
        #     assert "ff".to_hex == 255
-       fun to_hex: Int do return a_to(16)
+       fun to_hex(pos, ln: nullable Int): Int do
+               var res = 0
+               if pos == null then pos = 0
+               if ln == null then ln = length - pos
+               var max = pos + ln
+               for i in [pos .. max[ do
+                       res <<= 4
+                       res += self[i].from_hex
+               end
+               return res
+       end
  
        # If `self` contains only digits <= '7', return the corresponding integer.
        #
        #     assert "\\ud800\\udfd3".from_utf16_escape == '๐“'
        #     assert "\\u00e8".from_utf16_escape == 'รจ'
        #     assert "\\u3042".from_utf16_escape == 'ใ‚'
-       fun from_utf16_escape: Char do
-               var ln = length
-               if ln != 6 and ln != 12 then return 0xFFFD.code_point
-               var cphi = substring(2, 4).to_hex
-               if cphi < 0xD800 then return cphi.code_point
-               if cphi > 0xDFFF then return cphi.code_point
-               if cphi > 0xDBFF then return 0xFFFD.code_point
-               var cp = 0
-               cp += (cphi - 0xD800) << 10
-               var cplo = substring(8, 4).to_hex
+       fun from_utf16_escape(pos, ln: nullable Int): Char do
+               if pos == null then pos = 0
+               if ln == null then ln = length - pos
+               if ln < 6 then return 0xFFFD.code_point
+               var cp = from_utf16_digit(pos + 2)
+               if cp < 0xD800 then return cp.code_point
+               if cp > 0xDFFF then return cp.code_point
+               if cp > 0xDBFF then return 0xFFFD.code_point
+               if ln == 6 then return 0xFFFD.code_point
+               if ln < 12 then return 0xFFFD.code_point
+               cp <<= 16
+               cp += from_utf16_digit(pos + 8)
+               var cplo = cp & 0xFFFF
                if cplo < 0xDC00 then return 0xFFFD.code_point
                if cplo > 0xDFFF then return 0xFFFD.code_point
-               cp += cplo - 0xDC00
-               cp += 0x10000
-               return cp.code_point
+               return cp.from_utf16_surr.code_point
+       end
+       # Returns a UTF-16 escape value
+       #
+       #     var s = "\\ud800\\udfd3"
+       #     assert s.from_utf16_digit(2) == 0xD800
+       #     assert s.from_utf16_digit(8) == 0xDFD3
+       fun from_utf16_digit(pos: nullable Int): Int do
+               if pos == null then pos = 0
+               return to_hex(pos, 4)
        end
  
        # Encode `self` to percent (or URL) encoding
                return s.plain_to_s
        end
  
 +      # Return the Levenshtein distance between two strings
 +      #
 +      # ~~~
 +      # assert "abcd".levenshtein_distance("abcd") == 0
 +      # assert "".levenshtein_distance("abcd")     == 4
 +      # assert "abcd".levenshtein_distance("")     == 4
 +      # assert "abcd".levenshtein_distance("xyz")  == 4
 +      # assert "abcd".levenshtein_distance("xbdy") == 3
 +      # ~~~
 +      fun levenshtein_distance(other: String): Int
 +      do
 +              var slen = self.length
 +              var olen = other.length
 +
 +              # fast cases
 +              if slen == 0 then return olen
 +              if olen == 0 then return slen
 +              if self == other then return 0
 +
 +              # previous row of distances
 +              var v0 = new Array[Int].with_capacity(olen+1)
 +
 +              # current row of distances
 +              var v1 = new Array[Int].with_capacity(olen+1)
 +
 +              for j in [0..olen] do
 +                      # prefix insert cost
 +                      v0[j] = j
 +              end
 +
 +              for i in [0..slen[ do
 +
 +                      # prefix delete cost
 +                      v1[0] = i + 1
 +
 +                      for j in [0..olen[ do
 +                              # delete cost
 +                              var cost1 = v1[j] + 1
 +                              # insert cost
 +                              var cost2 = v0[j + 1] + 1
 +                              # same char cost (+0)
 +                              var cost3 = v0[j]
 +                              # change cost
 +                              if self[i] != other[j] then cost3 += 1
 +                              # keep the min
 +                              v1[j+1] = cost1.min(cost2).min(cost3)
 +                      end
 +
 +                      # Switch columns:
 +                      # * v1 become v0 in the next iteration
 +                      # * old v0 is reused as the new v1
 +                      var tmp = v1
 +                      v1 = v0
 +                      v0 = tmp
 +              end
 +
 +              return v0[olen]
 +      end
 +
        # Copies `n` bytes from `self` at `src_offset` into `dest` starting at `dest_offset`
        #
        # Basically a high-level synonym of NativeString::copy_to
@@@ -1659,6 -1621,12 +1680,12 @@@ redef class Cha
        #     assert 'ใพ'.bytes == [0xE3u8, 0x81u8, 0xBEu8]
        fun bytes: SequenceRead[Byte] do return to_s.bytes
  
+       # Is `self` an UTF-16 surrogate pair ?
+       fun is_surrogate: Bool do
+               var cp = code_point
+               return cp >= 0xD800 and cp <= 0xDFFF
+       end
        # Length of `self` in a UTF-8 String
        private fun u8char_len: Int do
                var c = self.code_point
        do
                return self.is_numeric or self.is_alpha
        end
+       # Returns `self` to its int value
+       #
+       # REQUIRE: `is_hexdigit`
+       fun from_hex: Int do
+               if self >= '0' and self <= '9' then return code_point - 0x30
+               if self >= 'A' and self <= 'F' then return code_point - 0x37
+               if self >= 'a' and self <= 'f' then return code_point - 0x57
+               # Happens if self is not a hexdigit
+               assert self.is_hexdigit
+               # To make flow analysis happy
+               abort
+       end
  end
  
  redef class Collection[E]