From: Jean Privat <jean@pryen.org>
Date: Fri, 18 Dec 2015 20:27:41 +0000 (-0500)
Subject: Merge: Faster hex parsing
X-Git-Tag: v0.8~22
X-Git-Url: http://nitlanguage.org?hp=-c

Merge: Faster hex parsing

As said in #1895, we need faster parsing of UTF-16 escaping sequences, this PR is the answer.

It makes the runtime of the `large_escaped` benchmark go down from ~5s to ~3.5s, and with valgrind, from 26GIr to 20GIr

Note: based on #1886, only the 4 last commits are of interest here

Pull-Request: #1896
Reviewed-by: Jean Privat <jean@pryen.org>
---

7e303172862324a8aa68db5162fc32d605d4f69e
diff --combined lib/core/text/abstract_text.nit
index d4873ad,0b07edb..fc8ec5b
--- a/lib/core/text/abstract_text.nit
+++ b/lib/core/text/abstract_text.nit
@@@ -248,7 -248,17 +248,17 @@@ abstract class Tex
  	# If `self` contains only digits and alpha <= 'f', return the corresponding integer.
  	#
  	#     assert "ff".to_hex == 255
- 	fun to_hex: Int do return a_to(16)
+ 	fun to_hex(pos, ln: nullable Int): Int do
+ 		var res = 0
+ 		if pos == null then pos = 0
+ 		if ln == null then ln = length - pos
+ 		var max = pos + ln
+ 		for i in [pos .. max[ do
+ 			res <<= 4
+ 			res += self[i].from_hex
+ 		end
+ 		return res
+ 	end
  
  	# If `self` contains only digits <= '7', return the corresponding integer.
  	#
@@@ -733,21 -743,32 +743,32 @@@
  	#     assert "\\ud800\\udfd3".from_utf16_escape == 'ð'
  	#     assert "\\u00e8".from_utf16_escape == 'Ã¨'
  	#     assert "\\u3042".from_utf16_escape == 'ã'
- 	fun from_utf16_escape: Char do
- 		var ln = length
- 		if ln != 6 and ln != 12 then return 0xFFFD.code_point
- 		var cphi = substring(2, 4).to_hex
- 		if cphi < 0xD800 then return cphi.code_point
- 		if cphi > 0xDFFF then return cphi.code_point
- 		if cphi > 0xDBFF then return 0xFFFD.code_point
- 		var cp = 0
- 		cp += (cphi - 0xD800) << 10
- 		var cplo = substring(8, 4).to_hex
+ 	fun from_utf16_escape(pos, ln: nullable Int): Char do
+ 		if pos == null then pos = 0
+ 		if ln == null then ln = length - pos
+ 		if ln < 6 then return 0xFFFD.code_point
+ 		var cp = from_utf16_digit(pos + 2)
+ 		if cp < 0xD800 then return cp.code_point
+ 		if cp > 0xDFFF then return cp.code_point
+ 		if cp > 0xDBFF then return 0xFFFD.code_point
+ 		if ln == 6 then return 0xFFFD.code_point
+ 		if ln < 12 then return 0xFFFD.code_point
+ 		cp <<= 16
+ 		cp += from_utf16_digit(pos + 8)
+ 		var cplo = cp & 0xFFFF
  		if cplo < 0xDC00 then return 0xFFFD.code_point
  		if cplo > 0xDFFF then return 0xFFFD.code_point
- 		cp += cplo - 0xDC00
- 		cp += 0x10000
- 		return cp.code_point
+ 		return cp.from_utf16_surr.code_point
+ 	end
+ 
+ 	# Returns a UTF-16 escape value
+ 	#
+ 	#     var s = "\\ud800\\udfd3"
+ 	#     assert s.from_utf16_digit(2) == 0xD800
+ 	#     assert s.from_utf16_digit(8) == 0xDFD3
+ 	fun from_utf16_digit(pos: nullable Int): Int do
+ 		if pos == null then pos = 0
+ 		return to_hex(pos, 4)
  	end
  
  	# Encode `self` to percent (or URL) encoding
@@@ -974,65 -995,6 +995,65 @@@
  		return s.plain_to_s
  	end
  
 +	# Return the Levenshtein distance between two strings
 +	#
 +	# ~~~
 +	# assert "abcd".levenshtein_distance("abcd") == 0
 +	# assert "".levenshtein_distance("abcd")     == 4
 +	# assert "abcd".levenshtein_distance("")     == 4
 +	# assert "abcd".levenshtein_distance("xyz")  == 4
 +	# assert "abcd".levenshtein_distance("xbdy") == 3
 +	# ~~~
 +	fun levenshtein_distance(other: String): Int
 +	do
 +		var slen = self.length
 +		var olen = other.length
 +
 +		# fast cases
 +		if slen == 0 then return olen
 +		if olen == 0 then return slen
 +		if self == other then return 0
 +
 +		# previous row of distances
 +		var v0 = new Array[Int].with_capacity(olen+1)
 +
 +		# current row of distances
 +		var v1 = new Array[Int].with_capacity(olen+1)
 +
 +		for j in [0..olen] do
 +			# prefix insert cost
 +			v0[j] = j
 +		end
 +
 +		for i in [0..slen[ do
 +
 +			# prefix delete cost
 +			v1[0] = i + 1
 +
 +			for j in [0..olen[ do
 +				# delete cost
 +				var cost1 = v1[j] + 1
 +				# insert cost
 +				var cost2 = v0[j + 1] + 1
 +				# same char cost (+0)
 +				var cost3 = v0[j]
 +				# change cost
 +				if self[i] != other[j] then cost3 += 1
 +				# keep the min
 +				v1[j+1] = cost1.min(cost2).min(cost3)
 +			end
 +
 +			# Switch columns:
 +			# * v1 become v0 in the next iteration
 +			# * old v0 is reused as the new v1
 +			var tmp = v1
 +			v1 = v0
 +			v0 = tmp
 +		end
 +
 +		return v0[olen]
 +	end
 +
  	# Copies `n` bytes from `self` at `src_offset` into `dest` starting at `dest_offset`
  	#
  	# Basically a high-level synonym of NativeString::copy_to
@@@ -1659,6 -1621,12 +1680,12 @@@ redef class Cha
  	#     assert 'ã¾'.bytes == [0xE3u8, 0x81u8, 0xBEu8]
  	fun bytes: SequenceRead[Byte] do return to_s.bytes
  
+ 	# Is `self` an UTF-16 surrogate pair ?
+ 	fun is_surrogate: Bool do
+ 		var cp = code_point
+ 		return cp >= 0xD800 and cp <= 0xDFFF
+ 	end
+ 
  	# Length of `self` in a UTF-8 String
  	private fun u8char_len: Int do
  		var c = self.code_point
@@@ -1791,6 -1759,19 +1818,19 @@@
  	do
  		return self.is_numeric or self.is_alpha
  	end
+ 
+ 	# Returns `self` to its int value
+ 	#
+ 	# REQUIRE: `is_hexdigit`
+ 	fun from_hex: Int do
+ 		if self >= '0' and self <= '9' then return code_point - 0x30
+ 		if self >= 'A' and self <= 'F' then return code_point - 0x37
+ 		if self >= 'a' and self <= 'f' then return code_point - 0x57
+ 		# Happens if self is not a hexdigit
+ 		assert self.is_hexdigit
+ 		# To make flow analysis happy
+ 		abort
+ 	end
  end
  
  redef class Collection[E]