X-Git-Url: http://nitlanguage.org

diff --git a/lib/core/text/abstract_text.nit b/lib/core/text/abstract_text.nit
index b1e6a04..d4873ad 100644
--- a/lib/core/text/abstract_text.nit
+++ b/lib/core/text/abstract_text.nit
@@ -68,7 +68,7 @@ abstract class Text
 	fun substring(from: Int, count: Int): SELFTYPE is abstract
 
 	# Iterates on the substrings of self if any
-	fun substrings: Iterator[FlatText] is abstract
+	private fun substrings: Iterator[FlatText] is abstract
 
 	# Is the current Text empty (== "")
 	#
@@ -146,15 +146,7 @@ abstract class Text
 	# Returns -1 if not found
 	#
 	# DEPRECATED : Use self.chars.last_index_of_from instead
-	fun last_index_of_from(item: Char, pos: Int): Int
-	do
-		var iter = self.chars.reverse_iterator_from(pos)
-		while iter.is_ok do
-			if iter.item == item then return iter.index
-			iter.next
-		end
-		return -1
-	end
+	fun last_index_of_from(item: Char, pos: Int): Int do return chars.last_index_of_from(item, pos)
 
 	# Gets an iterator on the chars of self
 	#
@@ -543,7 +535,7 @@ abstract class Text
 
 		if c >= '0' and c <= '9' then
 			res.add('_')
-			res.append(c.ascii.to_s)
+			res.append(c.code_point.to_s)
 			res.add('d')
 			start = 1
 		end
@@ -555,7 +547,7 @@ abstract class Text
 				continue
 			end
 			if underscore then
-				res.append('_'.ascii.to_s)
+				res.append('_'.code_point.to_s)
 				res.add('d')
 			end
 			if c >= '0' and c <= '9' then
@@ -566,13 +558,13 @@ abstract class Text
 				underscore = true
 			else
 				res.add('_')
-				res.append(c.ascii.to_s)
+				res.append(c.code_point.to_s)
 				res.add('d')
 				underscore = false
 			end
 		end
 		if underscore then
-			res.append('_'.ascii.to_s)
+			res.append('_'.code_point.to_s)
 			res.add('d')
 		end
 		return res.to_s
@@ -587,7 +579,7 @@ abstract class Text
 	# Three digits are always used to avoid following digits to be interpreted as an element
 	# of the octal sequence.
 	#
-	#     assert "{0.ascii}{1.ascii}{8.ascii}{31.ascii}{32.ascii}".escape_to_c == "\\000\\001\\010\\037 "
+	#     assert "{0.code_point}{1.code_point}{8.code_point}{31.code_point}{32.code_point}".escape_to_c == "\\000\\001\\010\\037 "
 	#
 	# The exceptions are the common `\t` and `\n`.
 	fun escape_to_c: String
@@ -605,9 +597,9 @@ abstract class Text
 				b.append("\\\'")
 			else if c == '\\' then
 				b.append("\\\\")
-			else if c.ascii < 32 then
+			else if c.code_point < 32 then
 				b.add('\\')
-				var oct = c.ascii.to_base(8, false)
+				var oct = c.code_point.to_base(8, false)
 				# Force 3 octal digits since it is the
 				# maximum allowed in the C specification
 				if oct.length == 1 then
@@ -680,8 +672,8 @@ abstract class Text
 			else if c == ':' or c == ' ' or c == '#' then
 				b.add('\\')
 				b.add(c)
-			else if c.ascii < 32 or c == ';' or c == '|' or c == '\\' or c == '=' then
-				b.append("?{c.ascii.to_base(16, false)}")
+			else if c.code_point < 32 or c == ';' or c == '|' or c == '\\' or c == '=' then
+				b.append("?{c.code_point.to_base(16, false)}")
 			else
 				b.add(c)
 			end
@@ -695,7 +687,7 @@ abstract class Text
 	#     assert s.length        ==  2
 	#     var u = s.unescape_nit
 	#     assert u.length        ==  1
-	#     assert u.chars[0].ascii      ==  10 # (the ASCII value of the "new line" character)
+	#     assert u.chars[0].code_point      ==  10 # (the ASCII value of the "new line" character)
 	fun unescape_nit: String
 	do
 		var res = new Buffer.with_cap(self.length)
@@ -726,6 +718,38 @@ abstract class Text
 		return res.to_s
 	end
 
+	# Returns `self` with all characters escaped with their UTF-16 representation
+	#
+	#     assert "AÃ¨ãð".escape_to_utf16 == "\\u0041\\u00e8\\u3042\\ud800\\udfd3"
+	fun escape_to_utf16: String do
+		var buf = new Buffer
+		for i in chars do buf.append i.escape_to_utf16
+		return buf.to_s
+	end
+
+	# Returns the Unicode char escaped by `self`
+	#
+	#     assert "\\u0041".from_utf16_escape == 'A'
+	#     assert "\\ud800\\udfd3".from_utf16_escape == 'ð'
+	#     assert "\\u00e8".from_utf16_escape == 'Ã¨'
+	#     assert "\\u3042".from_utf16_escape == 'ã'
+	fun from_utf16_escape: Char do
+		var ln = length
+		if ln != 6 and ln != 12 then return 0xFFFD.code_point
+		var cphi = substring(2, 4).to_hex
+		if cphi < 0xD800 then return cphi.code_point
+		if cphi > 0xDFFF then return cphi.code_point
+		if cphi > 0xDBFF then return 0xFFFD.code_point
+		var cp = 0
+		cp += (cphi - 0xD800) << 10
+		var cplo = substring(8, 4).to_hex
+		if cplo < 0xDC00 then return 0xFFFD.code_point
+		if cplo > 0xDFFF then return 0xFFFD.code_point
+		cp += cplo - 0xDC00
+		cp += 0x10000
+		return cp.code_point
+	end
+
 	# Encode `self` to percent (or URL) encoding
 	#
 	#     assert "aBc09-._~".to_percent_encoding == "aBc09-._~"
@@ -787,7 +811,7 @@ abstract class Text
 			if c == '%' then
 				if i + 2 >= length then
 					# What follows % has been cut off
-					buf[l] = '?'.ascii.to_b
+					buf[l] = '?'.ascii
 				else
 					i += 1
 					var hex_s = substring(i, 2)
@@ -797,11 +821,11 @@ abstract class Text
 						i += 1
 					else
 						# What follows a % is not Hex
-						buf[l] = '?'.ascii.to_b
+						buf[l] = '?'.ascii
 						i -= 1
 					end
 				end
-			else buf[l] = c.ascii.to_b
+			else buf[l] = c.ascii
 
 			i += 1
 			l += 1
@@ -905,7 +929,7 @@ abstract class Text
 
 			for i in [0..length[ do
 				var char = chars[i]
-				h = (h << 5) + h + char.ascii
+				h = (h << 5) + h + char.code_point
 			end
 
 			hash_cache = h
@@ -950,6 +974,65 @@ abstract class Text
 		return s.plain_to_s
 	end
 
+	# Return the Levenshtein distance between two strings
+	#
+	# ~~~
+	# assert "abcd".levenshtein_distance("abcd") == 0
+	# assert "".levenshtein_distance("abcd")     == 4
+	# assert "abcd".levenshtein_distance("")     == 4
+	# assert "abcd".levenshtein_distance("xyz")  == 4
+	# assert "abcd".levenshtein_distance("xbdy") == 3
+	# ~~~
+	fun levenshtein_distance(other: String): Int
+	do
+		var slen = self.length
+		var olen = other.length
+
+		# fast cases
+		if slen == 0 then return olen
+		if olen == 0 then return slen
+		if self == other then return 0
+
+		# previous row of distances
+		var v0 = new Array[Int].with_capacity(olen+1)
+
+		# current row of distances
+		var v1 = new Array[Int].with_capacity(olen+1)
+
+		for j in [0..olen] do
+			# prefix insert cost
+			v0[j] = j
+		end
+
+		for i in [0..slen[ do
+
+			# prefix delete cost
+			v1[0] = i + 1
+
+			for j in [0..olen[ do
+				# delete cost
+				var cost1 = v1[j] + 1
+				# insert cost
+				var cost2 = v0[j + 1] + 1
+				# same char cost (+0)
+				var cost3 = v0[j]
+				# change cost
+				if self[i] != other[j] then cost3 += 1
+				# keep the min
+				v1[j+1] = cost1.min(cost2).min(cost3)
+			end
+
+			# Switch columns:
+			# * v1 become v0 in the next iteration
+			# * old v0 is reused as the new v1
+			var tmp = v1
+			v1 = v0
+			v0 = tmp
+		end
+
+		return v0[olen]
+	end
+
 	# Copies `n` bytes from `self` at `src_offset` into `dest` starting at `dest_offset`
 	#
 	# Basically a high-level synonym of NativeString::copy_to
@@ -981,10 +1064,7 @@ abstract class FlatText
 	#
 	# Warning : Might be void in some subclasses, be sure to check
 	# if set before using it.
-	private var items: NativeString is noinit
-
-	# Real items, used as cache for to_cstring is called
-	private var real_items: nullable NativeString = null
+	var items: NativeString is noinit
 
 	# Returns a char* starting at position `first_byte`
 	#
@@ -1001,7 +1081,7 @@ abstract class FlatText
 	#
 	# As always, do not modify the content of the String in C code, if this is what you want
 	# copy locally the char* as Nit Strings are immutable.
-	private fun fast_cstring: NativeString is abstract
+	fun fast_cstring: NativeString is abstract
 
 	redef var length = 0
 
@@ -1050,7 +1130,7 @@ private abstract class StringByteView
 
 	redef fun is_empty do return target.is_empty
 
-	redef fun length do return target.length
+	redef fun length do return target.bytelen
 
 	redef fun iterator do return self.iterator_from(0)
 
@@ -1573,9 +1653,15 @@ end
 
 redef class Char
 
+	# Returns a sequence with the UTF-8 bytes of `self`
+	#
+	#     assert 'a'.bytes == [0x61u8]
+	#     assert 'ã¾'.bytes == [0xE3u8, 0x81u8, 0xBEu8]
+	fun bytes: SequenceRead[Byte] do return to_s.bytes
+
 	# Length of `self` in a UTF-8 String
 	private fun u8char_len: Int do
-		var c = self.ascii
+		var c = self.code_point
 		if c < 0x80 then return 1
 		if c <= 0x7FF then return 2
 		if c <= 0xFFFF then return 3
@@ -1592,6 +1678,46 @@ redef class Char
 		return ns.to_s_with_length(ln)
 	end
 
+	# Returns `self` escaped to UTF-16
+	#
+	# i.e. Represents `self`.`code_point` using UTF-16 codets escaped
+	# with a `\u`
+	#
+	#     assert 'A'.escape_to_utf16 == "\\u0041"
+	#     assert 'Ã¨'.escape_to_utf16 == "\\u00e8"
+	#     assert 'ã'.escape_to_utf16 == "\\u3042"
+	#     assert 'ð'.escape_to_utf16 == "\\ud800\\udfd3"
+	fun escape_to_utf16: String do
+		var cp = code_point
+		var buf: Buffer
+		if cp < 0xD800 or (cp >= 0xE000 and cp <= 0xFFFF) then
+			buf = new Buffer.with_cap(6)
+			buf.append("\\u0000")
+			var hx = cp.to_hex
+			var outid = 5
+			for i in hx.chars.reverse_iterator do
+				buf[outid] = i
+				outid -= 1
+			end
+		else
+			buf = new Buffer.with_cap(12)
+			buf.append("\\u0000\\u0000")
+			var lo = (((cp - 0x10000) & 0x3FF) + 0xDC00).to_hex
+			var hi = ((((cp - 0x10000) & 0xFFC00) >> 10) + 0xD800).to_hex
+			var out = 2
+			for i in hi do
+				buf[out] = i
+				out += 1
+			end
+			out = 8
+			for i in lo do
+				buf[out] = i
+				out += 1
+			end
+		end
+		return buf.to_s
+	end
+
 	private fun u8char_tos(r: NativeString, len: Int) `{
 		r[len] = '\0';
 		switch(len){
@@ -1642,6 +1768,16 @@ redef class Char
 		return (self >= 'a' and self <= 'z') or (self >= 'A' and self <= 'Z')
 	end
 
+	# Is `self` an hexadecimal digit ?
+	#
+	#     assert 'A'.is_hexdigit
+	#     assert not 'G'.is_hexdigit
+	#     assert 'a'.is_hexdigit
+	#     assert not 'g'.is_hexdigit
+	#     assert '5'.is_hexdigit
+	fun is_hexdigit: Bool do return (self >= '0' and self <= '9') or (self >= 'A' and self <= 'F') or
+					(self >= 'a' and self <= 'f')
+
 	# Returns true if the char is an alpha or a numeric digit
 	#
 	#     assert 'a'.is_alphanumeric