From 3f4025d72cefa976c2cac241a687f173202dc219 Mon Sep 17 00:00:00 2001
From: Lucas Bajolet <r4pass@hotmail.com>
Date: Fri, 10 Jul 2015 16:15:03 -0400
Subject: [PATCH] lib/standard: Added services on NativeString and Char for
 the support of UTF-8

Signed-off-by: Lucas Bajolet <r4pass@hotmail.com>
---
 lib/standard/kernel.nit             |   15 ++++-
 lib/standard/text/abstract_text.nit |   46 ++++++++++++++--
 lib/standard/text/flat.nit          |   32 +++++++++++
 lib/standard/text/native.nit        |  104 +++++++++++++++++++++++++++++++++++
 4 files changed, 191 insertions(+), 6 deletions(-)

diff --git a/lib/standard/kernel.nit b/lib/standard/kernel.nit
index e05d152..c7f0751 100644
--- a/lib/standard/kernel.nit
+++ b/lib/standard/kernel.nit
@@ -719,10 +719,23 @@ universal Char
 	redef type OTHER: Char
 
 	redef fun object_id is intern
+	redef fun output `{
+		if(self < 128){
+			printf("%c", self);
+		}else if(self < 2048){
+			printf("%c%c", 0xC0 | ((0x7C0 & self) >> 6), 0x80 | (0x3F & self));
+		}else if(self < 65536){
+			printf("%c%c%c", 0xE0 | ((0xF000 & self) >> 12), 0x80 | ((0xFC0 & self) >> 6) ,0x80 | (0x3F & self));
+		}else if(self < 2097152){
+			printf("%c%c%c%c", 0xF0 | ((0x1C0000 & self) >> 18), 0x80 | ((0x3F000 & self) >> 12), 0x80 | ((0xFC0 & self) >> 6), 0x80 | (0x3F & self));
+		}else{
+			// Bad char
+			printf("%c", self);
+		}
+	`}
 	redef fun hash do return ascii
 	redef fun ==(o) is intern
 	redef fun !=(o) is intern
-	redef fun output is intern
 
 	redef fun <=(i) is intern
 	redef fun <(i) is intern
diff --git a/lib/standard/text/abstract_text.nit b/lib/standard/text/abstract_text.nit
index af45dab..cad7522 100644
--- a/lib/standard/text/abstract_text.nit
+++ b/lib/standard/text/abstract_text.nit
@@ -1524,14 +1524,50 @@ redef class Float
 end
 
 redef class Char
+
+	# Length of `self` in a UTF-8 String
+	private fun u8char_len: Int do
+		var c = self.ascii
+		if c < 0x80 then return 1
+		if c <= 0x7FF then return 2
+		if c <= 0xFFFF then return 3
+		if c <= 0x10FFFF then return 4
+		# Bad character format
+		return 1
+	end
+
 	#     assert 'x'.to_s    == "x"
-	redef fun to_s
-	do
-		var s = new Buffer.with_cap(1)
-		s.chars[0] = self
-		return s.to_s
+	redef fun to_s do
+		var ln = u8char_len
+		var ns = new NativeString(ln + 1)
+		u8char_tos(ns, ln)
+		return ns.to_s_with_length(ln)
 	end
 
+	private fun u8char_tos(r: NativeString, len: Int) `{
+		r[len] = '\0';
+		switch(len){
+			case 1:
+				r[0] = self;
+				break;
+			case 2:
+				r[0] = 0xC0 | ((self & 0x7C0) >> 6);
+				r[1] = 0x80 | (self & 0x3F);
+				break;
+			case 3:
+				r[0] = 0xE0 | ((self & 0xF000) >> 12);
+				r[1] = 0x80 | ((self & 0xFC0) >> 6);
+				r[2] = 0x80 | (self & 0x3F);
+				break;
+			case 4:
+				r[0] = 0xF0 | ((self & 0x1C0000) >> 18);
+				r[1] = 0x80 | ((self & 0x3F000) >> 12);
+				r[2] = 0x80 | ((self & 0xFC0) >> 6);
+				r[3] = 0x80 | (self & 0x3F);
+				break;
+		}
+	`}
+
 	# Returns true if the char is a numerical digit
 	#
 	#     assert '0'.is_numeric
diff --git a/lib/standard/text/flat.nit b/lib/standard/text/flat.nit
index 04fabc9..66e7734 100644
--- a/lib/standard/text/flat.nit
+++ b/lib/standard/text/flat.nit
@@ -869,6 +869,38 @@ redef class NativeString
 		str.real_items = new_self
 		return str
 	end
+
+	# Sets the next bytes at position `pos` to the value of `c`, encoded in UTF-8
+	#
+	# Very unsafe, make sure to have room for this char prior to calling this function.
+	private fun set_char_at(pos: Int, c: Char) do
+		var ln = c.u8char_len
+		native_set_char(pos, c, ln)
+	end
+
+	private fun native_set_char(pos: Int, c: Char, ln: Int) `{
+		char* dst = self + pos;
+		switch(ln){
+			case 1:
+				dst[0] = c;
+				break;
+			case 2:
+				dst[0] = 0xC0 | ((c & 0x7C0) >> 6);
+				dst[1] = 0x80 | (c & 0x3F);
+				break;
+			case 3:
+				dst[0] = 0xE0 | ((c & 0xF000) >> 12);
+				dst[1] = 0x80 | ((c & 0xFC0) >> 6);
+				dst[2] = 0x80 | (c & 0x3F);
+				break;
+			case 4:
+				dst[0] = 0xF0 | ((c & 0x1C0000) >> 18);
+				dst[1] = 0x80 | ((c & 0x3F000) >> 12);
+				dst[2] = 0x80 | ((c & 0xFC0) >> 6);
+				dst[3] = 0x80 | (c & 0x3F);
+				break;
+		}
+	`}
 end
 
 redef class Int
diff --git a/lib/standard/text/native.nit b/lib/standard/text/native.nit
index 8b66384..b2789a0 100644
--- a/lib/standard/text/native.nit
+++ b/lib/standard/text/native.nit
@@ -12,6 +12,24 @@
 module native
 
 import kernel
+import math
+
+redef class Byte
+	# Gives the length of the UTF-8 char starting with `self`
+	private fun u8len: Int do
+		if self & 0b1000_0000u8 == 0u8 then
+			return 1
+		else if self & 0b1110_0000u8 == 0b1100_0000u8 then
+			return 2
+		else if self & 0b1111_0000u8 == 0b1110_0000u8 then
+			return 3
+		else if self & 0b1111_1000u8 == 0b1111_0000u8 then
+			return 4
+		else
+			return 1
+		end
+	end
+end
 
 # Native strings are simple C char *
 extern class NativeString `{ char* `}
@@ -46,4 +64,90 @@ extern class NativeString `{ char* `}
 
 	# Parse `self` as a Float.
 	fun atof: Float `{ return atof(self); `}
+
+	# Gets the UTF-8 char at index `pos`
+	#
+	# Index is expressed in Unicode chars
+	#
+	# ~~~raw
+	#     assert "ããã".as(FlatString).items.char_at(0) == 'ã'
+	# ~~~
+	#
+	# If the char at position pos is an invalid Unicode char,
+	# the Unicode replacement character ï¿½ (0xFFFD) will be used.
+	#
+	# ~~~raw
+	#     assert "ããã".as(FlatString).items.char_at(1) == 'ï¿½'
+	# ~~~
+	fun char_at(pos: Int): Char `{
+		char c = self[pos];
+		if((c & 0x80) == 0x00) return (uint32_t)c;
+		if(((c & 0xE0) == 0xC0) && ((self[pos + 1] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0x1F) << 6) + ((((uint32_t)self[pos + 1] & 0x3F)));
+		if(((c & 0xF0) == 0xE0) && ((self[pos + 1] & 0xC0) == 0x80) && ((self[pos + 2] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0xF) << 12) + ((((uint32_t)self[pos + 1]) & 0x3F) << 6) + ((((uint32_t)self[pos + 2] & 0x3F)));
+		if(((c & 0xF7) == 0xF0) && ((self[pos + 1] & 0xC0) == 0x80) && ((self[pos + 2] & 0xC0) == 0x80) && ((self[pos + 3] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0x7) << 18) + ((((uint32_t)self[pos + 1]) & 0x3F) << 12) + ((((uint32_t)self[pos + 2]) & 0x3F) << 6) + ((((uint32_t)self[pos + 3] & 0x3F)));
+		return 0xFFFD;
+	`}
+
+	# Gets the byte index of char at position `n` in UTF-8 String
+	fun char_to_byte_index(n: Int): Int do return char_to_byte_index_cached(n, 0, 0)
+
+	# Gets the length of the character at position `pos` (1 if invalid sequence)
+	fun length_of_char_at(pos: Int): Int do
+		var c = self[pos]
+		if c & 0x80u8 == 0x00u8 then
+			return 1
+		else if c & 0xE0u8 == 0xC0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 then
+			return 2
+		else if c & 0xF0u8 == 0xE0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 then
+			return 3
+		else if c & 0xF7u8 == 0xF0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 and self[pos + 3] & 0xC0u8 == 0x80u8 then
+			return 4
+		else
+			return 1
+		end
+	end
+
+	# Gets the byte index of char at position `n` in UTF-8 String
+	#
+	# `char_from` and `byte_from` are cached values to seek from.
+	#
+	# NOTE: char_from and byte_from are not guaranteed to be valid cache values
+	# It it up to the client to ensure the validity of the information
+	fun char_to_byte_index_cached(n, char_from, byte_from: Int): Int do
+		var ns_i = byte_from
+		var my_i = char_from
+
+		while my_i < n do
+			ns_i += length_of_char_at(ns_i)
+			my_i += 1
+		end
+
+		while my_i > n do
+			ns_i = find_beginning_of_char_at(ns_i - 1)
+			my_i -= 1
+		end
+
+		return ns_i
+	end
+
+	# Returns the beginning position of the char at position `pos`
+	#
+	# If the char is invalid UTF-8, `pos` is returned as-is
+	#
+	# ~~~raw
+	# 	assert "abc".items.find_beginning_of_char_at(2) == 2
+	# 	assert "ã".items.find_beginning_of_char_at(1) == 0
+	#	assert [0x41u8, 233u8].to_s.items.find_beginning_of_char_at(1) == 1
+	# ~~~
+	fun find_beginning_of_char_at(pos: Int): Int do
+		var endpos = pos
+		var c = self[pos]
+		while c & 0xC0u8 == 0x80u8 do
+			pos -= 1
+			c = self[pos]
+		end
+		var stpos = pos
+		if length_of_char_at(stpos) >= (endpos - stpos + 1) then return pos
+		return endpos
+	end
 end
-- 
1.7.9.5