X-Git-Url: http://nitlanguage.org

diff --git a/lib/core/text/native.nit b/lib/core/text/native.nit
index fad3ae1..0469a7d 100644
--- a/lib/core/text/native.nit
+++ b/lib/core/text/native.nit
@@ -13,6 +13,7 @@ module native
 
 import kernel
 import math
+import fixed_ints
 
 in "C" `{
 #ifdef __linux__
@@ -22,6 +23,9 @@ in "C" `{
 	#include <libkern/OSByteOrder.h>
 	#define be32toh(x) OSSwapBigToHostInt32(x)
 #endif
+#ifdef _WIN32
+	#define be32toh(val) _byteswap_ulong(val)
+#endif
 
 #ifdef __pnacl__
 	#define be16toh(val) (((val) >> 8) | ((val) << 8))
@@ -47,33 +51,58 @@ redef class Byte
 			return 1
 		end
 	end
+
+	# Is `self` a valid UTF-8 sequence start ?
+	#
+	# ~~~nit
+	# assert 0u8.is_valid_utf8_start
+	# assert 0xC0u8.is_valid_utf8_start
+	# assert 0xE0u8.is_valid_utf8_start
+	# assert 0xF0u8.is_valid_utf8_start
+	# ~~~
+	fun is_valid_utf8_start: Bool do
+		if self & 0x80u8 == 0u8 then return true
+		if self & 0b1110_0000u8 == 0b1100_0000u8 then return true
+		if self & 0b1111_0000u8 == 0b1110_0000u8 then return true
+		if self & 0b1111_1000u8 == 0b1111_0000u8 then return true
+		return false
+	end
 end
 
-redef class Int
+redef class UInt32
 	# Returns the code_point from a utf16 surrogate pair
 	#
-	#     assert 0xD83DDE02.from_utf16_surr == 0x1F602
-	fun from_utf16_surr: Int do
-		var hi = (self & 0xFFFF0000) >> 16
-		var lo = self & 0xFFFF
-		var cp = 0
-		cp += (hi - 0xD800) << 10
-		cp += lo - 0xDC00
-		cp += 0x10000
+	#     assert 0xD83DDE02u32.from_utf16_surr == 0x1F602u32
+	fun from_utf16_surr: UInt32 do
+		var hi = (self & 0xFFFF0000u32) >> 16
+		var lo = self & 0xFFFFu32
+		var cp = 0u32
+		cp += (hi - 0xD800u32) << 10
+		cp += lo - 0xDC00u32
+		cp += 0x10000u32
 		return cp
 	end
+
+	# The character which code point (unicode-wise) is `self`
+	#
+	#     assert 65u32.code_point == 'A'
+	#     assert 10u32.code_point == '\n'
+	#     assert 0x220Bu32.code_point == 'â'
+	fun code_point: Char `{ return self; `}
 end
 
-# Native strings are simple C char *
-extern class NativeString `{ char* `}
-	# Creates a new NativeString with a capacity of `length`
+# C string `char *`
+#
+# Used as underlying implementation for `String` and some other `Text`.
+extern class CString `{ char* `}
+	# Create a new `CString` with the capacity for `length` characters
 	new(length: Int) is intern
 
-	# Returns a char* starting at `index`.
+	# Get a char* starting at `index`.
 	#
 	# WARNING: Unsafe for extern code, use only for temporary
 	# pointer manipulation purposes (e.g. write to file or such)
-	fun fast_cstring(index: Int): NativeString is intern
+	fun fast_cstring(index: Int): CString is intern
 
 	# Get char at `index`.
 	fun [](index: Int): Byte is intern
@@ -82,7 +111,11 @@ extern class NativeString `{ char* `}
 	fun []=(index: Int, item: Byte) is intern
 
 	# Copy `self` to `dest`.
-	fun copy_to(dest: NativeString, length: Int, from: Int, to: Int) is intern
+	fun copy_to(dest: CString, length: Int, from: Int, to: Int) is intern
+
+	redef fun ==(o) is intern do return is_same_instance(o)
+
+	redef fun !=(o) is intern do return not is_same_instance(o)
 
 	# Position of the first nul character.
 	fun cstring_length: Int
@@ -116,26 +149,26 @@ extern class NativeString `{ char* `}
 		var c = self[pos]
 		if c & 0x80u8 == 0u8 then return c.ascii
 		var b = fetch_4_hchars(pos)
-		var ret = 0
-		if b & 0xC00000 != 0x800000 then return 0xFFFD.code_point
-		if b & 0xE0000000 == 0xC0000000 then
-			ret |= (b & 0x1F000000) >> 18
-			ret |= (b & 0x3F0000) >> 16
+		var ret = 0u32
+		if b & 0xC00000u32 != 0x800000u32 then return 0xFFFD.code_point
+		if b & 0xE0000000u32 == 0xC0000000u32 then
+			ret |= (b & 0x1F000000u32) >> 18
+			ret |= (b & 0x3F0000u32) >> 16
 			return ret.code_point
 		end
-		if not b & 0xC000 == 0x8000 then return 0xFFFD.code_point
-		if b & 0xF0000000 == 0xE0000000 then
-			ret |= (b & 0xF000000) >> 12
-			ret |= (b & 0x3F0000) >> 10
-			ret |= (b & 0x3F00) >> 8
+		if not b & 0xC000u32 == 0x8000u32 then return 0xFFFD.code_point
+		if b & 0xF0000000u32 == 0xE0000000u32 then
+			ret |= (b & 0xF000000u32) >> 12
+			ret |= (b & 0x3F0000u32) >> 10
+			ret |= (b & 0x3F00u32) >> 8
 			return ret.code_point
 		end
-		if not b & 0xC0 == 0x80 then return 0xFFFD.code_point
-		if b & 0xF8000000 == 0xF0000000 then
-			ret |= (b.to_i & 0x7000000) >> 6
-			ret |= (b.to_i & 0x3F0000) >> 4
-			ret |= (b.to_i & 0x3F00) >> 2
-			ret |= b.to_i & 0x3F
+		if not b & 0xC0u32 == 0x80u32 then return 0xFFFD.code_point
+		if b & 0xF8000000u32 == 0xF0000000u32 then
+			ret |= (b & 0x7000000u32) >> 6
+			ret |= (b & 0x3F0000u32) >> 4
+			ret |= (b & 0x3F00u32) >> 2
+			ret |= b & 0x3Fu32
 			return ret.code_point
 		end
 		return 0xFFFD.code_point
@@ -175,7 +208,7 @@ extern class NativeString `{ char* `}
 		while dist > 0 do
 			while dist >= 4 do
 				var i = fetch_4_chars(ns_i)
-				if i & 0x80808080 != 0 then break
+				if i & 0x80808080u32 != 0u32 then break
 				ns_i += 4
 				my_i += 4
 				dist -= 4
@@ -189,7 +222,7 @@ extern class NativeString `{ char* `}
 		while dist < 0 do
 			while dist <= -4 do
 				var i = fetch_4_chars(ns_i - 4)
-				if i & 0x80808080 != 0 then break
+				if i & 0x80808080u32 != 0u32 then break
 				ns_i -= 4
 				my_i -= 4
 				dist += 4
@@ -231,13 +264,14 @@ extern class NativeString `{ char* `}
 	# If the char is invalid UTF-8, `pos` is returned as-is
 	#
 	# ~~~raw
-	# 	assert "abc".items.find_beginning_of_char_at(2) == 2
-	# 	assert "ã".items.find_beginning_of_char_at(1) == 0
+	#	assert "abc".items.find_beginning_of_char_at(2) == 2
+	#	assert "ã".items.find_beginning_of_char_at(1) == 0
 	#	assert [0x41u8, 233u8].to_s.items.find_beginning_of_char_at(1) == 1
 	# ~~~
 	fun find_beginning_of_char_at(pos: Int): Int do
 		var endpos = pos
 		var c = self[pos]
+		if c & 0x80u8 == 0x00u8 then return pos
 		while c & 0xC0u8 == 0x80u8 do
 			pos -= 1
 			c = self[pos]
@@ -247,25 +281,40 @@ extern class NativeString `{ char* `}
 		return endpos
 	end
 
-	# Number of UTF-8 characters in `self` between positions `from` and `to`
-	fun utf8_length(from, to: Int): Int do
+	# Number of UTF-8 characters in `self` starting at `from`, for a length of `byte_length`
+	fun utf8_length(from, byte_length: Int): Int is intern do
 		var st = from
-		var lst = to
 		var ln = 0
-		while st <= lst do
-			st += length_of_char_at(st)
+		while byte_length > 0 do
+			while byte_length >= 4 do
+				var i = fetch_4_chars(st)
+				if i & 0x80808080u32 != 0u32 then break
+				byte_length -= 4
+				st += 4
+				ln += 4
+			end
+			if byte_length == 0 then break
+			var cln = length_of_char_at(st)
+			st += cln
 			ln += 1
+			byte_length -= cln
 		end
 		return ln
 	end
 
 	# Fetch 4 chars in `self` at `pos`
-	fun fetch_4_chars(pos: Int): Int is intern do return fetch_4_ffi(pos)
+	fun fetch_4_chars(pos: Int): UInt32 is intern `{ return *((uint32_t*)(self+pos)); `}
 
 	# Fetch 4 chars in `self` at `pos`
-	fun fetch_4_hchars(pos: Int): Int is intern do return fetch_4h_ffi(pos)
+	fun fetch_4_hchars(pos: Int): UInt32 is intern `{ return (uint32_t)be32toh(*((uint32_t*)(self+pos))); `}
 
-	# FIXME: To remove when bootstrap supports PR #1898
-	private fun fetch_4_ffi(pos: Int): Int `{ return (long)*((uint32_t*)(self+pos)); `}
-	private fun fetch_4h_ffi(pos: Int): Int `{ return (long)be32toh(*((uint32_t*)(self+pos))); `}
+	# Right shifts `len` bytes of `self` from `sh` bytes starting at position `pos`
+	fun rshift(sh, len, pos: Int) do
+		copy_to(self, len, pos, pos + sh)
+	end
+
+	# Left shifts `len` bytes of `self` from `sh` bytes starting at position `pos`
+	fun lshift(sh, len, pos: Int) do
+		copy_to(self, len, pos, pos - sh)
+	end
 end