From dbf3efae9b8cf65d14ea991b67010b25a7bad363 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Alexis=20Laferri=C3=A8re?= Date: Wed, 8 Feb 2017 01:51:21 -0500 Subject: [PATCH] text: use UInt32 to manipulate chars MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Signed-off-by: Alexis Laferrière --- lib/core/text/abstract_text.nit | 16 ++++---- lib/core/text/flat.nit | 2 +- lib/core/text/native.nit | 73 ++++++++++++++++++--------------- lib/json/static.nit | 2 +- src/compiler/abstract_compiler.nit | 4 +- src/interpreter/naive_interpreter.nit | 4 +- 6 files changed, 54 insertions(+), 47 deletions(-) diff --git a/lib/core/text/abstract_text.nit b/lib/core/text/abstract_text.nit index ea438de..a09843d 100644 --- a/lib/core/text/abstract_text.nit +++ b/lib/core/text/abstract_text.nit @@ -789,17 +789,17 @@ abstract class Text if pos == null then pos = 0 if ln == null then ln = length - pos if ln < 6 then return 0xFFFD.code_point - var cp = from_utf16_digit(pos + 2) - if cp < 0xD800 then return cp.code_point - if cp > 0xDFFF then return cp.code_point - if cp > 0xDBFF then return 0xFFFD.code_point + var cp = from_utf16_digit(pos + 2).to_u32 + if cp < 0xD800u32 then return cp.code_point + if cp > 0xDFFFu32 then return cp.code_point + if cp > 0xDBFFu32 then return 0xFFFD.code_point if ln == 6 then return 0xFFFD.code_point if ln < 12 then return 0xFFFD.code_point cp <<= 16 - cp += from_utf16_digit(pos + 8) - var cplo = cp & 0xFFFF - if cplo < 0xDC00 then return 0xFFFD.code_point - if cplo > 0xDFFF then return 0xFFFD.code_point + cp += from_utf16_digit(pos + 8).to_u32 + var cplo = cp & 0xFFFFu32 + if cplo < 0xDC00u32 then return 0xFFFD.code_point + if cplo > 0xDFFFu32 then return 0xFFFD.code_point return cp.from_utf16_surr.code_point end diff --git a/lib/core/text/flat.nit b/lib/core/text/flat.nit index a28a264..de69148 100644 --- a/lib/core/text/flat.nit +++ b/lib/core/text/flat.nit @@ -1359,7 +1359,7 @@ redef class CString while rem > 0 do while rem >= 4 do var i = fetch_4_chars(pos) - if i & 0x80808080 != 0 then break + if i & 0x80808080u32 != 0u32 then break pos += 4 chr_ln += 4 rem -= 4 diff --git a/lib/core/text/native.nit b/lib/core/text/native.nit index 0ff3787..0469a7d 100644 --- a/lib/core/text/native.nit +++ b/lib/core/text/native.nit @@ -13,6 +13,7 @@ module native import kernel import math +import fixed_ints in "C" `{ #ifdef __linux__ @@ -68,19 +69,26 @@ redef class Byte end end -redef class Int +redef class UInt32 # Returns the code_point from a utf16 surrogate pair # - # assert 0xD83DDE02.from_utf16_surr == 0x1F602 - fun from_utf16_surr: Int do - var hi = (self & 0xFFFF0000) >> 16 - var lo = self & 0xFFFF - var cp = 0 - cp += (hi - 0xD800) << 10 - cp += lo - 0xDC00 - cp += 0x10000 + # assert 0xD83DDE02u32.from_utf16_surr == 0x1F602u32 + fun from_utf16_surr: UInt32 do + var hi = (self & 0xFFFF0000u32) >> 16 + var lo = self & 0xFFFFu32 + var cp = 0u32 + cp += (hi - 0xD800u32) << 10 + cp += lo - 0xDC00u32 + cp += 0x10000u32 return cp end + + # The character which code point (unicode-wise) is `self` + # + # assert 65u32.code_point == 'A' + # assert 10u32.code_point == '\n' + # assert 0x220Bu32.code_point == '∋' + fun code_point: Char `{ return self; `} end # C string `char *` @@ -141,26 +149,26 @@ extern class CString `{ char* `} var c = self[pos] if c & 0x80u8 == 0u8 then return c.ascii var b = fetch_4_hchars(pos) - var ret = 0 - if b & 0xC00000 != 0x800000 then return 0xFFFD.code_point - if b & 0xE0000000 == 0xC0000000 then - ret |= (b & 0x1F000000) >> 18 - ret |= (b & 0x3F0000) >> 16 + var ret = 0u32 + if b & 0xC00000u32 != 0x800000u32 then return 0xFFFD.code_point + if b & 0xE0000000u32 == 0xC0000000u32 then + ret |= (b & 0x1F000000u32) >> 18 + ret |= (b & 0x3F0000u32) >> 16 return ret.code_point end - if not b & 0xC000 == 0x8000 then return 0xFFFD.code_point - if b & 0xF0000000 == 0xE0000000 then - ret |= (b & 0xF000000) >> 12 - ret |= (b & 0x3F0000) >> 10 - ret |= (b & 0x3F00) >> 8 + if not b & 0xC000u32 == 0x8000u32 then return 0xFFFD.code_point + if b & 0xF0000000u32 == 0xE0000000u32 then + ret |= (b & 0xF000000u32) >> 12 + ret |= (b & 0x3F0000u32) >> 10 + ret |= (b & 0x3F00u32) >> 8 return ret.code_point end - if not b & 0xC0 == 0x80 then return 0xFFFD.code_point - if b & 0xF8000000 == 0xF0000000 then - ret |= (b.to_i & 0x7000000) >> 6 - ret |= (b.to_i & 0x3F0000) >> 4 - ret |= (b.to_i & 0x3F00) >> 2 - ret |= b.to_i & 0x3F + if not b & 0xC0u32 == 0x80u32 then return 0xFFFD.code_point + if b & 0xF8000000u32 == 0xF0000000u32 then + ret |= (b & 0x7000000u32) >> 6 + ret |= (b & 0x3F0000u32) >> 4 + ret |= (b & 0x3F00u32) >> 2 + ret |= b & 0x3Fu32 return ret.code_point end return 0xFFFD.code_point @@ -200,7 +208,7 @@ extern class CString `{ char* `} while dist > 0 do while dist >= 4 do var i = fetch_4_chars(ns_i) - if i & 0x80808080 != 0 then break + if i & 0x80808080u32 != 0u32 then break ns_i += 4 my_i += 4 dist -= 4 @@ -214,7 +222,7 @@ extern class CString `{ char* `} while dist < 0 do while dist <= -4 do var i = fetch_4_chars(ns_i - 4) - if i & 0x80808080 != 0 then break + if i & 0x80808080u32 != 0u32 then break ns_i -= 4 my_i -= 4 dist += 4 @@ -256,8 +264,8 @@ extern class CString `{ char* `} # If the char is invalid UTF-8, `pos` is returned as-is # # ~~~raw - # assert "abc".items.find_beginning_of_char_at(2) == 2 - # assert "か".items.find_beginning_of_char_at(1) == 0 + # assert "abc".items.find_beginning_of_char_at(2) == 2 + # assert "か".items.find_beginning_of_char_at(1) == 0 # assert [0x41u8, 233u8].to_s.items.find_beginning_of_char_at(1) == 1 # ~~~ fun find_beginning_of_char_at(pos: Int): Int do @@ -280,7 +288,7 @@ extern class CString `{ char* `} while byte_length > 0 do while byte_length >= 4 do var i = fetch_4_chars(st) - if i & 0x80808080 != 0 then break + if i & 0x80808080u32 != 0u32 then break byte_length -= 4 st += 4 ln += 4 @@ -295,11 +303,10 @@ extern class CString `{ char* `} end # Fetch 4 chars in `self` at `pos` - fun fetch_4_chars(pos: Int): Int is intern `{ return (long)*((uint32_t*)(self+pos)); `} + fun fetch_4_chars(pos: Int): UInt32 is intern `{ return *((uint32_t*)(self+pos)); `} # Fetch 4 chars in `self` at `pos` - fun fetch_4_hchars(pos: Int): Int is intern `{ return (long)be32toh(*((uint32_t*)(self+pos))); `} - + fun fetch_4_hchars(pos: Int): UInt32 is intern `{ return (uint32_t)be32toh(*((uint32_t*)(self+pos))); `} # Right shifts `len` bytes of `self` from `sh` bytes starting at position `pos` fun rshift(sh, len, pos: Int) do diff --git a/lib/json/static.nit b/lib/json/static.nit index f3107cc..f5eef1d 100644 --- a/lib/json/static.nit +++ b/lib/json/static.nit @@ -81,7 +81,7 @@ redef class Text if self[i + 5] == '\\' and self[i + 6] == 'u' then u16_esc <<= 16 u16_esc += from_utf16_digit(i + 7) - char = u16_esc.from_utf16_surr.code_point + char = u16_esc.to_u32.from_utf16_surr.code_point i += 6 else char = 0xFFFD.code_point diff --git a/src/compiler/abstract_compiler.nit b/src/compiler/abstract_compiler.nit index 47f813d..f0bb0bd 100644 --- a/src/compiler/abstract_compiler.nit +++ b/src/compiler/abstract_compiler.nit @@ -2612,10 +2612,10 @@ redef class AMethPropdef v.ret(v.new_expr("(char*){alloc}", ret.as(not null))) return true else if pname == "fetch_4_chars" then - v.ret(v.new_expr("(long)*((uint32_t*)({arguments[0]} + {arguments[1]}))", ret.as(not null))) + v.ret(v.new_expr("*((uint32_t*)({arguments[0]} + {arguments[1]}))", ret.as(not null))) return true else if pname == "fetch_4_hchars" then - v.ret(v.new_expr("(long)be32toh(*((uint32_t*)({arguments[0]} + {arguments[1]})))", ret.as(not null))) + v.ret(v.new_expr("(uint32_t)be32toh(*((uint32_t*)({arguments[0]} + {arguments[1]})))", ret.as(not null))) return true end else if cname == "NativeArray" then diff --git a/src/interpreter/naive_interpreter.nit b/src/interpreter/naive_interpreter.nit index 34530b7..233bdce 100644 --- a/src/interpreter/naive_interpreter.nit +++ b/src/interpreter/naive_interpreter.nit @@ -1174,9 +1174,9 @@ redef class AMethPropdef var ns = recvval.fast_cstring(args[1].to_i) return v.c_string_instance(ns.to_s) else if pname == "fetch_4_chars" then - return v.int_instance(args[0].val.as(CString).fetch_4_chars(args[1].to_i)) + return v.uint32_instance(args[0].val.as(CString).fetch_4_chars(args[1].to_i)) else if pname == "fetch_4_hchars" then - return v.int_instance(args[0].val.as(CString).fetch_4_hchars(args[1].to_i)) + return v.uint32_instance(args[0].val.as(CString).fetch_4_hchars(args[1].to_i)) else if pname == "utf8_length" then return v.int_instance(args[0].val.as(CString).utf8_length(args[1].to_i, args[2].to_i)) end -- 1.7.9.5