From: Lucas Bajolet Date: Fri, 11 Sep 2015 15:36:03 +0000 (-0400) Subject: lib/core: Have Bytes::to_s and stream use the new and improved to_s_with_length X-Git-Tag: v0.7.8~23^2~3 X-Git-Url: http://nitlanguage.org lib/core: Have Bytes::to_s and stream use the new and improved to_s_with_length Signed-off-by: Lucas Bajolet --- diff --git a/lib/core/bytes.nit b/lib/core/bytes.nit index 59c4c5f..332efc2 100644 --- a/lib/core/bytes.nit +++ b/lib/core/bytes.nit @@ -146,80 +146,13 @@ class Bytes redef fun to_s do persisted = true var b = self - if not is_utf8 then - b = clean_utf8 - persisted = false - end - return new FlatString.with_infos(b.items, b.length, 0, b.length -1) + var r = b.items.to_s_with_length(length) + if r != items then persisted = false + return r end redef fun iterator do return new BytesIterator.with_buffer(self) - # Is the byte collection valid UTF-8 ? - fun is_utf8: Bool do - var charst = once [0x80u8, 0u8, 0xE0u8, 0xC0u8, 0xF0u8, 0xE0u8, 0xF8u8, 0xF0u8] - var lobounds = once [0, 0x80, 0x800, 0x10000] - var hibounds = once [0x7F, 0x7FF, 0xFFFF, 0x10FFFF] - var pos = 0 - var len = length - var mits = items - while pos < len do - var nxst = mits.length_of_char_at(pos) - var charst_index = (nxst - 1) * 2 - if mits[pos] & charst[charst_index] == charst[charst_index + 1] then - var c = mits.char_at(pos) - var cp = c.ascii - if cp <= hibounds[nxst - 1] and cp >= lobounds[nxst - 1] then - if cp >= 0xD800 and cp <= 0xDFFF or - cp == 0xFFFE or cp == 0xFFFF then return false - else - return false - end - else - return false - end - pos += nxst - end - return true - end - - # Cleans the bytes of `self` to be UTF-8 compliant - private fun clean_utf8: Bytes do - var charst = once [0x80u8, 0u8, 0xE0u8, 0xC0u8, 0xF0u8, 0xE0u8, 0xF8u8, 0xF0u8] - var badchar = once [0xEFu8, 0xBFu8, 0xBDu8] - var lobounds = once [0, 0x80, 0x800, 0x10000] - var hibounds = once [0x7F, 0x7FF, 0xFFFF, 0x10FFFF] - var pos = 0 - var len = length - var ret = new Bytes.with_capacity(len) - var mits = items - while pos < len do - var nxst = mits.length_of_char_at(pos) - var charst_index = (nxst - 1) * 2 - if mits[pos] & charst[charst_index] == charst[charst_index + 1] then - var c = mits.char_at(pos) - var cp = c.ascii - if cp <= hibounds[nxst - 1] and cp >= lobounds[nxst - 1] then - if cp >= 0xD800 and cp <= 0xDFFF or - cp == 0xFFFE or cp == 0xFFFF then - ret.append badchar - pos += 1 - else - var pend = pos + nxst - for i in [pos .. pend[ do ret.add mits[i] - pos += nxst - end - else - ret.append badchar - pos += 1 - end - else - ret.append badchar - pos += 1 - end - end - return ret - end end private class BytesIterator diff --git a/lib/core/stream.nit b/lib/core/stream.nit index 2db319a..4b1e826 100644 --- a/lib/core/stream.nit +++ b/lib/core/stream.nit @@ -173,12 +173,13 @@ abstract class Reader # ~~~ fun read_all: String do var s = read_all_bytes - if not s.is_utf8 then s = s.clean_utf8 var slen = s.length if slen == 0 then return "" var rets = "" var pos = 0 - var sits = s.items + var str = s.items.clean_utf8(slen) + slen = str.bytelen + var sits = str.items var remsp = slen while pos < slen do # The 129 size was decided more or less arbitrarily diff --git a/lib/core/text/flat.nit b/lib/core/text/flat.nit index 7eebc08..2d4a942 100644 --- a/lib/core/text/flat.nit +++ b/lib/core/text/flat.nit @@ -985,8 +985,7 @@ redef class NativeString redef fun to_s_with_length(length): FlatString do assert length >= 0 - var str = new FlatString.with_infos(self, length, 0, length - 1) - return str + return clean_utf8(length) end redef fun to_s_full(bytelen, unilen) do @@ -997,6 +996,8 @@ redef class NativeString redef fun to_s_with_copy: FlatString do var length = cstring_length + var r = clean_utf8(length) + if r.items != self then return r var new_self = new NativeString(length + 1) copy_to(new_self, length, 0, 0) var str = new FlatString.with_infos(new_self, length, 0, length - 1)