From 975b92e5debe1fa25db4451a5c771494217a8bcf Mon Sep 17 00:00:00 2001 From: Lucas Bajolet Date: Tue, 15 Dec 2015 11:26:30 -0500 Subject: [PATCH] lib/json: Improved performance of Unicode-escaping in a `Text` Signed-off-by: Lucas Bajolet --- lib/core/text/abstract_text.nit | 37 ++++++++++++++++++++++++------------- lib/json/static.nit | 23 +++++++++++------------ 2 files changed, 35 insertions(+), 25 deletions(-) diff --git a/lib/core/text/abstract_text.nit b/lib/core/text/abstract_text.nit index 1de1aa1..0b07edb 100644 --- a/lib/core/text/abstract_text.nit +++ b/lib/core/text/abstract_text.nit @@ -743,21 +743,32 @@ abstract class Text # assert "\\ud800\\udfd3".from_utf16_escape == '𐏓' # assert "\\u00e8".from_utf16_escape == 'è' # assert "\\u3042".from_utf16_escape == 'あ' - fun from_utf16_escape: Char do - var ln = length - if ln != 6 and ln != 12 then return 0xFFFD.code_point - var cphi = substring(2, 4).to_hex - if cphi < 0xD800 then return cphi.code_point - if cphi > 0xDFFF then return cphi.code_point - if cphi > 0xDBFF then return 0xFFFD.code_point - var cp = 0 - cp += (cphi - 0xD800) << 10 - var cplo = substring(8, 4).to_hex + fun from_utf16_escape(pos, ln: nullable Int): Char do + if pos == null then pos = 0 + if ln == null then ln = length - pos + if ln < 6 then return 0xFFFD.code_point + var cp = from_utf16_digit(pos + 2) + if cp < 0xD800 then return cp.code_point + if cp > 0xDFFF then return cp.code_point + if cp > 0xDBFF then return 0xFFFD.code_point + if ln == 6 then return 0xFFFD.code_point + if ln < 12 then return 0xFFFD.code_point + cp <<= 16 + cp += from_utf16_digit(pos + 8) + var cplo = cp & 0xFFFF if cplo < 0xDC00 then return 0xFFFD.code_point if cplo > 0xDFFF then return 0xFFFD.code_point - cp += cplo - 0xDC00 - cp += 0x10000 - return cp.code_point + return cp.from_utf16_surr.code_point + end + + # Returns a UTF-16 escape value + # + # var s = "\\ud800\\udfd3" + # assert s.from_utf16_digit(2) == 0xD800 + # assert s.from_utf16_digit(8) == 0xDFD3 + fun from_utf16_digit(pos: nullable Int): Int do + if pos == null then pos = 0 + return to_hex(pos, 4) end # Encode `self` to percent (or URL) encoding diff --git a/lib/json/static.nit b/lib/json/static.nit index 15a4355..7c0e2cf 100644 --- a/lib/json/static.nit +++ b/lib/json/static.nit @@ -138,7 +138,8 @@ redef class Text protected fun json_to_nit_string: String do var res = new FlatBuffer.with_capacity(bytelen) var i = 0 - while i < self.length do + var ln = self.length + while i < ln do var char = self[i] if char == '\\' then i += 1 @@ -154,21 +155,19 @@ redef class Text else if char == 't' then char = '\t' else if char == 'u' then - var code = substring(i + 1, 4) - var hx = code.to_hex - if hx >= 0xD800 and hx <= 0xDFFF then - var lostr = substring(i + 7, 4) - if lostr.length < 4 then - hx = 0xFFFD + var u16_esc = from_utf16_digit(i + 1) + char = u16_esc.code_point + if char.is_surrogate and i + 10 < ln then + if self[i + 5] == '\\' and self[i + 6] == 'u' then + u16_esc <<= 16 + u16_esc += from_utf16_digit(i + 7) + char = u16_esc.from_utf16_surr.code_point + i += 6 else - hx <<= 16 - hx += lostr.to_hex - hx = hx.from_utf16_surr + char = 0xFFFD.code_point end - i += 6 end i += 4 - char = hx.code_point end # `"`, `/` or `\` => Keep `char` as-is. end -- 1.7.9.5