core/text: Added UTF-16 escape coding/decoding methods
authorLucas Bajolet <r4pass@hotmail.com>
Mon, 2 Nov 2015 17:16:46 +0000 (12:16 -0500)
committerLucas Bajolet <r4pass@hotmail.com>
Fri, 6 Nov 2015 21:33:37 +0000 (16:33 -0500)
Signed-off-by: Lucas Bajolet <r4pass@hotmail.com>

lib/core/text/abstract_text.nit

index baf8ae7..64c22c0 100644 (file)
@@ -718,6 +718,38 @@ abstract class Text
                return res.to_s
        end
 
+       # Returns `self` with all characters escaped with their UTF-16 representation
+       #
+       #     assert "Aรจใ‚๐“".escape_to_utf16 == "\\u0041\\u00e8\\u3042\\ud800\\udfd3"
+       fun escape_to_utf16: String do
+               var buf = new Buffer
+               for i in chars do buf.append i.escape_to_utf16
+               return buf.to_s
+       end
+
+       # Returns the Unicode char escaped by `self`
+       #
+       #     assert "\\u0041".from_utf16_escape == 'A'
+       #     assert "\\ud800\\udfd3".from_utf16_escape == '๐“'
+       #     assert "\\u00e8".from_utf16_escape == 'รจ'
+       #     assert "\\u3042".from_utf16_escape == 'ใ‚'
+       fun from_utf16_escape: Char do
+               var ln = length
+               if ln != 6 and ln != 12 then return 0xFFFD.code_point
+               var cphi = substring(2, 4).to_hex
+               if cphi < 0xD800 then return cphi.code_point
+               if cphi > 0xDFFF then return cphi.code_point
+               if cphi > 0xDBFF then return 0xFFFD.code_point
+               var cp = 0
+               cp += (cphi - 0xD800) << 10
+               var cplo = substring(8, 4).to_hex
+               if cplo < 0xDC00 then return 0xFFFD.code_point
+               if cplo > 0xDFFF then return 0xFFFD.code_point
+               cp += cplo - 0xDC00
+               cp += 0x10000
+               return cp.code_point
+       end
+
        # Encode `self` to percent (or URL) encoding
        #
        #     assert "aBc09-._~".to_percent_encoding == "aBc09-._~"
@@ -1587,6 +1619,46 @@ redef class Char
                return ns.to_s_with_length(ln)
        end
 
+       # Returns `self` escaped to UTF-16
+       #
+       # i.e. Represents `self`.`code_point` using UTF-16 codets escaped
+       # with a `\u`
+       #
+       #     assert 'A'.escape_to_utf16 == "\\u0041"
+       #     assert 'รจ'.escape_to_utf16 == "\\u00e8"
+       #     assert 'ใ‚'.escape_to_utf16 == "\\u3042"
+       #     assert '๐“'.escape_to_utf16 == "\\ud800\\udfd3"
+       fun escape_to_utf16: String do
+               var cp = code_point
+               var buf: Buffer
+               if cp < 0xD800 or (cp >= 0xE000 and cp <= 0xFFFF) then
+                       buf = new Buffer.with_cap(6)
+                       buf.append("\\u0000")
+                       var hx = cp.to_hex
+                       var outid = 5
+                       for i in hx.chars.reverse_iterator do
+                               buf[outid] = i
+                               outid -= 1
+                       end
+               else
+                       buf = new Buffer.with_cap(12)
+                       buf.append("\\u0000\\u0000")
+                       var lo = (((cp - 0x10000) & 0x3FF) + 0xDC00).to_hex
+                       var hi = ((((cp - 0x10000) & 0xFFC00) >> 10) + 0xD800).to_hex
+                       var out = 2
+                       for i in hi do
+                               buf[out] = i
+                               out += 1
+                       end
+                       out = 8
+                       for i in lo do
+                               buf[out] = i
+                               out += 1
+                       end
+               end
+               return buf.to_s
+       end
+
        private fun u8char_tos(r: NativeString, len: Int) `{
                r[len] = '\0';
                switch(len){