lib/core/text: Wrapper of ICU's UTF-16 encoded strings and conversion
authorAna Daouda <anadaouda@gmail.com>
Fri, 16 Aug 2019 20:17:03 +0000 (16:17 -0400)
committerAna Daouda <anadaouda@gmail.com>
Tue, 20 Aug 2019 17:33:58 +0000 (13:33 -0400)
Signed-off-by: Ana Daouda <anadaouda@gmail.com>

lib/core/text/u16_string.nit [new file with mode: 0644]

diff --git a/lib/core/text/u16_string.nit b/lib/core/text/u16_string.nit
new file mode 100644 (file)
index 0000000..e9a9986
--- /dev/null
@@ -0,0 +1,250 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Wrapper of ICU's UTF-16 encoded strings and conversion
+# This module is meant to ease the use of complex string operations provided by the ICU library.
+# The module provides a wrapper for ICU's string structure : `UChar *` as well as conversion functions to/from `String`
+module u16_string is pkgconfig ("icu-io", "icu-i18n", "icu-uc")
+
+intrude import abstract_text
+import core
+
+`{
+       #include <unicode/utypes.h>
+       #include <unicode/ustring.h>
+       #include <unicode/utf16.h>
+`}
+
+
+# UTF-16 encoded string
+class U16String
+       super Finalizable
+       super Text
+
+       # Pointer to a `UChar *` string
+       private var uchar_string: UCharString
+
+       # Number of code units (aka UTF-16 encoded code units or `UChar`) allocated to `uchar_string`
+       private var capacity = 0
+
+       # Number of code units actually in `uchar_string`.
+       # `code_units` <= `capacity`.
+       private var code_units = 0
+
+       redef fun length: Int do return uchar_string.code_points(code_units)
+
+       # Returns an empty `U16String` of capacity `cap` or a NULL `U16String` if no `cap` parameter is provided.
+       # The `cap` argument is the number of code units (aka UTF-16 encoded characters or `UChar`) allocated to `uchar_string`.
+       # If the number of code units is known in advance, it can be provided with the `units` parameter.
+       init (cap: nullable Int, units: nullable Int) do
+               if cap == null then
+                       uchar_string = new UCharString.nul
+               else
+                       assert cap >= 0
+
+                       if not units == null then
+                               assert units <= cap
+                               code_units = units
+                       end
+
+                       uchar_string = new UCharString.empty(cap)
+                       capacity = cap
+               end
+       end
+
+       # Returns a converted `U16String` from a `String`
+       init from_string(source: String) do
+               var csource = source.to_cstring
+               var csource_length = source.byte_length
+
+               uchar_string = new UCharString.nul
+               var required_length = uchar_string.from_cstring(0, csource, csource_length)
+
+               uchar_string = new UCharString.empty(required_length)
+               uchar_string.from_cstring(required_length, csource, csource_length)
+
+               capacity = required_length
+               code_units = source.u16_length
+       end
+
+       # Copies the characters of `source` to `self`.
+       # A maximum of `self.capacity` code units will be copied to `self`.
+       # If a code point >0xFFFF has to be divided, it will not be copied.
+       fun copy_from(source: String) do
+               uchar_string.from_cstring(capacity, source.to_cstring, source.byte_length)
+
+               code_units = source.u16_length
+       end
+
+       redef fun chars do return new U16StringCharView(self)
+
+       redef fun[](index: Int): Char do
+               assert index >= 0 and index < length
+               var offset = 0
+               var c = '\0'
+
+               for i in [0..index] do
+                       c = uchar_string.char_at_offset(offset, code_units)
+                       if c.to_i > 0xFFFF then offset += 2 else offset +=1
+               end
+               return c
+       end
+
+       redef fun to_cstring: CString do
+               var cself = new CString.nul
+               var required_length = uchar_string.to_cstring(cself, 0, code_units)
+
+               cself = new CString(required_length + 1)
+               uchar_string.to_cstring(cself, required_length + 1, code_units)
+
+               return cself
+       end
+
+       # Returns the number of UTF-8 code units (bytes) in `self`
+       redef fun byte_length: Int do
+               var offset = 0
+               var l = 0
+               var c = '\0'
+
+               for i in chars do
+                       c = uchar_string.char_at_offset(offset, code_units)
+                       var b = c.to_i
+                       if b > 0xFFFF then offset += 2 else offset +=1
+
+                       if b <= 0x7F then
+                               l += 1
+                       else if b <= 0x7FF then
+                               l += 2
+                       else if b <= 0xD7FF or b > 0x10FFFF then
+                               l += 0
+                       else if b <= 0xFFFF then
+                               l += 3
+                       else
+                               l += 4
+                       end
+               end
+               return l
+       end
+
+       redef fun to_s: String do return to_cstring.to_s_with_length(byte_length)
+       redef fun finalize do uchar_string.free
+end
+
+# ICU string `UChar *` which are UTF-16 strings
+extern class UCharString `{ UChar *`}
+
+       # Returns an empty `UCharString` of length `length`
+       new empty (length: Int) `{
+               UChar * str = (UChar *)malloc(sizeof(UChar) * length);
+               u_memset(str, 0, length);
+               return str;
+       `}
+
+       # Returns a `NULL` `UCharString`
+       new nul `{ return NULL; `}
+
+       # Returns the number of code points up to `code_units` characters
+       fun code_points(code_units: Int): Int `{
+               if (self == NULL) {
+                       return -1;
+               }
+               return u_countChar32(self, code_units);
+       `}
+
+       # Converts a `CString` to a `UCharString` and returns the required length of said `UCharString`
+       fun from_cstring(dest_length: Int, source: CString, source_length: Int): Int `{
+               UErrorCode error = U_ZERO_ERROR;
+               int32_t res;
+               u_strFromUTF8(self, dest_length, &res, source, source_length, &error);
+               return res;
+       `}
+
+       # Converts `self` to a `CString` and returns the required length (without the termination character) of said `CString`
+       fun to_cstring(dest: CString, dest_length: Int, source_length: Int): Int `{
+               UErrorCode error = U_ZERO_ERROR;
+               int32_t res;
+               u_strToUTF8(dest, dest_length, &res, self, source_length, &error);
+               return res;
+       `}
+
+       # Get code point at code unit `offset`
+       fun char_at_offset(offset: Int, code_units: Int): Char `{
+               UChar32 c = 0;
+               U16_NEXT(self, offset, code_units, c);
+               return c;
+       `}
+end
+
+private class U16StringCharIterator
+       super IndexedIterator[Char]
+
+       var target: U16String
+
+       var curr_pos: Int
+
+       redef fun is_ok do return curr_pos < target.length
+
+       redef fun item do return target[curr_pos]
+
+       redef fun next do curr_pos += 1
+
+       redef fun index do return curr_pos
+end
+
+private class U16StringCharReverseIterator
+       super IndexedIterator[Char]
+
+       var target: U16String
+
+       var curr_pos: Int
+
+       redef fun is_ok do return curr_pos >= 0
+
+       redef fun item do return target[curr_pos]
+
+       redef fun next do curr_pos -= 1
+
+       redef fun index do return curr_pos
+end
+
+private class U16StringCharView
+       super StringCharView
+
+       redef type SELFTYPE: U16String
+
+       redef fun [](index) do return target[index]
+
+       redef fun iterator_from(start) do return new U16StringCharIterator(target, start)
+
+       redef fun reverse_iterator_from(start) do return new U16StringCharReverseIterator(target, start)
+end
+
+redef class String
+       # Returns a UTF-16 encoded version of `self`
+       fun to_u16string: U16String do return new U16String.from_string(self)
+
+       # Returns the number of UTF-16 code units in `self`
+       fun u16_length: Int do
+               var n = 0
+               for c in chars do
+                       if c.to_i > 0xFFFF then n += 2 else n += 1
+               end
+               return n
+       end
+end
+
+redef class CString
+       # Returns a null `char *`
+       new nul `{ return NULL; `}
+end