From 614bf3afc4ff402a7fa8b755b308ce8b7efa317e Mon Sep 17 00:00:00 2001 From: Ana Daouda Date: Fri, 16 Aug 2019 16:17:03 -0400 Subject: [PATCH] lib/core/text: Wrapper of ICU's UTF-16 encoded strings and conversion Signed-off-by: Ana Daouda --- lib/core/text/u16_string.nit | 250 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 250 insertions(+) create mode 100644 lib/core/text/u16_string.nit diff --git a/lib/core/text/u16_string.nit b/lib/core/text/u16_string.nit new file mode 100644 index 0000000..e9a9986 --- /dev/null +++ b/lib/core/text/u16_string.nit @@ -0,0 +1,250 @@ +# This file is part of NIT ( http://www.nitlanguage.org ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Wrapper of ICU's UTF-16 encoded strings and conversion +# This module is meant to ease the use of complex string operations provided by the ICU library. +# The module provides a wrapper for ICU's string structure : `UChar *` as well as conversion functions to/from `String` +module u16_string is pkgconfig ("icu-io", "icu-i18n", "icu-uc") + +intrude import abstract_text +import core + +`{ + #include + #include + #include +`} + + +# UTF-16 encoded string +class U16String + super Finalizable + super Text + + # Pointer to a `UChar *` string + private var uchar_string: UCharString + + # Number of code units (aka UTF-16 encoded code units or `UChar`) allocated to `uchar_string` + private var capacity = 0 + + # Number of code units actually in `uchar_string`. + # `code_units` <= `capacity`. + private var code_units = 0 + + redef fun length: Int do return uchar_string.code_points(code_units) + + # Returns an empty `U16String` of capacity `cap` or a NULL `U16String` if no `cap` parameter is provided. + # The `cap` argument is the number of code units (aka UTF-16 encoded characters or `UChar`) allocated to `uchar_string`. + # If the number of code units is known in advance, it can be provided with the `units` parameter. + init (cap: nullable Int, units: nullable Int) do + if cap == null then + uchar_string = new UCharString.nul + else + assert cap >= 0 + + if not units == null then + assert units <= cap + code_units = units + end + + uchar_string = new UCharString.empty(cap) + capacity = cap + end + end + + # Returns a converted `U16String` from a `String` + init from_string(source: String) do + var csource = source.to_cstring + var csource_length = source.byte_length + + uchar_string = new UCharString.nul + var required_length = uchar_string.from_cstring(0, csource, csource_length) + + uchar_string = new UCharString.empty(required_length) + uchar_string.from_cstring(required_length, csource, csource_length) + + capacity = required_length + code_units = source.u16_length + end + + # Copies the characters of `source` to `self`. + # A maximum of `self.capacity` code units will be copied to `self`. + # If a code point >0xFFFF has to be divided, it will not be copied. + fun copy_from(source: String) do + uchar_string.from_cstring(capacity, source.to_cstring, source.byte_length) + + code_units = source.u16_length + end + + redef fun chars do return new U16StringCharView(self) + + redef fun[](index: Int): Char do + assert index >= 0 and index < length + var offset = 0 + var c = '\0' + + for i in [0..index] do + c = uchar_string.char_at_offset(offset, code_units) + if c.to_i > 0xFFFF then offset += 2 else offset +=1 + end + return c + end + + redef fun to_cstring: CString do + var cself = new CString.nul + var required_length = uchar_string.to_cstring(cself, 0, code_units) + + cself = new CString(required_length + 1) + uchar_string.to_cstring(cself, required_length + 1, code_units) + + return cself + end + + # Returns the number of UTF-8 code units (bytes) in `self` + redef fun byte_length: Int do + var offset = 0 + var l = 0 + var c = '\0' + + for i in chars do + c = uchar_string.char_at_offset(offset, code_units) + var b = c.to_i + if b > 0xFFFF then offset += 2 else offset +=1 + + if b <= 0x7F then + l += 1 + else if b <= 0x7FF then + l += 2 + else if b <= 0xD7FF or b > 0x10FFFF then + l += 0 + else if b <= 0xFFFF then + l += 3 + else + l += 4 + end + end + return l + end + + redef fun to_s: String do return to_cstring.to_s_with_length(byte_length) + redef fun finalize do uchar_string.free +end + +# ICU string `UChar *` which are UTF-16 strings +extern class UCharString `{ UChar *`} + + # Returns an empty `UCharString` of length `length` + new empty (length: Int) `{ + UChar * str = (UChar *)malloc(sizeof(UChar) * length); + u_memset(str, 0, length); + return str; + `} + + # Returns a `NULL` `UCharString` + new nul `{ return NULL; `} + + # Returns the number of code points up to `code_units` characters + fun code_points(code_units: Int): Int `{ + if (self == NULL) { + return -1; + } + return u_countChar32(self, code_units); + `} + + # Converts a `CString` to a `UCharString` and returns the required length of said `UCharString` + fun from_cstring(dest_length: Int, source: CString, source_length: Int): Int `{ + UErrorCode error = U_ZERO_ERROR; + int32_t res; + u_strFromUTF8(self, dest_length, &res, source, source_length, &error); + return res; + `} + + # Converts `self` to a `CString` and returns the required length (without the termination character) of said `CString` + fun to_cstring(dest: CString, dest_length: Int, source_length: Int): Int `{ + UErrorCode error = U_ZERO_ERROR; + int32_t res; + u_strToUTF8(dest, dest_length, &res, self, source_length, &error); + return res; + `} + + # Get code point at code unit `offset` + fun char_at_offset(offset: Int, code_units: Int): Char `{ + UChar32 c = 0; + U16_NEXT(self, offset, code_units, c); + return c; + `} +end + +private class U16StringCharIterator + super IndexedIterator[Char] + + var target: U16String + + var curr_pos: Int + + redef fun is_ok do return curr_pos < target.length + + redef fun item do return target[curr_pos] + + redef fun next do curr_pos += 1 + + redef fun index do return curr_pos +end + +private class U16StringCharReverseIterator + super IndexedIterator[Char] + + var target: U16String + + var curr_pos: Int + + redef fun is_ok do return curr_pos >= 0 + + redef fun item do return target[curr_pos] + + redef fun next do curr_pos -= 1 + + redef fun index do return curr_pos +end + +private class U16StringCharView + super StringCharView + + redef type SELFTYPE: U16String + + redef fun [](index) do return target[index] + + redef fun iterator_from(start) do return new U16StringCharIterator(target, start) + + redef fun reverse_iterator_from(start) do return new U16StringCharReverseIterator(target, start) +end + +redef class String + # Returns a UTF-16 encoded version of `self` + fun to_u16string: U16String do return new U16String.from_string(self) + + # Returns the number of UTF-16 code units in `self` + fun u16_length: Int do + var n = 0 + for c in chars do + if c.to_i > 0xFFFF then n += 2 else n += 1 + end + return n + end +end + +redef class CString + # Returns a null `char *` + new nul `{ return NULL; `} +end -- 1.7.9.5