core :: u16_string
This module is meant to ease the use of complex string operations provided by the ICU library.
The module provides a wrapper for ICU's string structure : UChar *
as well as conversion functions to/from String
core :: union_find
union–find algorithm using an efficient disjoint-set data structure
# Wrapper of ICU's UTF-16 encoded strings and conversion
# This module is meant to ease the use of complex string operations provided by the ICU library.
# The module provides a wrapper for ICU's string structure : `UChar *` as well as conversion functions to/from `String`
module u16_string is pkgconfig ("icu-io", "icu-i18n", "icu-uc")
intrude import abstract_text
import core
`{
#include <unicode/utypes.h>
#include <unicode/ustring.h>
#include <unicode/utf16.h>
`}
# UTF-16 encoded string
class U16String
super Finalizable
super Text
# Pointer to a `UChar *` string
private var uchar_string: UCharString
# Number of code units (aka UTF-16 encoded code units or `UChar`) allocated to `uchar_string`
private var capacity = 0
# Number of code units actually in `uchar_string`.
# `code_units` <= `capacity`.
private var code_units = 0
redef fun length: Int do return uchar_string.code_points(code_units)
# Returns an empty `U16String` of capacity `cap` or a NULL `U16String` if no `cap` parameter is provided.
# The `cap` argument is the number of code units (aka UTF-16 encoded characters or `UChar`) allocated to `uchar_string`.
# If the number of code units is known in advance, it can be provided with the `units` parameter.
init (cap: nullable Int, units: nullable Int) do
if cap == null then
uchar_string = new UCharString.nul
else
assert cap >= 0
if not units == null then
assert units <= cap
code_units = units
end
uchar_string = new UCharString.empty(cap)
capacity = cap
end
end
# Returns a converted `U16String` from a `String`
init from_string(source: String) do
var csource = source.to_cstring
var csource_length = source.byte_length
uchar_string = new UCharString.nul
var required_length = uchar_string.from_cstring(0, csource, csource_length)
uchar_string = new UCharString.empty(required_length)
uchar_string.from_cstring(required_length, csource, csource_length)
capacity = required_length
code_units = source.u16_length
end
# Copies the characters of `source` to `self`.
# A maximum of `self.capacity` code units will be copied to `self`.
# If a code point >0xFFFF has to be divided, it will not be copied.
fun copy_from(source: String) do
uchar_string.from_cstring(capacity, source.to_cstring, source.byte_length)
code_units = source.u16_length
end
redef fun chars do return new U16StringCharView(self)
redef fun[](index: Int): Char do
assert index >= 0 and index < length
var offset = 0
var c = '\0'
for i in [0..index] do
c = uchar_string.char_at_offset(offset, code_units)
if c.to_i > 0xFFFF then offset += 2 else offset +=1
end
return c
end
redef fun to_cstring: CString do
var cself = new CString.nul
var required_length = uchar_string.to_cstring(cself, 0, code_units)
cself = new CString(required_length + 1)
uchar_string.to_cstring(cself, required_length + 1, code_units)
return cself
end
# Returns the number of UTF-8 code units (bytes) in `self`
redef fun byte_length: Int do
var offset = 0
var l = 0
var c = '\0'
for i in chars do
c = uchar_string.char_at_offset(offset, code_units)
var b = c.to_i
if b > 0xFFFF then offset += 2 else offset +=1
if b <= 0x7F then
l += 1
else if b <= 0x7FF then
l += 2
else if b <= 0xD7FF or b > 0x10FFFF then
l += 0
else if b <= 0xFFFF then
l += 3
else
l += 4
end
end
return l
end
redef fun to_s: String do return to_cstring.to_s_with_length(byte_length)
redef fun finalize do uchar_string.free
end
# ICU string `UChar *` which are UTF-16 strings
extern class UCharString `{ UChar *`}
# Returns an empty `UCharString` of length `length`
new empty (length: Int) `{
UChar * str = (UChar *)malloc(sizeof(UChar) * length);
u_memset(str, 0, length);
return str;
`}
# Returns a `NULL` `UCharString`
new nul `{ return NULL; `}
# Returns the number of code points up to `code_units` characters
fun code_points(code_units: Int): Int `{
if (self == NULL) {
return -1;
}
return u_countChar32(self, code_units);
`}
# Converts a `CString` to a `UCharString` and returns the required length of said `UCharString`
fun from_cstring(dest_length: Int, source: CString, source_length: Int): Int `{
UErrorCode error = U_ZERO_ERROR;
int32_t res;
u_strFromUTF8(self, dest_length, &res, source, source_length, &error);
return res;
`}
# Converts `self` to a `CString` and returns the required length (without the termination character) of said `CString`
fun to_cstring(dest: CString, dest_length: Int, source_length: Int): Int `{
UErrorCode error = U_ZERO_ERROR;
int32_t res;
u_strToUTF8(dest, dest_length, &res, self, source_length, &error);
return res;
`}
# Get code point at code unit `offset`
fun char_at_offset(offset: Int, code_units: Int): Char `{
UChar32 c = 0;
U16_NEXT(self, offset, code_units, c);
return c;
`}
end
private class U16StringCharIterator
super IndexedIterator[Char]
var target: U16String
var curr_pos: Int
redef fun is_ok do return curr_pos < target.length
redef fun item do return target[curr_pos]
redef fun next do curr_pos += 1
redef fun index do return curr_pos
end
private class U16StringCharReverseIterator
super IndexedIterator[Char]
var target: U16String
var curr_pos: Int
redef fun is_ok do return curr_pos >= 0
redef fun item do return target[curr_pos]
redef fun next do curr_pos -= 1
redef fun index do return curr_pos
end
private class U16StringCharView
super StringCharView
redef type SELFTYPE: U16String
redef fun [](index) do return target[index]
redef fun iterator_from(start) do return new U16StringCharIterator(target, start)
redef fun reverse_iterator_from(start) do return new U16StringCharReverseIterator(target, start)
end
redef class String
# Returns a UTF-16 encoded version of `self`
fun to_u16string: U16String do return new U16String.from_string(self)
# Returns the number of UTF-16 code units in `self`
fun u16_length: Int do
var n = 0
for c in chars do
if c.to_i > 0xFFFF then n += 2 else n += 1
end
return n
end
end
redef class CString
# Returns a null `char *`
new nul `{ return NULL; `}
end
lib/core/text/u16_string.nit:15,1--250,3