Wrapper of ICU's UTF-16 encoded strings and conversion

This module is meant to ease the use of complex string operations provided by the ICU library. The module provides a wrapper for ICU's string structure : UChar * as well as conversion functions to/from String

Introduced classes

class U16String

core :: U16String

UTF-16 encoded string

extern class UCharString

core :: UCharString

ICU string UChar * which are UTF-16 strings

Redefined classes

redef extern class CString

core :: u16_string $ CString

C string char *

redef abstract class String

core :: u16_string $ String

Immutable sequence of characters.

All class definitions

redef extern class CString

core :: u16_string $ CString

C string char *

redef abstract class String

core :: u16_string $ String

Immutable sequence of characters.

class U16String

core $ U16String

UTF-16 encoded string

extern class UCharString

core $ UCharString

ICU string UChar * which are UTF-16 strings

Ancestors

module abstract_collection

core :: abstract_collection

Abstract collection classes and services.

module abstract_text

core :: abstract_text

Abstract class for manipulation of sequences of characters

module array

core :: array

This module introduces the standard array structure.

module bitset

core :: bitset

Services to handle BitSet

module bytes

core :: bytes

Services for byte streams and arrays

module circular_array

core :: circular_array

Efficient data structure to access both end of the sequence.

module codec_base

core :: codec_base

Base for codecs to use with streams

module codecs

core :: codecs

Group module for all codec-related manipulations

module collection

core :: collection

This module define several collection classes.

module environ

core :: environ

Access to the environment variables of the process

module error

core :: error

Standard error-management infrastructure.

module exec

core :: exec

Invocation and management of operating system sub-processes.

module file

core :: file

File manipulations (create, read, write, etc.)

module fixed_ints

core :: fixed_ints

Basic integers of fixed-precision

module fixed_ints_text

core :: fixed_ints_text

Text services to complement fixed_ints

module flat

core :: flat

All the array-based text representations

module gc

core :: gc

Access to the Nit internal garbage collection mechanism

module hash_collection

core :: hash_collection

Introduce HashMap and HashSet.

module iso8859_1

core :: iso8859_1

Codec for ISO8859-1 I/O

module kernel

core :: kernel

Most basic classes and methods.

module list

core :: list

This module handle double linked lists

module math

core :: math

Mathematical operations

module native

core :: native

Native structures for text and bytes

module numeric

core :: numeric

Advanced services for Numeric types

module protocol

core :: protocol

module queue

core :: queue

Queuing data structures and wrappers

module range

core :: range

Module for range of discrete objects.

module re

core :: re

Regular expression support for all services based on Pattern

module ropes

core :: ropes

Tree-based representation of a String.

module sorter

core :: sorter

This module contains classes used to compare things and sorts arrays.

module stream

core :: stream

Input and output streams of characters

module string_search

core :: string_search

Basic string search, match and replace.

module text

core :: text

All the classes and methods related to the manipulation of text entities

module time

core :: time

Management of time and dates

module union_find

core :: union_find

union–find algorithm using an efficient disjoint-set data structure

module utf8

core :: utf8

Codec for UTF-8 I/O

Parents

module core

core :: core

Standard classes and methods used by default by Nit programs and libraries.

Children

module a_star-m

a_star-m

# Wrapper of ICU's UTF-16 encoded strings and conversion
# This module is meant to ease the use of complex string operations provided by the ICU library.
# The module provides a wrapper for ICU's string structure : `UChar *` as well as conversion functions to/from `String`
module u16_string is pkgconfig ("icu-io", "icu-i18n", "icu-uc")

intrude import abstract_text
import core

`{
	#include <unicode/utypes.h>
	#include <unicode/ustring.h>
	#include <unicode/utf16.h>
`}


# UTF-16 encoded string
class U16String
	super Finalizable
	super Text

	# Pointer to a `UChar *` string
	private var uchar_string: UCharString

	# Number of code units (aka UTF-16 encoded code units or `UChar`) allocated to `uchar_string`
	private var capacity = 0

	# Number of code units actually in `uchar_string`.
	# `code_units` <= `capacity`.
	private var code_units = 0

	redef fun length: Int do return uchar_string.code_points(code_units)

	# Returns an empty `U16String` of capacity `cap` or a NULL `U16String` if no `cap` parameter is provided.
	# The `cap` argument is the number of code units (aka UTF-16 encoded characters or `UChar`) allocated to `uchar_string`.
	# If the number of code units is known in advance, it can be provided with the `units` parameter.
	init (cap: nullable Int, units: nullable Int) do
		if cap == null then
			uchar_string = new UCharString.nul
		else
			assert cap >= 0

			if not units == null then
				assert units <= cap
				code_units = units
			end

			uchar_string = new UCharString.empty(cap)
			capacity = cap
		end
	end

	# Returns a converted `U16String` from a `String`
	init from_string(source: String) do
		var csource = source.to_cstring
		var csource_length = source.byte_length

		uchar_string = new UCharString.nul
		var required_length = uchar_string.from_cstring(0, csource, csource_length)

		uchar_string = new UCharString.empty(required_length)
		uchar_string.from_cstring(required_length, csource, csource_length)

		capacity = required_length
		code_units = source.u16_length
	end

	# Copies the characters of `source` to `self`.
	# A maximum of `self.capacity` code units will be copied to `self`.
	# If a code point >0xFFFF has to be divided, it will not be copied.
	fun copy_from(source: String) do
		uchar_string.from_cstring(capacity, source.to_cstring, source.byte_length)

		code_units = source.u16_length
	end

	redef fun chars do return new U16StringCharView(self)

	redef fun[](index: Int): Char do
		assert index >= 0 and index < length
		var offset = 0
		var c = '\0'

		for i in [0..index] do
			c = uchar_string.char_at_offset(offset, code_units)
			if c.to_i > 0xFFFF then offset += 2 else offset +=1
		end
		return c
	end

	redef fun to_cstring: CString do
		var cself = new CString.nul
		var required_length = uchar_string.to_cstring(cself, 0, code_units)

		cself = new CString(required_length + 1)
		uchar_string.to_cstring(cself, required_length + 1, code_units)

		return cself
	end

	# Returns the number of UTF-8 code units (bytes) in `self`
	redef fun byte_length: Int do
		var offset = 0
		var l = 0
		var c = '\0'

		for i in chars do
			c = uchar_string.char_at_offset(offset, code_units)
			var b = c.to_i
			if b > 0xFFFF then offset += 2 else offset +=1

			if b <= 0x7F then
				l += 1
			else if b <= 0x7FF then
				l += 2
			else if b <= 0xD7FF or b > 0x10FFFF then
				l += 0
			else if b <= 0xFFFF then
				l += 3
			else
				l += 4
			end
		end
		return l
	end

	redef fun to_s: String do return to_cstring.to_s_with_length(byte_length)
	redef fun finalize do uchar_string.free
end

# ICU string `UChar *` which are UTF-16 strings
extern class UCharString `{ UChar *`}

	# Returns an empty `UCharString` of length `length`
	new empty (length: Int) `{
		UChar * str = (UChar *)malloc(sizeof(UChar) * length);
		u_memset(str, 0, length);
		return str;
	`}

	# Returns a `NULL` `UCharString`
	new nul `{ return NULL; `}

	# Returns the number of code points up to `code_units` characters
	fun code_points(code_units: Int): Int `{
		if (self == NULL) {
			return -1;
		}
		return u_countChar32(self, code_units);
	`}

	# Converts a `CString` to a `UCharString` and returns the required length of said `UCharString`
	fun from_cstring(dest_length: Int, source: CString, source_length: Int): Int `{
		UErrorCode error = U_ZERO_ERROR;
		int32_t res;
		u_strFromUTF8(self, dest_length, &res, source, source_length, &error);
		return res;
	`}

	# Converts `self` to a `CString` and returns the required length (without the termination character) of said `CString`
	fun to_cstring(dest: CString, dest_length: Int, source_length: Int): Int `{
		UErrorCode error = U_ZERO_ERROR;
		int32_t res;
		u_strToUTF8(dest, dest_length, &res, self, source_length, &error);
		return res;
	`}

	# Get code point at code unit `offset`
	fun char_at_offset(offset: Int, code_units: Int): Char `{
		UChar32 c = 0;
		U16_NEXT(self, offset, code_units, c);
		return c;
	`}
end

private class U16StringCharIterator
	super IndexedIterator[Char]

	var target: U16String

	var curr_pos: Int

	redef fun is_ok do return curr_pos < target.length

	redef fun item do return target[curr_pos]

	redef fun next do curr_pos += 1

	redef fun index do return curr_pos
end

private class U16StringCharReverseIterator
	super IndexedIterator[Char]

	var target: U16String

	var curr_pos: Int

	redef fun is_ok do return curr_pos >= 0

	redef fun item do return target[curr_pos]

	redef fun next do curr_pos -= 1

	redef fun index do return curr_pos
end

private class U16StringCharView
	super StringCharView

	redef type SELFTYPE: U16String

	redef fun [](index) do return target[index]

	redef fun iterator_from(start) do return new U16StringCharIterator(target, start)

	redef fun reverse_iterator_from(start) do return new U16StringCharReverseIterator(target, start)
end

redef class String
	# Returns a UTF-16 encoded version of `self`
	fun to_u16string: U16String do return new U16String.from_string(self)

	# Returns the number of UTF-16 code units in `self`
	fun u16_length: Int do
		var n = 0
		for c in chars do
			if c.to_i > 0xFFFF then n += 2 else n += 1
		end
		return n
	end
end

redef class CString
	# Returns a null `char *`
	new nul `{ return NULL; `}
end

lib/core/text/u16_string.nit:15,1--250,3

module u16_string

Summary

Wrapper of ICU's UTF-16 encoded strings and conversion

Introduced classes

U16String

UCharString

Redefined classes

CString

String

Wrapper of ICU's UTF-16 encoded strings and conversion

Introduced classes

class U16String

extern class UCharString

Redefined classes

redef extern class CString

redef abstract class String

Summary

All class definitions

CString

String

U16String

UCharString

All class definitions

redef extern class CString

redef abstract class String

class U16String

extern class UCharString

Summary

Ancestors

abstract_collection

abstract_text

array

bitset

bytes

circular_array

codec_base

codecs

collection

environ

error

exec

file

fixed_ints

fixed_ints_text

flat

gc

hash_collection

iso8859_1

kernel

list

math

native

numeric

protocol

queue

range

re

ropes

sorter

stream

string_search

text

time

union_find

utf8

Parents

core

Children

a_star-m

Ancestors

module abstract_collection

module abstract_text

module array

module bitset

module bytes

module circular_array

module codec_base

module codecs

module collection

module environ