1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
15 # Wrapper of ICU's UTF-16 encoded strings and conversion
16 # This module is meant to ease the use of complex string operations provided by the ICU library.
17 # The module provides a wrapper for ICU's string structure : `UChar *` as well as conversion functions to/from `String`
18 module u16_string
is pkgconfig
("icu-io", "icu-i18n", "icu-uc")
20 intrude import abstract_text
24 #include <unicode/utypes.h>
25 #include <unicode/ustring.h>
26 #include <unicode/utf16.h>
30 # UTF-16 encoded string
35 # Pointer to a `UChar *` string
36 private var uchar_string
: UCharString
38 # Number of code units (aka UTF-16 encoded code units or `UChar`) allocated to `uchar_string`
39 private var capacity
= 0
41 # Number of code units actually in `uchar_string`.
42 # `code_units` <= `capacity`.
43 private var code_units
= 0
45 redef fun length
: Int do return uchar_string
.code_points
(code_units
)
47 # Returns an empty `U16String` of capacity `cap` or a NULL `U16String` if no `cap` parameter is provided.
48 # The `cap` argument is the number of code units (aka UTF-16 encoded characters or `UChar`) allocated to `uchar_string`.
49 # If the number of code units is known in advance, it can be provided with the `units` parameter.
50 init (cap
: nullable Int, units
: nullable Int) do
52 uchar_string
= new UCharString.nul
56 if not units
== null then
61 uchar_string
= new UCharString.empty
(cap
)
66 # Returns a converted `U16String` from a `String`
67 init from_string
(source
: String) do
68 var csource
= source
.to_cstring
69 var csource_length
= source
.byte_length
71 uchar_string
= new UCharString.nul
72 var required_length
= uchar_string
.from_cstring
(0, csource
, csource_length
)
74 uchar_string
= new UCharString.empty
(required_length
)
75 uchar_string
.from_cstring
(required_length
, csource
, csource_length
)
77 capacity
= required_length
78 code_units
= source
.u16_length
81 # Copies the characters of `source` to `self`.
82 # A maximum of `self.capacity` code units will be copied to `self`.
83 # If a code point >0xFFFF has to be divided, it will not be copied.
84 fun copy_from
(source
: String) do
85 uchar_string
.from_cstring
(capacity
, source
.to_cstring
, source
.byte_length
)
87 code_units
= source
.u16_length
90 redef fun chars
do return new U16StringCharView(self)
92 redef fun[](index
: Int): Char do
93 assert index
>= 0 and index
< length
97 for i
in [0..index
] do
98 c
= uchar_string
.char_at_offset
(offset
, code_units
)
99 if c
.to_i
> 0xFFFF then offset
+= 2 else offset
+=1
104 redef fun to_cstring
: CString do
105 var cself
= new CString.nul
106 var required_length
= uchar_string
.to_cstring
(cself
, 0, code_units
)
108 cself
= new CString(required_length
+ 1)
109 uchar_string
.to_cstring
(cself
, required_length
+ 1, code_units
)
114 # Returns the number of UTF-8 code units (bytes) in `self`
115 redef fun byte_length
: Int do
121 c
= uchar_string
.char_at_offset
(offset
, code_units
)
123 if b
> 0xFFFF then offset
+= 2 else offset
+=1
127 else if b
<= 0x7FF then
129 else if b
<= 0xD7FF or b
> 0x10FFFF then
131 else if b
<= 0xFFFF then
140 redef fun to_s
: String do return to_cstring
.to_s_with_length
(byte_length
)
141 redef fun finalize
do uchar_string
.free
144 # ICU string `UChar *` which are UTF-16 strings
145 extern class UCharString `{ UChar *`}
147 # Returns an empty `UCharString` of length `length
`
148 new empty (length: Int) `{
149 UChar * str
= (UChar *)malloc
(sizeof
(UChar) * length
);
150 u_memset
(str
, 0, length
);
154 # Returns a `NULL` `UCharString`
155 new nul `{ return NULL; `}
157 # Returns the number of code points up to `code_units` characters
158 fun code_points
(code_units
: Int): Int `{
162 return u_countChar32(self, code_units);
165 # Converts a `CString` to a `UCharString` and returns the required length of said `UCharString`
166 fun from_cstring
(dest_length
: Int, source
: CString, source_length
: Int): Int `{
167 UErrorCode error = U_ZERO_ERROR;
169 u_strFromUTF8(self, dest_length, &res, source, source_length, &error);
173 # Converts `self` to a `CString` and returns the required length (without the termination character) of said `CString`
174 fun to_cstring
(dest
: CString, dest_length
: Int, source_length
: Int): Int `{
175 UErrorCode error = U_ZERO_ERROR;
177 u_strToUTF8(dest, dest_length, &res, self, source_length, &error);
181 # Get code point at code unit `offset`
182 fun char_at_offset
(offset
: Int, code_units
: Int): Char `{
184 U16_NEXT(self, offset, code_units, c);
189 private class U16StringCharIterator
190 super IndexedIterator[Char]
192 var target
: U16String
196 redef fun is_ok
do return curr_pos
< target
.length
198 redef fun item
do return target
[curr_pos
]
200 redef fun next
do curr_pos
+= 1
202 redef fun index
do return curr_pos
205 private class U16StringCharReverseIterator
206 super IndexedIterator[Char]
208 var target
: U16String
212 redef fun is_ok
do return curr_pos
>= 0
214 redef fun item
do return target
[curr_pos
]
216 redef fun next
do curr_pos
-= 1
218 redef fun index
do return curr_pos
221 private class U16StringCharView
224 redef type SELFTYPE: U16String
226 redef fun [](index
) do return target
[index
]
228 redef fun iterator_from
(start
) do return new U16StringCharIterator(target
, start
)
230 redef fun reverse_iterator_from
(start
) do return new U16StringCharReverseIterator(target
, start
)
234 # Returns a UTF-16 encoded version of `self`
235 fun to_u16string
: U16String do return new U16String.from_string
(self)
237 # Returns the number of UTF-16 code units in `self`
238 fun u16_length
: Int do
241 if c
.to_i
> 0xFFFF then n
+= 2 else n
+= 1
248 # Returns a null `char *`
249 new nul
`{ return NULL; `}