lib/core/text: Wrapper of ICU's UTF-16 encoded strings and conversion
[nit.git] / lib / core / text / u16_string.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Wrapper of ICU's UTF-16 encoded strings and conversion
16 # This module is meant to ease the use of complex string operations provided by the ICU library.
17 # The module provides a wrapper for ICU's string structure : `UChar *` as well as conversion functions to/from `String`
18 module u16_string is pkgconfig ("icu-io", "icu-i18n", "icu-uc")
19
20 intrude import abstract_text
21 import core
22
23 `{
24 #include <unicode/utypes.h>
25 #include <unicode/ustring.h>
26 #include <unicode/utf16.h>
27 `}
28
29
30 # UTF-16 encoded string
31 class U16String
32 super Finalizable
33 super Text
34
35 # Pointer to a `UChar *` string
36 private var uchar_string: UCharString
37
38 # Number of code units (aka UTF-16 encoded code units or `UChar`) allocated to `uchar_string`
39 private var capacity = 0
40
41 # Number of code units actually in `uchar_string`.
42 # `code_units` <= `capacity`.
43 private var code_units = 0
44
45 redef fun length: Int do return uchar_string.code_points(code_units)
46
47 # Returns an empty `U16String` of capacity `cap` or a NULL `U16String` if no `cap` parameter is provided.
48 # The `cap` argument is the number of code units (aka UTF-16 encoded characters or `UChar`) allocated to `uchar_string`.
49 # If the number of code units is known in advance, it can be provided with the `units` parameter.
50 init (cap: nullable Int, units: nullable Int) do
51 if cap == null then
52 uchar_string = new UCharString.nul
53 else
54 assert cap >= 0
55
56 if not units == null then
57 assert units <= cap
58 code_units = units
59 end
60
61 uchar_string = new UCharString.empty(cap)
62 capacity = cap
63 end
64 end
65
66 # Returns a converted `U16String` from a `String`
67 init from_string(source: String) do
68 var csource = source.to_cstring
69 var csource_length = source.byte_length
70
71 uchar_string = new UCharString.nul
72 var required_length = uchar_string.from_cstring(0, csource, csource_length)
73
74 uchar_string = new UCharString.empty(required_length)
75 uchar_string.from_cstring(required_length, csource, csource_length)
76
77 capacity = required_length
78 code_units = source.u16_length
79 end
80
81 # Copies the characters of `source` to `self`.
82 # A maximum of `self.capacity` code units will be copied to `self`.
83 # If a code point >0xFFFF has to be divided, it will not be copied.
84 fun copy_from(source: String) do
85 uchar_string.from_cstring(capacity, source.to_cstring, source.byte_length)
86
87 code_units = source.u16_length
88 end
89
90 redef fun chars do return new U16StringCharView(self)
91
92 redef fun[](index: Int): Char do
93 assert index >= 0 and index < length
94 var offset = 0
95 var c = '\0'
96
97 for i in [0..index] do
98 c = uchar_string.char_at_offset(offset, code_units)
99 if c.to_i > 0xFFFF then offset += 2 else offset +=1
100 end
101 return c
102 end
103
104 redef fun to_cstring: CString do
105 var cself = new CString.nul
106 var required_length = uchar_string.to_cstring(cself, 0, code_units)
107
108 cself = new CString(required_length + 1)
109 uchar_string.to_cstring(cself, required_length + 1, code_units)
110
111 return cself
112 end
113
114 # Returns the number of UTF-8 code units (bytes) in `self`
115 redef fun byte_length: Int do
116 var offset = 0
117 var l = 0
118 var c = '\0'
119
120 for i in chars do
121 c = uchar_string.char_at_offset(offset, code_units)
122 var b = c.to_i
123 if b > 0xFFFF then offset += 2 else offset +=1
124
125 if b <= 0x7F then
126 l += 1
127 else if b <= 0x7FF then
128 l += 2
129 else if b <= 0xD7FF or b > 0x10FFFF then
130 l += 0
131 else if b <= 0xFFFF then
132 l += 3
133 else
134 l += 4
135 end
136 end
137 return l
138 end
139
140 redef fun to_s: String do return to_cstring.to_s_with_length(byte_length)
141 redef fun finalize do uchar_string.free
142 end
143
144 # ICU string `UChar *` which are UTF-16 strings
145 extern class UCharString `{ UChar *`}
146
147 # Returns an empty `UCharString` of length `length`
148 new empty (length: Int) `{
149 UChar * str = (UChar *)malloc(sizeof(UChar) * length);
150 u_memset(str, 0, length);
151 return str;
152 `}
153
154 # Returns a `NULL` `UCharString`
155 new nul `{ return NULL; `}
156
157 # Returns the number of code points up to `code_units` characters
158 fun code_points(code_units: Int): Int `{
159 if (self == NULL) {
160 return -1;
161 }
162 return u_countChar32(self, code_units);
163 `}
164
165 # Converts a `CString` to a `UCharString` and returns the required length of said `UCharString`
166 fun from_cstring(dest_length: Int, source: CString, source_length: Int): Int `{
167 UErrorCode error = U_ZERO_ERROR;
168 int32_t res;
169 u_strFromUTF8(self, dest_length, &res, source, source_length, &error);
170 return res;
171 `}
172
173 # Converts `self` to a `CString` and returns the required length (without the termination character) of said `CString`
174 fun to_cstring(dest: CString, dest_length: Int, source_length: Int): Int `{
175 UErrorCode error = U_ZERO_ERROR;
176 int32_t res;
177 u_strToUTF8(dest, dest_length, &res, self, source_length, &error);
178 return res;
179 `}
180
181 # Get code point at code unit `offset`
182 fun char_at_offset(offset: Int, code_units: Int): Char `{
183 UChar32 c = 0;
184 U16_NEXT(self, offset, code_units, c);
185 return c;
186 `}
187 end
188
189 private class U16StringCharIterator
190 super IndexedIterator[Char]
191
192 var target: U16String
193
194 var curr_pos: Int
195
196 redef fun is_ok do return curr_pos < target.length
197
198 redef fun item do return target[curr_pos]
199
200 redef fun next do curr_pos += 1
201
202 redef fun index do return curr_pos
203 end
204
205 private class U16StringCharReverseIterator
206 super IndexedIterator[Char]
207
208 var target: U16String
209
210 var curr_pos: Int
211
212 redef fun is_ok do return curr_pos >= 0
213
214 redef fun item do return target[curr_pos]
215
216 redef fun next do curr_pos -= 1
217
218 redef fun index do return curr_pos
219 end
220
221 private class U16StringCharView
222 super StringCharView
223
224 redef type SELFTYPE: U16String
225
226 redef fun [](index) do return target[index]
227
228 redef fun iterator_from(start) do return new U16StringCharIterator(target, start)
229
230 redef fun reverse_iterator_from(start) do return new U16StringCharReverseIterator(target, start)
231 end
232
233 redef class String
234 # Returns a UTF-16 encoded version of `self`
235 fun to_u16string: U16String do return new U16String.from_string(self)
236
237 # Returns the number of UTF-16 code units in `self`
238 fun u16_length: Int do
239 var n = 0
240 for c in chars do
241 if c.to_i > 0xFFFF then n += 2 else n += 1
242 end
243 return n
244 end
245 end
246
247 redef class CString
248 # Returns a null `char *`
249 new nul `{ return NULL; `}
250 end