1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
11 # Native structures for text and bytes
22 #include <libkern/OSByteOrder.h>
23 #define be32toh(x) OSSwapBigToHostInt32(x)
27 #define be16toh(val) (((val) >> 8) | ((val) << 8))
28 #define be32toh(val) ((be16toh((val) << 16) | (be16toh((val) >> 16))))
31 #define be32toh(val) betoh32(val)
36 # Gives the length of the UTF-8 char starting with `self`
38 if self & 0b1000_0000u
8 == 0u8
then
40 else if self & 0b1110_0000u
8 == 0b1100_0000u
8 then
42 else if self & 0b1111_0000u
8 == 0b1110_0000u
8 then
44 else if self & 0b1111_1000u
8 == 0b1111_0000u
8 then
53 # Returns the code_point from a utf16 surrogate pair
55 # assert 0xD83DDE02.from_utf16_surr == 0x1F602
56 fun from_utf16_surr
: Int do
57 var hi
= (self & 0xFFFF0000) >> 16
58 var lo
= self & 0xFFFF
60 cp
+= (hi
- 0xD800) << 10
67 # Native strings are simple C char *
68 extern class NativeString `{ char* `}
69 # Creates a new NativeString with a capacity of `length
`
70 new(length: Int) is intern
72 # Returns a char* starting at `index
`.
74 # WARNING: Unsafe for extern code, use only for temporary
75 # pointer manipulation purposes (e.g. write to file or such)
76 fun fast_cstring(index: Int): NativeString is intern
78 # Get char at `index
`.
79 fun [](index: Int): Byte is intern
81 # Set char `item
` at index.
82 fun []=(index: Int, item: Byte) is intern
84 # Copy `self` to `dest
`.
85 fun copy_to(dest: NativeString, length: Int, from: Int, to: Int) is intern
87 # Position of the first nul character.
88 fun cstring_length: Int
91 while self[l] != 0u8 do l += 1
95 # Parse `self` as an Int.
96 fun atoi: Int is intern
98 # Parse `self` as a Float.
99 fun atof: Float `{ return atof(self); `}
101 # Gets the UTF-8 char at index `pos`
103 # Index is expressed in Unicode chars
106 # assert "かきく".as(FlatString).items.char_at(0) == 'か'
109 # If the char at position pos is an invalid Unicode char,
110 # the Unicode replacement character � (0xFFFD) will be used.
113 # assert "かきく".as(FlatString).items.char_at(1) == '�'
115 fun char_at
(pos
: Int): Char do
117 if c
& 0x80u
8 == 0u8
then return c
.ascii
118 var b
= fetch_4_hchars
(pos
)
120 if b
& 0xC00000 != 0x800000 then return 0xFFFD.code_point
121 if b
& 0xE0000000 == 0xC0000000 then
122 ret
|= (b
& 0x1F000000) >> 18
123 ret
|= (b
& 0x3F0000) >> 16
124 return ret
.code_point
126 if not b
& 0xC000 == 0x8000 then return 0xFFFD.code_point
127 if b
& 0xF0000000 == 0xE0000000 then
128 ret
|= (b
& 0xF000000) >> 12
129 ret
|= (b
& 0x3F0000) >> 10
130 ret
|= (b
& 0x3F00) >> 8
131 return ret
.code_point
133 if not b
& 0xC0 == 0x80 then return 0xFFFD.code_point
134 if b
& 0xF8000000 == 0xF0000000 then
135 ret
|= (b
.to_i
& 0x7000000) >> 6
136 ret
|= (b
.to_i
& 0x3F0000) >> 4
137 ret
|= (b
.to_i
& 0x3F00) >> 2
139 return ret
.code_point
141 return 0xFFFD.code_point
144 # Gets the byte index of char at position `n` in UTF-8 String
145 fun char_to_byte_index
(n
: Int): Int do return char_to_byte_index_cached
(n
, 0, 0)
147 # Gets the length of the character at position `pos` (1 if invalid sequence)
148 fun length_of_char_at
(pos
: Int): Int do
150 if c
& 0x80u
8 == 0x00u
8 then
152 else if c
& 0xE0u
8 == 0xC0u
8 and self[pos
+ 1] & 0xC0u
8 == 0x80u
8 then
154 else if c
& 0xF0u
8 == 0xE0u
8 and self[pos
+ 1] & 0xC0u
8 == 0x80u
8 and self[pos
+ 2] & 0xC0u
8 == 0x80u
8 then
156 else if c
& 0xF8u
8 == 0xF0u
8 and self[pos
+ 1] & 0xC0u
8 == 0x80u
8 and self[pos
+ 2] & 0xC0u
8 == 0x80u
8 and self[pos
+ 3] & 0xC0u
8 == 0x80u
8 then
163 # Gets the byte index of char at position `n` in UTF-8 String
165 # `char_from` and `byte_from` are cached values to seek from.
167 # NOTE: char_from and byte_from are not guaranteed to be valid cache values
168 # It it up to the client to ensure the validity of the information
169 fun char_to_byte_index_cached
(n
, char_from
, byte_from
: Int): Int do
177 var i
= fetch_4_chars
(ns_i
)
178 if i
& 0x80808080 != 0 then break
183 if dist
== 0 then break
184 ns_i
+= length_of_char_at
(ns_i
)
191 var i
= fetch_4_chars
(ns_i
- 4)
192 if i
& 0x80808080 != 0 then break
197 if dist
== 0 then break
198 ns_i
= find_beginning_of_char_at
(ns_i
- 1)
206 # Gets the char index of byte at position `n` in a UTF-8 String
208 # `char_from` and `byte_from` are cached values to seek from.
210 # NOTE: char_from and byte_from are not guaranteed to be valid cache values
211 # It it up to the client to ensure the validity of the information
212 fun byte_to_char_index_cached
(n
, char_from
, byte_from
: Int): Int do
217 ns_i
+= length_of_char_at
(ns_i
)
222 ns_i
= find_beginning_of_char_at
(ns_i
- 1)
229 # Returns the beginning position of the char at position `pos`
231 # If the char is invalid UTF-8, `pos` is returned as-is
234 # assert "abc".items.find_beginning_of_char_at(2) == 2
235 # assert "か".items.find_beginning_of_char_at(1) == 0
236 # assert [0x41u8, 233u8].to_s.items.find_beginning_of_char_at(1) == 1
238 fun find_beginning_of_char_at
(pos
: Int): Int do
241 if c
& 0x80u
8 == 0x00u
8 then return pos
242 while c
& 0xC0u
8 == 0x80u
8 do
247 if length_of_char_at
(stpos
) >= (endpos
- stpos
+ 1) then return pos
251 # Number of UTF-8 characters in `self` starting at `from`, for a length of `bytelen`
252 fun utf8_length
(from
, bytelen
: Int): Int do
256 while bytelen
>= 4 do
257 var i
= fetch_4_chars
(st
)
258 if i
& 0x80808080 != 0 then break
263 if bytelen
== 0 then break
264 var cln
= length_of_char_at
(st
)
272 # Fetch 4 chars in `self` at `pos`
273 fun fetch_4_chars
(pos
: Int): Int is intern do return fetch_4_ffi
(pos
)
275 # Fetch 4 chars in `self` at `pos`
276 fun fetch_4_hchars
(pos
: Int): Int is intern do return fetch_4h_ffi
(pos
)
278 # FIXME: To remove when bootstrap supports PR #1898
279 private fun fetch_4_ffi
(pos
: Int): Int `{ return (long)*((uint32_t*)(self+pos)); `}
280 private fun fetch_4h_ffi(pos: Int): Int `{ return (long)be32toh(*((uint32_t*)(self+pos))); `}