1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
11 # Native structures for text and bytes
18 # Gives the length of the UTF-8 char starting with `self`
20 if self & 0b1000_0000u
8 == 0u8
then
22 else if self & 0b1110_0000u
8 == 0b1100_0000u
8 then
24 else if self & 0b1111_0000u
8 == 0b1110_0000u
8 then
26 else if self & 0b1111_1000u
8 == 0b1111_0000u
8 then
34 # Native strings are simple C char *
35 extern class NativeString `{ char* `}
36 # Creates a new NativeString with a capacity of `length
`
37 new(length: Int) is intern
39 # Returns a char* starting at `index
`.
41 # WARNING: Unsafe for extern code, use only for temporary
42 # pointer manipulation purposes (e.g. write to file or such)
43 fun fast_cstring(index: Int): NativeString is intern
45 # Get char at `index
`.
46 fun [](index: Int): Byte is intern
48 # Set char `item
` at index.
49 fun []=(index: Int, item: Byte) is intern
51 # Copy `self` to `dest
`.
52 fun copy_to(dest: NativeString, length: Int, from: Int, to: Int) is intern
54 # Position of the first nul character.
55 fun cstring_length: Int
58 while self[l] != 0u8 do l += 1
62 # Parse `self` as an Int.
63 fun atoi: Int is intern
65 # Parse `self` as a Float.
66 fun atof: Float `{ return atof(self); `}
68 # Gets the UTF-8 char at index `pos`
70 # Index is expressed in Unicode chars
73 # assert "かきく".as(FlatString).items.char_at(0) == 'か'
76 # If the char at position pos is an invalid Unicode char,
77 # the Unicode replacement character � (0xFFFD) will be used.
80 # assert "かきく".as(FlatString).items.char_at(1) == '�'
82 fun char_at
(pos
: Int): Char `{
84 if((c & 0x80) == 0x00) return (uint32_t)c;
85 if(((c & 0xE0) == 0xC0) && ((self[pos + 1] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0x1F) << 6) + ((((uint32_t)self[pos + 1] & 0x3F)));
86 if(((c & 0xF0) == 0xE0) && ((self[pos + 1] & 0xC0) == 0x80) && ((self[pos + 2] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0xF) << 12) + ((((uint32_t)self[pos + 1]) & 0x3F) << 6) + ((((uint32_t)self[pos + 2] & 0x3F)));
87 if(((c & 0xF8) == 0xF0) && ((self[pos + 1] & 0xC0) == 0x80) && ((self[pos + 2] & 0xC0) == 0x80) && ((self[pos + 3] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0x7) << 18) + ((((uint32_t)self[pos + 1]) & 0x3F) << 12) + ((((uint32_t)self[pos + 2]) & 0x3F) << 6) + ((((uint32_t)self[pos + 3] & 0x3F)));
91 # Gets the byte index of char at position `n` in UTF-8 String
92 fun char_to_byte_index
(n
: Int): Int do return char_to_byte_index_cached
(n
, 0, 0)
94 # Gets the length of the character at position `pos` (1 if invalid sequence)
95 fun length_of_char_at
(pos
: Int): Int do
97 if c
& 0x80u
8 == 0x00u
8 then
99 else if c
& 0xE0u
8 == 0xC0u
8 and self[pos
+ 1] & 0xC0u
8 == 0x80u
8 then
101 else if c
& 0xF0u
8 == 0xE0u
8 and self[pos
+ 1] & 0xC0u
8 == 0x80u
8 and self[pos
+ 2] & 0xC0u
8 == 0x80u
8 then
103 else if c
& 0xF8u
8 == 0xF0u
8 and self[pos
+ 1] & 0xC0u
8 == 0x80u
8 and self[pos
+ 2] & 0xC0u
8 == 0x80u
8 and self[pos
+ 3] & 0xC0u
8 == 0x80u
8 then
110 # Gets the byte index of char at position `n` in UTF-8 String
112 # `char_from` and `byte_from` are cached values to seek from.
114 # NOTE: char_from and byte_from are not guaranteed to be valid cache values
115 # It it up to the client to ensure the validity of the information
116 fun char_to_byte_index_cached
(n
, char_from
, byte_from
: Int): Int do
121 ns_i
+= length_of_char_at
(ns_i
)
126 ns_i
= find_beginning_of_char_at
(ns_i
- 1)
133 # Gets the char index of byte at position `n` in a UTF-8 String
135 # `char_from` and `byte_from` are cached values to seek from.
137 # NOTE: char_from and byte_from are not guaranteed to be valid cache values
138 # It it up to the client to ensure the validity of the information
139 fun byte_to_char_index_cached
(n
, char_from
, byte_from
: Int): Int do
144 ns_i
+= length_of_char_at
(ns_i
)
149 ns_i
= find_beginning_of_char_at
(ns_i
- 1)
156 # Returns the beginning position of the char at position `pos`
158 # If the char is invalid UTF-8, `pos` is returned as-is
161 # assert "abc".items.find_beginning_of_char_at(2) == 2
162 # assert "か".items.find_beginning_of_char_at(1) == 0
163 # assert [0x41u8, 233u8].to_s.items.find_beginning_of_char_at(1) == 1
165 fun find_beginning_of_char_at
(pos
: Int): Int do
168 while c
& 0xC0u
8 == 0x80u
8 do
173 if length_of_char_at
(stpos
) >= (endpos
- stpos
+ 1) then return pos
177 # Number of UTF-8 characters in `self` between positions `from` and `to`
178 fun utf8_length
(from
, to
: Int): Int do
183 st
+= length_of_char_at
(st
)