redef type OTHER: Char
redef fun object_id is intern
+ redef fun output `{
+ if(self < 128){
+ printf("%c", self);
+ }else if(self < 2048){
+ printf("%c%c", 0xC0 | ((0x7C0 & self) >> 6), 0x80 | (0x3F & self));
+ }else if(self < 65536){
+ printf("%c%c%c", 0xE0 | ((0xF000 & self) >> 12), 0x80 | ((0xFC0 & self) >> 6) ,0x80 | (0x3F & self));
+ }else if(self < 2097152){
+ printf("%c%c%c%c", 0xF0 | ((0x1C0000 & self) >> 18), 0x80 | ((0x3F000 & self) >> 12), 0x80 | ((0xFC0 & self) >> 6), 0x80 | (0x3F & self));
+ }else{
+ // Bad char
+ printf("%c", self);
+ }
+ `}
redef fun hash do return ascii
redef fun ==(o) is intern
redef fun !=(o) is intern
- redef fun output is intern
redef fun <=(i) is intern
redef fun <(i) is intern
end
redef class Char
+
+ # Length of `self` in a UTF-8 String
+ private fun u8char_len: Int do
+ var c = self.ascii
+ if c < 0x80 then return 1
+ if c <= 0x7FF then return 2
+ if c <= 0xFFFF then return 3
+ if c <= 0x10FFFF then return 4
+ # Bad character format
+ return 1
+ end
+
# assert 'x'.to_s == "x"
- redef fun to_s
- do
- var s = new Buffer.with_cap(1)
- s.chars[0] = self
- return s.to_s
+ redef fun to_s do
+ var ln = u8char_len
+ var ns = new NativeString(ln + 1)
+ u8char_tos(ns, ln)
+ return ns.to_s_with_length(ln)
end
+ private fun u8char_tos(r: NativeString, len: Int) `{
+ r[len] = '\0';
+ switch(len){
+ case 1:
+ r[0] = self;
+ break;
+ case 2:
+ r[0] = 0xC0 | ((self & 0x7C0) >> 6);
+ r[1] = 0x80 | (self & 0x3F);
+ break;
+ case 3:
+ r[0] = 0xE0 | ((self & 0xF000) >> 12);
+ r[1] = 0x80 | ((self & 0xFC0) >> 6);
+ r[2] = 0x80 | (self & 0x3F);
+ break;
+ case 4:
+ r[0] = 0xF0 | ((self & 0x1C0000) >> 18);
+ r[1] = 0x80 | ((self & 0x3F000) >> 12);
+ r[2] = 0x80 | ((self & 0xFC0) >> 6);
+ r[3] = 0x80 | (self & 0x3F);
+ break;
+ }
+ `}
+
# Returns true if the char is a numerical digit
#
# assert '0'.is_numeric
str.real_items = new_self
return str
end
+
+ # Sets the next bytes at position `pos` to the value of `c`, encoded in UTF-8
+ #
+ # Very unsafe, make sure to have room for this char prior to calling this function.
+ private fun set_char_at(pos: Int, c: Char) do
+ var ln = c.u8char_len
+ native_set_char(pos, c, ln)
+ end
+
+ private fun native_set_char(pos: Int, c: Char, ln: Int) `{
+ char* dst = self + pos;
+ switch(ln){
+ case 1:
+ dst[0] = c;
+ break;
+ case 2:
+ dst[0] = 0xC0 | ((c & 0x7C0) >> 6);
+ dst[1] = 0x80 | (c & 0x3F);
+ break;
+ case 3:
+ dst[0] = 0xE0 | ((c & 0xF000) >> 12);
+ dst[1] = 0x80 | ((c & 0xFC0) >> 6);
+ dst[2] = 0x80 | (c & 0x3F);
+ break;
+ case 4:
+ dst[0] = 0xF0 | ((c & 0x1C0000) >> 18);
+ dst[1] = 0x80 | ((c & 0x3F000) >> 12);
+ dst[2] = 0x80 | ((c & 0xFC0) >> 6);
+ dst[3] = 0x80 | (c & 0x3F);
+ break;
+ }
+ `}
end
redef class Int
module native
import kernel
+import math
+
+redef class Byte
+ # Gives the length of the UTF-8 char starting with `self`
+ private fun u8len: Int do
+ if self & 0b1000_0000u8 == 0u8 then
+ return 1
+ else if self & 0b1110_0000u8 == 0b1100_0000u8 then
+ return 2
+ else if self & 0b1111_0000u8 == 0b1110_0000u8 then
+ return 3
+ else if self & 0b1111_1000u8 == 0b1111_0000u8 then
+ return 4
+ else
+ return 1
+ end
+ end
+end
# Native strings are simple C char *
extern class NativeString `{ char* `}
# Parse `self` as a Float.
fun atof: Float `{ return atof(self); `}
+
+ # Gets the UTF-8 char at index `pos`
+ #
+ # Index is expressed in Unicode chars
+ #
+ # ~~~raw
+ # assert "かきく".as(FlatString).items.char_at(0) == 'か'
+ # ~~~
+ #
+ # If the char at position pos is an invalid Unicode char,
+ # the Unicode replacement character � (0xFFFD) will be used.
+ #
+ # ~~~raw
+ # assert "かきく".as(FlatString).items.char_at(1) == '�'
+ # ~~~
+ fun char_at(pos: Int): Char `{
+ char c = self[pos];
+ if((c & 0x80) == 0x00) return (uint32_t)c;
+ if(((c & 0xE0) == 0xC0) && ((self[pos + 1] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0x1F) << 6) + ((((uint32_t)self[pos + 1] & 0x3F)));
+ if(((c & 0xF0) == 0xE0) && ((self[pos + 1] & 0xC0) == 0x80) && ((self[pos + 2] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0xF) << 12) + ((((uint32_t)self[pos + 1]) & 0x3F) << 6) + ((((uint32_t)self[pos + 2] & 0x3F)));
+ if(((c & 0xF7) == 0xF0) && ((self[pos + 1] & 0xC0) == 0x80) && ((self[pos + 2] & 0xC0) == 0x80) && ((self[pos + 3] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0x7) << 18) + ((((uint32_t)self[pos + 1]) & 0x3F) << 12) + ((((uint32_t)self[pos + 2]) & 0x3F) << 6) + ((((uint32_t)self[pos + 3] & 0x3F)));
+ return 0xFFFD;
+ `}
+
+ # Gets the byte index of char at position `n` in UTF-8 String
+ fun char_to_byte_index(n: Int): Int do return char_to_byte_index_cached(n, 0, 0)
+
+ # Gets the length of the character at position `pos` (1 if invalid sequence)
+ fun length_of_char_at(pos: Int): Int do
+ var c = self[pos]
+ if c & 0x80u8 == 0x00u8 then
+ return 1
+ else if c & 0xE0u8 == 0xC0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 then
+ return 2
+ else if c & 0xF0u8 == 0xE0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 then
+ return 3
+ else if c & 0xF7u8 == 0xF0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 and self[pos + 3] & 0xC0u8 == 0x80u8 then
+ return 4
+ else
+ return 1
+ end
+ end
+
+ # Gets the byte index of char at position `n` in UTF-8 String
+ #
+ # `char_from` and `byte_from` are cached values to seek from.
+ #
+ # NOTE: char_from and byte_from are not guaranteed to be valid cache values
+ # It it up to the client to ensure the validity of the information
+ fun char_to_byte_index_cached(n, char_from, byte_from: Int): Int do
+ var ns_i = byte_from
+ var my_i = char_from
+
+ while my_i < n do
+ ns_i += length_of_char_at(ns_i)
+ my_i += 1
+ end
+
+ while my_i > n do
+ ns_i = find_beginning_of_char_at(ns_i - 1)
+ my_i -= 1
+ end
+
+ return ns_i
+ end
+
+ # Returns the beginning position of the char at position `pos`
+ #
+ # If the char is invalid UTF-8, `pos` is returned as-is
+ #
+ # ~~~raw
+ # assert "abc".items.find_beginning_of_char_at(2) == 2
+ # assert "か".items.find_beginning_of_char_at(1) == 0
+ # assert [0x41u8, 233u8].to_s.items.find_beginning_of_char_at(1) == 1
+ # ~~~
+ fun find_beginning_of_char_at(pos: Int): Int do
+ var endpos = pos
+ var c = self[pos]
+ while c & 0xC0u8 == 0x80u8 do
+ pos -= 1
+ c = self[pos]
+ end
+ var stpos = pos
+ if length_of_char_at(stpos) >= (endpos - stpos + 1) then return pos
+ return endpos
+ end
end