redef class Byte
# Gives the length of the UTF-8 char starting with `self`
- private fun u8len: Int do
+ fun u8len: Int do
if self & 0b1000_0000u8 == 0u8 then
return 1
else if self & 0b1110_0000u8 == 0b1100_0000u8 then
end
end
+redef class Int
+ # Returns the code_point from a utf16 surrogate pair
+ #
+ # assert 0xD83DDE02.from_utf16_surr == 0x1F602
+ fun from_utf16_surr: Int do
+ var hi = (self & 0xFFFF0000) >> 16
+ var lo = self & 0xFFFF
+ var cp = 0
+ cp += (hi - 0xD800) << 10
+ cp += lo - 0xDC00
+ cp += 0x10000
+ return cp
+ end
+end
+
# Native strings are simple C char *
extern class NativeString `{ char* `}
# Creates a new NativeString with a capacity of `length`
return ns_i
end
+ # Gets the char index of byte at position `n` in a UTF-8 String
+ #
+ # `char_from` and `byte_from` are cached values to seek from.
+ #
+ # NOTE: char_from and byte_from are not guaranteed to be valid cache values
+ # It it up to the client to ensure the validity of the information
+ fun byte_to_char_index_cached(n, char_from, byte_from: Int): Int do
+ var ns_i = byte_from
+ var my_i = char_from
+
+ while ns_i < n do
+ ns_i += length_of_char_at(ns_i)
+ my_i += 1
+ end
+
+ while ns_i > n do
+ ns_i = find_beginning_of_char_at(ns_i - 1)
+ my_i -= 1
+ end
+
+ return my_i
+ end
+
# Returns the beginning position of the char at position `pos`
#
# If the char is invalid UTF-8, `pos` is returned as-is
if length_of_char_at(stpos) >= (endpos - stpos + 1) then return pos
return endpos
end
+
+ # Number of UTF-8 characters in `self` between positions `from` and `to`
+ fun utf8_length(from, to: Int): Int do
+ var st = from
+ var lst = to
+ var ln = 0
+ while st <= lst do
+ st += length_of_char_at(st)
+ ln += 1
+ end
+ return ln
+ end
end