escape_to_c: Escape trigraphs
[nit.git] / lib / core / text / native.nit
index fad3ae1..7902d11 100644 (file)
@@ -47,6 +47,22 @@ redef class Byte
                        return 1
                end
        end
+
+       # Is `self` a valid UTF-8 sequence start ?
+       #
+       # ~~~nit
+       # assert 0u8.is_valid_utf8_start
+       # assert 0xC0u8.is_valid_utf8_start
+       # assert 0xE0u8.is_valid_utf8_start
+       # assert 0xF0u8.is_valid_utf8_start
+       # ~~~
+       fun is_valid_utf8_start: Bool do
+               if self & 0x80u8 == 0u8 then return true
+               if self & 0b1110_0000u8 == 0b1100_0000u8 then return true
+               if self & 0b1111_0000u8 == 0b1110_0000u8 then return true
+               if self & 0b1111_1000u8 == 0b1111_0000u8 then return true
+               return false
+       end
 end
 
 redef class Int
@@ -84,6 +100,10 @@ extern class NativeString `{ char* `}
        # Copy `self` to `dest`.
        fun copy_to(dest: NativeString, length: Int, from: Int, to: Int) is intern
 
+       redef fun ==(o) is intern do return is_same_instance(o)
+
+       redef fun !=(o) is intern do return not is_same_instance(o)
+
        # Position of the first nul character.
        fun cstring_length: Int
        do
@@ -238,6 +258,7 @@ extern class NativeString `{ char* `}
        fun find_beginning_of_char_at(pos: Int): Int do
                var endpos = pos
                var c = self[pos]
+               if c & 0x80u8 == 0x00u8 then return pos
                while c & 0xC0u8 == 0x80u8 do
                        pos -= 1
                        c = self[pos]
@@ -247,25 +268,41 @@ extern class NativeString `{ char* `}
                return endpos
        end
 
-       # Number of UTF-8 characters in `self` between positions `from` and `to`
-       fun utf8_length(from, to: Int): Int do
+       # Number of UTF-8 characters in `self` starting at `from`, for a length of `bytelen`
+       fun utf8_length(from, bytelen: Int): Int is intern do
                var st = from
-               var lst = to
                var ln = 0
-               while st <= lst do
-                       st += length_of_char_at(st)
+               while bytelen > 0 do
+                       while bytelen >= 4 do
+                               var i = fetch_4_chars(st)
+                               if i & 0x80808080 != 0 then break
+                               bytelen -= 4
+                               st += 4
+                               ln += 4
+                       end
+                       if bytelen == 0 then break
+                       var cln = length_of_char_at(st)
+                       st += cln
                        ln += 1
+                       bytelen -= cln
                end
                return ln
        end
 
        # Fetch 4 chars in `self` at `pos`
-       fun fetch_4_chars(pos: Int): Int is intern do return fetch_4_ffi(pos)
+       fun fetch_4_chars(pos: Int): Int is intern `{ return (long)*((uint32_t*)(self+pos)); `}
 
        # Fetch 4 chars in `self` at `pos`
-       fun fetch_4_hchars(pos: Int): Int is intern do return fetch_4h_ffi(pos)
+       fun fetch_4_hchars(pos: Int): Int is intern `{ return (long)be32toh(*((uint32_t*)(self+pos))); `}
+
 
-       # FIXME: To remove when bootstrap supports PR #1898
-       private fun fetch_4_ffi(pos: Int): Int `{ return (long)*((uint32_t*)(self+pos)); `}
-       private fun fetch_4h_ffi(pos: Int): Int `{ return (long)be32toh(*((uint32_t*)(self+pos))); `}
+       # Right shifts `len` bytes of `self` from `sh` bytes starting at position `pos`
+       fun rshift(sh, len, pos: Int) do
+               copy_to(self, len, pos, pos + sh)
+       end
+
+       # Left shifts `len` bytes of `self` from `sh` bytes starting at position `pos`
+       fun lshift(sh, len, pos: Int) do
+               copy_to(self, len, pos, pos - sh)
+       end
 end