lib/core/text/native.nit

   1 # This file is part of NIT ( http://www.nitlanguage.org ).
   2 #
   3 # This file is free software, which comes along with NIT.  This software is
   4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
   5 # without  even  the implied warranty of  MERCHANTABILITY or  FITNESS FOR A
   6 # PARTICULAR PURPOSE.  You can modify it is you want,  provided this header
   7 # is kept unaltered, and a notification of the changes is added.
   8 # You  are  allowed  to  redistribute it and sell it, alone or is a part of
   9 # another product.
  10
  11 # Native structures for text and bytes
  12 module native
  13
  14 import kernel
  15 import math
  16 import fixed_ints
  17
  18 in "C" `{
  19 #ifdef __linux__
  20         #include <endian.h>
  21 #endif
  22 #ifdef __APPLE__
  23         #include <libkern/OSByteOrder.h>
  24         #define be32toh(x) OSSwapBigToHostInt32(x)
  25 #endif
  26 #ifdef _WIN32
  27         #define be32toh(val) _byteswap_ulong(val)
  28 #endif
  29
  30 #ifdef __pnacl__
  31         #define be16toh(val) (((val) >> 8) | ((val) << 8))
  32         #define be32toh(val) ((be16toh((val) << 16) | (be16toh((val) >> 16))))
  33 #endif
  34 #ifndef be32toh
  35         #define be32toh(val) betoh32(val)
  36 #endif
  37 `}
  38
  39 redef class Byte
  40         # Gives the length of the UTF-8 char starting with `self`
  41         fun u8len: Int do
  42                 if self & 0b1000_0000u8 == 0u8 then
  43                         return 1
  44                 else if self & 0b1110_0000u8 == 0b1100_0000u8 then
  45                         return 2
  46                 else if self & 0b1111_0000u8 == 0b1110_0000u8 then
  47                         return 3
  48                 else if self & 0b1111_1000u8 == 0b1111_0000u8 then
  49                         return 4
  50                 else
  51                         return 1
  52                 end
  53         end
  54
  55         # Is `self` a valid UTF-8 sequence start ?
  56         #
  57         # ~~~nit
  58         # assert 0u8.is_valid_utf8_start
  59         # assert 0xC0u8.is_valid_utf8_start
  60         # assert 0xE0u8.is_valid_utf8_start
  61         # assert 0xF0u8.is_valid_utf8_start
  62         # ~~~
  63         fun is_valid_utf8_start: Bool do
  64                 if self & 0x80u8 == 0u8 then return true
  65                 if self & 0b1110_0000u8 == 0b1100_0000u8 then return true
  66                 if self & 0b1111_0000u8 == 0b1110_0000u8 then return true
  67                 if self & 0b1111_1000u8 == 0b1111_0000u8 then return true
  68                 return false
  69         end
  70 end
  71
  72 redef class UInt32
  73         # Returns the code_point from a utf16 surrogate pair
  74         #
  75         #     assert 0xD83DDE02u32.from_utf16_surr == 0x1F602u32
  76         fun from_utf16_surr: UInt32 do
  77                 var hi = (self & 0xFFFF0000u32) >> 16
  78                 var lo = self & 0xFFFFu32
  79                 var cp = 0u32
  80                 cp += (hi - 0xD800u32) << 10
  81                 cp += lo - 0xDC00u32
  82                 cp += 0x10000u32
  83                 return cp
  84         end
  85
  86         # The character which code point (unicode-wise) is `self`
  87         #
  88         #     assert 65u32.code_point == 'A'
  89         #     assert 10u32.code_point == '\n'
  90         #     assert 0x220Bu32.code_point == '∋'
  91         fun code_point: Char `{ return self; `}
  92 end
  93
  94 # C string `char *`
  95 #
  96 # Used as underlying implementation for `String` and some other `Text`.
  97 extern class CString `{ char* `}
  98         # Create a new `CString` with the capacity for `length` characters
  99         new(length: Int) is intern
 100
 101         # Get a char* starting at `index`.
 102         #
 103         # WARNING: Unsafe for extern code, use only for temporary
 104         # pointer manipulation purposes (e.g. write to file or such)
 105         fun fast_cstring(index: Int): CString is intern
 106
 107         # Get char at `index`.
 108         fun [](index: Int): Byte is intern
 109
 110         # Set char `item` at index.
 111         fun []=(index: Int, item: Byte) is intern
 112
 113         # Copy `self` to `dest`.
 114         fun copy_to(dest: CString, length: Int, from: Int, to: Int) is intern
 115
 116         redef fun ==(o) is intern do return is_same_instance(o)
 117
 118         redef fun !=(o) is intern do return not is_same_instance(o)
 119
 120         # Position of the first nul character.
 121         fun cstring_length: Int
 122         do
 123                 var l = 0
 124                 while self[l] != 0u8 do l += 1
 125                 return l
 126         end
 127
 128         # Parse `self` as an Int.
 129         fun atoi: Int is intern
 130
 131         # Parse `self` as a Float.
 132         fun atof: Float `{ return atof(self); `}
 133
 134         # Gets the UTF-8 char at index `pos`
 135         #
 136         # Index is expressed in Unicode chars
 137         #
 138         # ~~~raw
 139         #     assert "かきく".as(FlatString).items.char_at(0) == 'か'
 140         # ~~~
 141         #
 142         # If the char at position pos is an invalid Unicode char,
 143         # the Unicode replacement character � (0xFFFD) will be used.
 144         #
 145         # ~~~raw
 146         #     assert "かきく".as(FlatString).items.char_at(1) == '�'
 147         # ~~~
 148         fun char_at(pos: Int): Char do
 149                 var c = self[pos]
 150                 if c & 0x80u8 == 0u8 then return c.ascii
 151                 var b = fetch_4_hchars(pos)
 152                 var ret = 0u32
 153                 if b & 0xC00000u32 != 0x800000u32 then return 0xFFFD.code_point
 154                 if b & 0xE0000000u32 == 0xC0000000u32 then
 155                         ret |= (b & 0x1F000000u32) >> 18
 156                         ret |= (b & 0x3F0000u32) >> 16
 157                         return ret.code_point
 158                 end
 159                 if not b & 0xC000u32 == 0x8000u32 then return 0xFFFD.code_point
 160                 if b & 0xF0000000u32 == 0xE0000000u32 then
 161                         ret |= (b & 0xF000000u32) >> 12
 162                         ret |= (b & 0x3F0000u32) >> 10
 163                         ret |= (b & 0x3F00u32) >> 8
 164                         return ret.code_point
 165                 end
 166                 if not b & 0xC0u32 == 0x80u32 then return 0xFFFD.code_point
 167                 if b & 0xF8000000u32 == 0xF0000000u32 then
 168                         ret |= (b & 0x7000000u32) >> 6
 169                         ret |= (b & 0x3F0000u32) >> 4
 170                         ret |= (b & 0x3F00u32) >> 2
 171                         ret |= b & 0x3Fu32
 172                         return ret.code_point
 173                 end
 174                 return 0xFFFD.code_point
 175         end
 176
 177         # Gets the byte index of char at position `n` in UTF-8 String
 178         fun char_to_byte_index(n: Int): Int do return char_to_byte_index_cached(n, 0, 0)
 179
 180         # Gets the length of the character at position `pos` (1 if invalid sequence)
 181         fun length_of_char_at(pos: Int): Int do
 182                 var c = self[pos]
 183                 if c & 0x80u8 == 0x00u8 then
 184                         return 1
 185                 else if c & 0xE0u8 == 0xC0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 then
 186                         return 2
 187                 else if c & 0xF0u8 == 0xE0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 then
 188                         return 3
 189                 else if c & 0xF8u8 == 0xF0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 and self[pos + 3] & 0xC0u8 == 0x80u8 then
 190                         return 4
 191                 else
 192                         return 1
 193                 end
 194         end
 195
 196         # Gets the byte index of char at position `n` in UTF-8 String
 197         #
 198         # `char_from` and `byte_from` are cached values to seek from.
 199         #
 200         # NOTE: char_from and byte_from are not guaranteed to be valid cache values
 201         # It it up to the client to ensure the validity of the information
 202         fun char_to_byte_index_cached(n, char_from, byte_from: Int): Int do
 203                 var ns_i = byte_from
 204                 var my_i = char_from
 205
 206                 var dist = n - my_i
 207
 208                 while dist > 0 do
 209                         while dist >= 4 do
 210                                 var i = fetch_4_chars(ns_i)
 211                                 if i & 0x80808080u32 != 0u32 then break
 212                                 ns_i += 4
 213                                 my_i += 4
 214                                 dist -= 4
 215                         end
 216                         if dist == 0 then break
 217                         ns_i += length_of_char_at(ns_i)
 218                         my_i += 1
 219                         dist -= 1
 220                 end
 221
 222                 while dist < 0 do
 223                         while dist <= -4 do
 224                                 var i = fetch_4_chars(ns_i - 4)
 225                                 if i & 0x80808080u32 != 0u32 then break
 226                                 ns_i -= 4
 227                                 my_i -= 4
 228                                 dist += 4
 229                         end
 230                         if dist == 0 then break
 231                         ns_i = find_beginning_of_char_at(ns_i - 1)
 232                         my_i -= 1
 233                         dist += 1
 234                 end
 235
 236                 return ns_i
 237         end
 238
 239         # Gets the char index of byte at position `n` in a UTF-8 String
 240         #
 241         # `char_from` and `byte_from` are cached values to seek from.
 242         #
 243         # NOTE: char_from and byte_from are not guaranteed to be valid cache values
 244         # It it up to the client to ensure the validity of the information
 245         fun byte_to_char_index_cached(n, char_from, byte_from: Int): Int do
 246                 var ns_i = byte_from
 247                 var my_i = char_from
 248
 249                 while ns_i < n do
 250                         ns_i += length_of_char_at(ns_i)
 251                         my_i += 1
 252                 end
 253
 254                 while ns_i > n do
 255                         ns_i = find_beginning_of_char_at(ns_i - 1)
 256                         my_i -= 1
 257                 end
 258
 259                 return my_i
 260         end
 261
 262         # Returns the beginning position of the char at position `pos`
 263         #
 264         # If the char is invalid UTF-8, `pos` is returned as-is
 265         #
 266         # ~~~raw
 267         #       assert "abc".items.find_beginning_of_char_at(2) == 2
 268         #       assert "か".items.find_beginning_of_char_at(1) == 0
 269         #       assert [0x41u8, 233u8].to_s.items.find_beginning_of_char_at(1) == 1
 270         # ~~~
 271         fun find_beginning_of_char_at(pos: Int): Int do
 272                 var endpos = pos
 273                 var c = self[pos]
 274                 if c & 0x80u8 == 0x00u8 then return pos
 275                 while c & 0xC0u8 == 0x80u8 do
 276                         pos -= 1
 277                         c = self[pos]
 278                 end
 279                 var stpos = pos
 280                 if length_of_char_at(stpos) >= (endpos - stpos + 1) then return pos
 281                 return endpos
 282         end
 283
 284         # Number of UTF-8 characters in `self` starting at `from`, for a length of `byte_length`
 285         fun utf8_length(from, byte_length: Int): Int is intern do
 286                 var st = from
 287                 var ln = 0
 288                 while byte_length > 0 do
 289                         while byte_length >= 4 do
 290                                 var i = fetch_4_chars(st)
 291                                 if i & 0x80808080u32 != 0u32 then break
 292                                 byte_length -= 4
 293                                 st += 4
 294                                 ln += 4
 295                         end
 296                         if byte_length == 0 then break
 297                         var cln = length_of_char_at(st)
 298                         st += cln
 299                         ln += 1
 300                         byte_length -= cln
 301                 end
 302                 return ln
 303         end
 304
 305         # Fetch 4 chars in `self` at `pos`
 306         fun fetch_4_chars(pos: Int): UInt32 is intern `{ return *((uint32_t*)(self+pos)); `}
 307
 308         # Fetch 4 chars in `self` at `pos`
 309         fun fetch_4_hchars(pos: Int): UInt32 is intern `{ return (uint32_t)be32toh(*((uint32_t*)(self+pos))); `}
 310
 311         # Right shifts `len` bytes of `self` from `sh` bytes starting at position `pos`
 312         fun rshift(sh, len, pos: Int) do
 313                 copy_to(self, len, pos, pos + sh)
 314         end
 315
 316         # Left shifts `len` bytes of `self` from `sh` bytes starting at position `pos`
 317         fun lshift(sh, len, pos: Int) do
 318                 copy_to(self, len, pos, pos - sh)
 319         end
 320 end