lib/core/text/native.nit

   1 # This file is part of NIT ( http://www.nitlanguage.org ).
   2 #
   3 # This file is free software, which comes along with NIT.  This software is
   4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
   5 # without  even  the implied warranty of  MERCHANTABILITY or  FITNESS FOR A
   6 # PARTICULAR PURPOSE.  You can modify it is you want,  provided this header
   7 # is kept unaltered, and a notification of the changes is added.
   8 # You  are  allowed  to  redistribute it and sell it, alone or is a part of
   9 # another product.
  10
  11 # Native structures for text and bytes
  12 module native
  13
  14 import kernel
  15 import math
  16
  17 in "C" `{
  18 #ifdef __linux__
  19         #include <endian.h>
  20 #endif
  21 #ifdef __APPLE__
  22         #include <libkern/OSByteOrder.h>
  23         #define be32toh(x) OSSwapBigToHostInt32(x)
  24 #endif
  25 #ifdef _WIN32
  26         #define be32toh(val) _byteswap_ulong(val)
  27 #endif
  28
  29 #ifdef __pnacl__
  30         #define be16toh(val) (((val) >> 8) | ((val) << 8))
  31         #define be32toh(val) ((be16toh((val) << 16) | (be16toh((val) >> 16))))
  32 #endif
  33 #ifndef be32toh
  34         #define be32toh(val) betoh32(val)
  35 #endif
  36 `}
  37
  38 redef class Byte
  39         # Gives the length of the UTF-8 char starting with `self`
  40         fun u8len: Int do
  41                 if self & 0b1000_0000u8 == 0u8 then
  42                         return 1
  43                 else if self & 0b1110_0000u8 == 0b1100_0000u8 then
  44                         return 2
  45                 else if self & 0b1111_0000u8 == 0b1110_0000u8 then
  46                         return 3
  47                 else if self & 0b1111_1000u8 == 0b1111_0000u8 then
  48                         return 4
  49                 else
  50                         return 1
  51                 end
  52         end
  53
  54         # Is `self` a valid UTF-8 sequence start ?
  55         #
  56         # ~~~nit
  57         # assert 0u8.is_valid_utf8_start
  58         # assert 0xC0u8.is_valid_utf8_start
  59         # assert 0xE0u8.is_valid_utf8_start
  60         # assert 0xF0u8.is_valid_utf8_start
  61         # ~~~
  62         fun is_valid_utf8_start: Bool do
  63                 if self & 0x80u8 == 0u8 then return true
  64                 if self & 0b1110_0000u8 == 0b1100_0000u8 then return true
  65                 if self & 0b1111_0000u8 == 0b1110_0000u8 then return true
  66                 if self & 0b1111_1000u8 == 0b1111_0000u8 then return true
  67                 return false
  68         end
  69 end
  70
  71 redef class Int
  72         # Returns the code_point from a utf16 surrogate pair
  73         #
  74         #     assert 0xD83DDE02.from_utf16_surr == 0x1F602
  75         fun from_utf16_surr: Int do
  76                 var hi = (self & 0xFFFF0000) >> 16
  77                 var lo = self & 0xFFFF
  78                 var cp = 0
  79                 cp += (hi - 0xD800) << 10
  80                 cp += lo - 0xDC00
  81                 cp += 0x10000
  82                 return cp
  83         end
  84 end
  85
  86 # C string `char *`
  87 #
  88 # Used as underlying implementation for `String` and some other `Text`.
  89 extern class CString `{ char* `}
  90         # Create a new `CString` with the capacity for `length` characters
  91         new(length: Int) is intern
  92
  93         # Get a char* starting at `index`.
  94         #
  95         # WARNING: Unsafe for extern code, use only for temporary
  96         # pointer manipulation purposes (e.g. write to file or such)
  97         fun fast_cstring(index: Int): CString is intern
  98
  99         # Get char at `index`.
 100         fun [](index: Int): Byte is intern
 101
 102         # Set char `item` at index.
 103         fun []=(index: Int, item: Byte) is intern
 104
 105         # Copy `self` to `dest`.
 106         fun copy_to(dest: CString, length: Int, from: Int, to: Int) is intern
 107
 108         redef fun ==(o) is intern do return is_same_instance(o)
 109
 110         redef fun !=(o) is intern do return not is_same_instance(o)
 111
 112         # Position of the first nul character.
 113         fun cstring_length: Int
 114         do
 115                 var l = 0
 116                 while self[l] != 0u8 do l += 1
 117                 return l
 118         end
 119
 120         # Parse `self` as an Int.
 121         fun atoi: Int is intern
 122
 123         # Parse `self` as a Float.
 124         fun atof: Float `{ return atof(self); `}
 125
 126         # Gets the UTF-8 char at index `pos`
 127         #
 128         # Index is expressed in Unicode chars
 129         #
 130         # ~~~raw
 131         #     assert "かきく".as(FlatString).items.char_at(0) == 'か'
 132         # ~~~
 133         #
 134         # If the char at position pos is an invalid Unicode char,
 135         # the Unicode replacement character � (0xFFFD) will be used.
 136         #
 137         # ~~~raw
 138         #     assert "かきく".as(FlatString).items.char_at(1) == '�'
 139         # ~~~
 140         fun char_at(pos: Int): Char do
 141                 var c = self[pos]
 142                 if c & 0x80u8 == 0u8 then return c.ascii
 143                 var b = fetch_4_hchars(pos)
 144                 var ret = 0
 145                 if b & 0xC00000 != 0x800000 then return 0xFFFD.code_point
 146                 if b & 0xE0000000 == 0xC0000000 then
 147                         ret |= (b & 0x1F000000) >> 18
 148                         ret |= (b & 0x3F0000) >> 16
 149                         return ret.code_point
 150                 end
 151                 if not b & 0xC000 == 0x8000 then return 0xFFFD.code_point
 152                 if b & 0xF0000000 == 0xE0000000 then
 153                         ret |= (b & 0xF000000) >> 12
 154                         ret |= (b & 0x3F0000) >> 10
 155                         ret |= (b & 0x3F00) >> 8
 156                         return ret.code_point
 157                 end
 158                 if not b & 0xC0 == 0x80 then return 0xFFFD.code_point
 159                 if b & 0xF8000000 == 0xF0000000 then
 160                         ret |= (b.to_i & 0x7000000) >> 6
 161                         ret |= (b.to_i & 0x3F0000) >> 4
 162                         ret |= (b.to_i & 0x3F00) >> 2
 163                         ret |= b.to_i & 0x3F
 164                         return ret.code_point
 165                 end
 166                 return 0xFFFD.code_point
 167         end
 168
 169         # Gets the byte index of char at position `n` in UTF-8 String
 170         fun char_to_byte_index(n: Int): Int do return char_to_byte_index_cached(n, 0, 0)
 171
 172         # Gets the length of the character at position `pos` (1 if invalid sequence)
 173         fun length_of_char_at(pos: Int): Int do
 174                 var c = self[pos]
 175                 if c & 0x80u8 == 0x00u8 then
 176                         return 1
 177                 else if c & 0xE0u8 == 0xC0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 then
 178                         return 2
 179                 else if c & 0xF0u8 == 0xE0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 then
 180                         return 3
 181                 else if c & 0xF8u8 == 0xF0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 and self[pos + 3] & 0xC0u8 == 0x80u8 then
 182                         return 4
 183                 else
 184                         return 1
 185                 end
 186         end
 187
 188         # Gets the byte index of char at position `n` in UTF-8 String
 189         #
 190         # `char_from` and `byte_from` are cached values to seek from.
 191         #
 192         # NOTE: char_from and byte_from are not guaranteed to be valid cache values
 193         # It it up to the client to ensure the validity of the information
 194         fun char_to_byte_index_cached(n, char_from, byte_from: Int): Int do
 195                 var ns_i = byte_from
 196                 var my_i = char_from
 197
 198                 var dist = n - my_i
 199
 200                 while dist > 0 do
 201                         while dist >= 4 do
 202                                 var i = fetch_4_chars(ns_i)
 203                                 if i & 0x80808080 != 0 then break
 204                                 ns_i += 4
 205                                 my_i += 4
 206                                 dist -= 4
 207                         end
 208                         if dist == 0 then break
 209                         ns_i += length_of_char_at(ns_i)
 210                         my_i += 1
 211                         dist -= 1
 212                 end
 213
 214                 while dist < 0 do
 215                         while dist <= -4 do
 216                                 var i = fetch_4_chars(ns_i - 4)
 217                                 if i & 0x80808080 != 0 then break
 218                                 ns_i -= 4
 219                                 my_i -= 4
 220                                 dist += 4
 221                         end
 222                         if dist == 0 then break
 223                         ns_i = find_beginning_of_char_at(ns_i - 1)
 224                         my_i -= 1
 225                         dist += 1
 226                 end
 227
 228                 return ns_i
 229         end
 230
 231         # Gets the char index of byte at position `n` in a UTF-8 String
 232         #
 233         # `char_from` and `byte_from` are cached values to seek from.
 234         #
 235         # NOTE: char_from and byte_from are not guaranteed to be valid cache values
 236         # It it up to the client to ensure the validity of the information
 237         fun byte_to_char_index_cached(n, char_from, byte_from: Int): Int do
 238                 var ns_i = byte_from
 239                 var my_i = char_from
 240
 241                 while ns_i < n do
 242                         ns_i += length_of_char_at(ns_i)
 243                         my_i += 1
 244                 end
 245
 246                 while ns_i > n do
 247                         ns_i = find_beginning_of_char_at(ns_i - 1)
 248                         my_i -= 1
 249                 end
 250
 251                 return my_i
 252         end
 253
 254         # Returns the beginning position of the char at position `pos`
 255         #
 256         # If the char is invalid UTF-8, `pos` is returned as-is
 257         #
 258         # ~~~raw
 259         #       assert "abc".items.find_beginning_of_char_at(2) == 2
 260         #       assert "か".items.find_beginning_of_char_at(1) == 0
 261         #       assert [0x41u8, 233u8].to_s.items.find_beginning_of_char_at(1) == 1
 262         # ~~~
 263         fun find_beginning_of_char_at(pos: Int): Int do
 264                 var endpos = pos
 265                 var c = self[pos]
 266                 if c & 0x80u8 == 0x00u8 then return pos
 267                 while c & 0xC0u8 == 0x80u8 do
 268                         pos -= 1
 269                         c = self[pos]
 270                 end
 271                 var stpos = pos
 272                 if length_of_char_at(stpos) >= (endpos - stpos + 1) then return pos
 273                 return endpos
 274         end
 275
 276         # Number of UTF-8 characters in `self` starting at `from`, for a length of `byte_length`
 277         fun utf8_length(from, byte_length: Int): Int is intern do
 278                 var st = from
 279                 var ln = 0
 280                 while byte_length > 0 do
 281                         while byte_length >= 4 do
 282                                 var i = fetch_4_chars(st)
 283                                 if i & 0x80808080 != 0 then break
 284                                 byte_length -= 4
 285                                 st += 4
 286                                 ln += 4
 287                         end
 288                         if byte_length == 0 then break
 289                         var cln = length_of_char_at(st)
 290                         st += cln
 291                         ln += 1
 292                         byte_length -= cln
 293                 end
 294                 return ln
 295         end
 296
 297         # Fetch 4 chars in `self` at `pos`
 298         fun fetch_4_chars(pos: Int): Int is intern `{ return (long)*((uint32_t*)(self+pos)); `}
 299
 300         # Fetch 4 chars in `self` at `pos`
 301         fun fetch_4_hchars(pos: Int): Int is intern `{ return (long)be32toh(*((uint32_t*)(self+pos))); `}
 302
 303
 304         # Right shifts `len` bytes of `self` from `sh` bytes starting at position `pos`
 305         fun rshift(sh, len, pos: Int) do
 306                 copy_to(self, len, pos, pos + sh)
 307         end
 308
 309         # Left shifts `len` bytes of `self` from `sh` bytes starting at position `pos`
 310         fun lshift(sh, len, pos: Int) do
 311                 copy_to(self, len, pos, pos - sh)
 312         end
 313 end