lib/core/text/native.nit

   1 # This file is part of NIT ( http://www.nitlanguage.org ).
   2 #
   3 # This file is free software, which comes along with NIT.  This software is
   4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
   5 # without  even  the implied warranty of  MERCHANTABILITY or  FITNESS FOR A
   6 # PARTICULAR PURPOSE.  You can modify it is you want,  provided this header
   7 # is kept unaltered, and a notification of the changes is added.
   8 # You  are  allowed  to  redistribute it and sell it, alone or is a part of
   9 # another product.
  10
  11 # Native structures for text and bytes
  12 module native
  13
  14 import kernel
  15 import math
  16
  17 in "C" `{
  18 #ifdef __linux__
  19         #include <endian.h>
  20 #endif
  21 #ifdef __APPLE__
  22         #include <libkern/OSByteOrder.h>
  23         #define be32toh(x) OSSwapBigToHostInt32(x)
  24 #endif
  25
  26 #ifdef __pnacl__
  27         #define be16toh(val) (((val) >> 8) | ((val) << 8))
  28         #define be32toh(val) ((be16toh((val) << 16) | (be16toh((val) >> 16))))
  29 #endif
  30 #ifndef be32toh
  31         #define be32toh(val) betoh32(val)
  32 #endif
  33 `}
  34
  35 redef class Byte
  36         # Gives the length of the UTF-8 char starting with `self`
  37         fun u8len: Int do
  38                 if self & 0b1000_0000u8 == 0u8 then
  39                         return 1
  40                 else if self & 0b1110_0000u8 == 0b1100_0000u8 then
  41                         return 2
  42                 else if self & 0b1111_0000u8 == 0b1110_0000u8 then
  43                         return 3
  44                 else if self & 0b1111_1000u8 == 0b1111_0000u8 then
  45                         return 4
  46                 else
  47                         return 1
  48                 end
  49         end
  50 end
  51
  52 redef class Int
  53         # Returns the code_point from a utf16 surrogate pair
  54         #
  55         #     assert 0xD83DDE02.from_utf16_surr == 0x1F602
  56         fun from_utf16_surr: Int do
  57                 var hi = (self & 0xFFFF0000) >> 16
  58                 var lo = self & 0xFFFF
  59                 var cp = 0
  60                 cp += (hi - 0xD800) << 10
  61                 cp += lo - 0xDC00
  62                 cp += 0x10000
  63                 return cp
  64         end
  65 end
  66
  67 # Native strings are simple C char *
  68 extern class NativeString `{ char* `}
  69         # Creates a new NativeString with a capacity of `length`
  70         new(length: Int) is intern
  71
  72         # Returns a char* starting at `index`.
  73         #
  74         # WARNING: Unsafe for extern code, use only for temporary
  75         # pointer manipulation purposes (e.g. write to file or such)
  76         fun fast_cstring(index: Int): NativeString is intern
  77
  78         # Get char at `index`.
  79         fun [](index: Int): Byte is intern
  80
  81         # Set char `item` at index.
  82         fun []=(index: Int, item: Byte) is intern
  83
  84         # Copy `self` to `dest`.
  85         fun copy_to(dest: NativeString, length: Int, from: Int, to: Int) is intern
  86
  87         # Position of the first nul character.
  88         fun cstring_length: Int
  89         do
  90                 var l = 0
  91                 while self[l] != 0u8 do l += 1
  92                 return l
  93         end
  94
  95         # Parse `self` as an Int.
  96         fun atoi: Int is intern
  97
  98         # Parse `self` as a Float.
  99         fun atof: Float `{ return atof(self); `}
 100
 101         # Gets the UTF-8 char at index `pos`
 102         #
 103         # Index is expressed in Unicode chars
 104         #
 105         # ~~~raw
 106         #     assert "かきく".as(FlatString).items.char_at(0) == 'か'
 107         # ~~~
 108         #
 109         # If the char at position pos is an invalid Unicode char,
 110         # the Unicode replacement character � (0xFFFD) will be used.
 111         #
 112         # ~~~raw
 113         #     assert "かきく".as(FlatString).items.char_at(1) == '�'
 114         # ~~~
 115         fun char_at(pos: Int): Char do
 116                 var c = self[pos]
 117                 if c & 0x80u8 == 0u8 then return c.ascii
 118                 var b = fetch_4_hchars(pos)
 119                 var ret = 0
 120                 if b & 0xC00000 != 0x800000 then return 0xFFFD.code_point
 121                 if b & 0xE0000000 == 0xC0000000 then
 122                         ret |= (b & 0x1F000000) >> 18
 123                         ret |= (b & 0x3F0000) >> 16
 124                         return ret.code_point
 125                 end
 126                 if not b & 0xC000 == 0x8000 then return 0xFFFD.code_point
 127                 if b & 0xF0000000 == 0xE0000000 then
 128                         ret |= (b & 0xF000000) >> 12
 129                         ret |= (b & 0x3F0000) >> 10
 130                         ret |= (b & 0x3F00) >> 8
 131                         return ret.code_point
 132                 end
 133                 if not b & 0xC0 == 0x80 then return 0xFFFD.code_point
 134                 if b & 0xF8000000 == 0xF0000000 then
 135                         ret |= (b.to_i & 0x7000000) >> 6
 136                         ret |= (b.to_i & 0x3F0000) >> 4
 137                         ret |= (b.to_i & 0x3F00) >> 2
 138                         ret |= b.to_i & 0x3F
 139                         return ret.code_point
 140                 end
 141                 return 0xFFFD.code_point
 142         end
 143
 144         # Gets the byte index of char at position `n` in UTF-8 String
 145         fun char_to_byte_index(n: Int): Int do return char_to_byte_index_cached(n, 0, 0)
 146
 147         # Gets the length of the character at position `pos` (1 if invalid sequence)
 148         fun length_of_char_at(pos: Int): Int do
 149                 var c = self[pos]
 150                 if c & 0x80u8 == 0x00u8 then
 151                         return 1
 152                 else if c & 0xE0u8 == 0xC0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 then
 153                         return 2
 154                 else if c & 0xF0u8 == 0xE0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 then
 155                         return 3
 156                 else if c & 0xF8u8 == 0xF0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 and self[pos + 3] & 0xC0u8 == 0x80u8 then
 157                         return 4
 158                 else
 159                         return 1
 160                 end
 161         end
 162
 163         # Gets the byte index of char at position `n` in UTF-8 String
 164         #
 165         # `char_from` and `byte_from` are cached values to seek from.
 166         #
 167         # NOTE: char_from and byte_from are not guaranteed to be valid cache values
 168         # It it up to the client to ensure the validity of the information
 169         fun char_to_byte_index_cached(n, char_from, byte_from: Int): Int do
 170                 var ns_i = byte_from
 171                 var my_i = char_from
 172
 173                 var dist = n - my_i
 174
 175                 while dist > 0 do
 176                         while dist >= 4 do
 177                                 var i = fetch_4_chars(ns_i)
 178                                 if i & 0x80808080 != 0 then break
 179                                 ns_i += 4
 180                                 my_i += 4
 181                                 dist -= 4
 182                         end
 183                         if dist == 0 then break
 184                         ns_i += length_of_char_at(ns_i)
 185                         my_i += 1
 186                         dist -= 1
 187                 end
 188
 189                 while dist < 0 do
 190                         while dist <= -4 do
 191                                 var i = fetch_4_chars(ns_i - 4)
 192                                 if i & 0x80808080 != 0 then break
 193                                 ns_i -= 4
 194                                 my_i -= 4
 195                                 dist += 4
 196                         end
 197                         if dist == 0 then break
 198                         ns_i = find_beginning_of_char_at(ns_i - 1)
 199                         my_i -= 1
 200                         dist += 1
 201                 end
 202
 203                 return ns_i
 204         end
 205
 206         # Gets the char index of byte at position `n` in a UTF-8 String
 207         #
 208         # `char_from` and `byte_from` are cached values to seek from.
 209         #
 210         # NOTE: char_from and byte_from are not guaranteed to be valid cache values
 211         # It it up to the client to ensure the validity of the information
 212         fun byte_to_char_index_cached(n, char_from, byte_from: Int): Int do
 213                 var ns_i = byte_from
 214                 var my_i = char_from
 215
 216                 while ns_i < n do
 217                         ns_i += length_of_char_at(ns_i)
 218                         my_i += 1
 219                 end
 220
 221                 while ns_i > n do
 222                         ns_i = find_beginning_of_char_at(ns_i - 1)
 223                         my_i -= 1
 224                 end
 225
 226                 return my_i
 227         end
 228
 229         # Returns the beginning position of the char at position `pos`
 230         #
 231         # If the char is invalid UTF-8, `pos` is returned as-is
 232         #
 233         # ~~~raw
 234         #       assert "abc".items.find_beginning_of_char_at(2) == 2
 235         #       assert "か".items.find_beginning_of_char_at(1) == 0
 236         #       assert [0x41u8, 233u8].to_s.items.find_beginning_of_char_at(1) == 1
 237         # ~~~
 238         fun find_beginning_of_char_at(pos: Int): Int do
 239                 var endpos = pos
 240                 var c = self[pos]
 241                 while c & 0xC0u8 == 0x80u8 do
 242                         pos -= 1
 243                         c = self[pos]
 244                 end
 245                 var stpos = pos
 246                 if length_of_char_at(stpos) >= (endpos - stpos + 1) then return pos
 247                 return endpos
 248         end
 249
 250         # Number of UTF-8 characters in `self` between positions `from` and `to`
 251         fun utf8_length(from, to: Int): Int do
 252                 var st = from
 253                 var lst = to
 254                 var ln = 0
 255                 while st <= lst do
 256                         st += length_of_char_at(st)
 257                         ln += 1
 258                 end
 259                 return ln
 260         end
 261
 262         # Fetch 4 chars in `self` at `pos`
 263         fun fetch_4_chars(pos: Int): Int is intern do return fetch_4_ffi(pos)
 264
 265         # Fetch 4 chars in `self` at `pos`
 266         fun fetch_4_hchars(pos: Int): Int is intern do return fetch_4h_ffi(pos)
 267
 268         # FIXME: To remove when bootstrap supports PR #1898
 269         private fun fetch_4_ffi(pos: Int): Int `{ return (long)*((uint32_t*)(self+pos)); `}
 270         private fun fetch_4h_ffi(pos: Int): Int `{ return (long)be32toh(*((uint32_t*)(self+pos))); `}
 271 end