lib/core/text/native.nit

   1 # This file is part of NIT ( http://www.nitlanguage.org ).
   2 #
   3 # This file is free software, which comes along with NIT.  This software is
   4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
   5 # without  even  the implied warranty of  MERCHANTABILITY or  FITNESS FOR A
   6 # PARTICULAR PURPOSE.  You can modify it is you want,  provided this header
   7 # is kept unaltered, and a notification of the changes is added.
   8 # You  are  allowed  to  redistribute it and sell it, alone or is a part of
   9 # another product.
  10
  11 # Native structures for text and bytes
  12 module native
  13
  14 import kernel
  15 import math
  16
  17 redef class Byte
  18         # Gives the length of the UTF-8 char starting with `self`
  19         fun u8len: Int do
  20                 if self & 0b1000_0000u8 == 0u8 then
  21                         return 1
  22                 else if self & 0b1110_0000u8 == 0b1100_0000u8 then
  23                         return 2
  24                 else if self & 0b1111_0000u8 == 0b1110_0000u8 then
  25                         return 3
  26                 else if self & 0b1111_1000u8 == 0b1111_0000u8 then
  27                         return 4
  28                 else
  29                         return 1
  30                 end
  31         end
  32 end
  33
  34 redef class Int
  35         # Returns the code_point from a utf16 surrogate pair
  36         #
  37         #     assert 0xD83DDE02.from_utf16_surr == 0x1F602
  38         fun from_utf16_surr: Int do
  39                 var hi = (self & 0xFFFF0000) >> 16
  40                 var lo = self & 0xFFFF
  41                 var cp = 0
  42                 cp += (hi - 0xD800) << 10
  43                 cp += lo - 0xDC00
  44                 cp += 0x10000
  45                 return cp
  46         end
  47 end
  48
  49 # Native strings are simple C char *
  50 extern class NativeString `{ char* `}
  51         # Creates a new NativeString with a capacity of `length`
  52         new(length: Int) is intern
  53
  54         # Returns a char* starting at `index`.
  55         #
  56         # WARNING: Unsafe for extern code, use only for temporary
  57         # pointer manipulation purposes (e.g. write to file or such)
  58         fun fast_cstring(index: Int): NativeString is intern
  59
  60         # Get char at `index`.
  61         fun [](index: Int): Byte is intern
  62
  63         # Set char `item` at index.
  64         fun []=(index: Int, item: Byte) is intern
  65
  66         # Copy `self` to `dest`.
  67         fun copy_to(dest: NativeString, length: Int, from: Int, to: Int) is intern
  68
  69         # Position of the first nul character.
  70         fun cstring_length: Int
  71         do
  72                 var l = 0
  73                 while self[l] != 0u8 do l += 1
  74                 return l
  75         end
  76
  77         # Parse `self` as an Int.
  78         fun atoi: Int is intern
  79
  80         # Parse `self` as a Float.
  81         fun atof: Float `{ return atof(self); `}
  82
  83         # Gets the UTF-8 char at index `pos`
  84         #
  85         # Index is expressed in Unicode chars
  86         #
  87         # ~~~raw
  88         #     assert "かきく".as(FlatString).items.char_at(0) == 'か'
  89         # ~~~
  90         #
  91         # If the char at position pos is an invalid Unicode char,
  92         # the Unicode replacement character � (0xFFFD) will be used.
  93         #
  94         # ~~~raw
  95         #     assert "かきく".as(FlatString).items.char_at(1) == '�'
  96         # ~~~
  97         fun char_at(pos: Int): Char `{
  98                 char c = self[pos];
  99                 if((c & 0x80) == 0x00) return (uint32_t)c;
 100                 if(((c & 0xE0) == 0xC0) && ((self[pos + 1] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0x1F) << 6) + ((((uint32_t)self[pos + 1] & 0x3F)));
 101                 if(((c & 0xF0) == 0xE0) && ((self[pos + 1] & 0xC0) == 0x80) && ((self[pos + 2] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0xF) << 12) + ((((uint32_t)self[pos + 1]) & 0x3F) << 6) + ((((uint32_t)self[pos + 2] & 0x3F)));
 102                 if(((c & 0xF8) == 0xF0) && ((self[pos + 1] & 0xC0) == 0x80) && ((self[pos + 2] & 0xC0) == 0x80) && ((self[pos + 3] & 0xC0) == 0x80)) return ((((uint32_t)c) & 0x7) << 18) + ((((uint32_t)self[pos + 1]) & 0x3F) << 12) + ((((uint32_t)self[pos + 2]) & 0x3F) << 6) + ((((uint32_t)self[pos + 3] & 0x3F)));
 103                 return 0xFFFD;
 104         `}
 105
 106         # Gets the byte index of char at position `n` in UTF-8 String
 107         fun char_to_byte_index(n: Int): Int do return char_to_byte_index_cached(n, 0, 0)
 108
 109         # Gets the length of the character at position `pos` (1 if invalid sequence)
 110         fun length_of_char_at(pos: Int): Int do
 111                 var c = self[pos]
 112                 if c & 0x80u8 == 0x00u8 then
 113                         return 1
 114                 else if c & 0xE0u8 == 0xC0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 then
 115                         return 2
 116                 else if c & 0xF0u8 == 0xE0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 then
 117                         return 3
 118                 else if c & 0xF8u8 == 0xF0u8 and self[pos + 1] & 0xC0u8 == 0x80u8 and self[pos + 2] & 0xC0u8 == 0x80u8 and self[pos + 3] & 0xC0u8 == 0x80u8 then
 119                         return 4
 120                 else
 121                         return 1
 122                 end
 123         end
 124
 125         # Gets the byte index of char at position `n` in UTF-8 String
 126         #
 127         # `char_from` and `byte_from` are cached values to seek from.
 128         #
 129         # NOTE: char_from and byte_from are not guaranteed to be valid cache values
 130         # It it up to the client to ensure the validity of the information
 131         fun char_to_byte_index_cached(n, char_from, byte_from: Int): Int do
 132                 var ns_i = byte_from
 133                 var my_i = char_from
 134
 135                 while my_i < n do
 136                         ns_i += length_of_char_at(ns_i)
 137                         my_i += 1
 138                 end
 139
 140                 while my_i > n do
 141                         ns_i = find_beginning_of_char_at(ns_i - 1)
 142                         my_i -= 1
 143                 end
 144
 145                 return ns_i
 146         end
 147
 148         # Gets the char index of byte at position `n` in a UTF-8 String
 149         #
 150         # `char_from` and `byte_from` are cached values to seek from.
 151         #
 152         # NOTE: char_from and byte_from are not guaranteed to be valid cache values
 153         # It it up to the client to ensure the validity of the information
 154         fun byte_to_char_index_cached(n, char_from, byte_from: Int): Int do
 155                 var ns_i = byte_from
 156                 var my_i = char_from
 157
 158                 while ns_i < n do
 159                         ns_i += length_of_char_at(ns_i)
 160                         my_i += 1
 161                 end
 162
 163                 while ns_i > n do
 164                         ns_i = find_beginning_of_char_at(ns_i - 1)
 165                         my_i -= 1
 166                 end
 167
 168                 return my_i
 169         end
 170
 171         # Returns the beginning position of the char at position `pos`
 172         #
 173         # If the char is invalid UTF-8, `pos` is returned as-is
 174         #
 175         # ~~~raw
 176         #       assert "abc".items.find_beginning_of_char_at(2) == 2
 177         #       assert "か".items.find_beginning_of_char_at(1) == 0
 178         #       assert [0x41u8, 233u8].to_s.items.find_beginning_of_char_at(1) == 1
 179         # ~~~
 180         fun find_beginning_of_char_at(pos: Int): Int do
 181                 var endpos = pos
 182                 var c = self[pos]
 183                 while c & 0xC0u8 == 0x80u8 do
 184                         pos -= 1
 185                         c = self[pos]
 186                 end
 187                 var stpos = pos
 188                 if length_of_char_at(stpos) >= (endpos - stpos + 1) then return pos
 189                 return endpos
 190         end
 191
 192         # Number of UTF-8 characters in `self` between positions `from` and `to`
 193         fun utf8_length(from, to: Int): Int do
 194                 var st = from
 195                 var lst = to
 196                 var ln = 0
 197                 while st <= lst do
 198                         st += length_of_char_at(st)
 199                         ln += 1
 200                 end
 201                 return ln
 202         end
 203 end