lib/string_experimentations/utf8.nit

   1 # This file is part of NIT ( http://www.nitlanguage.org ).
   2 #
   3 # Licensed under the Apache License, Version 2.0 (the "License");
   4 # you may not use this file except in compliance with the License.
   5 # You may obtain a copy of the License at
   6 #
   7 #     http://www.apache.org/licenses/LICENSE-2.0
   8 #
   9 # Unless required by applicable law or agreed to in writing, software
  10 # distributed under the License is distributed on an "AS IS" BASIS,
  11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 # See the License for the specific language governing permissions and
  13 # limitations under the License.
  14
  15 # Introduces UTF-8 as internal encoding for Strings in Nit.
  16 module utf8
  17
  18 intrude import standard::string
  19 intrude import standard::file
  20
  21 in "C Header" `{
  22
  23 #include <stdio.h>
  24 #include <string.h>
  25 #include <stdint.h>
  26
  27 typedef struct {
  28         long pos;
  29         char* ns;
  30 } UTF8Char;
  31
  32 `}
  33
  34 # UTF-8 char as defined in RFC-3629, e.g. 1-4 Bytes
  35 #
  36 # A UTF-8 char has its bytes stored in a NativeString (char*)
  37 extern class UnicodeChar `{ UTF8Char* `}
  38
  39         new(pos: Int, ns: NativeString) `{
  40                 UTF8Char* u = malloc(sizeof(UTF8Char));
  41                 u->pos = pos;
  42                 u->ns = ns;
  43                 return u;
  44         `}
  45
  46         # Real length of the char in UTF8
  47         #
  48         # As per the specification :
  49         #
  50         # ~~~raw
  51         #  Length  |        UTF-8 octet sequence
  52         #          |              (binary)
  53         # ---------+-------------------------------------------------
  54         #  1       | 0xxxxxxx
  55         #  2       | 110xxxxx 10xxxxxx
  56         #  3       | 1110xxxx 10xxxxxx 10xxxxxx
  57         #  4       | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  58         # ~~~
  59         private fun len: Int `{
  60                 char* ns = recv->ns;
  61                 int pos = recv->pos;
  62                 char nspos = ns[pos];
  63                 if((nspos & 0x80) == 0x00){ return 1;}
  64                 if((nspos & 0xE0) == 0xC0){ return 2;}
  65                 if((nspos & 0xF0) == 0xE0){ return 3;}
  66                 if((nspos & 0xF7) == 0xF0){ return 4;}
  67                 // Invalid character
  68                 return 1;
  69         `}
  70
  71         # Position in containing NativeString
  72         private fun pos: Int `{
  73                 return recv->pos;
  74         `}
  75
  76         private fun pos=(p: Int) `{recv->pos = p;`}
  77
  78         # C char* wrapping the char
  79         fun ns: NativeString `{
  80                 return recv->ns;
  81         `}
  82
  83         # Returns the Unicode code point representing the character
  84         #
  85         # Note : A unicode character might not be a visible glyph, but it will be used to determine canonical equivalence
  86         fun code_point: Int import UnicodeChar.len `{
  87                 switch(UnicodeChar_len(recv)){
  88                         case 1:
  89                                 return (long)(0x7F & (unsigned char)recv->ns[recv->pos]);
  90                         case 2:
  91                                 return 0 | ((0x1F & (unsigned char)recv->ns[recv->pos]) << 6) | (0x3F & (unsigned char)recv->ns[recv->pos+1]);
  92                         case 3:
  93                                 return 0 | ((0x0F & (unsigned char)recv->ns[recv->pos]) << 12) |
  94                                 ((0x3F & (unsigned char)recv->ns[recv->pos+1]) << 6) |
  95                                 (0x3F & (unsigned char)recv->ns[recv->pos+2]);
  96                         case 4:
  97                                 return 0 | ((0x07 & (unsigned char)recv->ns[recv->pos]) << 18) |
  98                                 ((0x3F & (unsigned char)recv->ns[recv->pos+1]) << 12) |
  99                                 ((0x3F & (unsigned char)recv->ns[recv->pos+2]) << 6) |
 100                                 (0x3F & (unsigned char)recv->ns[recv->pos+3]);
 101                 }
 102         `}
 103
 104         # Returns an upper-case version of self
 105         #
 106         # NOTE : Works only on ASCII chars
 107         # TODO : Support unicode for to_upper
 108         fun to_upper: UnicodeChar import UnicodeChar.code_point `{
 109                 int cp = UnicodeChar_code_point(recv);
 110                 if(cp < 97 || cp > 122){ return recv; }
 111                 char* ns = malloc(2);
 112                 ns[1] = '\0';
 113                 char c = recv->ns[recv->pos];
 114                 ns[0] = c - 32;
 115                 UTF8Char* ret = malloc(sizeof(UTF8Char));
 116                 ret->ns = ns;
 117                 ret->pos = 0;
 118                 return ret;
 119         `}
 120
 121         # Returns an lower-case version of self
 122         #
 123         # NOTE : Works only on ASCII chars
 124         # TODO : Support unicode for to_upper
 125         fun to_lower: UnicodeChar import UnicodeChar.code_point `{
 126                 int cp = UnicodeChar_code_point(recv);
 127                 if(cp < 65 || cp > 90){ return recv; }
 128                 char* ns = malloc(2);
 129                 ns[1] = '\0';
 130                 char c = recv->ns[recv->pos];
 131                 ns[0] = c + 32;
 132                 UTF8Char* ret = malloc(sizeof(UTF8Char));
 133                 ret->ns = ns;
 134                 ret->pos = 0;
 135                 return ret;
 136         `}
 137
 138         redef fun ==(o)
 139         do
 140                 if o isa Char then
 141                         if len != 1 then return false
 142                         if code_point == o.ascii then return true
 143                 else if o isa UnicodeChar then
 144                         if len != o.len then return false
 145                         if code_point == o.code_point then return true
 146                 end
 147                 return false
 148         end
 149
 150         redef fun output import UnicodeChar.code_point `{
 151                 switch(UnicodeChar_len(recv)){
 152                         case 1:
 153                                 printf("%c", recv->ns[recv->pos]);
 154                                 break;
 155                         case 2:
 156                                 printf("%c%c", recv->ns[recv->pos], recv->ns[recv->pos + 1]);
 157                                 break;
 158                         case 3:
 159                                 printf("%c%c%c", recv->ns[recv->pos], recv->ns[recv->pos + 1], recv->ns[recv->pos + 2]);
 160                                 break;
 161                         case 4:
 162                                 printf("%c%c%c%c", recv->ns[recv->pos], recv->ns[recv->pos + 1], recv->ns[recv->pos + 2], recv->ns[recv->pos + 3]);
 163                                 break;
 164                 }
 165         `}
 166
 167         redef fun to_s import NativeString.to_s_with_length `{
 168                 int len = utf8___UnicodeChar_len___impl(recv);
 169                 char* r = malloc(len + 1);
 170                 r[len] = '\0';
 171                 char* src = (recv->ns + recv->pos);
 172                 memcpy(r, src, len);
 173                 return NativeString_to_s_with_length(r, len);
 174         `}
 175 end
 176
 177 # A `StringIndex` is used to keep track of the position of characters in a `FlatString` object
 178 #
 179 # It becomes mandatory for UTF-8 strings since characters do not have a fixed size.
 180 private extern class StringIndex `{ UTF8Char* `}
 181
 182         new(size: Int) `{ return malloc(size*sizeof(UTF8Char)); `}
 183
 184         # Sets the character at `index` as `item`
 185         fun []=(index: Int, item: UnicodeChar) `{ recv[index] = *item; `}
 186
 187         # Gets the character at position `id`
 188         fun [](id: Int): UnicodeChar `{ return &recv[id]; `}
 189
 190         # Copies a part of self starting at index `my_from` of length `length` into `other`, starting at `its_from`
 191         fun copy_to(other: StringIndex, my_from: Int, its_from: Int, length: Int)`{
 192                 UTF8Char* myfrom = recv + my_from*(sizeof(UTF8Char));
 193                 UTF8Char* itsfrom = other + its_from*(sizeof(UTF8Char));
 194                 memcpy(itsfrom, myfrom, length);
 195         `}
 196 end
 197
 198 redef class FlatString
 199
 200         # Index of the characters of the FlatString
 201         private var index: StringIndex
 202
 203         # Length in bytes of the string (e.g. the length of the C string)
 204         var bytelen: Int
 205
 206         private init with_infos_index(items: NativeString, len: Int, index_from: Int, index_to: Int, index: StringIndex, bytelen: Int)
 207         do
 208                 self.items = items
 209                 length = len
 210                 self.index_from = index_from
 211                 self.index_to = index_to
 212                 self.index = index
 213                 self.bytelen = bytelen
 214         end
 215
 216         redef fun to_cstring
 217         do
 218                 if real_items != null then return real_items.as(not null)
 219                 var new_items = new NativeString(bytelen + 1)
 220                 self.items.copy_to(new_items, bytelen, index[index_from].pos, 0)
 221                 new_items[bytelen] = '\0'
 222                 self.real_items = new_items
 223                 return new_items
 224         end
 225
 226         redef fun substring(from, count)
 227         do
 228                 assert count >= 0
 229
 230                 if from < 0 then
 231                         count += from
 232                         if count < 0 then count = 0
 233                         from = 0
 234                 end
 235
 236                 if count == 0 then return empty
 237
 238                 var real_from = index_from + from
 239                 var real_to = real_from + count - 1
 240
 241                 if real_to > index_to then real_to = index_to
 242
 243                 var sub_bytelen = (index[real_to].pos - index[from].pos) + index[from].len
 244
 245                 return new FlatString.with_infos_index(items, count, real_from, real_to, index, sub_bytelen)
 246         end
 247
 248         redef fun reversed
 249         do
 250                 var native = new NativeString(self.bytelen + 1)
 251                 var length = self.length
 252                 var index = self.index
 253                 var pos = 0
 254                 var i = 0
 255                 var ipos = bytelen
 256                 var new_index = new StringIndex(length)
 257                 var pos_index = length
 258                 while i < length do
 259                         var uchar = index[i]
 260                         var uchar_len = uchar.len
 261                         ipos -= uchar_len
 262                         new_index[pos_index] = new UnicodeChar(ipos, native)
 263                         pos_index -= 1
 264                         items.copy_to(native, uchar_len, pos, ipos)
 265                         pos += uchar_len
 266                         i += 1
 267                 end
 268                 return new FlatString.with_infos_index(native, length, 0, length-1, new_index, bytelen)
 269         end
 270
 271         redef fun *(i)
 272         do
 273                 assert i >= 0
 274
 275                 var mylen = self.bytelen
 276                 var finlen = mylen * i
 277
 278                 var my_items = self.items
 279
 280                 var my_real_len = length
 281                 var my_real_fin_len = my_real_len * i
 282
 283                 var target_string = new NativeString((finlen) + 1)
 284
 285                 var my_index = index
 286                 var new_index = new StringIndex(my_real_fin_len)
 287
 288                 target_string[finlen] = '\0'
 289
 290                 var current_last = 0
 291                 var curr_index = 0
 292
 293                 for iteration in [1 .. i] do
 294                         my_items.copy_to(target_string, mylen, index_from, current_last)
 295                         my_index.copy_to(new_index, length, 0, curr_index)
 296                         current_last += mylen
 297                 end
 298
 299                 return new FlatString.with_infos_index(target_string, my_real_fin_len, 0, my_real_fin_len -1, new_index, finlen)
 300
 301         end
 302
 303         redef fun to_upper
 304         do
 305                 var outstr = new NativeString(self.bytelen + 1)
 306
 307                 var out_index = 0
 308                 var index = self.index
 309                 var ipos = 0
 310                 var max = length
 311                 var items = self.items
 312
 313                 while ipos < max do
 314                         var u = index[ipos].to_upper
 315                         u.ns.copy_to(outstr, u.len, u.pos, out_index)
 316                         out_index += u.len
 317                         ipos += 1
 318                 end
 319
 320                 outstr[self.bytelen] = '\0'
 321
 322                 return outstr.to_s_with_length(self.bytelen)
 323         end
 324
 325         redef fun to_lower
 326         do
 327                 var outstr = new NativeString(self.bytelen + 1)
 328
 329                 var out_index = 0
 330                 var index = self.index
 331                 var ipos = 0
 332                 var max = length
 333                 var items = self.items
 334
 335                 while ipos < max do
 336                         var u = index[ipos].to_lower
 337                         u.ns.copy_to(outstr, u.len, u.pos, out_index)
 338                         out_index += u.len
 339                         ipos += 1
 340                 end
 341
 342                 outstr[self.bytelen] = '\0'
 343
 344                 return outstr.to_s_with_length(self.bytelen)
 345         end
 346
 347         redef fun output
 348         do
 349                 var i = self.index_from
 350                 var imax = self.index_to
 351                 while i <= imax do
 352                         index[i].output
 353                         i += 1
 354                 end
 355         end
 356
 357 end
 358
 359 redef class FlatBuffer
 360
 361         # Fix for this particular implementation
 362         #
 363         # Since the to_s of a FlatBuffer now builds using
 364         # the old String contructor, this breaks everything.
 365         #
 366         # This will disappear when UTF8 is fully-supported
 367         redef fun to_s do
 368                 written = false
 369                 return to_cstring.to_s_with_length(length)
 370         end
 371 end
 372
 373 redef class NativeString
 374
 375         # Creates the index for said NativeString
 376         # `length` is the size of the CString (in bytes, up to the first \0)
 377         # real_len is just a way to store the length (UTF-8 characters)
 378         private fun make_index(length: Int, real_len: Container[Int]): StringIndex import Container[Int].item=, UnicodeChar.len `{
 379                 int pos = 0;
 380                 int index_pos = 0;
 381                 UTF8Char* index = malloc(length*sizeof(UTF8Char));
 382                 while(pos < length){
 383                         UTF8Char* curr = &index[index_pos];
 384                         curr->pos = pos;
 385                         curr->ns = recv;
 386                         pos += UnicodeChar_len(curr);
 387                         index_pos ++;
 388                 }
 389                 Container_of_Int_item__assign(real_len, index_pos);
 390                 return index;
 391         `}
 392
 393         redef fun to_s: FlatString
 394         do
 395                 var len = cstring_length
 396                 return to_s_with_length(len)
 397         end
 398
 399         redef fun to_s_with_length(len: Int): FlatString
 400         do
 401                 var real_len = new Container[Int](0)
 402                 var x = make_index(len, real_len)
 403                 return new FlatString.with_infos_index(self, real_len.item, 0, real_len.item - 1, x, len)
 404         end
 405
 406         redef fun to_s_with_copy
 407         do
 408                 var real_len = new Container[Int](0)
 409                 var length = cstring_length
 410                 var x = make_index(length, real_len)
 411                 var new_self = new NativeString(length + 1)
 412                 copy_to(new_self, length, 0, 0)
 413                 return new FlatString.with_infos_index(new_self, real_len.item, 0, real_len.item - 1, x, length)
 414         end
 415 end
 416
 417 redef class OFStream
 418         redef fun write(s)
 419         do
 420                 assert is_writable
 421                 if s isa FlatText then
 422                         if s isa FlatString then
 423                                 write_native(s.to_cstring, s.bytelen)
 424                         else
 425                                 write_native(s.to_cstring, s.length)
 426                         end
 427                 else for i in s.substrings do write_native(i.to_cstring, i.length)
 428         end
 429 end