# This file is part of NIT ( http://www.nitlanguage.org ). # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Introduces UTF-8 as internal encoding for Strings in Nit. module utf8 intrude import standard::string intrude import standard::file in "C Header" `{ #include #include #include typedef struct { long pos; char* ns; } UTF8Char; `} # UTF-8 char as defined in RFC-3629, e.g. 1-4 Bytes # # A UTF-8 char has its bytes stored in a NativeString (char*) extern class UTF8Char `{ UTF8Char* `} new(pos: Int, ns: NativeString) `{ UTF8Char* u = malloc(sizeof(UTF8Char)); u->pos = pos; u->ns = ns; return u; `} # Real length of the char in UTF8 # # As per the specification : # # ~~~raw # Length | UTF-8 octet sequence # | (binary) # ---------+------------------------------------------------- # 1 | 0xxxxxxx # 2 | 110xxxxx 10xxxxxx # 3 | 1110xxxx 10xxxxxx 10xxxxxx # 4 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx # ~~~ private fun len: Int `{ char* ns = recv->ns; int pos = recv->pos; char nspos = ns[pos]; if((nspos & 0x80) == 0x00){ return 1;} if((nspos & 0xE0) == 0xC0){ return 2;} if((nspos & 0xF0) == 0xE0){ return 3;} if((nspos & 0xF7) == 0xF0){ return 4;} // Invalid character return 1; `} # Position in containing NativeString private fun pos: Int `{ return recv->pos; `} private fun pos=(p: Int) `{recv->pos = p;`} # C char* wrapping the char fun ns: NativeString `{ return recv->ns; `} # Returns the Unicode code point representing the character # # Note : A unicode character might not be a visible glyph, but it will be used to determine canonical equivalence fun code_point: Int import UTF8Char.len `{ switch(UTF8Char_len(recv)){ case 1: return (long)(0x7F & (unsigned char)recv->ns[recv->pos]); case 2: return 0 | ((0x1F & (unsigned char)recv->ns[recv->pos]) << 6) | (0x3F & (unsigned char)recv->ns[recv->pos+1]); case 3: return 0 | ((0x0F & (unsigned char)recv->ns[recv->pos]) << 12) | ((0x3F & (unsigned char)recv->ns[recv->pos+1]) << 6) | (0x3F & (unsigned char)recv->ns[recv->pos+2]); case 4: return 0 | ((0x07 & (unsigned char)recv->ns[recv->pos]) << 18) | ((0x3F & (unsigned char)recv->ns[recv->pos+1]) << 12) | ((0x3F & (unsigned char)recv->ns[recv->pos+2]) << 6) | (0x3F & (unsigned char)recv->ns[recv->pos+3]); } `} # Returns an upper-case version of self # # NOTE : Works only on ASCII chars # TODO : Support unicode for to_upper fun to_upper: UTF8Char import UTF8Char.code_point `{ int cp = UTF8Char_code_point(recv); if(cp < 97 || cp > 122){ return recv; } char* ns = malloc(2); ns[1] = '\0'; char c = recv->ns[recv->pos]; ns[0] = c - 32; UTF8Char* ret = malloc(sizeof(UTF8Char)); ret->ns = ns; ret->pos = 0; return ret; `} # Returns an lower-case version of self # # NOTE : Works only on ASCII chars # TODO : Support unicode for to_upper fun to_lower: UTF8Char import UTF8Char.code_point `{ int cp = UTF8Char_code_point(recv); if(cp < 65 || cp > 90){ return recv; } char* ns = malloc(2); ns[1] = '\0'; char c = recv->ns[recv->pos]; ns[0] = c + 32; UTF8Char* ret = malloc(sizeof(UTF8Char)); ret->ns = ns; ret->pos = 0; return ret; `} redef fun ==(o) do if o isa Char then if len != 1 then return false if code_point == o.ascii then return true else if o isa UTF8Char then if len != o.len then return false if code_point == o.code_point then return true end return false end redef fun output import UTF8Char.code_point `{ switch(UTF8Char_len(recv)){ case 1: printf("%c", recv->ns[recv->pos]); break; case 2: printf("%c%c", recv->ns[recv->pos], recv->ns[recv->pos + 1]); break; case 3: printf("%c%c%c", recv->ns[recv->pos], recv->ns[recv->pos + 1], recv->ns[recv->pos + 2]); break; case 4: printf("%c%c%c%c", recv->ns[recv->pos], recv->ns[recv->pos + 1], recv->ns[recv->pos + 2], recv->ns[recv->pos + 3]); break; } `} redef fun to_s import NativeString.to_s_with_length `{ int len = utf8___UTF8Char_len___impl(recv); char* r = malloc(len + 1); r[len] = '\0'; char* src = (recv->ns + recv->pos); memcpy(r, src, len); return NativeString_to_s_with_length(r, len); `} end # A `StringIndex` is used to keep track of the position of characters in a `FlatString` object # # It becomes mandatory for UTF-8 strings since characters do not have a fixed size. private extern class StringIndex `{ UTF8Char* `} new(size: Int) `{ return malloc(size*sizeof(UTF8Char)); `} # Sets the character at `index` as `item` fun []=(index: Int, item: UTF8Char) `{ recv[index] = *item; `} # Gets the character at position `id` fun [](id: Int): UTF8Char `{ return &recv[id]; `} # Copies a part of self starting at index `my_from` of length `length` into `other`, starting at `its_from` fun copy_to(other: StringIndex, my_from: Int, its_from: Int, length: Int)`{ UTF8Char* myfrom = recv + my_from*(sizeof(UTF8Char)); UTF8Char* itsfrom = other + its_from*(sizeof(UTF8Char)); memcpy(itsfrom, myfrom, length); `} end redef class FlatString # Index of the characters of the FlatString private var index: StringIndex # Length in bytes of the string (e.g. the length of the C string) var bytelen: Int private init with_infos_index(items: NativeString, len: Int, index_from: Int, index_to: Int, index: StringIndex, bytelen: Int) do self.items = items length = len self.index_from = index_from self.index_to = index_to self.index = index self.bytelen = bytelen end redef fun to_cstring do if real_items != null then return real_items.as(not null) var new_items = new NativeString(bytelen + 1) self.items.copy_to(new_items, bytelen, index[index_from].pos, 0) new_items[bytelen] = '\0' self.real_items = new_items return new_items end redef fun substring(from, count) do assert count >= 0 if from < 0 then count += from if count < 0 then count = 0 from = 0 end if count == 0 then return empty var real_from = index_from + from var real_to = real_from + count - 1 if real_to > index_to then real_to = index_to var sub_bytelen = (index[real_to].pos - index[from].pos) + index[from].len return new FlatString.with_infos_index(items, count, real_from, real_to, index, sub_bytelen) end redef fun reversed do var native = new NativeString(self.bytelen + 1) var length = self.length var index = self.index var pos = 0 var i = 0 var ipos = bytelen var new_index = new StringIndex(length) var pos_index = length while i < length do var uchar = index[i] var uchar_len = uchar.len ipos -= uchar_len new_index[pos_index] = new UTF8Char(ipos, native) pos_index -= 1 items.copy_to(native, uchar_len, pos, ipos) pos += uchar_len i += 1 end return new FlatString.with_infos_index(native, length, 0, length-1, new_index, bytelen) end redef fun *(i) do assert i >= 0 var mylen = self.bytelen var finlen = mylen * i var my_items = self.items var my_real_len = length var my_real_fin_len = my_real_len * i var target_string = new NativeString((finlen) + 1) var my_index = index var new_index = new StringIndex(my_real_fin_len) target_string[finlen] = '\0' var current_last = 0 var curr_index = 0 for iteration in [1 .. i] do my_items.copy_to(target_string, mylen, index_from, current_last) my_index.copy_to(new_index, length, 0, curr_index) current_last += mylen end return new FlatString.with_infos_index(target_string, my_real_fin_len, 0, my_real_fin_len -1, new_index, finlen) end redef fun to_upper do var outstr = new NativeString(self.bytelen + 1) var out_index = 0 var index = self.index var ipos = 0 var max = length while ipos < max do var u = index[ipos].to_upper u.ns.copy_to(outstr, u.len, u.pos, out_index) out_index += u.len ipos += 1 end outstr[self.bytelen] = '\0' return outstr.to_s_with_length(self.bytelen) end redef fun to_lower do var outstr = new NativeString(self.bytelen + 1) var out_index = 0 var index = self.index var ipos = 0 var max = length while ipos < max do var u = index[ipos].to_lower u.ns.copy_to(outstr, u.len, u.pos, out_index) out_index += u.len ipos += 1 end outstr[self.bytelen] = '\0' return outstr.to_s_with_length(self.bytelen) end redef fun output do var i = self.index_from var imax = self.index_to while i <= imax do index[i].output i += 1 end end end redef class FlatBuffer # Fix for this particular implementation # # Since the to_s of a FlatBuffer now builds using # the old String contructor, this breaks everything. # # This will disappear when UTF8 is fully-supported redef fun to_s do written = false return to_cstring.to_s_with_length(length) end end redef class NativeString # Creates the index for said NativeString # `length` is the size of the CString (in bytes, up to the first \0) # real_len is just a way to store the length (UTF-8 characters) private fun make_index(length: Int, real_len: Container[Int]): StringIndex import Container[Int].item=, UTF8Char.len `{ int pos = 0; int index_pos = 0; UTF8Char* index = malloc(length*sizeof(UTF8Char)); while(pos < length){ UTF8Char* curr = &index[index_pos]; curr->pos = pos; curr->ns = recv; pos += UTF8Char_len(curr); index_pos ++; } Container_of_Int_item__assign(real_len, index_pos); return index; `} redef fun to_s: FlatString do var len = cstring_length return to_s_with_length(len) end redef fun to_s_with_length(len: Int): FlatString do var real_len = new Container[Int](0) var x = make_index(len, real_len) return new FlatString.with_infos_index(self, real_len.item, 0, real_len.item - 1, x, len) end redef fun to_s_with_copy do var real_len = new Container[Int](0) var length = cstring_length var x = make_index(length, real_len) var new_self = new NativeString(length + 1) copy_to(new_self, length, 0, 0) return new FlatString.with_infos_index(new_self, real_len.item, 0, real_len.item - 1, x, length) end end redef class FileWriter redef fun write(s) do assert is_writable if s isa FlatText then if s isa FlatString then write_native(s.to_cstring, s.bytelen) else write_native(s.to_cstring, s.length) end else for i in s.substrings do write_native(i.to_cstring, i.length) end end