X-Git-Url: http://nitlanguage.org diff --git a/lib/string_experimentations/utf8.nit b/lib/string_experimentations/utf8.nit index b124889..b7a74ce 100644 --- a/lib/string_experimentations/utf8.nit +++ b/lib/string_experimentations/utf8.nit @@ -16,6 +16,7 @@ module utf8 intrude import standard::string +intrude import standard::file in "C Header" `{ @@ -77,6 +78,90 @@ extern class UnicodeChar `{ UTF8Char* `} return recv->ns; `} + # Returns the Unicode code point representing the character + # + # Note : A unicode character might not be a visible glyph, but it will be used to determine canonical equivalence + fun code_point: Int import UnicodeChar.len `{ + switch(UnicodeChar_len(recv)){ + case 1: + return (long)(0x7F & (unsigned char)recv->ns[recv->pos]); + case 2: + return 0 | ((0x1F & (unsigned char)recv->ns[recv->pos]) << 6) | (0x3F & (unsigned char)recv->ns[recv->pos+1]); + case 3: + return 0 | ((0x0F & (unsigned char)recv->ns[recv->pos]) << 12) | + ((0x3F & (unsigned char)recv->ns[recv->pos+1]) << 6) | + (0x3F & (unsigned char)recv->ns[recv->pos+2]); + case 4: + return 0 | ((0x07 & (unsigned char)recv->ns[recv->pos]) << 18) | + ((0x3F & (unsigned char)recv->ns[recv->pos+1]) << 12) | + ((0x3F & (unsigned char)recv->ns[recv->pos+2]) << 6) | + (0x3F & (unsigned char)recv->ns[recv->pos+3]); + } + `} + + # Returns an upper-case version of self + # + # NOTE : Works only on ASCII chars + # TODO : Support unicode for to_upper + fun to_upper: UnicodeChar import UnicodeChar.code_point `{ + int cp = UnicodeChar_code_point(recv); + if(cp < 97 || cp > 122){ return recv; } + char* ns = malloc(2); + ns[1] = '\0'; + char c = recv->ns[recv->pos]; + ns[0] = c - 32; + UTF8Char* ret = malloc(sizeof(UTF8Char)); + ret->ns = ns; + ret->pos = 0; + return ret; + `} + + # Returns an lower-case version of self + # + # NOTE : Works only on ASCII chars + # TODO : Support unicode for to_upper + fun to_lower: UnicodeChar import UnicodeChar.code_point `{ + int cp = UnicodeChar_code_point(recv); + if(cp < 65 || cp > 90){ return recv; } + char* ns = malloc(2); + ns[1] = '\0'; + char c = recv->ns[recv->pos]; + ns[0] = c + 32; + UTF8Char* ret = malloc(sizeof(UTF8Char)); + ret->ns = ns; + ret->pos = 0; + return ret; + `} + + redef fun ==(o) + do + if o isa Char then + if len != 1 then return false + if code_point == o.ascii then return true + else if o isa UnicodeChar then + if len != o.len then return false + if code_point == o.code_point then return true + end + return false + end + + redef fun output import UnicodeChar.code_point `{ + switch(UnicodeChar_len(recv)){ + case 1: + printf("%c", recv->ns[recv->pos]); + break; + case 2: + printf("%c%c", recv->ns[recv->pos], recv->ns[recv->pos + 1]); + break; + case 3: + printf("%c%c%c", recv->ns[recv->pos], recv->ns[recv->pos + 1], recv->ns[recv->pos + 2]); + break; + case 4: + printf("%c%c%c%c", recv->ns[recv->pos], recv->ns[recv->pos + 1], recv->ns[recv->pos + 2], recv->ns[recv->pos + 3]); + break; + } + `} + redef fun to_s import NativeString.to_s_with_length `{ int len = utf8___UnicodeChar_len___impl(recv); char* r = malloc(len + 1); @@ -126,6 +211,161 @@ redef class FlatString self.bytelen = bytelen end + redef fun to_cstring + do + if real_items != null then return real_items.as(not null) + var new_items = new NativeString(bytelen + 1) + self.items.copy_to(new_items, bytelen, index[index_from].pos, 0) + new_items[bytelen] = '\0' + self.real_items = new_items + return new_items + end + + redef fun substring(from, count) + do + assert count >= 0 + + if from < 0 then + count += from + if count < 0 then count = 0 + from = 0 + end + + if count == 0 then return empty + + var real_from = index_from + from + var real_to = real_from + count - 1 + + if real_to > index_to then real_to = index_to + + var sub_bytelen = (index[real_to].pos - index[from].pos) + index[from].len + + return new FlatString.with_infos_index(items, count, real_from, real_to, index, sub_bytelen) + end + + redef fun reversed + do + var native = new NativeString(self.bytelen + 1) + var length = self.length + var index = self.index + var pos = 0 + var i = 0 + var ipos = bytelen + var new_index = new StringIndex(length) + var pos_index = length + while i < length do + var uchar = index[i] + var uchar_len = uchar.len + ipos -= uchar_len + new_index[pos_index] = new UnicodeChar(ipos, native) + pos_index -= 1 + items.copy_to(native, uchar_len, pos, ipos) + pos += uchar_len + i += 1 + end + return new FlatString.with_infos_index(native, length, 0, length-1, new_index, bytelen) + end + + redef fun *(i) + do + assert i >= 0 + + var mylen = self.bytelen + var finlen = mylen * i + + var my_items = self.items + + var my_real_len = length + var my_real_fin_len = my_real_len * i + + var target_string = new NativeString((finlen) + 1) + + var my_index = index + var new_index = new StringIndex(my_real_fin_len) + + target_string[finlen] = '\0' + + var current_last = 0 + var curr_index = 0 + + for iteration in [1 .. i] do + my_items.copy_to(target_string, mylen, index_from, current_last) + my_index.copy_to(new_index, length, 0, curr_index) + current_last += mylen + end + + return new FlatString.with_infos_index(target_string, my_real_fin_len, 0, my_real_fin_len -1, new_index, finlen) + + end + + redef fun to_upper + do + var outstr = new NativeString(self.bytelen + 1) + + var out_index = 0 + var index = self.index + var ipos = 0 + var max = length + var items = self.items + + while ipos < max do + var u = index[ipos].to_upper + u.ns.copy_to(outstr, u.len, u.pos, out_index) + out_index += u.len + ipos += 1 + end + + outstr[self.bytelen] = '\0' + + return outstr.to_s_with_length(self.bytelen) + end + + redef fun to_lower + do + var outstr = new NativeString(self.bytelen + 1) + + var out_index = 0 + var index = self.index + var ipos = 0 + var max = length + var items = self.items + + while ipos < max do + var u = index[ipos].to_lower + u.ns.copy_to(outstr, u.len, u.pos, out_index) + out_index += u.len + ipos += 1 + end + + outstr[self.bytelen] = '\0' + + return outstr.to_s_with_length(self.bytelen) + end + + redef fun output + do + var i = self.index_from + var imax = self.index_to + while i <= imax do + index[i].output + i += 1 + end + end + +end + +redef class FlatBuffer + + # Fix for this particular implementation + # + # Since the to_s of a FlatBuffer now builds using + # the old String contructor, this breaks everything. + # + # This will disappear when UTF8 is fully-supported + redef fun to_s do + written = false + return to_cstring.to_s_with_length(length) + end end redef class NativeString @@ -166,8 +406,22 @@ redef class NativeString var real_len = new Container[Int](0) var length = cstring_length var x = make_index(length, real_len) - var new_self = calloc_string(length + 1) + var new_self = new NativeString(length + 1) copy_to(new_self, length, 0, 0) return new FlatString.with_infos_index(new_self, real_len.item, 0, real_len.item - 1, x, length) end end + +redef class OFStream + redef fun write(s) + do + assert is_writable + if s isa FlatText then + if s isa FlatString then + write_native(s.to_cstring, s.bytelen) + else + write_native(s.to_cstring, s.length) + end + else for i in s.substrings do write_native(i.to_cstring, i.length) + end +end