X-Git-Url: http://nitlanguage.org diff --git a/lib/string_experimentations/utf8.nit b/lib/string_experimentations/utf8.nit index 95cd4d4..b3a2450 100644 --- a/lib/string_experimentations/utf8.nit +++ b/lib/string_experimentations/utf8.nit @@ -16,6 +16,7 @@ module utf8 intrude import standard::string +intrude import standard::file in "C Header" `{ @@ -33,7 +34,7 @@ typedef struct { # UTF-8 char as defined in RFC-3629, e.g. 1-4 Bytes # # A UTF-8 char has its bytes stored in a NativeString (char*) -extern class UnicodeChar `{ UTF8Char* `} +extern class UTF8Char `{ UTF8Char* `} new(pos: Int, ns: NativeString) `{ UTF8Char* u = malloc(sizeof(UTF8Char)); @@ -46,6 +47,7 @@ extern class UnicodeChar `{ UTF8Char* `} # # As per the specification : # + # ~~~raw # Length | UTF-8 octet sequence # | (binary) # ---------+------------------------------------------------- @@ -53,9 +55,10 @@ extern class UnicodeChar `{ UTF8Char* `} # 2 | 110xxxxx 10xxxxxx # 3 | 1110xxxx 10xxxxxx 10xxxxxx # 4 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + # ~~~ private fun len: Int `{ - char* ns = recv->ns; - int pos = recv->pos; + char* ns = self->ns; + int pos = self->pos; char nspos = ns[pos]; if((nspos & 0x80) == 0x00){ return 1;} if((nspos & 0xE0) == 0xC0){ return 2;} @@ -67,42 +70,105 @@ extern class UnicodeChar `{ UTF8Char* `} # Position in containing NativeString private fun pos: Int `{ - return recv->pos; + return self->pos; `} - private fun pos=(p: Int) `{recv->pos = p;`} + private fun pos=(p: Int) `{self->pos = p;`} # C char* wrapping the char fun ns: NativeString `{ - return recv->ns; + return self->ns; `} # Returns the Unicode code point representing the character # # Note : A unicode character might not be a visible glyph, but it will be used to determine canonical equivalence - fun code_point: Int import UnicodeChar.len `{ - switch(UnicodeChar_len(recv)){ + fun code_point: Int import UTF8Char.len `{ + switch(UTF8Char_len(self)){ case 1: - return (long)(0x7F & (unsigned char)recv->ns[recv->pos]); + return (long)(0x7F & (unsigned char)self->ns[self->pos]); case 2: - return 0 | ((0x1F & (unsigned char)recv->ns[recv->pos]) << 6) | (0x3F & (unsigned char)recv->ns[recv->pos+1]); + return 0 | ((0x1F & (unsigned char)self->ns[self->pos]) << 6) | (0x3F & (unsigned char)self->ns[self->pos+1]); case 3: - return 0 | ((0x0F & (unsigned char)recv->ns[recv->pos]) << 12) | - ((0x3F & (unsigned char)recv->ns[recv->pos+1]) << 6) | - (0x3F & (unsigned char)recv->ns[recv->pos+2]); + return 0 | ((0x0F & (unsigned char)self->ns[self->pos]) << 12) | + ((0x3F & (unsigned char)self->ns[self->pos+1]) << 6) | + (0x3F & (unsigned char)self->ns[self->pos+2]); case 4: - return 0 | ((0x07 & (unsigned char)recv->ns[recv->pos]) << 18) | - ((0x3F & (unsigned char)recv->ns[recv->pos+1]) << 12) | - ((0x3F & (unsigned char)recv->ns[recv->pos+2]) << 6) | - (0x3F & (unsigned char)recv->ns[recv->pos+3]); + return 0 | ((0x07 & (unsigned char)self->ns[self->pos]) << 18) | + ((0x3F & (unsigned char)self->ns[self->pos+1]) << 12) | + ((0x3F & (unsigned char)self->ns[self->pos+2]) << 6) | + (0x3F & (unsigned char)self->ns[self->pos+3]); + } + `} + + # Returns an upper-case version of self + # + # NOTE : Works only on ASCII chars + # TODO : Support unicode for to_upper + fun to_upper: UTF8Char import UTF8Char.code_point `{ + int cp = UTF8Char_code_point(self); + if(cp < 97 || cp > 122){ return self; } + char* ns = malloc(2); + ns[1] = '\0'; + char c = self->ns[self->pos]; + ns[0] = c - 32; + UTF8Char* ret = malloc(sizeof(UTF8Char)); + ret->ns = ns; + ret->pos = 0; + return ret; + `} + + # Returns an lower-case version of self + # + # NOTE : Works only on ASCII chars + # TODO : Support unicode for to_upper + fun to_lower: UTF8Char import UTF8Char.code_point `{ + int cp = UTF8Char_code_point(self); + if(cp < 65 || cp > 90){ return self; } + char* ns = malloc(2); + ns[1] = '\0'; + char c = self->ns[self->pos]; + ns[0] = c + 32; + UTF8Char* ret = malloc(sizeof(UTF8Char)); + ret->ns = ns; + ret->pos = 0; + return ret; + `} + + redef fun ==(o) + do + if o isa Char then + if len != 1 then return false + if code_point == o.ascii then return true + else if o isa UTF8Char then + if len != o.len then return false + if code_point == o.code_point then return true + end + return false + end + + redef fun output import UTF8Char.code_point `{ + switch(UTF8Char_len(self)){ + case 1: + printf("%c", self->ns[self->pos]); + break; + case 2: + printf("%c%c", self->ns[self->pos], self->ns[self->pos + 1]); + break; + case 3: + printf("%c%c%c", self->ns[self->pos], self->ns[self->pos + 1], self->ns[self->pos + 2]); + break; + case 4: + printf("%c%c%c%c", self->ns[self->pos], self->ns[self->pos + 1], self->ns[self->pos + 2], self->ns[self->pos + 3]); + break; } `} redef fun to_s import NativeString.to_s_with_length `{ - int len = utf8___UnicodeChar_len___impl(recv); + int len = utf8___UTF8Char_len___impl(self); char* r = malloc(len + 1); r[len] = '\0'; - char* src = (recv->ns + recv->pos); + char* src = (self->ns + self->pos); memcpy(r, src, len); return NativeString_to_s_with_length(r, len); `} @@ -116,14 +182,14 @@ private extern class StringIndex `{ UTF8Char* `} new(size: Int) `{ return malloc(size*sizeof(UTF8Char)); `} # Sets the character at `index` as `item` - fun []=(index: Int, item: UnicodeChar) `{ recv[index] = *item; `} + fun []=(index: Int, item: UTF8Char) `{ self[index] = *item; `} # Gets the character at position `id` - fun [](id: Int): UnicodeChar `{ return &recv[id]; `} + fun [](id: Int): UTF8Char `{ return &self[id]; `} # Copies a part of self starting at index `my_from` of length `length` into `other`, starting at `its_from` fun copy_to(other: StringIndex, my_from: Int, its_from: Int, length: Int)`{ - UTF8Char* myfrom = recv + my_from*(sizeof(UTF8Char)); + UTF8Char* myfrom = self + my_from*(sizeof(UTF8Char)); UTF8Char* itsfrom = other + its_from*(sizeof(UTF8Char)); memcpy(itsfrom, myfrom, length); `} @@ -147,6 +213,61 @@ redef class FlatString self.bytelen = bytelen end + redef fun to_cstring + do + if real_items != null then return real_items.as(not null) + var new_items = new NativeString(bytelen + 1) + self.items.copy_to(new_items, bytelen, index[index_from].pos, 0) + new_items[bytelen] = '\0' + self.real_items = new_items + return new_items + end + + redef fun substring(from, count) + do + assert count >= 0 + + if from < 0 then + count += from + if count < 0 then count = 0 + from = 0 + end + + if count == 0 then return empty + + var real_from = index_from + from + var real_to = real_from + count - 1 + + if real_to > index_to then real_to = index_to + + var sub_bytelen = (index[real_to].pos - index[from].pos) + index[from].len + + return new FlatString.with_infos_index(items, count, real_from, real_to, index, sub_bytelen) + end + + redef fun reversed + do + var native = new NativeString(self.bytelen + 1) + var length = self.length + var index = self.index + var pos = 0 + var i = 0 + var ipos = bytelen + var new_index = new StringIndex(length) + var pos_index = length + while i < length do + var uchar = index[i] + var uchar_len = uchar.len + ipos -= uchar_len + new_index[pos_index] = new UTF8Char(ipos, native) + pos_index -= 1 + items.copy_to(native, uchar_len, pos, ipos) + pos += uchar_len + i += 1 + end + return new FlatString.with_infos_index(native, length, 0, length-1, new_index, bytelen) + end + redef fun *(i) do assert i >= 0 @@ -159,7 +280,7 @@ redef class FlatString var my_real_len = length var my_real_fin_len = my_real_len * i - var target_string = calloc_string((finlen) + 1) + var target_string = new NativeString((finlen) + 1) var my_index = index var new_index = new StringIndex(my_real_fin_len) @@ -179,6 +300,72 @@ redef class FlatString end + redef fun to_upper + do + var outstr = new NativeString(self.bytelen + 1) + + var out_index = 0 + var index = self.index + var ipos = 0 + var max = length + + while ipos < max do + var u = index[ipos].to_upper + u.ns.copy_to(outstr, u.len, u.pos, out_index) + out_index += u.len + ipos += 1 + end + + outstr[self.bytelen] = '\0' + + return outstr.to_s_with_length(self.bytelen) + end + + redef fun to_lower + do + var outstr = new NativeString(self.bytelen + 1) + + var out_index = 0 + var index = self.index + var ipos = 0 + var max = length + + while ipos < max do + var u = index[ipos].to_lower + u.ns.copy_to(outstr, u.len, u.pos, out_index) + out_index += u.len + ipos += 1 + end + + outstr[self.bytelen] = '\0' + + return outstr.to_s_with_length(self.bytelen) + end + + redef fun output + do + var i = self.index_from + var imax = self.index_to + while i <= imax do + index[i].output + i += 1 + end + end + +end + +redef class FlatBuffer + + # Fix for this particular implementation + # + # Since the to_s of a FlatBuffer now builds using + # the old String contructor, this breaks everything. + # + # This will disappear when UTF8 is fully-supported + redef fun to_s do + written = false + return to_cstring.to_s_with_length(length) + end end redef class NativeString @@ -186,15 +373,15 @@ redef class NativeString # Creates the index for said NativeString # `length` is the size of the CString (in bytes, up to the first \0) # real_len is just a way to store the length (UTF-8 characters) - private fun make_index(length: Int, real_len: Container[Int]): StringIndex import Container[Int].item=, UnicodeChar.len `{ + private fun make_index(length: Int, real_len: Container[Int]): StringIndex import Container[Int].item=, UTF8Char.len `{ int pos = 0; int index_pos = 0; UTF8Char* index = malloc(length*sizeof(UTF8Char)); while(pos < length){ UTF8Char* curr = &index[index_pos]; curr->pos = pos; - curr->ns = recv; - pos += UnicodeChar_len(curr); + curr->ns = self; + pos += UTF8Char_len(curr); index_pos ++; } Container_of_Int_item__assign(real_len, index_pos); @@ -207,7 +394,7 @@ redef class NativeString return to_s_with_length(len) end - redef fun to_s_with_length(len: Int): FlatString + redef fun to_s_with_length(len) do var real_len = new Container[Int](0) var x = make_index(len, real_len) @@ -219,8 +406,22 @@ redef class NativeString var real_len = new Container[Int](0) var length = cstring_length var x = make_index(length, real_len) - var new_self = calloc_string(length + 1) + var new_self = new NativeString(length + 1) copy_to(new_self, length, 0, 0) return new FlatString.with_infos_index(new_self, real_len.item, 0, real_len.item - 1, x, length) end end + +redef class FileWriter + redef fun write(s) + do + assert is_writable + if s isa FlatText then + if s isa FlatString then + write_native(s.to_cstring, s.bytelen) + else + write_native(s.to_cstring, s.length) + end + else for i in s.substrings do write_native(i.to_cstring, i.length) + end +end