From 542a74f74924f1bd7c91689c95adf23f8270a798 Mon Sep 17 00:00:00 2001 From: Lucas Bajolet Date: Tue, 2 Jun 2015 14:49:19 -0400 Subject: [PATCH] lib/: removed UTF-8 related modules in anticipation of the integration in stdlib Signed-off-by: Lucas Bajolet --- lib/string_experimentations/README.md | 24 - .../string_experimentations.nit | 18 - lib/string_experimentations/utf8.nit | 427 ----------- lib/string_experimentations/utf8_noindex.nit | 742 -------------------- tests/sav/utf_test.res | 11 - tests/utf_test.nit | 42 -- 6 files changed, 1264 deletions(-) delete mode 100644 lib/string_experimentations/README.md delete mode 100644 lib/string_experimentations/string_experimentations.nit delete mode 100644 lib/string_experimentations/utf8.nit delete mode 100644 lib/string_experimentations/utf8_noindex.nit delete mode 100644 tests/sav/utf_test.res delete mode 100644 tests/utf_test.nit diff --git a/lib/string_experimentations/README.md b/lib/string_experimentations/README.md deleted file mode 100644 index 7cbce13..0000000 --- a/lib/string_experimentations/README.md +++ /dev/null @@ -1,24 +0,0 @@ -This project is a collection of modules used to experiment on different variations of Text and its subclasses. -This is only temporary as these modules will eventually be merged into standard library or discarded for those bringing no real improvements to the language. - -The modules contained here are : - - * utf8: A draft of implementation of UTF-8 as internal encoding for Strings with automatic indexing. - * utf8_no_index: Another draft of implementation of UTF-8, this time without indexing. - -TODO : - - * utf8: - * Support for the whole API of Text - * Any kind of normalization form for equality (NFC probably) - * Compatibility versions of equality test - * Locale support - * Comparisons - * to_upper/lower fully-compatible with Unicode - - * utf8_no_index: - * Add cache for the last indexed character - DONE - * Two-way iteration - DONE - * Intelligent indexed access (calculating the nearest point of insertion, i.e. begin, end, or cache) - DONE - * UnicodeChar as universal type - * UnicodeChar => Char and Char => Byte diff --git a/lib/string_experimentations/string_experimentations.nit b/lib/string_experimentations/string_experimentations.nit deleted file mode 100644 index 1eeb10d..0000000 --- a/lib/string_experimentations/string_experimentations.nit +++ /dev/null @@ -1,18 +0,0 @@ -# This file is part of NIT ( http://www.nitlanguage.org ). -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# General module for all kinds of string experimentations -module string_experimentations - -import utf8 diff --git a/lib/string_experimentations/utf8.nit b/lib/string_experimentations/utf8.nit deleted file mode 100644 index b3a2450..0000000 --- a/lib/string_experimentations/utf8.nit +++ /dev/null @@ -1,427 +0,0 @@ -# This file is part of NIT ( http://www.nitlanguage.org ). -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Introduces UTF-8 as internal encoding for Strings in Nit. -module utf8 - -intrude import standard::string -intrude import standard::file - -in "C Header" `{ - -#include -#include -#include - -typedef struct { - long pos; - char* ns; -} UTF8Char; - -`} - -# UTF-8 char as defined in RFC-3629, e.g. 1-4 Bytes -# -# A UTF-8 char has its bytes stored in a NativeString (char*) -extern class UTF8Char `{ UTF8Char* `} - - new(pos: Int, ns: NativeString) `{ - UTF8Char* u = malloc(sizeof(UTF8Char)); - u->pos = pos; - u->ns = ns; - return u; - `} - - # Real length of the char in UTF8 - # - # As per the specification : - # - # ~~~raw - # Length | UTF-8 octet sequence - # | (binary) - # ---------+------------------------------------------------- - # 1 | 0xxxxxxx - # 2 | 110xxxxx 10xxxxxx - # 3 | 1110xxxx 10xxxxxx 10xxxxxx - # 4 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - # ~~~ - private fun len: Int `{ - char* ns = self->ns; - int pos = self->pos; - char nspos = ns[pos]; - if((nspos & 0x80) == 0x00){ return 1;} - if((nspos & 0xE0) == 0xC0){ return 2;} - if((nspos & 0xF0) == 0xE0){ return 3;} - if((nspos & 0xF7) == 0xF0){ return 4;} - // Invalid character - return 1; - `} - - # Position in containing NativeString - private fun pos: Int `{ - return self->pos; - `} - - private fun pos=(p: Int) `{self->pos = p;`} - - # C char* wrapping the char - fun ns: NativeString `{ - return self->ns; - `} - - # Returns the Unicode code point representing the character - # - # Note : A unicode character might not be a visible glyph, but it will be used to determine canonical equivalence - fun code_point: Int import UTF8Char.len `{ - switch(UTF8Char_len(self)){ - case 1: - return (long)(0x7F & (unsigned char)self->ns[self->pos]); - case 2: - return 0 | ((0x1F & (unsigned char)self->ns[self->pos]) << 6) | (0x3F & (unsigned char)self->ns[self->pos+1]); - case 3: - return 0 | ((0x0F & (unsigned char)self->ns[self->pos]) << 12) | - ((0x3F & (unsigned char)self->ns[self->pos+1]) << 6) | - (0x3F & (unsigned char)self->ns[self->pos+2]); - case 4: - return 0 | ((0x07 & (unsigned char)self->ns[self->pos]) << 18) | - ((0x3F & (unsigned char)self->ns[self->pos+1]) << 12) | - ((0x3F & (unsigned char)self->ns[self->pos+2]) << 6) | - (0x3F & (unsigned char)self->ns[self->pos+3]); - } - `} - - # Returns an upper-case version of self - # - # NOTE : Works only on ASCII chars - # TODO : Support unicode for to_upper - fun to_upper: UTF8Char import UTF8Char.code_point `{ - int cp = UTF8Char_code_point(self); - if(cp < 97 || cp > 122){ return self; } - char* ns = malloc(2); - ns[1] = '\0'; - char c = self->ns[self->pos]; - ns[0] = c - 32; - UTF8Char* ret = malloc(sizeof(UTF8Char)); - ret->ns = ns; - ret->pos = 0; - return ret; - `} - - # Returns an lower-case version of self - # - # NOTE : Works only on ASCII chars - # TODO : Support unicode for to_upper - fun to_lower: UTF8Char import UTF8Char.code_point `{ - int cp = UTF8Char_code_point(self); - if(cp < 65 || cp > 90){ return self; } - char* ns = malloc(2); - ns[1] = '\0'; - char c = self->ns[self->pos]; - ns[0] = c + 32; - UTF8Char* ret = malloc(sizeof(UTF8Char)); - ret->ns = ns; - ret->pos = 0; - return ret; - `} - - redef fun ==(o) - do - if o isa Char then - if len != 1 then return false - if code_point == o.ascii then return true - else if o isa UTF8Char then - if len != o.len then return false - if code_point == o.code_point then return true - end - return false - end - - redef fun output import UTF8Char.code_point `{ - switch(UTF8Char_len(self)){ - case 1: - printf("%c", self->ns[self->pos]); - break; - case 2: - printf("%c%c", self->ns[self->pos], self->ns[self->pos + 1]); - break; - case 3: - printf("%c%c%c", self->ns[self->pos], self->ns[self->pos + 1], self->ns[self->pos + 2]); - break; - case 4: - printf("%c%c%c%c", self->ns[self->pos], self->ns[self->pos + 1], self->ns[self->pos + 2], self->ns[self->pos + 3]); - break; - } - `} - - redef fun to_s import NativeString.to_s_with_length `{ - int len = utf8___UTF8Char_len___impl(self); - char* r = malloc(len + 1); - r[len] = '\0'; - char* src = (self->ns + self->pos); - memcpy(r, src, len); - return NativeString_to_s_with_length(r, len); - `} -end - -# A `StringIndex` is used to keep track of the position of characters in a `FlatString` object -# -# It becomes mandatory for UTF-8 strings since characters do not have a fixed size. -private extern class StringIndex `{ UTF8Char* `} - - new(size: Int) `{ return malloc(size*sizeof(UTF8Char)); `} - - # Sets the character at `index` as `item` - fun []=(index: Int, item: UTF8Char) `{ self[index] = *item; `} - - # Gets the character at position `id` - fun [](id: Int): UTF8Char `{ return &self[id]; `} - - # Copies a part of self starting at index `my_from` of length `length` into `other`, starting at `its_from` - fun copy_to(other: StringIndex, my_from: Int, its_from: Int, length: Int)`{ - UTF8Char* myfrom = self + my_from*(sizeof(UTF8Char)); - UTF8Char* itsfrom = other + its_from*(sizeof(UTF8Char)); - memcpy(itsfrom, myfrom, length); - `} -end - -redef class FlatString - - # Index of the characters of the FlatString - private var index: StringIndex - - # Length in bytes of the string (e.g. the length of the C string) - var bytelen: Int - - private init with_infos_index(items: NativeString, len: Int, index_from: Int, index_to: Int, index: StringIndex, bytelen: Int) - do - self.items = items - length = len - self.index_from = index_from - self.index_to = index_to - self.index = index - self.bytelen = bytelen - end - - redef fun to_cstring - do - if real_items != null then return real_items.as(not null) - var new_items = new NativeString(bytelen + 1) - self.items.copy_to(new_items, bytelen, index[index_from].pos, 0) - new_items[bytelen] = '\0' - self.real_items = new_items - return new_items - end - - redef fun substring(from, count) - do - assert count >= 0 - - if from < 0 then - count += from - if count < 0 then count = 0 - from = 0 - end - - if count == 0 then return empty - - var real_from = index_from + from - var real_to = real_from + count - 1 - - if real_to > index_to then real_to = index_to - - var sub_bytelen = (index[real_to].pos - index[from].pos) + index[from].len - - return new FlatString.with_infos_index(items, count, real_from, real_to, index, sub_bytelen) - end - - redef fun reversed - do - var native = new NativeString(self.bytelen + 1) - var length = self.length - var index = self.index - var pos = 0 - var i = 0 - var ipos = bytelen - var new_index = new StringIndex(length) - var pos_index = length - while i < length do - var uchar = index[i] - var uchar_len = uchar.len - ipos -= uchar_len - new_index[pos_index] = new UTF8Char(ipos, native) - pos_index -= 1 - items.copy_to(native, uchar_len, pos, ipos) - pos += uchar_len - i += 1 - end - return new FlatString.with_infos_index(native, length, 0, length-1, new_index, bytelen) - end - - redef fun *(i) - do - assert i >= 0 - - var mylen = self.bytelen - var finlen = mylen * i - - var my_items = self.items - - var my_real_len = length - var my_real_fin_len = my_real_len * i - - var target_string = new NativeString((finlen) + 1) - - var my_index = index - var new_index = new StringIndex(my_real_fin_len) - - target_string[finlen] = '\0' - - var current_last = 0 - var curr_index = 0 - - for iteration in [1 .. i] do - my_items.copy_to(target_string, mylen, index_from, current_last) - my_index.copy_to(new_index, length, 0, curr_index) - current_last += mylen - end - - return new FlatString.with_infos_index(target_string, my_real_fin_len, 0, my_real_fin_len -1, new_index, finlen) - - end - - redef fun to_upper - do - var outstr = new NativeString(self.bytelen + 1) - - var out_index = 0 - var index = self.index - var ipos = 0 - var max = length - - while ipos < max do - var u = index[ipos].to_upper - u.ns.copy_to(outstr, u.len, u.pos, out_index) - out_index += u.len - ipos += 1 - end - - outstr[self.bytelen] = '\0' - - return outstr.to_s_with_length(self.bytelen) - end - - redef fun to_lower - do - var outstr = new NativeString(self.bytelen + 1) - - var out_index = 0 - var index = self.index - var ipos = 0 - var max = length - - while ipos < max do - var u = index[ipos].to_lower - u.ns.copy_to(outstr, u.len, u.pos, out_index) - out_index += u.len - ipos += 1 - end - - outstr[self.bytelen] = '\0' - - return outstr.to_s_with_length(self.bytelen) - end - - redef fun output - do - var i = self.index_from - var imax = self.index_to - while i <= imax do - index[i].output - i += 1 - end - end - -end - -redef class FlatBuffer - - # Fix for this particular implementation - # - # Since the to_s of a FlatBuffer now builds using - # the old String contructor, this breaks everything. - # - # This will disappear when UTF8 is fully-supported - redef fun to_s do - written = false - return to_cstring.to_s_with_length(length) - end -end - -redef class NativeString - - # Creates the index for said NativeString - # `length` is the size of the CString (in bytes, up to the first \0) - # real_len is just a way to store the length (UTF-8 characters) - private fun make_index(length: Int, real_len: Container[Int]): StringIndex import Container[Int].item=, UTF8Char.len `{ - int pos = 0; - int index_pos = 0; - UTF8Char* index = malloc(length*sizeof(UTF8Char)); - while(pos < length){ - UTF8Char* curr = &index[index_pos]; - curr->pos = pos; - curr->ns = self; - pos += UTF8Char_len(curr); - index_pos ++; - } - Container_of_Int_item__assign(real_len, index_pos); - return index; - `} - - redef fun to_s: FlatString - do - var len = cstring_length - return to_s_with_length(len) - end - - redef fun to_s_with_length(len) - do - var real_len = new Container[Int](0) - var x = make_index(len, real_len) - return new FlatString.with_infos_index(self, real_len.item, 0, real_len.item - 1, x, len) - end - - redef fun to_s_with_copy - do - var real_len = new Container[Int](0) - var length = cstring_length - var x = make_index(length, real_len) - var new_self = new NativeString(length + 1) - copy_to(new_self, length, 0, 0) - return new FlatString.with_infos_index(new_self, real_len.item, 0, real_len.item - 1, x, length) - end -end - -redef class FileWriter - redef fun write(s) - do - assert is_writable - if s isa FlatText then - if s isa FlatString then - write_native(s.to_cstring, s.bytelen) - else - write_native(s.to_cstring, s.length) - end - else for i in s.substrings do write_native(i.to_cstring, i.length) - end -end diff --git a/lib/string_experimentations/utf8_noindex.nit b/lib/string_experimentations/utf8_noindex.nit deleted file mode 100644 index 8756838..0000000 --- a/lib/string_experimentations/utf8_noindex.nit +++ /dev/null @@ -1,742 +0,0 @@ -# This file is part of NIT ( http://www.nitlanguage.org ). -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Introduces UTF-8 as internal encoding for Strings in Nit. -module utf8_noindex - -intrude import standard::string -intrude import standard::file - -in "C Header" `{ - -#include -#include -#include - -#define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100) - -`} - -# UTF-8 char as defined in RFC-3629, e.g. 1-4 Bytes -extern class UnicodeChar `{ uint32_t* `} - super Comparable - - redef type OTHER: UnicodeChar - - # Transforms a byte-variable char* character to its uint32_t equivalent - new from_ns(ns: NativeString, index: Int) `{ - unsigned char* ret = calloc(1,4); - if((ns[index] & 0x80) == 0){ memcpy(ret + 3, ns + index, 1); } - else if((ns[index] & 0xE0) == 0xC0) { memcpy(ret + 2, ns + index, 2); } - else if((ns[index] & 0xF0) == 0xE0) { memcpy(ret + 1, ns + index, 3); } - else if((ns[index] & 0xF7) == 0xF0) { memcpy(ret, ns + index, 4); } - else{ memcpy(ret + 3, ns + index, 1);} - if (!IS_BIG_ENDIAN) { - uint32_t tmp = ntohl(*((uint32_t*)ret)); - memcpy(ret, &tmp, 4); - } - return (uint32_t*)ret; - `} - - # Real length of the char in UTF8 - # - # As per the specification : - # - # ~~~raw - # Length | UTF-8 octet sequence - # | (binary) - # ---------+------------------------------------------------- - # 1 | 0xxxxxxx - # 2 | 110xxxxx 10xxxxxx - # 3 | 1110xxxx 10xxxxxx 10xxxxxx - # 4 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - # ~~~ - fun len: Int `{ - uint32_t s = *self; - if(s <= 127) {return 1;} - if(s >= 49280 && s <= 57279) {return 2;} - if(s >= 14712960 && s <= 15712191) {return 3;} - if(s >= 4034953344 && s <= 4156538815) { return 4; } - // Bad character - return 1; - `} - - # Returns the Unicode code point representing the character - # - # Note : A unicode character might not be a visible glyph, but it will be used to determine canonical equivalence - fun code_point: Int import UnicodeChar.len `{ - uint32_t val = *self; - uint32_t ret = 0; - switch(UnicodeChar_len(self)){ - case 1: - ret = *self; - break; - case 2: - ret = 0 | ((val & 0x00001F00) >> 2) | (val & 0x0000003F); - break; - case 3: - ret = 0 | ((val & 0x000F0000) >> 4) | ((val & 0x00003F00) >> 2) | (val & 0x0000003F); - break; - case 4: - ret = 0 | ((val & 0x07000000) >> 6) | ((val & 0x003F0000) >> 4) | ((val & 0x00003F00) >> 2) | (val & 0x0000003F); - break; - } - unsigned char* rt = (unsigned char*) &ret; - return ret; - `} - - # Warning : This does not follow the Unicode specification for now - # - # TODO: Support Unicode-compliant comparison - redef fun <(o) do return self.code_point < o.code_point - - # Returns an upper-case version of self - # - # NOTE : Works only on ASCII chars - # TODO : Support unicode for to_upper - fun to_upper: UnicodeChar import UnicodeChar.code_point `{ - if(*self < 97 || *self > 122){ return self; } - uint32_t* ret = calloc(1,4); - *ret = *self - 32; - return ret; - `} - - # Returns an lower-case version of self - # - # NOTE : Works only on ASCII chars - # TODO : Support unicode for to_upper - fun to_lower: UnicodeChar import UnicodeChar.code_point `{ - if(*self < 65 || *self > 90){ return self; } - uint32_t* ret = calloc(1,4); - *ret = *self + 32; - return ret; - `} - - redef fun ==(o) - do - if not o isa UnicodeChar then return false - if o.code_point == self.code_point then return true - return false - end - - redef fun output import UnicodeChar.len `{ - uint32_t self0 = *self; - if(!IS_BIG_ENDIAN){ - uint32_t tmp = ntohl(self0); - memcpy(&self0, &tmp, 4); - } - unsigned char* s = (unsigned char*) &self0; - switch(UnicodeChar_len(self0)){ - case 1: - printf("%c", s[3]); - break; - case 2: - printf("%c%c", s[2], s[3]); - break; - case 3: - printf("%c%c%c", s[1], s[2], s[3]); - break; - case 4: - printf("%c%c%c%c", s[0], s[1], s[2], s[3]); - break; - } - `} - - redef fun to_s: FlatString import FlatString.full, UnicodeChar.len `{ - int len = UnicodeChar_len(self); - char* r = malloc(len + 1); - r[len] = '\0'; - uint32_t src = *self; - if(!IS_BIG_ENDIAN){ - uint32_t tmp = htonl(src); - memcpy(&src, &tmp, 4); - } - unsigned char* s = (unsigned char*) &src; - switch(len){ - case 1: memcpy(r, s+3, 1); break; - case 2: memcpy(r, s+2, 2); break; - case 3: memcpy(r, s+1, 3); break; - case 4: memcpy(r, s, 4); break; - } - return new_FlatString_full(r, 0, len - 1, len, 1); - `} -end - -# Used to keep track of the last accessed char in a String -class CharCache - # The position (as in char) of a String - var position: Int - # The position in the NativeString underlying the String - var bytepos: Int -end - -class FlatStringReviter - super IndexedIterator[UnicodeChar] - - # The NativeString to iterate upon - private var ns: NativeString - - # The position in the string - private var pos: Int - - # The position in the native string - private var bytepos: Int - - init(s: FlatString) do from(s, s.length - 1) - - init from(s: FlatString, position: Int) - do - ns = s.items - pos = position - bytepos = s.byte_index(position) - end - - redef fun next - do - bytepos -= 1 - while ns[bytepos].ascii.bin_and(0xC0) == 0x80 do - bytepos -= 1 - end - pos -= 1 - end - - redef fun index do return pos - - redef fun item do return new UnicodeChar.from_ns(ns, bytepos) - - redef fun is_ok do return pos >= 0 -end - -class FlatStringIter - super IndexedIterator[UnicodeChar] - - private var ns: NativeString - - private var pos: Int - - private var bytepos: Int - - private var slen: Int - - private var it: UnicodeChar - - private var is_created = false - - init(s: FlatString) do from(s, 0) - - init from(s: FlatString, position: Int) do - ns = s.items - pos = position - bytepos = s.byte_index(position) - slen = s.length - end - - redef fun index do return pos - - redef fun is_ok do return pos < slen - - redef fun item do - if not is_created then - it = new UnicodeChar.from_ns(ns, bytepos) - is_created = true - end - return it - end - - redef fun next - do - if not is_created then - it = new UnicodeChar.from_ns(ns, bytepos) - end - is_created = false - var pace = it.len - pos += 1 - bytepos += pace - end -end - -redef class FlatString - - redef type OTHER: FlatString - - # Length in bytes of the string (e.g. the length of the C string) - redef var bytelen - - # Cache for the last accessed character in the char - var cache = new CharCache(-1,-1) - - redef var length = length_l is lazy - - private init full(items: NativeString, from, to, bytelen, len: Int) - do - self.items = items - index_from = from - index_to = to - self.bytelen = bytelen - length = len - end - - # Length implementation - private fun length_l: Int import FlatString.items, FlatString.index_to, FlatString.index_from `{ - char* ns = FlatString_items(self); - int i = FlatString_index_from(self); - int max = FlatString_index_to(self); - int length = 0; - while(i <= max){ - char c = ns[i]; - if((c & 0x80) == 0) { i+= 1; } - else if((c & 0xE0) == 0xC0) { i += 2; } - else if((c & 0xF0) == 0xE0) { i += 3; } - else if((c & 0xF7) == 0xF0) { i += 4; } - else { i += 1; } - length ++; - } - return length; - `} - - redef fun <(o) - do - var o_pos = 0 - var olen = o.length - for i in [0 .. length[ do - if o_pos >= olen then return false - if char_at(i) > o.char_at(i) then return false - if char_at(i) < o.char_at(i) then return true - end - return false - end - - redef fun ==(o) do - if o == null then return false - if not o isa FlatString then return super - var mylen = length - var itslen = o.length - if mylen != itslen then return false - var mypos = 0 - var itspos = 0 - - while mypos < mylen do - if char_at(mypos) != o.char_at(itspos) then return false - mypos += 1 - itspos += 1 - end - return true - end - - private fun byte_index(index: Int): Int do - assert index >= 0 - assert index < length - - # Find best insertion point - var delta_begin = index - var delta_end = (length - 1) - index - var delta_cache = (cache.position - index).abs - var min = delta_begin - - if delta_cache < min then min = delta_cache - if delta_end < min then min = delta_end - - var ns_i: Int - var my_i: Int - var myits = items - - if min == delta_begin then - ns_i = index_from - my_i = 0 - else if min == delta_cache then - ns_i = cache.bytepos - my_i = cache.position - else - ns_i = index_to - my_i = length - end - - while my_i < index do - if myits[ns_i].ascii.bin_and(0x80) == 0 then - ns_i += 1 - else if myits[ns_i].ascii.bin_and(0xE0) == 0xC0 then - ns_i += 2 - else if myits[ns_i].ascii.bin_and(0xF0) == 0xE0 then - ns_i += 3 - else if myits[ns_i].ascii.bin_and(0xF7) == 0xF0 then - ns_i += 4 - else - ns_i += 1 - end - my_i += 1 - end - - while my_i > index do - if myits[ns_i].ascii.bin_and(0xC0) != 0x80 then - my_i -= 1 - if my_i == index then break - end - ns_i -= 1 - end - - cache.position = index - cache.bytepos = ns_i - - return ns_i - end - - fun char_at(pos: Int): UnicodeChar do - return new UnicodeChar.from_ns(items, byte_index(pos)) - end - - private init with_bytelen(items: NativeString, index_from: Int, index_to: Int, bytelen: Int) do - self.items = items - self.index_from = index_from - self.index_to = index_to - self.bytelen = bytelen - end - - redef fun reversed do - var new_str = new NativeString(bytelen) - var s_pos = bytelen - var my_pos = index_from - var its = items - for i in [0..length[ do - var c = char_at(i).len - s_pos -= c - its.copy_to(new_str, c, my_pos, s_pos) - my_pos += c - end - return new FlatString.full(new_str, 0, bytelen - 1, bytelen, length) - end - - redef fun to_upper do - var ns = new NativeString(bytelen) - var offset = 0 - for i in [0 .. length[ - do - var c = char_at(i) - c.to_upper.to_s.items.copy_to(ns, c.len, 0, offset) - offset += c.len - end - return new FlatString.full(ns, 0, bytelen - 1, bytelen, length) - end - - redef fun to_lower do - var ns = new NativeString(bytelen) - var offset = 0 - for i in [0 .. length[ - do - var c = char_at(i) - c.to_lower.to_s.items.copy_to(ns, c.len, 0, offset) - offset += c.len - end - return new FlatString.full(ns, 0, bytelen - 1, bytelen, length) - end - - redef fun +(o) do - if o isa Buffer then o = o.to_s - if o isa FlatString then - var new_str = new NativeString(bytelen + o.bytelen + 1) - var new_bytelen = bytelen + o.bytelen - new_str[new_bytelen] = '\0' - var newlen = length + o.length - items.copy_to(new_str, bytelen, index_from, 0) - o.items.copy_to(new_str, o.bytelen, o.index_from, bytelen) - return new FlatString.full(new_str, 0, new_bytelen - 1, new_bytelen, newlen) - else if o isa Concat then - return new Concat(self, o) - else - # If it goes to this point, that means another String implementation was concerned, therefore you need to support the + operation for this variant - abort - end - end - - redef fun *(i) do - var mybtlen = bytelen - var new_bytelen = mybtlen * i - var mylen = length - var newlen = mylen * i - var ns = new NativeString(new_bytelen + 1) - ns[new_bytelen] = '\0' - var offset = 0 - while i > 0 do - items.copy_to(ns, bytelen, index_from, offset) - offset += mybtlen - i -= 1 - end - return new FlatString.full(ns, 0, new_bytelen - 1, new_bytelen, newlen) - end - - # O(n) - redef fun substring(from, count) do - assert count >= 0 - - if from < 0 then - count += from - if count < 0 then count = 0 - from = 0 - end - - if count == 0 then return empty - - var real_from = byte_index(from) - - var lst = from + count - 1 - - if lst > length - from then - return new FlatString.with_bytelen(items, real_from, index_to, index_to - real_from) - end - - var real_to = byte_index(lst) - - return new FlatString.full(items, real_from, real_to, (real_to + char_at(lst).len) - real_from, count) - end - - redef fun to_cstring do - if real_items != null then return real_items.as(not null) - var new_items = new NativeString(bytelen + 1) - self.items.copy_to(new_items, bytelen, index_from, 0) - new_items[bytelen] = '\0' - self.real_items = new_items - return new_items - end -end - -redef class Text - - # Length of the string, in bytes - fun bytelen: Int is abstract - -end - -redef class FlatBuffer - - redef var bytelen - - redef init from(s) do - if s isa Concat then - with_capacity(50) - for i in s.substrings do self.append(i) - end - items = new NativeString(s.bytelen) - if s isa FlatString then - s.items.copy_to(items, s.bytelen, s.index_from, 0) - else - s.as(FlatBuffer).items.copy_to(items, s.as(FlatBuffer).bytelen, 0, 0) - end - length = s.length - bytelen = s.bytelen - capacity = s.bytelen - end - - # Replaces the char at `index` by `item` - fun char_at=(index: Int, item: UnicodeChar) do - is_dirty = true - if index == length then - add_unicode item - return - end - assert index >= 0 and index < length - var ip = byte_at(index) - var c = char_at_byte(ip) - var size_diff = item.len - c.len - if size_diff > 0 then - rshift_bytes(ip + c.len, size_diff) - else if size_diff < 0 then - lshift_bytes(ip + c.len, -size_diff) - end - var s = item.to_s - s.items.copy_to(items, s.bytelen, 0, ip) - end - - # Shifts the content of the buffer by `len` bytes to the right, starting at byte `from` - fun rshift_bytes(from: Int, len: Int) import FlatBuffer.bytelen, FlatBuffer.bytelen=, FlatBuffer.items `{ - long bt = FlatBuffer_bytelen(self); - char* ns = FlatBuffer_items(self); - int off = from + len; - memmove(ns + off, ns + from, bt - from); - FlatBuffer_bytelen__assign(self, bt + len); - `} - - # Shifts the content of the buffer by `len` bytes to the left, starting at `from` - fun lshift_bytes(from: Int, len: Int) import FlatBuffer.bytelen, FlatBuffer.bytelen=, FlatBuffer.items `{ - long bt = FlatBuffer_bytelen(self); - char* ns = FlatBuffer_items(self); - int off = from - len; - memmove(ns + off, ns + from, bt - from); - FlatBuffer_bytelen__assign(self, bt - len); - `} - - # Get the Unicode char stored at `index` in `self` - fun char_at(index: Int): UnicodeChar do return new UnicodeChar.from_ns(items, byte_at(index)) - - # Get the Unicode char stored at `index` (bytewise) in `self` - fun char_at_byte(index: Int): UnicodeChar do return new UnicodeChar.from_ns(items, index) - - # Add equivalent that supports Unicode - fun add_unicode(c: UnicodeChar) do - var s = c.to_s - if s.bytelen + bytelen > capacity then enlarge(s.bytelen) - s.items.copy_to(items, s.bytelen, 0, bytelen) - end - - # Gets the byte index (in NativeString) of the char stored at `i` - fun byte_at(i: Int): Int do - assert i < length and i >= 0 - var ns_i = 0 - var real_i = 0 - while real_i < i do - if items[ns_i].ascii.bin_and(0x80) == 0 then - ns_i += 1 - else if items[ns_i].ascii.bin_and(0xE0) == 0xC0 then - ns_i += 2 - else if items[ns_i].ascii.bin_and(0xF0) == 0xE0 then - ns_i += 3 - else if items[ns_i].ascii.bin_and(0xF7) == 0xF0 then - ns_i += 4 - else - ns_i += 1 - end - real_i += 1 - end - return ns_i - end - - redef fun enlarge(cap) do - var c = capacity - if cap <= c then return - while c <= cap do c = c * 2 + 2 - var a = new NativeString(c+1) - if bytelen > 0 then items.copy_to(a, bytelen, 0, 0) - items = a - capacity = c - end - - redef fun append(s) do - if s isa Concat then - for i in s.substrings do append i - end - var i = s.as(FlatString) - var blen = bytelen - var iblen = i.bytelen - var newlen = blen + iblen - if newlen > capacity then - enlarge(newlen) - end - i.items.copy_to(items, iblen, i.index_from, blen) - bytelen += iblen - length += i.length - end - - redef fun reverse - do - var nns = new NativeString(bytelen) - var ns = items - var btlen = bytelen - var myp = 0 - var itsp = btlen - while myp < btlen do - var c = char_at_byte(myp).len - itsp -= c - ns.copy_to(nns, c, myp, itsp) - myp += c - end - items = nns - end - - redef fun clear do - length = 0 - bytelen = 0 - end - - redef fun copy(s, l, d, ns) do - if not d isa FlatBuffer then - # This implementation here is only concerned by the FlatBuffer - # If you implement a new Buffer subclass, make sure to support this operation via refinement. - abort - end - var rs = byte_at(s) - var re = byte_at(s + l - 1) - var rl = re - rs - var rns = d.byte_at(ns) - items.copy_to(d.items, rl, rns, rs) - end - - redef fun times(i) do - var len = bytelen - var off = len - var newlen = len * i - if newlen > capacity then enlarge(newlen) - for j in [1 .. i[ do - items.copy_to(items, len, 0, off) - off += len - end - bytelen = newlen - length = length * i - end - - redef fun upper do - for i in [0 .. length[ do - var pos = byte_at(i) - var c = char_at_byte(pos) - var d = c.to_upper - if c == d then continue - d.to_s.items.copy_to(items, 1, 0, pos) - end - end - - redef fun lower do - for i in [0 .. length[ do - var pos = byte_at(i) - var c = char_at_byte(pos) - var d = c.to_lower - if c == d then continue - d.to_s.items.copy_to(items, 1, 0, pos) - end - end - - redef fun to_cstring do - var ns = new NativeString(bytelen) - items.copy_to(ns, bytelen, 0, 0) - return ns - end -end - -redef class NativeString - - redef fun to_s: FlatString - do - var len = cstring_length - return to_s_with_length(len) - end - - redef fun to_s_with_length(len) - do - return new FlatString.with_bytelen(self, 0, len - 1, len) - end - - redef fun to_s_with_copy - do - var length = cstring_length - var new_self = new NativeString(length + 1) - copy_to(new_self, length, 0, 0) - return new FlatString.with_bytelen(new_self, 0, length - 1, length) - end -end - -redef class FileWriter - redef fun write(s) - do - assert is_writable - if s isa FlatText then - write_native(s.to_cstring, s.bytelen) - else for i in s.substrings do write_native(i.to_cstring, i.length) - end -end diff --git a/tests/sav/utf_test.res b/tests/sav/utf_test.res deleted file mode 100644 index 4055d93..0000000 --- a/tests/sav/utf_test.res +++ /dev/null @@ -1,11 +0,0 @@ -28 -すでa語A本日a 𐍆,A ᓂ . ᓀ 界世a𐍃ーЖロaハ -ハaロЖー𐍃a世界 ᓀ . ᓂ A,𐍆 a日本A語aです -ー𐍃a世 -30fc -10343 -61 -4e16 -ハAロЖー𐍃A世界 ᓀ . ᓂ A,𐍆 A日本A語Aです -ハaロЖー𐍃a世界 ᓀ . ᓂ a,𐍆 a日本a語aです -ハaロЖー𐍃a世界 ᓀ . ᓂ A,𐍆 a日本A語aですハaロЖー𐍃a世界 ᓀ . ᓂ A,𐍆 a日本A語aです diff --git a/tests/utf_test.nit b/tests/utf_test.nit deleted file mode 100644 index 88474c4..0000000 --- a/tests/utf_test.nit +++ /dev/null @@ -1,42 +0,0 @@ -# This file is part of NIT ( http://www.nitlanguage.org ). -# -# This file is free software, which comes along with NIT. This software is -# distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; -# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. You can modify it is you want, provided this header -# is kept unaltered, and a notification of the changes is added. -# You are allowed to redistribute it and sell it, alone or is a part of -# another product. - -import standard -intrude import string_experimentations::utf8 - -var s = "aàハ𐍆".as(FlatString) -assert s.index[0].code_point == 97 -assert s.index[1].code_point == 224 -assert s.index[2].code_point == 12495 -assert s.index[3].code_point == 66374 - -var str = "ハaロЖー𐍃a世界 ᓀ . ᓂ A,𐍆 a日本A語aです".as(FlatString) - -print str.length - -print str.reversed - -str.output - -print "" - -var x = str.substring(4,4).as(FlatString) - -print x - -for i in [0..x.length[ do - print x.index[i + x.index_from].code_point.to_hex -end - -print str.to_upper - -print str.to_lower - -print str * 2 -- 1.7.9.5