X-Git-Url: http://nitlanguage.org diff --git a/lib/string_experimentations/utf8_noindex.nit b/lib/string_experimentations/utf8_noindex.nit index cef60f7..8756838 100644 --- a/lib/string_experimentations/utf8_noindex.nit +++ b/lib/string_experimentations/utf8_noindex.nit @@ -53,6 +53,7 @@ extern class UnicodeChar `{ uint32_t* `} # # As per the specification : # + # ~~~raw # Length | UTF-8 octet sequence # | (binary) # ---------+------------------------------------------------- @@ -60,8 +61,9 @@ extern class UnicodeChar `{ uint32_t* `} # 2 | 110xxxxx 10xxxxxx # 3 | 1110xxxx 10xxxxxx 10xxxxxx # 4 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + # ~~~ fun len: Int `{ - uint32_t s = *recv; + uint32_t s = *self; if(s <= 127) {return 1;} if(s >= 49280 && s <= 57279) {return 2;} if(s >= 14712960 && s <= 15712191) {return 3;} @@ -73,12 +75,12 @@ extern class UnicodeChar `{ uint32_t* `} # Returns the Unicode code point representing the character # # Note : A unicode character might not be a visible glyph, but it will be used to determine canonical equivalence - fun code_point: Int `{ - uint32_t val = *recv; + fun code_point: Int import UnicodeChar.len `{ + uint32_t val = *self; uint32_t ret = 0; - switch(UnicodeChar_len(recv)){ + switch(UnicodeChar_len(self)){ case 1: - ret = *recv; + ret = *self; break; case 2: ret = 0 | ((val & 0x00001F00) >> 2) | (val & 0x0000003F); @@ -104,9 +106,9 @@ extern class UnicodeChar `{ uint32_t* `} # NOTE : Works only on ASCII chars # TODO : Support unicode for to_upper fun to_upper: UnicodeChar import UnicodeChar.code_point `{ - if(*recv < 97 || *recv > 122){ return recv; } + if(*self < 97 || *self > 122){ return self; } uint32_t* ret = calloc(1,4); - *ret = *recv - 32; + *ret = *self - 32; return ret; `} @@ -115,9 +117,9 @@ extern class UnicodeChar `{ uint32_t* `} # NOTE : Works only on ASCII chars # TODO : Support unicode for to_upper fun to_lower: UnicodeChar import UnicodeChar.code_point `{ - if(*recv < 65 || *recv > 90){ return recv; } + if(*self < 65 || *self > 90){ return self; } uint32_t* ret = calloc(1,4); - *ret = *recv + 32; + *ret = *self + 32; return ret; `} @@ -129,13 +131,13 @@ extern class UnicodeChar `{ uint32_t* `} end redef fun output import UnicodeChar.len `{ - uint32_t self = *recv; + uint32_t self0 = *self; if(!IS_BIG_ENDIAN){ - uint32_t tmp = ntohl(self); - memcpy(&self, &tmp, 4); + uint32_t tmp = ntohl(self0); + memcpy(&self0, &tmp, 4); } - unsigned char* s = (unsigned char*) &self; - switch(UnicodeChar_len(recv)){ + unsigned char* s = (unsigned char*) &self0; + switch(UnicodeChar_len(self0)){ case 1: printf("%c", s[3]); break; @@ -152,10 +154,10 @@ extern class UnicodeChar `{ uint32_t* `} `} redef fun to_s: FlatString import FlatString.full, UnicodeChar.len `{ - int len = UnicodeChar_len(recv); + int len = UnicodeChar_len(self); char* r = malloc(len + 1); r[len] = '\0'; - uint32_t src = *recv; + uint32_t src = *self; if(!IS_BIG_ENDIAN){ uint32_t tmp = htonl(src); memcpy(&src, &tmp, 4); @@ -171,6 +173,14 @@ extern class UnicodeChar `{ uint32_t* `} `} end +# Used to keep track of the last accessed char in a String +class CharCache + # The position (as in char) of a String + var position: Int + # The position in the NativeString underlying the String + var bytepos: Int +end + class FlatStringReviter super IndexedIterator[UnicodeChar] @@ -221,7 +231,7 @@ class FlatStringIter private var it: UnicodeChar - private var is_created: Bool + private var is_created = false init(s: FlatString) do from(s, 0) @@ -261,7 +271,10 @@ redef class FlatString redef type OTHER: FlatString # Length in bytes of the string (e.g. the length of the C string) - redef var bytelen: Int + redef var bytelen + + # Cache for the last accessed character in the char + var cache = new CharCache(-1,-1) redef var length = length_l is lazy @@ -276,9 +289,9 @@ redef class FlatString # Length implementation private fun length_l: Int import FlatString.items, FlatString.index_to, FlatString.index_from `{ - char* ns = FlatString_items(recv); - int i = FlatString_index_from(recv); - int max = FlatString_index_to(recv); + char* ns = FlatString_items(self); + int i = FlatString_index_from(self); + int max = FlatString_index_to(self); int length = 0; while(i <= max){ char c = ns[i]; @@ -324,22 +337,57 @@ redef class FlatString private fun byte_index(index: Int): Int do assert index >= 0 assert index < length - var ns_i = index_from - var my_i = 0 - while my_i != index do - if items[ns_i].ascii.bin_and(0x80) == 0 then + + # Find best insertion point + var delta_begin = index + var delta_end = (length - 1) - index + var delta_cache = (cache.position - index).abs + var min = delta_begin + + if delta_cache < min then min = delta_cache + if delta_end < min then min = delta_end + + var ns_i: Int + var my_i: Int + var myits = items + + if min == delta_begin then + ns_i = index_from + my_i = 0 + else if min == delta_cache then + ns_i = cache.bytepos + my_i = cache.position + else + ns_i = index_to + my_i = length + end + + while my_i < index do + if myits[ns_i].ascii.bin_and(0x80) == 0 then ns_i += 1 - else if items[ns_i].ascii.bin_and(0xE0) == 0xC0 then + else if myits[ns_i].ascii.bin_and(0xE0) == 0xC0 then ns_i += 2 - else if items[ns_i].ascii.bin_and(0xF0) == 0xE0 then + else if myits[ns_i].ascii.bin_and(0xF0) == 0xE0 then ns_i += 3 - else if items[ns_i].ascii.bin_and(0xF7) == 0xF0 then + else if myits[ns_i].ascii.bin_and(0xF7) == 0xF0 then ns_i += 4 else ns_i += 1 end my_i += 1 end + + while my_i > index do + if myits[ns_i].ascii.bin_and(0xC0) != 0x80 then + my_i -= 1 + if my_i == index then break + end + ns_i -= 1 + end + + cache.position = index + cache.bytepos = ns_i + return ns_i end @@ -355,7 +403,7 @@ redef class FlatString end redef fun reversed do - var new_str = calloc_string(bytelen) + var new_str = new NativeString(bytelen) var s_pos = bytelen var my_pos = index_from var its = items @@ -369,7 +417,7 @@ redef class FlatString end redef fun to_upper do - var ns = calloc_string(bytelen) + var ns = new NativeString(bytelen) var offset = 0 for i in [0 .. length[ do @@ -381,7 +429,7 @@ redef class FlatString end redef fun to_lower do - var ns = calloc_string(bytelen) + var ns = new NativeString(bytelen) var offset = 0 for i in [0 .. length[ do @@ -395,15 +443,15 @@ redef class FlatString redef fun +(o) do if o isa Buffer then o = o.to_s if o isa FlatString then - var new_str = calloc_string(bytelen + o.bytelen + 1) + var new_str = new NativeString(bytelen + o.bytelen + 1) var new_bytelen = bytelen + o.bytelen new_str[new_bytelen] = '\0' var newlen = length + o.length items.copy_to(new_str, bytelen, index_from, 0) o.items.copy_to(new_str, o.bytelen, o.index_from, bytelen) return new FlatString.full(new_str, 0, new_bytelen - 1, new_bytelen, newlen) - else if o isa RopeString then - return new RopeString.from(self) + o + else if o isa Concat then + return new Concat(self, o) else # If it goes to this point, that means another String implementation was concerned, therefore you need to support the + operation for this variant abort @@ -415,7 +463,7 @@ redef class FlatString var new_bytelen = mybtlen * i var mylen = length var newlen = mylen * i - var ns = calloc_string(new_bytelen + 1) + var ns = new NativeString(new_bytelen + 1) ns[new_bytelen] = '\0' var offset = 0 while i > 0 do @@ -427,7 +475,7 @@ redef class FlatString end # O(n) - redef fun substring(from: Int, count: Int) do + redef fun substring(from, count) do assert count >= 0 if from < 0 then @@ -453,7 +501,7 @@ redef class FlatString redef fun to_cstring do if real_items != null then return real_items.as(not null) - var new_items = calloc_string(bytelen + 1) + var new_items = new NativeString(bytelen + 1) self.items.copy_to(new_items, bytelen, index_from, 0) new_items[bytelen] = '\0' self.real_items = new_items @@ -470,14 +518,14 @@ end redef class FlatBuffer - redef var bytelen: Int + redef var bytelen redef init from(s) do - if s isa RopeString then + if s isa Concat then with_capacity(50) for i in s.substrings do self.append(i) end - items = calloc_string(s.bytelen) + items = new NativeString(s.bytelen) if s isa FlatString then s.items.copy_to(items, s.bytelen, s.index_from, 0) else @@ -510,20 +558,20 @@ redef class FlatBuffer # Shifts the content of the buffer by `len` bytes to the right, starting at byte `from` fun rshift_bytes(from: Int, len: Int) import FlatBuffer.bytelen, FlatBuffer.bytelen=, FlatBuffer.items `{ - long bt = FlatBuffer_bytelen(recv); - char* ns = FlatBuffer_items(recv); + long bt = FlatBuffer_bytelen(self); + char* ns = FlatBuffer_items(self); int off = from + len; memmove(ns + off, ns + from, bt - from); - FlatBuffer_bytelen__assign(recv, bt + len); + FlatBuffer_bytelen__assign(self, bt + len); `} # Shifts the content of the buffer by `len` bytes to the left, starting at `from` fun lshift_bytes(from: Int, len: Int) import FlatBuffer.bytelen, FlatBuffer.bytelen=, FlatBuffer.items `{ - long bt = FlatBuffer_bytelen(recv); - char* ns = FlatBuffer_items(recv); + long bt = FlatBuffer_bytelen(self); + char* ns = FlatBuffer_items(self); int off = from - len; memmove(ns + off, ns + from, bt - from); - FlatBuffer_bytelen__assign(recv, bt - len); + FlatBuffer_bytelen__assign(self, bt - len); `} # Get the Unicode char stored at `index` in `self` @@ -565,14 +613,14 @@ redef class FlatBuffer var c = capacity if cap <= c then return while c <= cap do c = c * 2 + 2 - var a = calloc_string(c+1) + var a = new NativeString(c+1) if bytelen > 0 then items.copy_to(a, bytelen, 0, 0) items = a capacity = c end redef fun append(s) do - if s isa RopeString then + if s isa Concat then for i in s.substrings do append i end var i = s.as(FlatString) @@ -589,7 +637,7 @@ redef class FlatBuffer redef fun reverse do - var nns = calloc_string(bytelen) + var nns = new NativeString(bytelen) var ns = items var btlen = bytelen var myp = 0 @@ -655,7 +703,7 @@ redef class FlatBuffer end redef fun to_cstring do - var ns = calloc_string(bytelen) + var ns = new NativeString(bytelen) items.copy_to(ns, bytelen, 0, 0) return ns end @@ -669,7 +717,7 @@ redef class NativeString return to_s_with_length(len) end - redef fun to_s_with_length(len: Int): FlatString + redef fun to_s_with_length(len) do return new FlatString.with_bytelen(self, 0, len - 1, len) end @@ -677,16 +725,16 @@ redef class NativeString redef fun to_s_with_copy do var length = cstring_length - var new_self = calloc_string(length + 1) + var new_self = new NativeString(length + 1) copy_to(new_self, length, 0, 0) return new FlatString.with_bytelen(new_self, 0, length - 1, length) end end -redef class OFStream +redef class FileWriter redef fun write(s) do - assert _writable + assert is_writable if s isa FlatText then write_native(s.to_cstring, s.bytelen) else for i in s.substrings do write_native(i.to_cstring, i.length)