# This file is part of NIT ( http://www.nitlanguage.org ). # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Introduces UTF-8 as internal encoding for Strings in Nit. module utf8_noindex intrude import standard::string intrude import standard::file in "C Header" `{ #include #include #include #define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100) `} # UTF-8 char as defined in RFC-3629, e.g. 1-4 Bytes extern class UnicodeChar `{ uint32_t* `} super Comparable redef type OTHER: UnicodeChar # Transforms a byte-variable char* character to its uint32_t equivalent new from_ns(ns: NativeString, index: Int) `{ unsigned char* ret = calloc(1,4); if((ns[index] & 0x80) == 0){ memcpy(ret + 3, ns + index, 1); } else if((ns[index] & 0xE0) == 0xC0) { memcpy(ret + 2, ns + index, 2); } else if((ns[index] & 0xF0) == 0xE0) { memcpy(ret + 1, ns + index, 3); } else if((ns[index] & 0xF7) == 0xF0) { memcpy(ret, ns + index, 4); } else{ memcpy(ret + 3, ns + index, 1);} if (!IS_BIG_ENDIAN) { uint32_t tmp = ntohl(*((uint32_t*)ret)); memcpy(ret, &tmp, 4); } return (uint32_t*)ret; `} # Real length of the char in UTF8 # # As per the specification : # # ~~~raw # Length | UTF-8 octet sequence # | (binary) # ---------+------------------------------------------------- # 1 | 0xxxxxxx # 2 | 110xxxxx 10xxxxxx # 3 | 1110xxxx 10xxxxxx 10xxxxxx # 4 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx # ~~~ fun len: Int `{ uint32_t s = *recv; if(s <= 127) {return 1;} if(s >= 49280 && s <= 57279) {return 2;} if(s >= 14712960 && s <= 15712191) {return 3;} if(s >= 4034953344 && s <= 4156538815) { return 4; } // Bad character return 1; `} # Returns the Unicode code point representing the character # # Note : A unicode character might not be a visible glyph, but it will be used to determine canonical equivalence fun code_point: Int import UnicodeChar.len `{ uint32_t val = *recv; uint32_t ret = 0; switch(UnicodeChar_len(recv)){ case 1: ret = *recv; break; case 2: ret = 0 | ((val & 0x00001F00) >> 2) | (val & 0x0000003F); break; case 3: ret = 0 | ((val & 0x000F0000) >> 4) | ((val & 0x00003F00) >> 2) | (val & 0x0000003F); break; case 4: ret = 0 | ((val & 0x07000000) >> 6) | ((val & 0x003F0000) >> 4) | ((val & 0x00003F00) >> 2) | (val & 0x0000003F); break; } unsigned char* rt = (unsigned char*) &ret; return ret; `} # Warning : This does not follow the Unicode specification for now # # TODO: Support Unicode-compliant comparison redef fun <(o) do return self.code_point < o.code_point # Returns an upper-case version of self # # NOTE : Works only on ASCII chars # TODO : Support unicode for to_upper fun to_upper: UnicodeChar import UnicodeChar.code_point `{ if(*recv < 97 || *recv > 122){ return recv; } uint32_t* ret = calloc(1,4); *ret = *recv - 32; return ret; `} # Returns an lower-case version of self # # NOTE : Works only on ASCII chars # TODO : Support unicode for to_upper fun to_lower: UnicodeChar import UnicodeChar.code_point `{ if(*recv < 65 || *recv > 90){ return recv; } uint32_t* ret = calloc(1,4); *ret = *recv + 32; return ret; `} redef fun ==(o) do if not o isa UnicodeChar then return false if o.code_point == self.code_point then return true return false end redef fun output import UnicodeChar.len `{ uint32_t self = *recv; if(!IS_BIG_ENDIAN){ uint32_t tmp = ntohl(self); memcpy(&self, &tmp, 4); } unsigned char* s = (unsigned char*) &self; switch(UnicodeChar_len(recv)){ case 1: printf("%c", s[3]); break; case 2: printf("%c%c", s[2], s[3]); break; case 3: printf("%c%c%c", s[1], s[2], s[3]); break; case 4: printf("%c%c%c%c", s[0], s[1], s[2], s[3]); break; } `} redef fun to_s: FlatString import FlatString.full, UnicodeChar.len `{ int len = UnicodeChar_len(recv); char* r = malloc(len + 1); r[len] = '\0'; uint32_t src = *recv; if(!IS_BIG_ENDIAN){ uint32_t tmp = htonl(src); memcpy(&src, &tmp, 4); } unsigned char* s = (unsigned char*) &src; switch(len){ case 1: memcpy(r, s+3, 1); break; case 2: memcpy(r, s+2, 2); break; case 3: memcpy(r, s+1, 3); break; case 4: memcpy(r, s, 4); break; } return new_FlatString_full(r, 0, len - 1, len, 1); `} end # Used to keep track of the last accessed char in a String class CharCache # The position (as in char) of a String var position: Int # The position in the NativeString underlying the String var bytepos: Int end class FlatStringReviter super IndexedIterator[UnicodeChar] # The NativeString to iterate upon private var ns: NativeString # The position in the string private var pos: Int # The position in the native string private var bytepos: Int init(s: FlatString) do from(s, s.length - 1) init from(s: FlatString, position: Int) do ns = s.items pos = position bytepos = s.byte_index(position) end redef fun next do bytepos -= 1 while ns[bytepos].ascii.bin_and(0xC0) == 0x80 do bytepos -= 1 end pos -= 1 end redef fun index do return pos redef fun item do return new UnicodeChar.from_ns(ns, bytepos) redef fun is_ok do return pos >= 0 end class FlatStringIter super IndexedIterator[UnicodeChar] private var ns: NativeString private var pos: Int private var bytepos: Int private var slen: Int private var it: UnicodeChar private var is_created: Bool init(s: FlatString) do from(s, 0) init from(s: FlatString, position: Int) do ns = s.items pos = position bytepos = s.byte_index(position) slen = s.length end redef fun index do return pos redef fun is_ok do return pos < slen redef fun item do if not is_created then it = new UnicodeChar.from_ns(ns, bytepos) is_created = true end return it end redef fun next do if not is_created then it = new UnicodeChar.from_ns(ns, bytepos) end is_created = false var pace = it.len pos += 1 bytepos += pace end end redef class FlatString redef type OTHER: FlatString # Length in bytes of the string (e.g. the length of the C string) redef var bytelen: Int # Cache for the last accessed character in the char var cache = new CharCache(-1,-1) redef var length = length_l is lazy private init full(items: NativeString, from, to, bytelen, len: Int) do self.items = items index_from = from index_to = to self.bytelen = bytelen length = len end # Length implementation private fun length_l: Int import FlatString.items, FlatString.index_to, FlatString.index_from `{ char* ns = FlatString_items(recv); int i = FlatString_index_from(recv); int max = FlatString_index_to(recv); int length = 0; while(i <= max){ char c = ns[i]; if((c & 0x80) == 0) { i+= 1; } else if((c & 0xE0) == 0xC0) { i += 2; } else if((c & 0xF0) == 0xE0) { i += 3; } else if((c & 0xF7) == 0xF0) { i += 4; } else { i += 1; } length ++; } return length; `} redef fun <(o) do var o_pos = 0 var olen = o.length for i in [0 .. length[ do if o_pos >= olen then return false if char_at(i) > o.char_at(i) then return false if char_at(i) < o.char_at(i) then return true end return false end redef fun ==(o) do if o == null then return false if not o isa FlatString then return super var mylen = length var itslen = o.length if mylen != itslen then return false var mypos = 0 var itspos = 0 while mypos < mylen do if char_at(mypos) != o.char_at(itspos) then return false mypos += 1 itspos += 1 end return true end private fun byte_index(index: Int): Int do assert index >= 0 assert index < length # Find best insertion point var delta_begin = index var delta_end = (length - 1) - index var delta_cache = (cache.position - index).abs var min = delta_begin if delta_cache < min then min = delta_cache if delta_end < min then min = delta_end var ns_i: Int var my_i: Int var myits = items if min == delta_begin then ns_i = index_from my_i = 0 else if min == delta_cache then ns_i = cache.bytepos my_i = cache.position else ns_i = index_to my_i = length end while my_i < index do if myits[ns_i].ascii.bin_and(0x80) == 0 then ns_i += 1 else if myits[ns_i].ascii.bin_and(0xE0) == 0xC0 then ns_i += 2 else if myits[ns_i].ascii.bin_and(0xF0) == 0xE0 then ns_i += 3 else if myits[ns_i].ascii.bin_and(0xF7) == 0xF0 then ns_i += 4 else ns_i += 1 end my_i += 1 end while my_i > index do if myits[ns_i].ascii.bin_and(0xC0) != 0x80 then my_i -= 1 if my_i == index then break end ns_i -= 1 end cache.position = index cache.bytepos = ns_i return ns_i end fun char_at(pos: Int): UnicodeChar do return new UnicodeChar.from_ns(items, byte_index(pos)) end private init with_bytelen(items: NativeString, index_from: Int, index_to: Int, bytelen: Int) do self.items = items self.index_from = index_from self.index_to = index_to self.bytelen = bytelen end redef fun reversed do var new_str = new NativeString(bytelen) var s_pos = bytelen var my_pos = index_from var its = items for i in [0..length[ do var c = char_at(i).len s_pos -= c its.copy_to(new_str, c, my_pos, s_pos) my_pos += c end return new FlatString.full(new_str, 0, bytelen - 1, bytelen, length) end redef fun to_upper do var ns = new NativeString(bytelen) var offset = 0 for i in [0 .. length[ do var c = char_at(i) c.to_upper.to_s.items.copy_to(ns, c.len, 0, offset) offset += c.len end return new FlatString.full(ns, 0, bytelen - 1, bytelen, length) end redef fun to_lower do var ns = new NativeString(bytelen) var offset = 0 for i in [0 .. length[ do var c = char_at(i) c.to_lower.to_s.items.copy_to(ns, c.len, 0, offset) offset += c.len end return new FlatString.full(ns, 0, bytelen - 1, bytelen, length) end redef fun +(o) do if o isa Buffer then o = o.to_s if o isa FlatString then var new_str = new NativeString(bytelen + o.bytelen + 1) var new_bytelen = bytelen + o.bytelen new_str[new_bytelen] = '\0' var newlen = length + o.length items.copy_to(new_str, bytelen, index_from, 0) o.items.copy_to(new_str, o.bytelen, o.index_from, bytelen) return new FlatString.full(new_str, 0, new_bytelen - 1, new_bytelen, newlen) else if o isa Concat then return new Concat(self, o) else # If it goes to this point, that means another String implementation was concerned, therefore you need to support the + operation for this variant abort end end redef fun *(i) do var mybtlen = bytelen var new_bytelen = mybtlen * i var mylen = length var newlen = mylen * i var ns = new NativeString(new_bytelen + 1) ns[new_bytelen] = '\0' var offset = 0 while i > 0 do items.copy_to(ns, bytelen, index_from, offset) offset += mybtlen i -= 1 end return new FlatString.full(ns, 0, new_bytelen - 1, new_bytelen, newlen) end # O(n) redef fun substring(from: Int, count: Int) do assert count >= 0 if from < 0 then count += from if count < 0 then count = 0 from = 0 end if count == 0 then return empty var real_from = byte_index(from) var lst = from + count - 1 if lst > length - from then return new FlatString.with_bytelen(items, real_from, index_to, index_to - real_from) end var real_to = byte_index(lst) return new FlatString.full(items, real_from, real_to, (real_to + char_at(lst).len) - real_from, count) end redef fun to_cstring do if real_items != null then return real_items.as(not null) var new_items = new NativeString(bytelen + 1) self.items.copy_to(new_items, bytelen, index_from, 0) new_items[bytelen] = '\0' self.real_items = new_items return new_items end end redef class Text # Length of the string, in bytes fun bytelen: Int is abstract end redef class FlatBuffer redef var bytelen: Int redef init from(s) do if s isa Concat then with_capacity(50) for i in s.substrings do self.append(i) end items = new NativeString(s.bytelen) if s isa FlatString then s.items.copy_to(items, s.bytelen, s.index_from, 0) else s.as(FlatBuffer).items.copy_to(items, s.as(FlatBuffer).bytelen, 0, 0) end length = s.length bytelen = s.bytelen capacity = s.bytelen end # Replaces the char at `index` by `item` fun char_at=(index: Int, item: UnicodeChar) do is_dirty = true if index == length then add_unicode item return end assert index >= 0 and index < length var ip = byte_at(index) var c = char_at_byte(ip) var size_diff = item.len - c.len if size_diff > 0 then rshift_bytes(ip + c.len, size_diff) else if size_diff < 0 then lshift_bytes(ip + c.len, -size_diff) end var s = item.to_s s.items.copy_to(items, s.bytelen, 0, ip) end # Shifts the content of the buffer by `len` bytes to the right, starting at byte `from` fun rshift_bytes(from: Int, len: Int) import FlatBuffer.bytelen, FlatBuffer.bytelen=, FlatBuffer.items `{ long bt = FlatBuffer_bytelen(recv); char* ns = FlatBuffer_items(recv); int off = from + len; memmove(ns + off, ns + from, bt - from); FlatBuffer_bytelen__assign(recv, bt + len); `} # Shifts the content of the buffer by `len` bytes to the left, starting at `from` fun lshift_bytes(from: Int, len: Int) import FlatBuffer.bytelen, FlatBuffer.bytelen=, FlatBuffer.items `{ long bt = FlatBuffer_bytelen(recv); char* ns = FlatBuffer_items(recv); int off = from - len; memmove(ns + off, ns + from, bt - from); FlatBuffer_bytelen__assign(recv, bt - len); `} # Get the Unicode char stored at `index` in `self` fun char_at(index: Int): UnicodeChar do return new UnicodeChar.from_ns(items, byte_at(index)) # Get the Unicode char stored at `index` (bytewise) in `self` fun char_at_byte(index: Int): UnicodeChar do return new UnicodeChar.from_ns(items, index) # Add equivalent that supports Unicode fun add_unicode(c: UnicodeChar) do var s = c.to_s if s.bytelen + bytelen > capacity then enlarge(s.bytelen) s.items.copy_to(items, s.bytelen, 0, bytelen) end # Gets the byte index (in NativeString) of the char stored at `i` fun byte_at(i: Int): Int do assert i < length and i >= 0 var ns_i = 0 var real_i = 0 while real_i < i do if items[ns_i].ascii.bin_and(0x80) == 0 then ns_i += 1 else if items[ns_i].ascii.bin_and(0xE0) == 0xC0 then ns_i += 2 else if items[ns_i].ascii.bin_and(0xF0) == 0xE0 then ns_i += 3 else if items[ns_i].ascii.bin_and(0xF7) == 0xF0 then ns_i += 4 else ns_i += 1 end real_i += 1 end return ns_i end redef fun enlarge(cap) do var c = capacity if cap <= c then return while c <= cap do c = c * 2 + 2 var a = new NativeString(c+1) if bytelen > 0 then items.copy_to(a, bytelen, 0, 0) items = a capacity = c end redef fun append(s) do if s isa Concat then for i in s.substrings do append i end var i = s.as(FlatString) var blen = bytelen var iblen = i.bytelen var newlen = blen + iblen if newlen > capacity then enlarge(newlen) end i.items.copy_to(items, iblen, i.index_from, blen) bytelen += iblen length += i.length end redef fun reverse do var nns = new NativeString(bytelen) var ns = items var btlen = bytelen var myp = 0 var itsp = btlen while myp < btlen do var c = char_at_byte(myp).len itsp -= c ns.copy_to(nns, c, myp, itsp) myp += c end items = nns end redef fun clear do length = 0 bytelen = 0 end redef fun copy(s, l, d, ns) do if not d isa FlatBuffer then # This implementation here is only concerned by the FlatBuffer # If you implement a new Buffer subclass, make sure to support this operation via refinement. abort end var rs = byte_at(s) var re = byte_at(s + l - 1) var rl = re - rs var rns = d.byte_at(ns) items.copy_to(d.items, rl, rns, rs) end redef fun times(i) do var len = bytelen var off = len var newlen = len * i if newlen > capacity then enlarge(newlen) for j in [1 .. i[ do items.copy_to(items, len, 0, off) off += len end bytelen = newlen length = length * i end redef fun upper do for i in [0 .. length[ do var pos = byte_at(i) var c = char_at_byte(pos) var d = c.to_upper if c == d then continue d.to_s.items.copy_to(items, 1, 0, pos) end end redef fun lower do for i in [0 .. length[ do var pos = byte_at(i) var c = char_at_byte(pos) var d = c.to_lower if c == d then continue d.to_s.items.copy_to(items, 1, 0, pos) end end redef fun to_cstring do var ns = new NativeString(bytelen) items.copy_to(ns, bytelen, 0, 0) return ns end end redef class NativeString redef fun to_s: FlatString do var len = cstring_length return to_s_with_length(len) end redef fun to_s_with_length(len: Int): FlatString do return new FlatString.with_bytelen(self, 0, len - 1, len) end redef fun to_s_with_copy do var length = cstring_length var new_self = new NativeString(length + 1) copy_to(new_self, length, 0, 0) return new FlatString.with_bytelen(new_self, 0, length - 1, length) end end redef class OFStream redef fun write(s) do assert is_writable if s isa FlatText then write_native(s.to_cstring, s.bytelen) else for i in s.substrings do write_native(i.to_cstring, i.length) end end