# UTF-8 char as defined in RFC-3629, e.g. 1-4 Bytes
extern class UnicodeChar `{ uint32_t* `}
+ super Comparable
+
+ redef type OTHER: UnicodeChar
# Transforms a byte-variable char* character to its uint32_t equivalent
new from_ns(ns: NativeString, index: Int) `{
#
# As per the specification :
#
+ # ~~~raw
# Length | UTF-8 octet sequence
# | (binary)
# ---------+-------------------------------------------------
# 2 | 110xxxxx 10xxxxxx
# 3 | 1110xxxx 10xxxxxx 10xxxxxx
# 4 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ # ~~~
fun len: Int `{
- uint32_t s = *recv;
+ uint32_t s = *self;
if(s <= 127) {return 1;}
if(s >= 49280 && s <= 57279) {return 2;}
if(s >= 14712960 && s <= 15712191) {return 3;}
# Returns the Unicode code point representing the character
#
# Note : A unicode character might not be a visible glyph, but it will be used to determine canonical equivalence
- fun code_point: Int `{
- uint32_t val = *recv;
+ fun code_point: Int import UnicodeChar.len `{
+ uint32_t val = *self;
uint32_t ret = 0;
- switch(UnicodeChar_len(recv)){
+ switch(UnicodeChar_len(self)){
case 1:
- ret = *recv;
+ ret = *self;
break;
case 2:
ret = 0 | ((val & 0x00001F00) >> 2) | (val & 0x0000003F);
return ret;
`}
+ # Warning : This does not follow the Unicode specification for now
+ #
+ # TODO: Support Unicode-compliant comparison
+ redef fun <(o) do return self.code_point < o.code_point
+
# Returns an upper-case version of self
#
# NOTE : Works only on ASCII chars
# TODO : Support unicode for to_upper
fun to_upper: UnicodeChar import UnicodeChar.code_point `{
- if(*recv < 97 || *recv > 122){ return recv; }
+ if(*self < 97 || *self > 122){ return self; }
uint32_t* ret = calloc(1,4);
- *ret = *recv - 32;
+ *ret = *self - 32;
return ret;
`}
# NOTE : Works only on ASCII chars
# TODO : Support unicode for to_upper
fun to_lower: UnicodeChar import UnicodeChar.code_point `{
- if(*recv < 65 || *recv > 90){ return recv; }
+ if(*self < 65 || *self > 90){ return self; }
uint32_t* ret = calloc(1,4);
- *ret = *recv + 32;
+ *ret = *self + 32;
return ret;
`}
end
redef fun output import UnicodeChar.len `{
- uint32_t self = *recv;
+ uint32_t self0 = *self;
if(!IS_BIG_ENDIAN){
- uint32_t tmp = ntohl(self);
- memcpy(&self, &tmp, 4);
+ uint32_t tmp = ntohl(self0);
+ memcpy(&self0, &tmp, 4);
}
- unsigned char* s = (unsigned char*) &self;
- switch(UnicodeChar_len(recv)){
+ unsigned char* s = (unsigned char*) &self0;
+ switch(UnicodeChar_len(self0)){
case 1:
printf("%c", s[3]);
break;
`}
redef fun to_s: FlatString import FlatString.full, UnicodeChar.len `{
- int len = UnicodeChar_len(recv);
+ int len = UnicodeChar_len(self);
char* r = malloc(len + 1);
r[len] = '\0';
- uint32_t src = *recv;
+ uint32_t src = *self;
if(!IS_BIG_ENDIAN){
uint32_t tmp = htonl(src);
memcpy(&src, &tmp, 4);
`}
end
+# Used to keep track of the last accessed char in a String
+class CharCache
+ # The position (as in char) of a String
+ var position: Int
+ # The position in the NativeString underlying the String
+ var bytepos: Int
+end
+
+class FlatStringReviter
+ super IndexedIterator[UnicodeChar]
+
+ # The NativeString to iterate upon
+ private var ns: NativeString
+
+ # The position in the string
+ private var pos: Int
+
+ # The position in the native string
+ private var bytepos: Int
+
+ init(s: FlatString) do from(s, s.length - 1)
+
+ init from(s: FlatString, position: Int)
+ do
+ ns = s.items
+ pos = position
+ bytepos = s.byte_index(position)
+ end
+
+ redef fun next
+ do
+ bytepos -= 1
+ while ns[bytepos].ascii.bin_and(0xC0) == 0x80 do
+ bytepos -= 1
+ end
+ pos -= 1
+ end
+
+ redef fun index do return pos
+
+ redef fun item do return new UnicodeChar.from_ns(ns, bytepos)
+
+ redef fun is_ok do return pos >= 0
+end
+
+class FlatStringIter
+ super IndexedIterator[UnicodeChar]
+
+ private var ns: NativeString
+
+ private var pos: Int
+
+ private var bytepos: Int
+
+ private var slen: Int
+
+ private var it: UnicodeChar
+
+ private var is_created = false
+
+ init(s: FlatString) do from(s, 0)
+
+ init from(s: FlatString, position: Int) do
+ ns = s.items
+ pos = position
+ bytepos = s.byte_index(position)
+ slen = s.length
+ end
+
+ redef fun index do return pos
+
+ redef fun is_ok do return pos < slen
+
+ redef fun item do
+ if not is_created then
+ it = new UnicodeChar.from_ns(ns, bytepos)
+ is_created = true
+ end
+ return it
+ end
+
+ redef fun next
+ do
+ if not is_created then
+ it = new UnicodeChar.from_ns(ns, bytepos)
+ end
+ is_created = false
+ var pace = it.len
+ pos += 1
+ bytepos += pace
+ end
+end
+
redef class FlatString
+ redef type OTHER: FlatString
+
# Length in bytes of the string (e.g. the length of the C string)
- var bytelen: Int
+ redef var bytelen
+
+ # Cache for the last accessed character in the char
+ var cache = new CharCache(-1,-1)
redef var length = length_l is lazy
# Length implementation
private fun length_l: Int import FlatString.items, FlatString.index_to, FlatString.index_from `{
- char* ns = FlatString_items(recv);
- int i = FlatString_index_from(recv);
- int max = FlatString_index_to(recv);
+ char* ns = FlatString_items(self);
+ int i = FlatString_index_from(self);
+ int max = FlatString_index_to(self);
int length = 0;
while(i <= max){
char c = ns[i];
return length;
`}
- private init with_bytelen(items: NativeString, index_from: Int, index_to: Int, bytelen: Int)
+ redef fun <(o)
do
+ var o_pos = 0
+ var olen = o.length
+ for i in [0 .. length[ do
+ if o_pos >= olen then return false
+ if char_at(i) > o.char_at(i) then return false
+ if char_at(i) < o.char_at(i) then return true
+ end
+ return false
+ end
+
+ redef fun ==(o) do
+ if o == null then return false
+ if not o isa FlatString then return super
+ var mylen = length
+ var itslen = o.length
+ if mylen != itslen then return false
+ var mypos = 0
+ var itspos = 0
+
+ while mypos < mylen do
+ if char_at(mypos) != o.char_at(itspos) then return false
+ mypos += 1
+ itspos += 1
+ end
+ return true
+ end
+
+ private fun byte_index(index: Int): Int do
+ assert index >= 0
+ assert index < length
+
+ # Find best insertion point
+ var delta_begin = index
+ var delta_end = (length - 1) - index
+ var delta_cache = (cache.position - index).abs
+ var min = delta_begin
+
+ if delta_cache < min then min = delta_cache
+ if delta_end < min then min = delta_end
+
+ var ns_i: Int
+ var my_i: Int
+ var myits = items
+
+ if min == delta_begin then
+ ns_i = index_from
+ my_i = 0
+ else if min == delta_cache then
+ ns_i = cache.bytepos
+ my_i = cache.position
+ else
+ ns_i = index_to
+ my_i = length
+ end
+
+ while my_i < index do
+ if myits[ns_i].ascii.bin_and(0x80) == 0 then
+ ns_i += 1
+ else if myits[ns_i].ascii.bin_and(0xE0) == 0xC0 then
+ ns_i += 2
+ else if myits[ns_i].ascii.bin_and(0xF0) == 0xE0 then
+ ns_i += 3
+ else if myits[ns_i].ascii.bin_and(0xF7) == 0xF0 then
+ ns_i += 4
+ else
+ ns_i += 1
+ end
+ my_i += 1
+ end
+
+ while my_i > index do
+ if myits[ns_i].ascii.bin_and(0xC0) != 0x80 then
+ my_i -= 1
+ if my_i == index then break
+ end
+ ns_i -= 1
+ end
+
+ cache.position = index
+ cache.bytepos = ns_i
+
+ return ns_i
+ end
+
+ fun char_at(pos: Int): UnicodeChar do
+ return new UnicodeChar.from_ns(items, byte_index(pos))
+ end
+
+ private init with_bytelen(items: NativeString, index_from: Int, index_to: Int, bytelen: Int) do
self.items = items
self.index_from = index_from
self.index_to = index_to
self.bytelen = bytelen
end
- redef fun to_cstring
- do
+ redef fun reversed do
+ var new_str = new NativeString(bytelen)
+ var s_pos = bytelen
+ var my_pos = index_from
+ var its = items
+ for i in [0..length[ do
+ var c = char_at(i).len
+ s_pos -= c
+ its.copy_to(new_str, c, my_pos, s_pos)
+ my_pos += c
+ end
+ return new FlatString.full(new_str, 0, bytelen - 1, bytelen, length)
+ end
+
+ redef fun to_upper do
+ var ns = new NativeString(bytelen)
+ var offset = 0
+ for i in [0 .. length[
+ do
+ var c = char_at(i)
+ c.to_upper.to_s.items.copy_to(ns, c.len, 0, offset)
+ offset += c.len
+ end
+ return new FlatString.full(ns, 0, bytelen - 1, bytelen, length)
+ end
+
+ redef fun to_lower do
+ var ns = new NativeString(bytelen)
+ var offset = 0
+ for i in [0 .. length[
+ do
+ var c = char_at(i)
+ c.to_lower.to_s.items.copy_to(ns, c.len, 0, offset)
+ offset += c.len
+ end
+ return new FlatString.full(ns, 0, bytelen - 1, bytelen, length)
+ end
+
+ redef fun +(o) do
+ if o isa Buffer then o = o.to_s
+ if o isa FlatString then
+ var new_str = new NativeString(bytelen + o.bytelen + 1)
+ var new_bytelen = bytelen + o.bytelen
+ new_str[new_bytelen] = '\0'
+ var newlen = length + o.length
+ items.copy_to(new_str, bytelen, index_from, 0)
+ o.items.copy_to(new_str, o.bytelen, o.index_from, bytelen)
+ return new FlatString.full(new_str, 0, new_bytelen - 1, new_bytelen, newlen)
+ else if o isa Concat then
+ return new Concat(self, o)
+ else
+ # If it goes to this point, that means another String implementation was concerned, therefore you need to support the + operation for this variant
+ abort
+ end
+ end
+
+ redef fun *(i) do
+ var mybtlen = bytelen
+ var new_bytelen = mybtlen * i
+ var mylen = length
+ var newlen = mylen * i
+ var ns = new NativeString(new_bytelen + 1)
+ ns[new_bytelen] = '\0'
+ var offset = 0
+ while i > 0 do
+ items.copy_to(ns, bytelen, index_from, offset)
+ offset += mybtlen
+ i -= 1
+ end
+ return new FlatString.full(ns, 0, new_bytelen - 1, new_bytelen, newlen)
+ end
+
+ # O(n)
+ redef fun substring(from, count) do
+ assert count >= 0
+
+ if from < 0 then
+ count += from
+ if count < 0 then count = 0
+ from = 0
+ end
+
+ if count == 0 then return empty
+
+ var real_from = byte_index(from)
+
+ var lst = from + count - 1
+
+ if lst > length - from then
+ return new FlatString.with_bytelen(items, real_from, index_to, index_to - real_from)
+ end
+
+ var real_to = byte_index(lst)
+
+ return new FlatString.full(items, real_from, real_to, (real_to + char_at(lst).len) - real_from, count)
+ end
+
+ redef fun to_cstring do
if real_items != null then return real_items.as(not null)
- var new_items = calloc_string(bytelen + 1)
+ var new_items = new NativeString(bytelen + 1)
self.items.copy_to(new_items, bytelen, index_from, 0)
new_items[bytelen] = '\0'
self.real_items = new_items
end
end
+redef class Text
+
+ # Length of the string, in bytes
+ fun bytelen: Int is abstract
+
+end
+
+redef class FlatBuffer
+
+ redef var bytelen
+
+ redef init from(s) do
+ if s isa Concat then
+ with_capacity(50)
+ for i in s.substrings do self.append(i)
+ end
+ items = new NativeString(s.bytelen)
+ if s isa FlatString then
+ s.items.copy_to(items, s.bytelen, s.index_from, 0)
+ else
+ s.as(FlatBuffer).items.copy_to(items, s.as(FlatBuffer).bytelen, 0, 0)
+ end
+ length = s.length
+ bytelen = s.bytelen
+ capacity = s.bytelen
+ end
+
+ # Replaces the char at `index` by `item`
+ fun char_at=(index: Int, item: UnicodeChar) do
+ is_dirty = true
+ if index == length then
+ add_unicode item
+ return
+ end
+ assert index >= 0 and index < length
+ var ip = byte_at(index)
+ var c = char_at_byte(ip)
+ var size_diff = item.len - c.len
+ if size_diff > 0 then
+ rshift_bytes(ip + c.len, size_diff)
+ else if size_diff < 0 then
+ lshift_bytes(ip + c.len, -size_diff)
+ end
+ var s = item.to_s
+ s.items.copy_to(items, s.bytelen, 0, ip)
+ end
+
+ # Shifts the content of the buffer by `len` bytes to the right, starting at byte `from`
+ fun rshift_bytes(from: Int, len: Int) import FlatBuffer.bytelen, FlatBuffer.bytelen=, FlatBuffer.items `{
+ long bt = FlatBuffer_bytelen(self);
+ char* ns = FlatBuffer_items(self);
+ int off = from + len;
+ memmove(ns + off, ns + from, bt - from);
+ FlatBuffer_bytelen__assign(self, bt + len);
+ `}
+
+ # Shifts the content of the buffer by `len` bytes to the left, starting at `from`
+ fun lshift_bytes(from: Int, len: Int) import FlatBuffer.bytelen, FlatBuffer.bytelen=, FlatBuffer.items `{
+ long bt = FlatBuffer_bytelen(self);
+ char* ns = FlatBuffer_items(self);
+ int off = from - len;
+ memmove(ns + off, ns + from, bt - from);
+ FlatBuffer_bytelen__assign(self, bt - len);
+ `}
+
+ # Get the Unicode char stored at `index` in `self`
+ fun char_at(index: Int): UnicodeChar do return new UnicodeChar.from_ns(items, byte_at(index))
+
+ # Get the Unicode char stored at `index` (bytewise) in `self`
+ fun char_at_byte(index: Int): UnicodeChar do return new UnicodeChar.from_ns(items, index)
+
+ # Add equivalent that supports Unicode
+ fun add_unicode(c: UnicodeChar) do
+ var s = c.to_s
+ if s.bytelen + bytelen > capacity then enlarge(s.bytelen)
+ s.items.copy_to(items, s.bytelen, 0, bytelen)
+ end
+
+ # Gets the byte index (in NativeString) of the char stored at `i`
+ fun byte_at(i: Int): Int do
+ assert i < length and i >= 0
+ var ns_i = 0
+ var real_i = 0
+ while real_i < i do
+ if items[ns_i].ascii.bin_and(0x80) == 0 then
+ ns_i += 1
+ else if items[ns_i].ascii.bin_and(0xE0) == 0xC0 then
+ ns_i += 2
+ else if items[ns_i].ascii.bin_and(0xF0) == 0xE0 then
+ ns_i += 3
+ else if items[ns_i].ascii.bin_and(0xF7) == 0xF0 then
+ ns_i += 4
+ else
+ ns_i += 1
+ end
+ real_i += 1
+ end
+ return ns_i
+ end
+
+ redef fun enlarge(cap) do
+ var c = capacity
+ if cap <= c then return
+ while c <= cap do c = c * 2 + 2
+ var a = new NativeString(c+1)
+ if bytelen > 0 then items.copy_to(a, bytelen, 0, 0)
+ items = a
+ capacity = c
+ end
+
+ redef fun append(s) do
+ if s isa Concat then
+ for i in s.substrings do append i
+ end
+ var i = s.as(FlatString)
+ var blen = bytelen
+ var iblen = i.bytelen
+ var newlen = blen + iblen
+ if newlen > capacity then
+ enlarge(newlen)
+ end
+ i.items.copy_to(items, iblen, i.index_from, blen)
+ bytelen += iblen
+ length += i.length
+ end
+
+ redef fun reverse
+ do
+ var nns = new NativeString(bytelen)
+ var ns = items
+ var btlen = bytelen
+ var myp = 0
+ var itsp = btlen
+ while myp < btlen do
+ var c = char_at_byte(myp).len
+ itsp -= c
+ ns.copy_to(nns, c, myp, itsp)
+ myp += c
+ end
+ items = nns
+ end
+
+ redef fun clear do
+ length = 0
+ bytelen = 0
+ end
+
+ redef fun copy(s, l, d, ns) do
+ if not d isa FlatBuffer then
+ # This implementation here is only concerned by the FlatBuffer
+ # If you implement a new Buffer subclass, make sure to support this operation via refinement.
+ abort
+ end
+ var rs = byte_at(s)
+ var re = byte_at(s + l - 1)
+ var rl = re - rs
+ var rns = d.byte_at(ns)
+ items.copy_to(d.items, rl, rns, rs)
+ end
+
+ redef fun times(i) do
+ var len = bytelen
+ var off = len
+ var newlen = len * i
+ if newlen > capacity then enlarge(newlen)
+ for j in [1 .. i[ do
+ items.copy_to(items, len, 0, off)
+ off += len
+ end
+ bytelen = newlen
+ length = length * i
+ end
+
+ redef fun upper do
+ for i in [0 .. length[ do
+ var pos = byte_at(i)
+ var c = char_at_byte(pos)
+ var d = c.to_upper
+ if c == d then continue
+ d.to_s.items.copy_to(items, 1, 0, pos)
+ end
+ end
+
+ redef fun lower do
+ for i in [0 .. length[ do
+ var pos = byte_at(i)
+ var c = char_at_byte(pos)
+ var d = c.to_lower
+ if c == d then continue
+ d.to_s.items.copy_to(items, 1, 0, pos)
+ end
+ end
+
+ redef fun to_cstring do
+ var ns = new NativeString(bytelen)
+ items.copy_to(ns, bytelen, 0, 0)
+ return ns
+ end
+end
+
redef class NativeString
redef fun to_s: FlatString
return to_s_with_length(len)
end
- redef fun to_s_with_length(len: Int): FlatString
+ redef fun to_s_with_length(len)
do
return new FlatString.with_bytelen(self, 0, len - 1, len)
end
redef fun to_s_with_copy
do
var length = cstring_length
- var new_self = calloc_string(length + 1)
+ var new_self = new NativeString(length + 1)
copy_to(new_self, length, 0, 0)
return new FlatString.with_bytelen(new_self, 0, length - 1, length)
end
end
-redef class OFStream
+redef class FileWriter
redef fun write(s)
do
- assert _writable
+ assert is_writable
if s isa FlatText then
- if s isa FlatString then
- write_native(s.to_cstring, s.bytelen)
- else
- write_native(s.to_cstring, s.length)
- end
+ write_native(s.to_cstring, s.bytelen)
else for i in s.substrings do write_native(i.to_cstring, i.length)
end
end