+++ /dev/null
-# This file is part of NIT ( http://www.nitlanguage.org ).
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Introduces UTF-8 as internal encoding for Strings in Nit.
-module utf8
-
-intrude import standard::string
-intrude import standard::file
-
-in "C Header" `{
-
-#include <stdio.h>
-#include <string.h>
-#include <stdint.h>
-
-typedef struct {
- long pos;
- char* ns;
-} UTF8Char;
-
-`}
-
-# UTF-8 char as defined in RFC-3629, e.g. 1-4 Bytes
-#
-# A UTF-8 char has its bytes stored in a NativeString (char*)
-extern class UTF8Char `{ UTF8Char* `}
-
- new(pos: Int, ns: NativeString) `{
- UTF8Char* u = malloc(sizeof(UTF8Char));
- u->pos = pos;
- u->ns = ns;
- return u;
- `}
-
- # Real length of the char in UTF8
- #
- # As per the specification :
- #
- # ~~~raw
- # Length | UTF-8 octet sequence
- # | (binary)
- # ---------+-------------------------------------------------
- # 1 | 0xxxxxxx
- # 2 | 110xxxxx 10xxxxxx
- # 3 | 1110xxxx 10xxxxxx 10xxxxxx
- # 4 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- # ~~~
- private fun len: Int `{
- char* ns = self->ns;
- int pos = self->pos;
- char nspos = ns[pos];
- if((nspos & 0x80) == 0x00){ return 1;}
- if((nspos & 0xE0) == 0xC0){ return 2;}
- if((nspos & 0xF0) == 0xE0){ return 3;}
- if((nspos & 0xF7) == 0xF0){ return 4;}
- // Invalid character
- return 1;
- `}
-
- # Position in containing NativeString
- private fun pos: Int `{
- return self->pos;
- `}
-
- private fun pos=(p: Int) `{self->pos = p;`}
-
- # C char* wrapping the char
- fun ns: NativeString `{
- return self->ns;
- `}
-
- # Returns the Unicode code point representing the character
- #
- # Note : A unicode character might not be a visible glyph, but it will be used to determine canonical equivalence
- fun code_point: Int import UTF8Char.len `{
- switch(UTF8Char_len(self)){
- case 1:
- return (long)(0x7F & (unsigned char)self->ns[self->pos]);
- case 2:
- return 0 | ((0x1F & (unsigned char)self->ns[self->pos]) << 6) | (0x3F & (unsigned char)self->ns[self->pos+1]);
- case 3:
- return 0 | ((0x0F & (unsigned char)self->ns[self->pos]) << 12) |
- ((0x3F & (unsigned char)self->ns[self->pos+1]) << 6) |
- (0x3F & (unsigned char)self->ns[self->pos+2]);
- case 4:
- return 0 | ((0x07 & (unsigned char)self->ns[self->pos]) << 18) |
- ((0x3F & (unsigned char)self->ns[self->pos+1]) << 12) |
- ((0x3F & (unsigned char)self->ns[self->pos+2]) << 6) |
- (0x3F & (unsigned char)self->ns[self->pos+3]);
- }
- `}
-
- # Returns an upper-case version of self
- #
- # NOTE : Works only on ASCII chars
- # TODO : Support unicode for to_upper
- fun to_upper: UTF8Char import UTF8Char.code_point `{
- int cp = UTF8Char_code_point(self);
- if(cp < 97 || cp > 122){ return self; }
- char* ns = malloc(2);
- ns[1] = '\0';
- char c = self->ns[self->pos];
- ns[0] = c - 32;
- UTF8Char* ret = malloc(sizeof(UTF8Char));
- ret->ns = ns;
- ret->pos = 0;
- return ret;
- `}
-
- # Returns an lower-case version of self
- #
- # NOTE : Works only on ASCII chars
- # TODO : Support unicode for to_upper
- fun to_lower: UTF8Char import UTF8Char.code_point `{
- int cp = UTF8Char_code_point(self);
- if(cp < 65 || cp > 90){ return self; }
- char* ns = malloc(2);
- ns[1] = '\0';
- char c = self->ns[self->pos];
- ns[0] = c + 32;
- UTF8Char* ret = malloc(sizeof(UTF8Char));
- ret->ns = ns;
- ret->pos = 0;
- return ret;
- `}
-
- redef fun ==(o)
- do
- if o isa Char then
- if len != 1 then return false
- if code_point == o.ascii then return true
- else if o isa UTF8Char then
- if len != o.len then return false
- if code_point == o.code_point then return true
- end
- return false
- end
-
- redef fun output import UTF8Char.code_point `{
- switch(UTF8Char_len(self)){
- case 1:
- printf("%c", self->ns[self->pos]);
- break;
- case 2:
- printf("%c%c", self->ns[self->pos], self->ns[self->pos + 1]);
- break;
- case 3:
- printf("%c%c%c", self->ns[self->pos], self->ns[self->pos + 1], self->ns[self->pos + 2]);
- break;
- case 4:
- printf("%c%c%c%c", self->ns[self->pos], self->ns[self->pos + 1], self->ns[self->pos + 2], self->ns[self->pos + 3]);
- break;
- }
- `}
-
- redef fun to_s import NativeString.to_s_with_length `{
- int len = utf8___UTF8Char_len___impl(self);
- char* r = malloc(len + 1);
- r[len] = '\0';
- char* src = (self->ns + self->pos);
- memcpy(r, src, len);
- return NativeString_to_s_with_length(r, len);
- `}
-end
-
-# A `StringIndex` is used to keep track of the position of characters in a `FlatString` object
-#
-# It becomes mandatory for UTF-8 strings since characters do not have a fixed size.
-private extern class StringIndex `{ UTF8Char* `}
-
- new(size: Int) `{ return malloc(size*sizeof(UTF8Char)); `}
-
- # Sets the character at `index` as `item`
- fun []=(index: Int, item: UTF8Char) `{ self[index] = *item; `}
-
- # Gets the character at position `id`
- fun [](id: Int): UTF8Char `{ return &self[id]; `}
-
- # Copies a part of self starting at index `my_from` of length `length` into `other`, starting at `its_from`
- fun copy_to(other: StringIndex, my_from: Int, its_from: Int, length: Int)`{
- UTF8Char* myfrom = self + my_from*(sizeof(UTF8Char));
- UTF8Char* itsfrom = other + its_from*(sizeof(UTF8Char));
- memcpy(itsfrom, myfrom, length);
- `}
-end
-
-redef class FlatString
-
- # Index of the characters of the FlatString
- private var index: StringIndex
-
- # Length in bytes of the string (e.g. the length of the C string)
- var bytelen: Int
-
- private init with_infos_index(items: NativeString, len: Int, index_from: Int, index_to: Int, index: StringIndex, bytelen: Int)
- do
- self.items = items
- length = len
- self.index_from = index_from
- self.index_to = index_to
- self.index = index
- self.bytelen = bytelen
- end
-
- redef fun to_cstring
- do
- if real_items != null then return real_items.as(not null)
- var new_items = new NativeString(bytelen + 1)
- self.items.copy_to(new_items, bytelen, index[index_from].pos, 0)
- new_items[bytelen] = '\0'
- self.real_items = new_items
- return new_items
- end
-
- redef fun substring(from, count)
- do
- assert count >= 0
-
- if from < 0 then
- count += from
- if count < 0 then count = 0
- from = 0
- end
-
- if count == 0 then return empty
-
- var real_from = index_from + from
- var real_to = real_from + count - 1
-
- if real_to > index_to then real_to = index_to
-
- var sub_bytelen = (index[real_to].pos - index[from].pos) + index[from].len
-
- return new FlatString.with_infos_index(items, count, real_from, real_to, index, sub_bytelen)
- end
-
- redef fun reversed
- do
- var native = new NativeString(self.bytelen + 1)
- var length = self.length
- var index = self.index
- var pos = 0
- var i = 0
- var ipos = bytelen
- var new_index = new StringIndex(length)
- var pos_index = length
- while i < length do
- var uchar = index[i]
- var uchar_len = uchar.len
- ipos -= uchar_len
- new_index[pos_index] = new UTF8Char(ipos, native)
- pos_index -= 1
- items.copy_to(native, uchar_len, pos, ipos)
- pos += uchar_len
- i += 1
- end
- return new FlatString.with_infos_index(native, length, 0, length-1, new_index, bytelen)
- end
-
- redef fun *(i)
- do
- assert i >= 0
-
- var mylen = self.bytelen
- var finlen = mylen * i
-
- var my_items = self.items
-
- var my_real_len = length
- var my_real_fin_len = my_real_len * i
-
- var target_string = new NativeString((finlen) + 1)
-
- var my_index = index
- var new_index = new StringIndex(my_real_fin_len)
-
- target_string[finlen] = '\0'
-
- var current_last = 0
- var curr_index = 0
-
- for iteration in [1 .. i] do
- my_items.copy_to(target_string, mylen, index_from, current_last)
- my_index.copy_to(new_index, length, 0, curr_index)
- current_last += mylen
- end
-
- return new FlatString.with_infos_index(target_string, my_real_fin_len, 0, my_real_fin_len -1, new_index, finlen)
-
- end
-
- redef fun to_upper
- do
- var outstr = new NativeString(self.bytelen + 1)
-
- var out_index = 0
- var index = self.index
- var ipos = 0
- var max = length
-
- while ipos < max do
- var u = index[ipos].to_upper
- u.ns.copy_to(outstr, u.len, u.pos, out_index)
- out_index += u.len
- ipos += 1
- end
-
- outstr[self.bytelen] = '\0'
-
- return outstr.to_s_with_length(self.bytelen)
- end
-
- redef fun to_lower
- do
- var outstr = new NativeString(self.bytelen + 1)
-
- var out_index = 0
- var index = self.index
- var ipos = 0
- var max = length
-
- while ipos < max do
- var u = index[ipos].to_lower
- u.ns.copy_to(outstr, u.len, u.pos, out_index)
- out_index += u.len
- ipos += 1
- end
-
- outstr[self.bytelen] = '\0'
-
- return outstr.to_s_with_length(self.bytelen)
- end
-
- redef fun output
- do
- var i = self.index_from
- var imax = self.index_to
- while i <= imax do
- index[i].output
- i += 1
- end
- end
-
-end
-
-redef class FlatBuffer
-
- # Fix for this particular implementation
- #
- # Since the to_s of a FlatBuffer now builds using
- # the old String contructor, this breaks everything.
- #
- # This will disappear when UTF8 is fully-supported
- redef fun to_s do
- written = false
- return to_cstring.to_s_with_length(length)
- end
-end
-
-redef class NativeString
-
- # Creates the index for said NativeString
- # `length` is the size of the CString (in bytes, up to the first \0)
- # real_len is just a way to store the length (UTF-8 characters)
- private fun make_index(length: Int, real_len: Container[Int]): StringIndex import Container[Int].item=, UTF8Char.len `{
- int pos = 0;
- int index_pos = 0;
- UTF8Char* index = malloc(length*sizeof(UTF8Char));
- while(pos < length){
- UTF8Char* curr = &index[index_pos];
- curr->pos = pos;
- curr->ns = self;
- pos += UTF8Char_len(curr);
- index_pos ++;
- }
- Container_of_Int_item__assign(real_len, index_pos);
- return index;
- `}
-
- redef fun to_s: FlatString
- do
- var len = cstring_length
- return to_s_with_length(len)
- end
-
- redef fun to_s_with_length(len)
- do
- var real_len = new Container[Int](0)
- var x = make_index(len, real_len)
- return new FlatString.with_infos_index(self, real_len.item, 0, real_len.item - 1, x, len)
- end
-
- redef fun to_s_with_copy
- do
- var real_len = new Container[Int](0)
- var length = cstring_length
- var x = make_index(length, real_len)
- var new_self = new NativeString(length + 1)
- copy_to(new_self, length, 0, 0)
- return new FlatString.with_infos_index(new_self, real_len.item, 0, real_len.item - 1, x, length)
- end
-end
-
-redef class FileWriter
- redef fun write(s)
- do
- assert is_writable
- if s isa FlatText then
- if s isa FlatString then
- write_native(s.to_cstring, s.bytelen)
- else
- write_native(s.to_cstring, s.length)
- end
- else for i in s.substrings do write_native(i.to_cstring, i.length)
- end
-end
+++ /dev/null
-# This file is part of NIT ( http://www.nitlanguage.org ).
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Introduces UTF-8 as internal encoding for Strings in Nit.
-module utf8_noindex
-
-intrude import standard::string
-intrude import standard::file
-
-in "C Header" `{
-
-#include <stdio.h>
-#include <string.h>
-#include <stdint.h>
-
-#define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
-
-`}
-
-# UTF-8 char as defined in RFC-3629, e.g. 1-4 Bytes
-extern class UnicodeChar `{ uint32_t* `}
- super Comparable
-
- redef type OTHER: UnicodeChar
-
- # Transforms a byte-variable char* character to its uint32_t equivalent
- new from_ns(ns: NativeString, index: Int) `{
- unsigned char* ret = calloc(1,4);
- if((ns[index] & 0x80) == 0){ memcpy(ret + 3, ns + index, 1); }
- else if((ns[index] & 0xE0) == 0xC0) { memcpy(ret + 2, ns + index, 2); }
- else if((ns[index] & 0xF0) == 0xE0) { memcpy(ret + 1, ns + index, 3); }
- else if((ns[index] & 0xF7) == 0xF0) { memcpy(ret, ns + index, 4); }
- else{ memcpy(ret + 3, ns + index, 1);}
- if (!IS_BIG_ENDIAN) {
- uint32_t tmp = ntohl(*((uint32_t*)ret));
- memcpy(ret, &tmp, 4);
- }
- return (uint32_t*)ret;
- `}
-
- # Real length of the char in UTF8
- #
- # As per the specification :
- #
- # ~~~raw
- # Length | UTF-8 octet sequence
- # | (binary)
- # ---------+-------------------------------------------------
- # 1 | 0xxxxxxx
- # 2 | 110xxxxx 10xxxxxx
- # 3 | 1110xxxx 10xxxxxx 10xxxxxx
- # 4 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- # ~~~
- fun len: Int `{
- uint32_t s = *self;
- if(s <= 127) {return 1;}
- if(s >= 49280 && s <= 57279) {return 2;}
- if(s >= 14712960 && s <= 15712191) {return 3;}
- if(s >= 4034953344 && s <= 4156538815) { return 4; }
- // Bad character
- return 1;
- `}
-
- # Returns the Unicode code point representing the character
- #
- # Note : A unicode character might not be a visible glyph, but it will be used to determine canonical equivalence
- fun code_point: Int import UnicodeChar.len `{
- uint32_t val = *self;
- uint32_t ret = 0;
- switch(UnicodeChar_len(self)){
- case 1:
- ret = *self;
- break;
- case 2:
- ret = 0 | ((val & 0x00001F00) >> 2) | (val & 0x0000003F);
- break;
- case 3:
- ret = 0 | ((val & 0x000F0000) >> 4) | ((val & 0x00003F00) >> 2) | (val & 0x0000003F);
- break;
- case 4:
- ret = 0 | ((val & 0x07000000) >> 6) | ((val & 0x003F0000) >> 4) | ((val & 0x00003F00) >> 2) | (val & 0x0000003F);
- break;
- }
- unsigned char* rt = (unsigned char*) &ret;
- return ret;
- `}
-
- # Warning : This does not follow the Unicode specification for now
- #
- # TODO: Support Unicode-compliant comparison
- redef fun <(o) do return self.code_point < o.code_point
-
- # Returns an upper-case version of self
- #
- # NOTE : Works only on ASCII chars
- # TODO : Support unicode for to_upper
- fun to_upper: UnicodeChar import UnicodeChar.code_point `{
- if(*self < 97 || *self > 122){ return self; }
- uint32_t* ret = calloc(1,4);
- *ret = *self - 32;
- return ret;
- `}
-
- # Returns an lower-case version of self
- #
- # NOTE : Works only on ASCII chars
- # TODO : Support unicode for to_upper
- fun to_lower: UnicodeChar import UnicodeChar.code_point `{
- if(*self < 65 || *self > 90){ return self; }
- uint32_t* ret = calloc(1,4);
- *ret = *self + 32;
- return ret;
- `}
-
- redef fun ==(o)
- do
- if not o isa UnicodeChar then return false
- if o.code_point == self.code_point then return true
- return false
- end
-
- redef fun output import UnicodeChar.len `{
- uint32_t self0 = *self;
- if(!IS_BIG_ENDIAN){
- uint32_t tmp = ntohl(self0);
- memcpy(&self0, &tmp, 4);
- }
- unsigned char* s = (unsigned char*) &self0;
- switch(UnicodeChar_len(self0)){
- case 1:
- printf("%c", s[3]);
- break;
- case 2:
- printf("%c%c", s[2], s[3]);
- break;
- case 3:
- printf("%c%c%c", s[1], s[2], s[3]);
- break;
- case 4:
- printf("%c%c%c%c", s[0], s[1], s[2], s[3]);
- break;
- }
- `}
-
- redef fun to_s: FlatString import FlatString.full, UnicodeChar.len `{
- int len = UnicodeChar_len(self);
- char* r = malloc(len + 1);
- r[len] = '\0';
- uint32_t src = *self;
- if(!IS_BIG_ENDIAN){
- uint32_t tmp = htonl(src);
- memcpy(&src, &tmp, 4);
- }
- unsigned char* s = (unsigned char*) &src;
- switch(len){
- case 1: memcpy(r, s+3, 1); break;
- case 2: memcpy(r, s+2, 2); break;
- case 3: memcpy(r, s+1, 3); break;
- case 4: memcpy(r, s, 4); break;
- }
- return new_FlatString_full(r, 0, len - 1, len, 1);
- `}
-end
-
-# Used to keep track of the last accessed char in a String
-class CharCache
- # The position (as in char) of a String
- var position: Int
- # The position in the NativeString underlying the String
- var bytepos: Int
-end
-
-class FlatStringReviter
- super IndexedIterator[UnicodeChar]
-
- # The NativeString to iterate upon
- private var ns: NativeString
-
- # The position in the string
- private var pos: Int
-
- # The position in the native string
- private var bytepos: Int
-
- init(s: FlatString) do from(s, s.length - 1)
-
- init from(s: FlatString, position: Int)
- do
- ns = s.items
- pos = position
- bytepos = s.byte_index(position)
- end
-
- redef fun next
- do
- bytepos -= 1
- while ns[bytepos].ascii.bin_and(0xC0) == 0x80 do
- bytepos -= 1
- end
- pos -= 1
- end
-
- redef fun index do return pos
-
- redef fun item do return new UnicodeChar.from_ns(ns, bytepos)
-
- redef fun is_ok do return pos >= 0
-end
-
-class FlatStringIter
- super IndexedIterator[UnicodeChar]
-
- private var ns: NativeString
-
- private var pos: Int
-
- private var bytepos: Int
-
- private var slen: Int
-
- private var it: UnicodeChar
-
- private var is_created = false
-
- init(s: FlatString) do from(s, 0)
-
- init from(s: FlatString, position: Int) do
- ns = s.items
- pos = position
- bytepos = s.byte_index(position)
- slen = s.length
- end
-
- redef fun index do return pos
-
- redef fun is_ok do return pos < slen
-
- redef fun item do
- if not is_created then
- it = new UnicodeChar.from_ns(ns, bytepos)
- is_created = true
- end
- return it
- end
-
- redef fun next
- do
- if not is_created then
- it = new UnicodeChar.from_ns(ns, bytepos)
- end
- is_created = false
- var pace = it.len
- pos += 1
- bytepos += pace
- end
-end
-
-redef class FlatString
-
- redef type OTHER: FlatString
-
- # Length in bytes of the string (e.g. the length of the C string)
- redef var bytelen
-
- # Cache for the last accessed character in the char
- var cache = new CharCache(-1,-1)
-
- redef var length = length_l is lazy
-
- private init full(items: NativeString, from, to, bytelen, len: Int)
- do
- self.items = items
- index_from = from
- index_to = to
- self.bytelen = bytelen
- length = len
- end
-
- # Length implementation
- private fun length_l: Int import FlatString.items, FlatString.index_to, FlatString.index_from `{
- char* ns = FlatString_items(self);
- int i = FlatString_index_from(self);
- int max = FlatString_index_to(self);
- int length = 0;
- while(i <= max){
- char c = ns[i];
- if((c & 0x80) == 0) { i+= 1; }
- else if((c & 0xE0) == 0xC0) { i += 2; }
- else if((c & 0xF0) == 0xE0) { i += 3; }
- else if((c & 0xF7) == 0xF0) { i += 4; }
- else { i += 1; }
- length ++;
- }
- return length;
- `}
-
- redef fun <(o)
- do
- var o_pos = 0
- var olen = o.length
- for i in [0 .. length[ do
- if o_pos >= olen then return false
- if char_at(i) > o.char_at(i) then return false
- if char_at(i) < o.char_at(i) then return true
- end
- return false
- end
-
- redef fun ==(o) do
- if o == null then return false
- if not o isa FlatString then return super
- var mylen = length
- var itslen = o.length
- if mylen != itslen then return false
- var mypos = 0
- var itspos = 0
-
- while mypos < mylen do
- if char_at(mypos) != o.char_at(itspos) then return false
- mypos += 1
- itspos += 1
- end
- return true
- end
-
- private fun byte_index(index: Int): Int do
- assert index >= 0
- assert index < length
-
- # Find best insertion point
- var delta_begin = index
- var delta_end = (length - 1) - index
- var delta_cache = (cache.position - index).abs
- var min = delta_begin
-
- if delta_cache < min then min = delta_cache
- if delta_end < min then min = delta_end
-
- var ns_i: Int
- var my_i: Int
- var myits = items
-
- if min == delta_begin then
- ns_i = index_from
- my_i = 0
- else if min == delta_cache then
- ns_i = cache.bytepos
- my_i = cache.position
- else
- ns_i = index_to
- my_i = length
- end
-
- while my_i < index do
- if myits[ns_i].ascii.bin_and(0x80) == 0 then
- ns_i += 1
- else if myits[ns_i].ascii.bin_and(0xE0) == 0xC0 then
- ns_i += 2
- else if myits[ns_i].ascii.bin_and(0xF0) == 0xE0 then
- ns_i += 3
- else if myits[ns_i].ascii.bin_and(0xF7) == 0xF0 then
- ns_i += 4
- else
- ns_i += 1
- end
- my_i += 1
- end
-
- while my_i > index do
- if myits[ns_i].ascii.bin_and(0xC0) != 0x80 then
- my_i -= 1
- if my_i == index then break
- end
- ns_i -= 1
- end
-
- cache.position = index
- cache.bytepos = ns_i
-
- return ns_i
- end
-
- fun char_at(pos: Int): UnicodeChar do
- return new UnicodeChar.from_ns(items, byte_index(pos))
- end
-
- private init with_bytelen(items: NativeString, index_from: Int, index_to: Int, bytelen: Int) do
- self.items = items
- self.index_from = index_from
- self.index_to = index_to
- self.bytelen = bytelen
- end
-
- redef fun reversed do
- var new_str = new NativeString(bytelen)
- var s_pos = bytelen
- var my_pos = index_from
- var its = items
- for i in [0..length[ do
- var c = char_at(i).len
- s_pos -= c
- its.copy_to(new_str, c, my_pos, s_pos)
- my_pos += c
- end
- return new FlatString.full(new_str, 0, bytelen - 1, bytelen, length)
- end
-
- redef fun to_upper do
- var ns = new NativeString(bytelen)
- var offset = 0
- for i in [0 .. length[
- do
- var c = char_at(i)
- c.to_upper.to_s.items.copy_to(ns, c.len, 0, offset)
- offset += c.len
- end
- return new FlatString.full(ns, 0, bytelen - 1, bytelen, length)
- end
-
- redef fun to_lower do
- var ns = new NativeString(bytelen)
- var offset = 0
- for i in [0 .. length[
- do
- var c = char_at(i)
- c.to_lower.to_s.items.copy_to(ns, c.len, 0, offset)
- offset += c.len
- end
- return new FlatString.full(ns, 0, bytelen - 1, bytelen, length)
- end
-
- redef fun +(o) do
- if o isa Buffer then o = o.to_s
- if o isa FlatString then
- var new_str = new NativeString(bytelen + o.bytelen + 1)
- var new_bytelen = bytelen + o.bytelen
- new_str[new_bytelen] = '\0'
- var newlen = length + o.length
- items.copy_to(new_str, bytelen, index_from, 0)
- o.items.copy_to(new_str, o.bytelen, o.index_from, bytelen)
- return new FlatString.full(new_str, 0, new_bytelen - 1, new_bytelen, newlen)
- else if o isa Concat then
- return new Concat(self, o)
- else
- # If it goes to this point, that means another String implementation was concerned, therefore you need to support the + operation for this variant
- abort
- end
- end
-
- redef fun *(i) do
- var mybtlen = bytelen
- var new_bytelen = mybtlen * i
- var mylen = length
- var newlen = mylen * i
- var ns = new NativeString(new_bytelen + 1)
- ns[new_bytelen] = '\0'
- var offset = 0
- while i > 0 do
- items.copy_to(ns, bytelen, index_from, offset)
- offset += mybtlen
- i -= 1
- end
- return new FlatString.full(ns, 0, new_bytelen - 1, new_bytelen, newlen)
- end
-
- # O(n)
- redef fun substring(from, count) do
- assert count >= 0
-
- if from < 0 then
- count += from
- if count < 0 then count = 0
- from = 0
- end
-
- if count == 0 then return empty
-
- var real_from = byte_index(from)
-
- var lst = from + count - 1
-
- if lst > length - from then
- return new FlatString.with_bytelen(items, real_from, index_to, index_to - real_from)
- end
-
- var real_to = byte_index(lst)
-
- return new FlatString.full(items, real_from, real_to, (real_to + char_at(lst).len) - real_from, count)
- end
-
- redef fun to_cstring do
- if real_items != null then return real_items.as(not null)
- var new_items = new NativeString(bytelen + 1)
- self.items.copy_to(new_items, bytelen, index_from, 0)
- new_items[bytelen] = '\0'
- self.real_items = new_items
- return new_items
- end
-end
-
-redef class Text
-
- # Length of the string, in bytes
- fun bytelen: Int is abstract
-
-end
-
-redef class FlatBuffer
-
- redef var bytelen
-
- redef init from(s) do
- if s isa Concat then
- with_capacity(50)
- for i in s.substrings do self.append(i)
- end
- items = new NativeString(s.bytelen)
- if s isa FlatString then
- s.items.copy_to(items, s.bytelen, s.index_from, 0)
- else
- s.as(FlatBuffer).items.copy_to(items, s.as(FlatBuffer).bytelen, 0, 0)
- end
- length = s.length
- bytelen = s.bytelen
- capacity = s.bytelen
- end
-
- # Replaces the char at `index` by `item`
- fun char_at=(index: Int, item: UnicodeChar) do
- is_dirty = true
- if index == length then
- add_unicode item
- return
- end
- assert index >= 0 and index < length
- var ip = byte_at(index)
- var c = char_at_byte(ip)
- var size_diff = item.len - c.len
- if size_diff > 0 then
- rshift_bytes(ip + c.len, size_diff)
- else if size_diff < 0 then
- lshift_bytes(ip + c.len, -size_diff)
- end
- var s = item.to_s
- s.items.copy_to(items, s.bytelen, 0, ip)
- end
-
- # Shifts the content of the buffer by `len` bytes to the right, starting at byte `from`
- fun rshift_bytes(from: Int, len: Int) import FlatBuffer.bytelen, FlatBuffer.bytelen=, FlatBuffer.items `{
- long bt = FlatBuffer_bytelen(self);
- char* ns = FlatBuffer_items(self);
- int off = from + len;
- memmove(ns + off, ns + from, bt - from);
- FlatBuffer_bytelen__assign(self, bt + len);
- `}
-
- # Shifts the content of the buffer by `len` bytes to the left, starting at `from`
- fun lshift_bytes(from: Int, len: Int) import FlatBuffer.bytelen, FlatBuffer.bytelen=, FlatBuffer.items `{
- long bt = FlatBuffer_bytelen(self);
- char* ns = FlatBuffer_items(self);
- int off = from - len;
- memmove(ns + off, ns + from, bt - from);
- FlatBuffer_bytelen__assign(self, bt - len);
- `}
-
- # Get the Unicode char stored at `index` in `self`
- fun char_at(index: Int): UnicodeChar do return new UnicodeChar.from_ns(items, byte_at(index))
-
- # Get the Unicode char stored at `index` (bytewise) in `self`
- fun char_at_byte(index: Int): UnicodeChar do return new UnicodeChar.from_ns(items, index)
-
- # Add equivalent that supports Unicode
- fun add_unicode(c: UnicodeChar) do
- var s = c.to_s
- if s.bytelen + bytelen > capacity then enlarge(s.bytelen)
- s.items.copy_to(items, s.bytelen, 0, bytelen)
- end
-
- # Gets the byte index (in NativeString) of the char stored at `i`
- fun byte_at(i: Int): Int do
- assert i < length and i >= 0
- var ns_i = 0
- var real_i = 0
- while real_i < i do
- if items[ns_i].ascii.bin_and(0x80) == 0 then
- ns_i += 1
- else if items[ns_i].ascii.bin_and(0xE0) == 0xC0 then
- ns_i += 2
- else if items[ns_i].ascii.bin_and(0xF0) == 0xE0 then
- ns_i += 3
- else if items[ns_i].ascii.bin_and(0xF7) == 0xF0 then
- ns_i += 4
- else
- ns_i += 1
- end
- real_i += 1
- end
- return ns_i
- end
-
- redef fun enlarge(cap) do
- var c = capacity
- if cap <= c then return
- while c <= cap do c = c * 2 + 2
- var a = new NativeString(c+1)
- if bytelen > 0 then items.copy_to(a, bytelen, 0, 0)
- items = a
- capacity = c
- end
-
- redef fun append(s) do
- if s isa Concat then
- for i in s.substrings do append i
- end
- var i = s.as(FlatString)
- var blen = bytelen
- var iblen = i.bytelen
- var newlen = blen + iblen
- if newlen > capacity then
- enlarge(newlen)
- end
- i.items.copy_to(items, iblen, i.index_from, blen)
- bytelen += iblen
- length += i.length
- end
-
- redef fun reverse
- do
- var nns = new NativeString(bytelen)
- var ns = items
- var btlen = bytelen
- var myp = 0
- var itsp = btlen
- while myp < btlen do
- var c = char_at_byte(myp).len
- itsp -= c
- ns.copy_to(nns, c, myp, itsp)
- myp += c
- end
- items = nns
- end
-
- redef fun clear do
- length = 0
- bytelen = 0
- end
-
- redef fun copy(s, l, d, ns) do
- if not d isa FlatBuffer then
- # This implementation here is only concerned by the FlatBuffer
- # If you implement a new Buffer subclass, make sure to support this operation via refinement.
- abort
- end
- var rs = byte_at(s)
- var re = byte_at(s + l - 1)
- var rl = re - rs
- var rns = d.byte_at(ns)
- items.copy_to(d.items, rl, rns, rs)
- end
-
- redef fun times(i) do
- var len = bytelen
- var off = len
- var newlen = len * i
- if newlen > capacity then enlarge(newlen)
- for j in [1 .. i[ do
- items.copy_to(items, len, 0, off)
- off += len
- end
- bytelen = newlen
- length = length * i
- end
-
- redef fun upper do
- for i in [0 .. length[ do
- var pos = byte_at(i)
- var c = char_at_byte(pos)
- var d = c.to_upper
- if c == d then continue
- d.to_s.items.copy_to(items, 1, 0, pos)
- end
- end
-
- redef fun lower do
- for i in [0 .. length[ do
- var pos = byte_at(i)
- var c = char_at_byte(pos)
- var d = c.to_lower
- if c == d then continue
- d.to_s.items.copy_to(items, 1, 0, pos)
- end
- end
-
- redef fun to_cstring do
- var ns = new NativeString(bytelen)
- items.copy_to(ns, bytelen, 0, 0)
- return ns
- end
-end
-
-redef class NativeString
-
- redef fun to_s: FlatString
- do
- var len = cstring_length
- return to_s_with_length(len)
- end
-
- redef fun to_s_with_length(len)
- do
- return new FlatString.with_bytelen(self, 0, len - 1, len)
- end
-
- redef fun to_s_with_copy
- do
- var length = cstring_length
- var new_self = new NativeString(length + 1)
- copy_to(new_self, length, 0, 0)
- return new FlatString.with_bytelen(new_self, 0, length - 1, length)
- end
-end
-
-redef class FileWriter
- redef fun write(s)
- do
- assert is_writable
- if s isa FlatText then
- write_native(s.to_cstring, s.bytelen)
- else for i in s.substrings do write_native(i.to_cstring, i.length)
- end
-end