lib/: removed UTF-8 related modules in anticipation of the integration in stdlib

author Lucas Bajolet <r4pass@hotmail.com>

Tue, 2 Jun 2015 18:49:19 +0000 (14:49 -0400)

committer Lucas Bajolet <r4pass@hotmail.com>

Tue, 2 Jun 2015 18:49:19 +0000 (14:49 -0400)
author Lucas Bajolet <r4pass@hotmail.com>
Tue, 2 Jun 2015 18:49:19 +0000 (14:49 -0400)
committer Lucas Bajolet <r4pass@hotmail.com>
Tue, 2 Jun 2015 18:49:19 +0000 (14:49 -0400)
diff --git a/lib/string_experimentations/README.md b/lib/string_experimentations/README.md

deleted file mode 100644 (file)

index 7cbce13..0000000
--- a/lib/string_experimentations/README.md
+++ /dev/null
@@ -1,24 +0,0 @@
-This project is a collection of modules used to experiment on different variations of Text and its subclasses.
-This is only temporary as these modules will eventually be merged into standard library or discarded for those bringing no real improvements to the language.
-
-The modules contained here are :
-
- * utf8: A draft of implementation of UTF-8 as internal encoding for Strings with automatic indexing.
- * utf8_no_index: Another draft of implementation of UTF-8, this time without indexing.
-
-TODO :
-
- * utf8:
-  * Support for the whole API of Text
-  * Any kind of normalization form for equality (NFC probably)
-  * Compatibility versions of equality test
-  * Locale support
-  * Comparisons
-  * to_upper/lower fully-compatible with Unicode
-
- * utf8_no_index:
-  * Add cache for the last indexed character - DONE
-  * Two-way iteration - DONE
-  * Intelligent indexed access (calculating the nearest point of insertion, i.e. begin, end, or cache) - DONE
-  * UnicodeChar as universal type
-  * UnicodeChar => Char and Char => Byte
diff --git a/lib/string_experimentations/string_experimentations.nit b/lib/string_experimentations/string_experimentations.nit

deleted file mode 100644 (file)

index 1eeb10d..0000000
--- a/lib/string_experimentations/string_experimentations.nit
+++ /dev/null
@@ -1,18 +0,0 @@
-# This file is part of NIT ( http://www.nitlanguage.org ).
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# General module for all kinds of string experimentations
-module string_experimentations
-
-import utf8
diff --git a/lib/string_experimentations/utf8.nit b/lib/string_experimentations/utf8.nit

deleted file mode 100644 (file)

index b3a2450..0000000
--- a/lib/string_experimentations/utf8.nit
+++ /dev/null
@@ -1,427 +0,0 @@
-# This file is part of NIT ( http://www.nitlanguage.org ).
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Introduces UTF-8 as internal encoding for Strings in Nit.
-module utf8
-
-intrude import standard::string
-intrude import standard::file
-
-in "C Header" `{
-
-#include <stdio.h>
-#include <string.h>
-#include <stdint.h>
-
-typedef struct {
-       long pos;
-       char* ns;
-} UTF8Char;
-
-`}
-
-# UTF-8 char as defined in RFC-3629, e.g. 1-4 Bytes
-#
-# A UTF-8 char has its bytes stored in a NativeString (char*)
-extern class UTF8Char `{ UTF8Char* `}
-
-       new(pos: Int, ns: NativeString) `{
-               UTF8Char* u = malloc(sizeof(UTF8Char));
-               u->pos = pos;
-               u->ns = ns;
-               return u;
-       `}
-
-       # Real length of the char in UTF8
-       #
-       # As per the specification :
-       #
-       # ~~~raw
-       #  Length  |        UTF-8 octet sequence
-       #          |              (binary)
-       # ---------+-------------------------------------------------
-       #  1       | 0xxxxxxx
-       #  2       | 110xxxxx 10xxxxxx
-       #  3       | 1110xxxx 10xxxxxx 10xxxxxx
-       #  4       | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-       # ~~~
-       private fun len: Int `{
-               char* ns = self->ns;
-               int pos = self->pos;
-               char nspos = ns[pos];
-               if((nspos & 0x80) == 0x00){ return 1;}
-               if((nspos & 0xE0) == 0xC0){ return 2;}
-               if((nspos & 0xF0) == 0xE0){ return 3;}
-               if((nspos & 0xF7) == 0xF0){ return 4;}
-               // Invalid character
-               return 1;
-       `}
-
-       # Position in containing NativeString
-       private fun pos: Int `{
-               return self->pos;
-       `}
-
-       private fun pos=(p: Int) `{self->pos = p;`}
-
-       # C char* wrapping the char
-       fun ns: NativeString `{
-               return self->ns;
-       `}
-
-       # Returns the Unicode code point representing the character
-       #
-       # Note : A unicode character might not be a visible glyph, but it will be used to determine canonical equivalence
-       fun code_point: Int import UTF8Char.len `{
-               switch(UTF8Char_len(self)){
-                       case 1:
-                               return (long)(0x7F & (unsigned char)self->ns[self->pos]);
-                       case 2:
-                               return 0 | ((0x1F & (unsigned char)self->ns[self->pos]) << 6) | (0x3F & (unsigned char)self->ns[self->pos+1]);
-                       case 3:
-                               return 0 | ((0x0F & (unsigned char)self->ns[self->pos]) << 12) |
-                               ((0x3F & (unsigned char)self->ns[self->pos+1]) << 6) |
-                               (0x3F & (unsigned char)self->ns[self->pos+2]);
-                       case 4:
-                               return 0 | ((0x07 & (unsigned char)self->ns[self->pos]) << 18) |
-                               ((0x3F & (unsigned char)self->ns[self->pos+1]) << 12) |
-                               ((0x3F & (unsigned char)self->ns[self->pos+2]) << 6) |
-                               (0x3F & (unsigned char)self->ns[self->pos+3]);
-               }
-       `}
-
-       # Returns an upper-case version of self
-       #
-       # NOTE : Works only on ASCII chars
-       # TODO : Support unicode for to_upper
-       fun to_upper: UTF8Char import UTF8Char.code_point `{
-               int cp = UTF8Char_code_point(self);
-               if(cp < 97 || cp > 122){ return self; }
-               char* ns = malloc(2);
-               ns[1] = '\0';
-               char c = self->ns[self->pos];
-               ns[0] = c - 32;
-               UTF8Char* ret = malloc(sizeof(UTF8Char));
-               ret->ns = ns;
-               ret->pos = 0;
-               return ret;
-       `}
-
-       # Returns an lower-case version of self
-       #
-       # NOTE : Works only on ASCII chars
-       # TODO : Support unicode for to_upper
-       fun to_lower: UTF8Char import UTF8Char.code_point `{
-               int cp = UTF8Char_code_point(self);
-               if(cp < 65 || cp > 90){ return self; }
-               char* ns = malloc(2);
-               ns[1] = '\0';
-               char c = self->ns[self->pos];
-               ns[0] = c + 32;
-               UTF8Char* ret = malloc(sizeof(UTF8Char));
-               ret->ns = ns;
-               ret->pos = 0;
-               return ret;
-       `}
-
-       redef fun ==(o)
-       do
-               if o isa Char then
-                       if len != 1 then return false
-                       if code_point == o.ascii then return true
-               else if o isa UTF8Char then
-                       if len != o.len then return false
-                       if code_point == o.code_point then return true
-               end
-               return false
-       end
-
-       redef fun output import UTF8Char.code_point `{
-               switch(UTF8Char_len(self)){
-                       case 1:
-                               printf("%c", self->ns[self->pos]);
-                               break;
-                       case 2:
-                               printf("%c%c", self->ns[self->pos], self->ns[self->pos + 1]);
-                               break;
-                       case 3:
-                               printf("%c%c%c", self->ns[self->pos], self->ns[self->pos + 1], self->ns[self->pos + 2]);
-                               break;
-                       case 4:
-                               printf("%c%c%c%c", self->ns[self->pos], self->ns[self->pos + 1], self->ns[self->pos + 2], self->ns[self->pos + 3]);
-                               break;
-               }
-       `}
-
-       redef fun to_s import NativeString.to_s_with_length `{
-               int len = utf8___UTF8Char_len___impl(self);
-               char* r = malloc(len + 1);
-               r[len] = '\0';
-               char* src = (self->ns + self->pos);
-               memcpy(r, src, len);
-               return NativeString_to_s_with_length(r, len);
-       `}
-end
-
-# A `StringIndex` is used to keep track of the position of characters in a `FlatString` object
-#
-# It becomes mandatory for UTF-8 strings since characters do not have a fixed size.
-private extern class StringIndex `{ UTF8Char* `}
-
-       new(size: Int) `{ return malloc(size*sizeof(UTF8Char)); `}
-
-       # Sets the character at `index` as `item`
-       fun []=(index: Int, item: UTF8Char) `{ self[index] = *item; `}
-
-       # Gets the character at position `id`
-       fun [](id: Int): UTF8Char `{ return &self[id]; `}
-
-       # Copies a part of self starting at index `my_from` of length `length` into `other`, starting at `its_from`
-       fun copy_to(other: StringIndex, my_from: Int, its_from: Int, length: Int)`{
-               UTF8Char* myfrom = self + my_from*(sizeof(UTF8Char));
-               UTF8Char* itsfrom = other + its_from*(sizeof(UTF8Char));
-               memcpy(itsfrom, myfrom, length);
-       `}
-end
-
-redef class FlatString
-
-       # Index of the characters of the FlatString
-       private var index: StringIndex
-
-       # Length in bytes of the string (e.g. the length of the C string)
-       var bytelen: Int
-
-       private init with_infos_index(items: NativeString, len: Int, index_from: Int, index_to: Int, index: StringIndex, bytelen: Int)
-       do
-               self.items = items
-               length = len
-               self.index_from = index_from
-               self.index_to = index_to
-               self.index = index
-               self.bytelen = bytelen
-       end
-
-       redef fun to_cstring
-       do
-               if real_items != null then return real_items.as(not null)
-               var new_items = new NativeString(bytelen + 1)
-               self.items.copy_to(new_items, bytelen, index[index_from].pos, 0)
-               new_items[bytelen] = '\0'
-               self.real_items = new_items
-               return new_items
-       end
-
-       redef fun substring(from, count)
-       do
-               assert count >= 0
-
-               if from < 0 then
-                       count += from
-                       if count < 0 then count = 0
-                       from = 0
-               end
-
-               if count == 0 then return empty
-
-               var real_from = index_from + from
-               var real_to = real_from + count - 1
-
-               if real_to > index_to then real_to = index_to
-
-               var sub_bytelen = (index[real_to].pos - index[from].pos) + index[from].len
-
-               return new FlatString.with_infos_index(items, count, real_from, real_to, index, sub_bytelen)
-       end
-
-       redef fun reversed
-       do
-               var native = new NativeString(self.bytelen + 1)
-               var length = self.length
-               var index = self.index
-               var pos = 0
-               var i = 0
-               var ipos = bytelen
-               var new_index = new StringIndex(length)
-               var pos_index = length
-               while i < length do
-                       var uchar = index[i]
-                       var uchar_len = uchar.len
-                       ipos -= uchar_len
-                       new_index[pos_index] = new UTF8Char(ipos, native)
-                       pos_index -= 1
-                       items.copy_to(native, uchar_len, pos, ipos)
-                       pos += uchar_len
-                       i += 1
-               end
-               return new FlatString.with_infos_index(native, length, 0, length-1, new_index, bytelen)
-       end
-
-       redef fun *(i)
-       do
-               assert i >= 0
-
-               var mylen = self.bytelen
-               var finlen = mylen * i
-
-               var my_items = self.items
-
-               var my_real_len = length
-               var my_real_fin_len = my_real_len * i
-
-               var target_string = new NativeString((finlen) + 1)
-
-               var my_index = index
-               var new_index = new StringIndex(my_real_fin_len)
-
-               target_string[finlen] = '\0'
-
-               var current_last = 0
-               var curr_index = 0
-
-               for iteration in [1 .. i] do
-                       my_items.copy_to(target_string, mylen, index_from, current_last)
-                       my_index.copy_to(new_index, length, 0, curr_index)
-                       current_last += mylen
-               end
-
-               return new FlatString.with_infos_index(target_string, my_real_fin_len, 0, my_real_fin_len -1, new_index, finlen)
-
-       end
-
-       redef fun to_upper
-       do
-               var outstr = new NativeString(self.bytelen + 1)
-
-               var out_index = 0
-               var index = self.index
-               var ipos = 0
-               var max = length
-
-               while ipos < max do
-                       var u = index[ipos].to_upper
-                       u.ns.copy_to(outstr, u.len, u.pos, out_index)
-                       out_index += u.len
-                       ipos += 1
-               end
-
-               outstr[self.bytelen] = '\0'
-
-               return outstr.to_s_with_length(self.bytelen)
-       end
-
-       redef fun to_lower
-       do
-               var outstr = new NativeString(self.bytelen + 1)
-
-               var out_index = 0
-               var index = self.index
-               var ipos = 0
-               var max = length
-
-               while ipos < max do
-                       var u = index[ipos].to_lower
-                       u.ns.copy_to(outstr, u.len, u.pos, out_index)
-                       out_index += u.len
-                       ipos += 1
-               end
-
-               outstr[self.bytelen] = '\0'
-
-               return outstr.to_s_with_length(self.bytelen)
-       end
-
-       redef fun output
-       do
-               var i = self.index_from
-               var imax = self.index_to
-               while i <= imax do
-                       index[i].output
-                       i += 1
-               end
-       end
-
-end
-
-redef class FlatBuffer
-
-       # Fix for this particular implementation
-       #
-       # Since the to_s of a FlatBuffer now builds using
-       # the old String contructor, this breaks everything.
-       #
-       # This will disappear when UTF8 is fully-supported
-       redef fun to_s do
-               written = false
-               return to_cstring.to_s_with_length(length)
-       end
-end
-
-redef class NativeString
-
-       # Creates the index for said NativeString
-       # `length` is the size of the CString (in bytes, up to the first \0)
-       # real_len is just a way to store the length (UTF-8 characters)
-       private fun make_index(length: Int, real_len: Container[Int]): StringIndex import Container[Int].item=, UTF8Char.len `{
-               int pos = 0;
-               int index_pos = 0;
-               UTF8Char* index = malloc(length*sizeof(UTF8Char));
-               while(pos < length){
-                       UTF8Char* curr = &index[index_pos];
-                       curr->pos = pos;
-                       curr->ns = self;
-                       pos += UTF8Char_len(curr);
-                       index_pos ++;
-               }
-               Container_of_Int_item__assign(real_len, index_pos);
-               return index;
-       `}
-
-       redef fun to_s: FlatString
-       do
-               var len = cstring_length
-               return to_s_with_length(len)
-       end
-
-       redef fun to_s_with_length(len)
-       do
-               var real_len = new Container[Int](0)
-               var x = make_index(len, real_len)
-               return new FlatString.with_infos_index(self, real_len.item, 0, real_len.item - 1, x, len)
-       end
-
-       redef fun to_s_with_copy
-       do
-               var real_len = new Container[Int](0)
-               var length = cstring_length
-               var x = make_index(length, real_len)
-               var new_self = new NativeString(length + 1)
-               copy_to(new_self, length, 0, 0)
-               return new FlatString.with_infos_index(new_self, real_len.item, 0, real_len.item - 1, x, length)
-       end
-end
-
-redef class FileWriter
-       redef fun write(s)
-       do
-               assert is_writable
-               if s isa FlatText then
-                       if s isa FlatString then
-                               write_native(s.to_cstring, s.bytelen)
-                       else
-                               write_native(s.to_cstring, s.length)
-                       end
-               else for i in s.substrings do write_native(i.to_cstring, i.length)
-       end
-end
diff --git a/lib/string_experimentations/utf8_noindex.nit b/lib/string_experimentations/utf8_noindex.nit

deleted file mode 100644 (file)

index 8756838..0000000
--- a/lib/string_experimentations/utf8_noindex.nit
+++ /dev/null
@@ -1,742 +0,0 @@
-# This file is part of NIT ( http://www.nitlanguage.org ).
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Introduces UTF-8 as internal encoding for Strings in Nit.
-module utf8_noindex
-
-intrude import standard::string
-intrude import standard::file
-
-in "C Header" `{
-
-#include <stdio.h>
-#include <string.h>
-#include <stdint.h>
-
-#define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
-
-`}
-
-# UTF-8 char as defined in RFC-3629, e.g. 1-4 Bytes
-extern class UnicodeChar `{ uint32_t* `}
-       super Comparable
-
-       redef type OTHER: UnicodeChar
-
-       # Transforms a byte-variable char* character to its uint32_t equivalent
-       new from_ns(ns: NativeString, index: Int) `{
-               unsigned char* ret = calloc(1,4);
-               if((ns[index] & 0x80) == 0){ memcpy(ret + 3, ns + index, 1);  }
-               else if((ns[index] & 0xE0) == 0xC0) { memcpy(ret + 2, ns + index, 2); }
-               else if((ns[index] & 0xF0) == 0xE0) { memcpy(ret + 1, ns + index, 3); }
-               else if((ns[index] & 0xF7) == 0xF0) { memcpy(ret, ns + index, 4); }
-               else{ memcpy(ret + 3, ns + index, 1);}
-               if (!IS_BIG_ENDIAN) {
-                       uint32_t tmp = ntohl(*((uint32_t*)ret));
-                       memcpy(ret, &tmp, 4);
-               }
-               return (uint32_t*)ret;
-       `}
-
-       # Real length of the char in UTF8
-       #
-       # As per the specification :
-       #
-       # ~~~raw
-       #  Length  |        UTF-8 octet sequence
-       #          |              (binary)
-       # ---------+-------------------------------------------------
-       #  1       | 0xxxxxxx
-       #  2       | 110xxxxx 10xxxxxx
-       #  3       | 1110xxxx 10xxxxxx 10xxxxxx
-       #  4       | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-       # ~~~
-       fun len: Int `{
-               uint32_t s = *self;
-               if(s <= 127) {return 1;}
-               if(s >= 49280 && s <= 57279) {return 2;}
-               if(s >= 14712960 && s <= 15712191) {return 3;}
-               if(s >= 4034953344 && s <= 4156538815) { return 4; }
-               // Bad character
-               return 1;
-       `}
-
-       # Returns the Unicode code point representing the character
-       #
-       # Note : A unicode character might not be a visible glyph, but it will be used to determine canonical equivalence
-       fun code_point: Int import UnicodeChar.len `{
-               uint32_t val = *self;
-               uint32_t ret = 0;
-               switch(UnicodeChar_len(self)){
-                       case 1:
-                               ret = *self;
-                               break;
-                       case 2:
-                               ret = 0 | ((val & 0x00001F00) >> 2) | (val & 0x0000003F);
-                               break;
-                       case 3:
-                               ret = 0 | ((val & 0x000F0000) >> 4) | ((val & 0x00003F00) >> 2) | (val & 0x0000003F);
-                               break;
-                       case 4:
-                               ret = 0 | ((val & 0x07000000) >> 6) | ((val & 0x003F0000) >> 4) | ((val & 0x00003F00) >> 2) | (val & 0x0000003F);
-                               break;
-               }
-               unsigned char* rt = (unsigned char*) &ret;
-               return ret;
-       `}
-
-       # Warning : This does not follow the Unicode specification for now
-       #
-       # TODO: Support Unicode-compliant comparison
-       redef fun <(o) do return self.code_point < o.code_point
-
-       # Returns an upper-case version of self
-       #
-       # NOTE : Works only on ASCII chars
-       # TODO : Support unicode for to_upper
-       fun to_upper: UnicodeChar import UnicodeChar.code_point `{
-               if(*self < 97 || *self > 122){ return self; }
-               uint32_t* ret = calloc(1,4);
-               *ret = *self - 32;
-               return ret;
-       `}
-
-       # Returns an lower-case version of self
-       #
-       # NOTE : Works only on ASCII chars
-       # TODO : Support unicode for to_upper
-       fun to_lower: UnicodeChar import UnicodeChar.code_point `{
-               if(*self < 65 || *self > 90){ return self; }
-               uint32_t* ret = calloc(1,4);
-               *ret = *self + 32;
-               return ret;
-       `}
-
-       redef fun ==(o)
-       do
-               if not o isa UnicodeChar then return false
-               if o.code_point == self.code_point then return true
-               return false
-       end
-
-       redef fun output import UnicodeChar.len `{
-               uint32_t self0 = *self;
-               if(!IS_BIG_ENDIAN){
-                       uint32_t tmp = ntohl(self0);
-                       memcpy(&self0, &tmp, 4);
-               }
-               unsigned char* s = (unsigned char*) &self0;
-               switch(UnicodeChar_len(self0)){
-                       case 1:
-                               printf("%c", s[3]);
-                               break;
-                       case 2:
-                               printf("%c%c", s[2], s[3]);
-                               break;
-                       case 3:
-                               printf("%c%c%c", s[1], s[2], s[3]);
-                               break;
-                       case 4:
-                               printf("%c%c%c%c", s[0], s[1], s[2], s[3]);
-                               break;
-               }
-       `}
-
-       redef fun to_s: FlatString import FlatString.full, UnicodeChar.len `{
-               int len = UnicodeChar_len(self);
-               char* r = malloc(len + 1);
-               r[len] = '\0';
-               uint32_t src = *self;
-               if(!IS_BIG_ENDIAN){
-                       uint32_t tmp = htonl(src);
-                       memcpy(&src, &tmp, 4);
-               }
-               unsigned char* s = (unsigned char*) &src;
-               switch(len){
-                       case 1: memcpy(r, s+3, 1); break;
-                       case 2: memcpy(r, s+2, 2); break;
-                       case 3: memcpy(r, s+1, 3); break;
-                       case 4: memcpy(r, s, 4); break;
-               }
-               return new_FlatString_full(r, 0, len - 1, len, 1);
-       `}
-end
-
-# Used to keep track of the last accessed char in a String
-class CharCache
-       # The position (as in char) of a String
-       var position: Int
-       # The position in the NativeString underlying the String
-       var bytepos: Int
-end
-
-class FlatStringReviter
-       super IndexedIterator[UnicodeChar]
-
-       # The NativeString to iterate upon
-       private var ns: NativeString
-
-       # The position in the string
-       private var pos: Int
-
-       # The position in the native string
-       private var bytepos: Int
-
-       init(s: FlatString) do from(s, s.length - 1)
-
-       init from(s: FlatString, position: Int)
-       do
-               ns = s.items
-               pos = position
-               bytepos = s.byte_index(position)
-       end
-
-       redef fun next
-       do
-               bytepos -= 1
-               while ns[bytepos].ascii.bin_and(0xC0) == 0x80 do
-                       bytepos -= 1
-               end
-               pos -= 1
-       end
-
-       redef fun index do return pos
-
-       redef fun item do return new UnicodeChar.from_ns(ns, bytepos)
-
-       redef fun is_ok do return pos >= 0
-end
-
-class FlatStringIter
-       super IndexedIterator[UnicodeChar]
-
-       private var ns: NativeString
-
-       private var pos: Int
-
-       private var bytepos: Int
-
-       private var slen: Int
-
-       private var it: UnicodeChar
-
-       private var is_created = false
-
-       init(s: FlatString) do from(s, 0)
-
-       init from(s: FlatString, position: Int) do
-               ns = s.items
-               pos = position
-               bytepos = s.byte_index(position)
-               slen = s.length
-       end
-
-       redef fun index do return pos
-
-       redef fun is_ok do return pos < slen
-
-       redef fun item do
-               if not is_created then
-                       it = new UnicodeChar.from_ns(ns, bytepos)
-                       is_created = true
-               end
-               return it
-       end
-
-       redef fun next
-       do
-               if not is_created then
-                       it = new UnicodeChar.from_ns(ns, bytepos)
-               end
-               is_created = false
-               var pace = it.len
-               pos += 1
-               bytepos += pace
-       end
-end
-
-redef class FlatString
-
-       redef type OTHER: FlatString
-
-       # Length in bytes of the string (e.g. the length of the C string)
-       redef var bytelen
-
-       # Cache for the last accessed character in the char
-       var cache = new CharCache(-1,-1)
-
-       redef var length = length_l is lazy
-
-       private init full(items: NativeString, from, to, bytelen, len: Int)
-       do
-               self.items = items
-               index_from = from
-               index_to = to
-               self.bytelen = bytelen
-               length = len
-       end
-
-       # Length implementation
-       private fun length_l: Int import FlatString.items, FlatString.index_to, FlatString.index_from `{
-               char* ns = FlatString_items(self);
-               int i = FlatString_index_from(self);
-               int max = FlatString_index_to(self);
-               int length = 0;
-               while(i <= max){
-                       char c = ns[i];
-                       if((c & 0x80) == 0) { i+= 1; }
-                       else if((c & 0xE0) == 0xC0) { i += 2; }
-                       else if((c & 0xF0) == 0xE0) { i += 3; }
-                       else if((c & 0xF7) == 0xF0) { i += 4; }
-                       else { i += 1; }
-                       length ++;
-               }
-               return length;
-       `}
-
-       redef fun <(o)
-       do
-               var o_pos = 0
-               var olen = o.length
-               for i in [0 .. length[ do
-                       if o_pos >= olen then return false
-                       if char_at(i) > o.char_at(i) then return false
-                       if char_at(i) < o.char_at(i) then return true
-               end
-               return false
-       end
-
-       redef fun ==(o) do
-               if o == null then return false
-               if not o isa FlatString then return super
-               var mylen = length
-               var itslen = o.length
-               if mylen != itslen then return false
-               var mypos = 0
-               var itspos = 0
-
-               while mypos < mylen do
-                       if char_at(mypos) != o.char_at(itspos) then return false
-                       mypos += 1
-                       itspos += 1
-               end
-               return true
-       end
-
-       private fun byte_index(index: Int): Int do
-               assert index >= 0
-               assert index < length
-
-               # Find best insertion point
-               var delta_begin = index
-               var delta_end = (length - 1) - index
-               var delta_cache = (cache.position - index).abs
-               var min = delta_begin
-
-               if delta_cache < min then min = delta_cache
-               if delta_end < min then min = delta_end
-
-               var ns_i: Int
-               var my_i: Int
-               var myits = items
-
-               if min == delta_begin then
-                       ns_i = index_from
-                       my_i = 0
-               else if min == delta_cache then
-                       ns_i = cache.bytepos
-                       my_i = cache.position
-               else
-                       ns_i = index_to
-                       my_i = length
-               end
-
-               while my_i < index do
-                       if myits[ns_i].ascii.bin_and(0x80) == 0 then
-                               ns_i += 1
-                       else if myits[ns_i].ascii.bin_and(0xE0) == 0xC0 then
-                               ns_i += 2
-                       else if myits[ns_i].ascii.bin_and(0xF0) == 0xE0 then
-                               ns_i += 3
-                       else if myits[ns_i].ascii.bin_and(0xF7) == 0xF0 then
-                               ns_i += 4
-                       else
-                               ns_i += 1
-                       end
-                       my_i += 1
-               end
-
-               while my_i > index do
-                       if myits[ns_i].ascii.bin_and(0xC0) != 0x80 then
-                               my_i -= 1
-                               if my_i == index then break
-                       end
-                       ns_i -= 1
-               end
-
-               cache.position = index
-               cache.bytepos = ns_i
-
-               return ns_i
-       end
-
-       fun char_at(pos: Int): UnicodeChar do
-               return new UnicodeChar.from_ns(items, byte_index(pos))
-       end
-
-       private init with_bytelen(items: NativeString, index_from: Int, index_to: Int, bytelen: Int) do
-               self.items = items
-               self.index_from = index_from
-               self.index_to = index_to
-               self.bytelen = bytelen
-       end
-
-       redef fun reversed do
-               var new_str = new NativeString(bytelen)
-               var s_pos = bytelen
-               var my_pos = index_from
-               var its = items
-               for i in [0..length[ do
-                       var c = char_at(i).len
-                       s_pos -= c
-                       its.copy_to(new_str, c, my_pos, s_pos)
-                       my_pos += c
-               end
-               return new FlatString.full(new_str, 0, bytelen - 1, bytelen, length)
-       end
-
-       redef fun to_upper do
-               var ns = new NativeString(bytelen)
-               var offset = 0
-               for i in [0 .. length[
-               do
-                       var c = char_at(i)
-                       c.to_upper.to_s.items.copy_to(ns, c.len, 0, offset)
-                       offset += c.len
-               end
-               return new FlatString.full(ns, 0, bytelen - 1, bytelen, length)
-       end
-
-       redef fun to_lower do
-               var ns = new NativeString(bytelen)
-               var offset = 0
-               for i in [0 .. length[
-               do
-                       var c = char_at(i)
-                       c.to_lower.to_s.items.copy_to(ns, c.len, 0, offset)
-                       offset += c.len
-               end
-               return new FlatString.full(ns, 0, bytelen - 1, bytelen, length)
-       end
-
-       redef fun +(o) do
-               if o isa Buffer then o = o.to_s
-               if o isa FlatString then
-                       var new_str = new NativeString(bytelen + o.bytelen + 1)
-                       var new_bytelen = bytelen + o.bytelen
-                       new_str[new_bytelen] = '\0'
-                       var newlen = length + o.length
-                       items.copy_to(new_str, bytelen, index_from, 0)
-                       o.items.copy_to(new_str, o.bytelen, o.index_from, bytelen)
-                       return new FlatString.full(new_str, 0, new_bytelen - 1, new_bytelen, newlen)
-               else if o isa Concat then
-                       return new Concat(self, o)
-               else
-                       # If it goes to this point, that means another String implementation was concerned, therefore you need to support the + operation for this variant
-                       abort
-               end
-       end
-
-       redef fun *(i) do
-               var mybtlen = bytelen
-               var new_bytelen = mybtlen * i
-               var mylen = length
-               var newlen = mylen * i
-               var ns = new NativeString(new_bytelen + 1)
-               ns[new_bytelen] = '\0'
-               var offset = 0
-               while i > 0 do
-                       items.copy_to(ns, bytelen, index_from, offset)
-                       offset += mybtlen
-                       i -= 1
-               end
-               return new FlatString.full(ns, 0, new_bytelen - 1, new_bytelen, newlen)
-       end
-
-       # O(n)
-       redef fun substring(from, count) do
-               assert count >= 0
-
-               if from < 0 then
-                       count += from
-                       if count < 0 then count = 0
-                       from = 0
-               end
-
-               if count == 0 then return empty
-
-               var real_from = byte_index(from)
-
-               var lst = from + count - 1
-
-               if lst > length - from then
-                       return new FlatString.with_bytelen(items, real_from, index_to, index_to - real_from)
-               end
-
-               var real_to = byte_index(lst)
-
-               return new FlatString.full(items, real_from, real_to, (real_to + char_at(lst).len) - real_from, count)
-       end
-
-       redef fun to_cstring do
-               if real_items != null then return real_items.as(not null)
-               var new_items = new NativeString(bytelen + 1)
-               self.items.copy_to(new_items, bytelen, index_from, 0)
-               new_items[bytelen] = '\0'
-               self.real_items = new_items
-               return new_items
-       end
-end
-
-redef class Text
-
-       # Length of the string, in bytes
-       fun bytelen: Int is abstract
-
-end
-
-redef class FlatBuffer
-
-       redef var bytelen
-
-       redef init from(s) do
-               if s isa Concat then
-                       with_capacity(50)
-                       for i in s.substrings do self.append(i)
-               end
-               items = new NativeString(s.bytelen)
-               if s isa FlatString then
-                       s.items.copy_to(items, s.bytelen, s.index_from, 0)
-               else
-                       s.as(FlatBuffer).items.copy_to(items, s.as(FlatBuffer).bytelen, 0, 0)
-               end
-               length = s.length
-               bytelen = s.bytelen
-               capacity = s.bytelen
-       end
-
-       # Replaces the char at `index` by `item`
-       fun char_at=(index: Int, item: UnicodeChar) do
-               is_dirty = true
-               if index == length then
-                       add_unicode item
-                       return
-               end
-               assert index >= 0 and index < length
-               var ip = byte_at(index)
-               var c = char_at_byte(ip)
-               var size_diff = item.len - c.len
-               if size_diff > 0 then
-                       rshift_bytes(ip + c.len, size_diff)
-               else if size_diff < 0 then
-                       lshift_bytes(ip + c.len, -size_diff)
-               end
-               var s = item.to_s
-               s.items.copy_to(items, s.bytelen, 0, ip)
-       end
-
-       # Shifts the content of the buffer by `len` bytes to the right, starting at byte `from`
-       fun rshift_bytes(from: Int, len: Int) import FlatBuffer.bytelen, FlatBuffer.bytelen=, FlatBuffer.items `{
-               long bt = FlatBuffer_bytelen(self);
-               char* ns = FlatBuffer_items(self);
-               int off = from + len;
-               memmove(ns + off, ns + from, bt - from);
-               FlatBuffer_bytelen__assign(self, bt + len);
-       `}
-
-       # Shifts the content of the buffer by `len` bytes to the left, starting at `from`
-       fun lshift_bytes(from: Int, len: Int) import FlatBuffer.bytelen, FlatBuffer.bytelen=, FlatBuffer.items `{
-               long bt = FlatBuffer_bytelen(self);
-               char* ns = FlatBuffer_items(self);
-               int off = from - len;
-               memmove(ns + off, ns + from, bt - from);
-               FlatBuffer_bytelen__assign(self, bt - len);
-       `}
-
-       # Get the Unicode char stored at `index` in `self`
-       fun char_at(index: Int): UnicodeChar do return new UnicodeChar.from_ns(items, byte_at(index))
-
-       # Get the Unicode char stored at `index` (bytewise) in `self`
-       fun char_at_byte(index: Int): UnicodeChar do return new UnicodeChar.from_ns(items, index)
-
-       # Add equivalent that supports Unicode
-       fun add_unicode(c: UnicodeChar) do
-               var s = c.to_s
-               if s.bytelen + bytelen > capacity then enlarge(s.bytelen)
-               s.items.copy_to(items, s.bytelen, 0, bytelen)
-       end
-
-       # Gets the byte index (in NativeString) of the char stored at `i`
-       fun byte_at(i: Int): Int do
-               assert i < length and i >= 0
-               var ns_i = 0
-               var real_i = 0
-               while real_i < i do
-                       if items[ns_i].ascii.bin_and(0x80) == 0 then
-                               ns_i += 1
-                       else if items[ns_i].ascii.bin_and(0xE0) == 0xC0 then
-                               ns_i += 2
-                       else if items[ns_i].ascii.bin_and(0xF0) == 0xE0 then
-                               ns_i += 3
-                       else if items[ns_i].ascii.bin_and(0xF7) == 0xF0 then
-                               ns_i += 4
-                       else
-                               ns_i += 1
-                       end
-                       real_i += 1
-               end
-               return ns_i
-       end
-
-       redef fun enlarge(cap) do
-               var c = capacity
-               if cap <= c then return
-               while c <= cap do c = c * 2 + 2
-               var a = new NativeString(c+1)
-               if bytelen > 0 then items.copy_to(a, bytelen, 0, 0)
-               items = a
-               capacity = c
-       end
-
-       redef fun append(s) do
-               if s isa Concat then
-                       for i in s.substrings do append i
-               end
-               var i = s.as(FlatString)
-               var blen = bytelen
-               var iblen = i.bytelen
-               var newlen = blen + iblen
-               if newlen > capacity then
-                       enlarge(newlen)
-               end
-               i.items.copy_to(items, iblen, i.index_from, blen)
-               bytelen += iblen
-               length += i.length
-       end
-
-       redef fun reverse
-       do
-               var nns = new NativeString(bytelen)
-               var ns = items
-               var btlen = bytelen
-               var myp = 0
-               var itsp = btlen
-               while myp < btlen do
-                       var c = char_at_byte(myp).len
-                       itsp -= c
-                       ns.copy_to(nns, c, myp, itsp)
-                       myp += c
-               end
-               items = nns
-       end
-
-       redef fun clear do
-               length = 0
-               bytelen = 0
-       end
-
-       redef fun copy(s, l, d, ns) do
-               if not d isa FlatBuffer then
-                       # This implementation here is only concerned by the FlatBuffer
-                       # If you implement a new Buffer subclass, make sure to support this operation via refinement.
-                       abort
-               end
-               var rs = byte_at(s)
-               var re = byte_at(s + l - 1)
-               var rl = re - rs
-               var rns = d.byte_at(ns)
-               items.copy_to(d.items, rl, rns, rs)
-       end
-
-       redef fun times(i) do
-               var len = bytelen
-               var off = len
-               var newlen = len * i
-               if newlen > capacity then enlarge(newlen)
-               for j in [1 .. i[ do
-                       items.copy_to(items, len, 0, off)
-                       off += len
-               end
-               bytelen = newlen
-               length = length * i
-       end
-
-       redef fun upper do
-               for i in [0 .. length[ do
-                       var pos = byte_at(i)
-                       var c = char_at_byte(pos)
-                       var d = c.to_upper
-                       if c == d then continue
-                       d.to_s.items.copy_to(items, 1, 0, pos)
-               end
-       end
-
-       redef fun lower do
-               for i in [0 .. length[ do
-                       var pos = byte_at(i)
-                       var c = char_at_byte(pos)
-                       var d = c.to_lower
-                       if c == d then continue
-                       d.to_s.items.copy_to(items, 1, 0, pos)
-               end
-       end
-
-       redef fun to_cstring do
-               var ns = new NativeString(bytelen)
-               items.copy_to(ns, bytelen, 0, 0)
-               return ns
-       end
-end
-
-redef class NativeString
-
-       redef fun to_s: FlatString
-       do
-               var len = cstring_length
-               return to_s_with_length(len)
-       end
-
-       redef fun to_s_with_length(len)
-       do
-               return new FlatString.with_bytelen(self, 0, len - 1, len)
-       end
-
-       redef fun to_s_with_copy
-       do
-               var length = cstring_length
-               var new_self = new NativeString(length + 1)
-               copy_to(new_self, length, 0, 0)
-               return new FlatString.with_bytelen(new_self, 0, length - 1, length)
-       end
-end
-
-redef class FileWriter
-       redef fun write(s)
-       do
-               assert is_writable
-               if s isa FlatText then
-                       write_native(s.to_cstring, s.bytelen)
-               else for i in s.substrings do write_native(i.to_cstring, i.length)
-       end
-end
diff --git a/tests/sav/utf_test.res b/tests/sav/utf_test.res

deleted file mode 100644 (file)

index 4055d93..0000000
--- a/tests/sav/utf_test.res
+++ /dev/null
@@ -1,11 +0,0 @@
-28
-すでa語A本日a 𐍆,A ᓂ . ᓀ 界世a𐍃ーЖロaハ
-ハaロЖー𐍃a世界 ᓀ . ᓂ A,𐍆 a日本A語aです
-ー𐍃a世
-30fc
-10343
-61
-4e16
-ハAロЖー𐍃A世界 ᓀ . ᓂ A,𐍆 A日本A語Aです
-ハaロЖー𐍃a世界 ᓀ . ᓂ a,𐍆 a日本a語aです
-ハaロЖー𐍃a世界 ᓀ . ᓂ A,𐍆 a日本A語aですハaロЖー𐍃a世界 ᓀ . ᓂ A,𐍆 a日本A語aです
diff --git a/tests/utf_test.nit b/tests/utf_test.nit

deleted file mode 100644 (file)

index 88474c4..0000000
--- a/tests/utf_test.nit
+++ /dev/null
@@ -1,42 +0,0 @@
-# This file is part of NIT ( http://www.nitlanguage.org ).
-#
-# This file is free software, which comes along with NIT.  This software is
-# distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
-# without  even  the implied warranty of  MERCHANTABILITY or  FITNESS FOR A
-# PARTICULAR PURPOSE.  You can modify it is you want,  provided this header
-# is kept unaltered, and a notification of the changes is added.
-# You  are  allowed  to  redistribute it and sell it, alone or is a part of
-# another product.
-
-import standard
-intrude import string_experimentations::utf8
-
-var s = "aàハ𐍆".as(FlatString)
-assert s.index[0].code_point == 97
-assert s.index[1].code_point == 224
-assert s.index[2].code_point == 12495
-assert s.index[3].code_point == 66374
-
-var str = "ハaロЖー𐍃a世界 ᓀ . ᓂ A,𐍆 a日本A語aです".as(FlatString)
-
-print str.length
-
-print str.reversed
-
-str.output
-
-print ""
-
-var x = str.substring(4,4).as(FlatString)
-
-print x
-
-for i in [0..x.length[ do
-       print x.index[i + x.index_from].code_point.to_hex
-end
-
-print str.to_upper
-
-print str.to_lower
-
-print str * 2
author	Lucas Bajolet <r4pass@hotmail.com>
	Tue, 2 Jun 2015 18:49:19 +0000 (14:49 -0400)
committer	Lucas Bajolet <r4pass@hotmail.com>
	Tue, 2 Jun 2015 18:49:19 +0000 (14:49 -0400)
lib/string_experimentations/README.md	[deleted file]	patch \| blob \| history
lib/string_experimentations/string_experimentations.nit	[deleted file]	patch \| blob \| history
lib/string_experimentations/utf8.nit	[deleted file]	patch \| blob \| history
lib/string_experimentations/utf8_noindex.nit	[deleted file]	patch \| blob \| history
tests/sav/utf_test.res	[deleted file]	patch \| blob \| history
tests/utf_test.nit	[deleted file]	patch \| blob \| history