lib/string_exp/utf8_no_index: Introducing utf8 variant without indexes

author Lucas Bajolet <r4pass@hotmail.com>

Mon, 4 Aug 2014 16:52:56 +0000 (12:52 -0400)

committer Lucas Bajolet <r4pass@hotmail.com>

Thu, 7 Aug 2014 16:57:09 +0000 (12:57 -0400)
author Lucas Bajolet <r4pass@hotmail.com>
Mon, 4 Aug 2014 16:52:56 +0000 (12:52 -0400)
committer Lucas Bajolet <r4pass@hotmail.com>
Thu, 7 Aug 2014 16:57:09 +0000 (12:57 -0400)
diff --git a/lib/string_experimentations/utf8_noindex.nit b/lib/string_experimentations/utf8_noindex.nit

new file mode 100644 (file)

index 0000000..d34e051
--- /dev/null
+++ b/lib/string_experimentations/utf8_noindex.nit
@@ -0,0 +1,238 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Introduces UTF-8 as internal encoding for Strings in Nit.
+module utf8_noindex
+
+intrude import standard::string
+intrude import standard::file
+
+in "C Header" `{
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
+
+`}
+
+# UTF-8 char as defined in RFC-3629, e.g. 1-4 Bytes
+extern class UnicodeChar `{ uint32_t* `}
+
+       # Real length of the char in UTF8
+       #
+       # As per the specification :
+       #
+       #  Length  |        UTF-8 octet sequence
+       #          |              (binary)
+       # ---------+-------------------------------------------------
+       #  1       | 0xxxxxxx
+       #  2       | 110xxxxx 10xxxxxx
+       #  3       | 1110xxxx 10xxxxxx 10xxxxxx
+       #  4       | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+       fun len: Int `{
+               uint32_t s = *recv;
+               if(s <= 127) {return 1;}
+               if(s >= 49280 && s <= 57279) {return 2;}
+               if(s >= 14712960 && s <= 15712191) {return 3;}
+               if(s >= 4034953344 && s <= 4156538815) { return 4; }
+               // Bad character
+               return 1;
+       `}
+
+       # Returns the Unicode code point representing the character
+       #
+       # Note : A unicode character might not be a visible glyph, but it will be used to determine canonical equivalence
+       fun code_point: Int `{
+               uint32_t val = *recv;
+               uint32_t ret = 0;
+               switch(UnicodeChar_len(recv)){
+                       case 1:
+                               ret = *recv;
+                               break;
+                       case 2:
+                               ret = 0 | ((val & 0x00001F00) >> 2) | (val & 0x0000003F);
+                               break;
+                       case 3:
+                               ret = 0 | ((val & 0x000F0000) >> 4) | ((val & 0x00003F00) >> 2) | (val & 0x0000003F);
+                               break;
+                       case 4:
+                               ret = 0 | ((val & 0x07000000) >> 6) | ((val & 0x003F0000) >> 4) | ((val & 0x00003F00) >> 2) | (val & 0x0000003F);
+                               break;
+               }
+               unsigned char* rt = (unsigned char*) &ret;
+               return ret;
+       `}
+
+       # Returns an upper-case version of self
+       #
+       # NOTE : Works only on ASCII chars
+       # TODO : Support unicode for to_upper
+       fun to_upper: UnicodeChar import UnicodeChar.code_point `{
+               if(*recv < 97 || *recv > 122){ return recv; }
+               uint32_t* ret = calloc(1,4);
+               *ret = *recv - 32;
+               return ret;
+       `}
+
+       # Returns an lower-case version of self
+       #
+       # NOTE : Works only on ASCII chars
+       # TODO : Support unicode for to_upper
+       fun to_lower: UnicodeChar import UnicodeChar.code_point `{
+               if(*recv < 65 || *recv > 90){ return recv; }
+               uint32_t* ret = calloc(1,4);
+               *ret = *recv + 32;
+               return ret;
+       `}
+
+       redef fun ==(o)
+       do
+               if not o isa UnicodeChar then return false
+               if o.code_point == self.code_point then return true
+               return false
+       end
+
+       redef fun output import UnicodeChar.len `{
+               uint32_t self = *recv;
+               if(!IS_BIG_ENDIAN){
+                       uint32_t tmp = ntohl(self);
+                       memcpy(&self, &tmp, 4);
+               }
+               unsigned char* s = (unsigned char*) &self;
+               switch(UnicodeChar_len(recv)){
+                       case 1:
+                               printf("%c", s[3]);
+                               break;
+                       case 2:
+                               printf("%c%c", s[2], s[3]);
+                               break;
+                       case 3:
+                               printf("%c%c%c", s[1], s[2], s[3]);
+                               break;
+                       case 4:
+                               printf("%c%c%c%c", s[0], s[1], s[2], s[3]);
+                               break;
+               }
+       `}
+
+       redef fun to_s: FlatString import FlatString.full, UnicodeChar.len `{
+               int len = UnicodeChar_len(recv);
+               char* r = malloc(len + 1);
+               r[len] = '\0';
+               uint32_t src = *recv;
+               if(!IS_BIG_ENDIAN){
+                       uint32_t tmp = htonl(src);
+                       memcpy(&src, &tmp, 4);
+               }
+               unsigned char* s = (unsigned char*) &src;
+               switch(len){
+                       case 1: memcpy(r, s+3, 1); break;
+                       case 2: memcpy(r, s+2, 2); break;
+                       case 3: memcpy(r, s+1, 3); break;
+                       case 4: memcpy(r, s, 4); break;
+               }
+               return new_FlatString_full(r, 0, len - 1, len, 1);
+       `}
+end
+
+redef class FlatString
+
+       # Length in bytes of the string (e.g. the length of the C string)
+       var bytelen: Int
+
+       redef var length = length_l is lazy
+
+       private init full(items: NativeString, from, to, bytelen, len: Int)
+       do
+               self.items = items
+               index_from = from
+               index_to = to
+               self.bytelen = bytelen
+               length = len
+       end
+
+       # Length implementation
+       private fun length_l: Int import FlatString.items, FlatString.index_to, FlatString.index_from `{
+               char* ns = FlatString_items(recv);
+               int i = FlatString_index_from(recv);
+               int max = FlatString_index_to(recv);
+               int length = 0;
+               while(i <= max){
+                       char c = ns[i];
+                       if((c & 0x80) == 0) { i+= 1; }
+                       else if((c & 0xE0) == 0xC0) { i += 2; }
+                       else if((c & 0xF0) == 0xE0) { i += 3; }
+                       else if((c & 0xF7) == 0xF0) { i += 4; }
+                       else { i += 1; }
+                       length ++;
+               }
+               return length;
+       `}
+
+       private init with_bytelen(items: NativeString, index_from: Int, index_to: Int, bytelen: Int)
+       do
+               self.items = items
+               self.index_from = index_from
+               self.index_to = index_to
+               self.bytelen = bytelen
+       end
+
+       redef fun to_cstring
+       do
+               if real_items != null then return real_items.as(not null)
+               var new_items = calloc_string(bytelen + 1)
+               self.items.copy_to(new_items, bytelen, index_from, 0)
+               new_items[bytelen] = '\0'
+               self.real_items = new_items
+               return new_items
+       end
+end
+
+redef class NativeString
+
+       redef fun to_s: FlatString
+       do
+               var len = cstring_length
+               return to_s_with_length(len)
+       end
+
+       redef fun to_s_with_length(len: Int): FlatString
+       do
+               return new FlatString.with_bytelen(self, 0, len - 1, len)
+       end
+
+       redef fun to_s_with_copy
+       do
+               var length = cstring_length
+               var new_self = calloc_string(length + 1)
+               copy_to(new_self, length, 0, 0)
+               return new FlatString.with_bytelen(new_self, 0, length - 1, length)
+       end
+end
+
+redef class OFStream
+       redef fun write(s)
+       do
+               assert _writable
+               if s isa FlatText then
+                       if s isa FlatString then
+                               write_native(s.to_cstring, s.bytelen)
+                       else
+                               write_native(s.to_cstring, s.length)
+                       end
+               else for i in s.substrings do write_native(i.to_cstring, i.length)
+       end
+end
author	Lucas Bajolet <r4pass@hotmail.com>
	Mon, 4 Aug 2014 16:52:56 +0000 (12:52 -0400)
committer	Lucas Bajolet <r4pass@hotmail.com>
	Thu, 7 Aug 2014 16:57:09 +0000 (12:57 -0400)