--- /dev/null
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Introduces UTF-8 as internal encoding for Strings in Nit.
+module utf8_noindex
+
+intrude import standard::string
+intrude import standard::file
+
+in "C Header" `{
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
+
+`}
+
+# UTF-8 char as defined in RFC-3629, e.g. 1-4 Bytes
+extern class UnicodeChar `{ uint32_t* `}
+
+ # Real length of the char in UTF8
+ #
+ # As per the specification :
+ #
+ # Length | UTF-8 octet sequence
+ # | (binary)
+ # ---------+-------------------------------------------------
+ # 1 | 0xxxxxxx
+ # 2 | 110xxxxx 10xxxxxx
+ # 3 | 1110xxxx 10xxxxxx 10xxxxxx
+ # 4 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ fun len: Int `{
+ uint32_t s = *recv;
+ if(s <= 127) {return 1;}
+ if(s >= 49280 && s <= 57279) {return 2;}
+ if(s >= 14712960 && s <= 15712191) {return 3;}
+ if(s >= 4034953344 && s <= 4156538815) { return 4; }
+ // Bad character
+ return 1;
+ `}
+
+ # Returns the Unicode code point representing the character
+ #
+ # Note : A unicode character might not be a visible glyph, but it will be used to determine canonical equivalence
+ fun code_point: Int `{
+ uint32_t val = *recv;
+ uint32_t ret = 0;
+ switch(UnicodeChar_len(recv)){
+ case 1:
+ ret = *recv;
+ break;
+ case 2:
+ ret = 0 | ((val & 0x00001F00) >> 2) | (val & 0x0000003F);
+ break;
+ case 3:
+ ret = 0 | ((val & 0x000F0000) >> 4) | ((val & 0x00003F00) >> 2) | (val & 0x0000003F);
+ break;
+ case 4:
+ ret = 0 | ((val & 0x07000000) >> 6) | ((val & 0x003F0000) >> 4) | ((val & 0x00003F00) >> 2) | (val & 0x0000003F);
+ break;
+ }
+ unsigned char* rt = (unsigned char*) &ret;
+ return ret;
+ `}
+
+ # Returns an upper-case version of self
+ #
+ # NOTE : Works only on ASCII chars
+ # TODO : Support unicode for to_upper
+ fun to_upper: UnicodeChar import UnicodeChar.code_point `{
+ if(*recv < 97 || *recv > 122){ return recv; }
+ uint32_t* ret = calloc(1,4);
+ *ret = *recv - 32;
+ return ret;
+ `}
+
+ # Returns an lower-case version of self
+ #
+ # NOTE : Works only on ASCII chars
+ # TODO : Support unicode for to_upper
+ fun to_lower: UnicodeChar import UnicodeChar.code_point `{
+ if(*recv < 65 || *recv > 90){ return recv; }
+ uint32_t* ret = calloc(1,4);
+ *ret = *recv + 32;
+ return ret;
+ `}
+
+ redef fun ==(o)
+ do
+ if not o isa UnicodeChar then return false
+ if o.code_point == self.code_point then return true
+ return false
+ end
+
+ redef fun output import UnicodeChar.len `{
+ uint32_t self = *recv;
+ if(!IS_BIG_ENDIAN){
+ uint32_t tmp = ntohl(self);
+ memcpy(&self, &tmp, 4);
+ }
+ unsigned char* s = (unsigned char*) &self;
+ switch(UnicodeChar_len(recv)){
+ case 1:
+ printf("%c", s[3]);
+ break;
+ case 2:
+ printf("%c%c", s[2], s[3]);
+ break;
+ case 3:
+ printf("%c%c%c", s[1], s[2], s[3]);
+ break;
+ case 4:
+ printf("%c%c%c%c", s[0], s[1], s[2], s[3]);
+ break;
+ }
+ `}
+
+ redef fun to_s: FlatString import FlatString.full, UnicodeChar.len `{
+ int len = UnicodeChar_len(recv);
+ char* r = malloc(len + 1);
+ r[len] = '\0';
+ uint32_t src = *recv;
+ if(!IS_BIG_ENDIAN){
+ uint32_t tmp = htonl(src);
+ memcpy(&src, &tmp, 4);
+ }
+ unsigned char* s = (unsigned char*) &src;
+ switch(len){
+ case 1: memcpy(r, s+3, 1); break;
+ case 2: memcpy(r, s+2, 2); break;
+ case 3: memcpy(r, s+1, 3); break;
+ case 4: memcpy(r, s, 4); break;
+ }
+ return new_FlatString_full(r, 0, len - 1, len, 1);
+ `}
+end
+
+redef class FlatString
+
+ # Length in bytes of the string (e.g. the length of the C string)
+ var bytelen: Int
+
+ redef var length = length_l is lazy
+
+ private init full(items: NativeString, from, to, bytelen, len: Int)
+ do
+ self.items = items
+ index_from = from
+ index_to = to
+ self.bytelen = bytelen
+ length = len
+ end
+
+ # Length implementation
+ private fun length_l: Int import FlatString.items, FlatString.index_to, FlatString.index_from `{
+ char* ns = FlatString_items(recv);
+ int i = FlatString_index_from(recv);
+ int max = FlatString_index_to(recv);
+ int length = 0;
+ while(i <= max){
+ char c = ns[i];
+ if((c & 0x80) == 0) { i+= 1; }
+ else if((c & 0xE0) == 0xC0) { i += 2; }
+ else if((c & 0xF0) == 0xE0) { i += 3; }
+ else if((c & 0xF7) == 0xF0) { i += 4; }
+ else { i += 1; }
+ length ++;
+ }
+ return length;
+ `}
+
+ private init with_bytelen(items: NativeString, index_from: Int, index_to: Int, bytelen: Int)
+ do
+ self.items = items
+ self.index_from = index_from
+ self.index_to = index_to
+ self.bytelen = bytelen
+ end
+
+ redef fun to_cstring
+ do
+ if real_items != null then return real_items.as(not null)
+ var new_items = calloc_string(bytelen + 1)
+ self.items.copy_to(new_items, bytelen, index_from, 0)
+ new_items[bytelen] = '\0'
+ self.real_items = new_items
+ return new_items
+ end
+end
+
+redef class NativeString
+
+ redef fun to_s: FlatString
+ do
+ var len = cstring_length
+ return to_s_with_length(len)
+ end
+
+ redef fun to_s_with_length(len: Int): FlatString
+ do
+ return new FlatString.with_bytelen(self, 0, len - 1, len)
+ end
+
+ redef fun to_s_with_copy
+ do
+ var length = cstring_length
+ var new_self = calloc_string(length + 1)
+ copy_to(new_self, length, 0, 0)
+ return new FlatString.with_bytelen(new_self, 0, length - 1, length)
+ end
+end
+
+redef class OFStream
+ redef fun write(s)
+ do
+ assert _writable
+ if s isa FlatText then
+ if s isa FlatString then
+ write_native(s.to_cstring, s.bytelen)
+ else
+ write_native(s.to_cstring, s.length)
+ end
+ else for i in s.substrings do write_native(i.to_cstring, i.length)
+ end
+end