# This file is part of NIT ( http://www.nitlanguage.org ). # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Introduces UTF-8 as internal encoding for Strings in Nit. module utf8_noindex intrude import standard::string intrude import standard::file in "C Header" `{ #include #include #include #define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100) `} # UTF-8 char as defined in RFC-3629, e.g. 1-4 Bytes extern class UnicodeChar `{ uint32_t* `} # Transforms a byte-variable char* character to its uint32_t equivalent new from_ns(ns: NativeString, index: Int) `{ unsigned char* ret = calloc(1,4); if((ns[index] & 0x80) == 0){ memcpy(ret + 3, ns + index, 1); } else if((ns[index] & 0xE0) == 0xC0) { memcpy(ret + 2, ns + index, 2); } else if((ns[index] & 0xF0) == 0xE0) { memcpy(ret + 1, ns + index, 3); } else if((ns[index] & 0xF7) == 0xF0) { memcpy(ret, ns + index, 4); } else{ memcpy(ret + 3, ns + index, 1);} if (!IS_BIG_ENDIAN) { uint32_t tmp = ntohl(*((uint32_t*)ret)); memcpy(ret, &tmp, 4); } return (uint32_t*)ret; `} # Real length of the char in UTF8 # # As per the specification : # # Length | UTF-8 octet sequence # | (binary) # ---------+------------------------------------------------- # 1 | 0xxxxxxx # 2 | 110xxxxx 10xxxxxx # 3 | 1110xxxx 10xxxxxx 10xxxxxx # 4 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx fun len: Int `{ uint32_t s = *recv; if(s <= 127) {return 1;} if(s >= 49280 && s <= 57279) {return 2;} if(s >= 14712960 && s <= 15712191) {return 3;} if(s >= 4034953344 && s <= 4156538815) { return 4; } // Bad character return 1; `} # Returns the Unicode code point representing the character # # Note : A unicode character might not be a visible glyph, but it will be used to determine canonical equivalence fun code_point: Int `{ uint32_t val = *recv; uint32_t ret = 0; switch(UnicodeChar_len(recv)){ case 1: ret = *recv; break; case 2: ret = 0 | ((val & 0x00001F00) >> 2) | (val & 0x0000003F); break; case 3: ret = 0 | ((val & 0x000F0000) >> 4) | ((val & 0x00003F00) >> 2) | (val & 0x0000003F); break; case 4: ret = 0 | ((val & 0x07000000) >> 6) | ((val & 0x003F0000) >> 4) | ((val & 0x00003F00) >> 2) | (val & 0x0000003F); break; } unsigned char* rt = (unsigned char*) &ret; return ret; `} # Returns an upper-case version of self # # NOTE : Works only on ASCII chars # TODO : Support unicode for to_upper fun to_upper: UnicodeChar import UnicodeChar.code_point `{ if(*recv < 97 || *recv > 122){ return recv; } uint32_t* ret = calloc(1,4); *ret = *recv - 32; return ret; `} # Returns an lower-case version of self # # NOTE : Works only on ASCII chars # TODO : Support unicode for to_upper fun to_lower: UnicodeChar import UnicodeChar.code_point `{ if(*recv < 65 || *recv > 90){ return recv; } uint32_t* ret = calloc(1,4); *ret = *recv + 32; return ret; `} redef fun ==(o) do if not o isa UnicodeChar then return false if o.code_point == self.code_point then return true return false end redef fun output import UnicodeChar.len `{ uint32_t self = *recv; if(!IS_BIG_ENDIAN){ uint32_t tmp = ntohl(self); memcpy(&self, &tmp, 4); } unsigned char* s = (unsigned char*) &self; switch(UnicodeChar_len(recv)){ case 1: printf("%c", s[3]); break; case 2: printf("%c%c", s[2], s[3]); break; case 3: printf("%c%c%c", s[1], s[2], s[3]); break; case 4: printf("%c%c%c%c", s[0], s[1], s[2], s[3]); break; } `} redef fun to_s: FlatString import FlatString.full, UnicodeChar.len `{ int len = UnicodeChar_len(recv); char* r = malloc(len + 1); r[len] = '\0'; uint32_t src = *recv; if(!IS_BIG_ENDIAN){ uint32_t tmp = htonl(src); memcpy(&src, &tmp, 4); } unsigned char* s = (unsigned char*) &src; switch(len){ case 1: memcpy(r, s+3, 1); break; case 2: memcpy(r, s+2, 2); break; case 3: memcpy(r, s+1, 3); break; case 4: memcpy(r, s, 4); break; } return new_FlatString_full(r, 0, len - 1, len, 1); `} end redef class FlatString # Length in bytes of the string (e.g. the length of the C string) var bytelen: Int redef var length = length_l is lazy private init full(items: NativeString, from, to, bytelen, len: Int) do self.items = items index_from = from index_to = to self.bytelen = bytelen length = len end # Length implementation private fun length_l: Int import FlatString.items, FlatString.index_to, FlatString.index_from `{ char* ns = FlatString_items(recv); int i = FlatString_index_from(recv); int max = FlatString_index_to(recv); int length = 0; while(i <= max){ char c = ns[i]; if((c & 0x80) == 0) { i+= 1; } else if((c & 0xE0) == 0xC0) { i += 2; } else if((c & 0xF0) == 0xE0) { i += 3; } else if((c & 0xF7) == 0xF0) { i += 4; } else { i += 1; } length ++; } return length; `} private init with_bytelen(items: NativeString, index_from: Int, index_to: Int, bytelen: Int) do self.items = items self.index_from = index_from self.index_to = index_to self.bytelen = bytelen end redef fun to_cstring do if real_items != null then return real_items.as(not null) var new_items = calloc_string(bytelen + 1) self.items.copy_to(new_items, bytelen, index_from, 0) new_items[bytelen] = '\0' self.real_items = new_items return new_items end end redef class NativeString redef fun to_s: FlatString do var len = cstring_length return to_s_with_length(len) end redef fun to_s_with_length(len: Int): FlatString do return new FlatString.with_bytelen(self, 0, len - 1, len) end redef fun to_s_with_copy do var length = cstring_length var new_self = calloc_string(length + 1) copy_to(new_self, length, 0, 0) return new FlatString.with_bytelen(new_self, 0, length - 1, length) end end redef class OFStream redef fun write(s) do assert _writable if s isa FlatText then if s isa FlatString then write_native(s.to_cstring, s.bytelen) else write_native(s.to_cstring, s.length) end else for i in s.substrings do write_native(i.to_cstring, i.length) end end