lib/string_exp/utf8: Introducing UTF-8 basics for String.
authorLucas Bajolet <r4pass@hotmail.com>
Mon, 21 Jul 2014 14:18:48 +0000 (10:18 -0400)
committerLucas Bajolet <r4pass@hotmail.com>
Tue, 29 Jul 2014 16:00:11 +0000 (12:00 -0400)
Signed-off-by: Lucas Bajolet <r4pass@hotmail.com>

lib/string_experimentations/utf8.nit [new file with mode: 0644]

diff --git a/lib/string_experimentations/utf8.nit b/lib/string_experimentations/utf8.nit
new file mode 100644 (file)
index 0000000..3a57641
--- /dev/null
@@ -0,0 +1,109 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Introduces UTF-8 as internal encoding for Strings in Nit.
+module utf8
+
+intrude import standard::string
+
+in "C Header" `{
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+typedef struct {
+       long pos;
+       char* ns;
+} UTF8Char;
+
+`}
+
+# UTF-8 char as defined in RFC-3629, e.g. 1-4 Bytes
+#
+# A UTF-8 char has its bytes stored in a NativeString (char*)
+extern class UnicodeChar `{ UTF8Char* `}
+
+       new(pos: Int, ns: NativeString) `{
+               UTF8Char* u = malloc(sizeof(UTF8Char));
+               u->pos = pos;
+               u->ns = ns;
+               return u;
+       `}
+
+       # Real length of the char in UTF8
+       #
+       # As per the specification :
+       #
+       #  Length  |        UTF-8 octet sequence
+       #          |              (binary)
+       # ---------+-------------------------------------------------
+       #  1       | 0xxxxxxx
+       #  2       | 110xxxxx 10xxxxxx
+       #  3       | 1110xxxx 10xxxxxx 10xxxxxx
+       #  4       | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+       private fun len: Int `{
+               char* ns = recv->ns;
+               int pos = recv->pos;
+               char nspos = ns[pos];
+               if((nspos & 0x80) == 0x00){ return 1;}
+               if((nspos & 0xE0) == 0xC0){ return 2;}
+               if((nspos & 0xF0) == 0xE0){ return 3;}
+               if((nspos & 0xF7) == 0xF0){ return 4;}
+               // Invalid character
+               return 1;
+       `}
+
+       # Position in containing NativeString
+       private fun pos: Int `{
+               return recv->pos;
+       `}
+
+       private fun pos=(p: Int) `{recv->pos = p;`}
+
+       # C char* wrapping the char
+       fun ns: NativeString `{
+               return recv->ns;
+       `}
+
+       redef fun to_s import NativeString.to_s_with_length `{
+               int len = utf8___UnicodeChar_len___impl(recv);
+               char* r = malloc(len + 1);
+               r[len] = '\0';
+               char* src = (recv->ns + recv->pos);
+               memcpy(r, src, len);
+               return NativeString_to_s_with_length(r, len);
+       `}
+end
+
+# A `StringIndex` is used to keep track of the position of characters in a `FlatString` object
+#
+# It becomes mandatory for UTF-8 strings since characters do not have a fixed size.
+private extern class StringIndex `{ UTF8Char* `}
+
+       new(size: Int) `{ return malloc(size*sizeof(UTF8Char)); `}
+
+       # Sets the character at `index` as `item`
+       fun []=(index: Int, item: UnicodeChar) `{ recv[index] = *item; `}
+
+       # Gets the character at position `id`
+       fun [](id: Int): UnicodeChar `{ return &recv[id]; `}
+
+       # Copies a part of self starting at index `my_from` of length `length` into `other`, starting at `its_from`
+       fun copy_to(other: StringIndex, my_from: Int, its_from: Int, length: Int)`{
+               UTF8Char* myfrom = recv + my_from*(sizeof(UTF8Char));
+               UTF8Char* itsfrom = other + its_from*(sizeof(UTF8Char));
+               memcpy(itsfrom, myfrom, length);
+       `}
+end