3a57641905464562dd73bf15d9a5ac0c86005c3c
[nit.git] / lib / string_experimentations / utf8.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Introduces UTF-8 as internal encoding for Strings in Nit.
16 module utf8
17
18 intrude import standard::string
19
20 in "C Header" `{
21
22 #include <stdio.h>
23 #include <string.h>
24 #include <stdint.h>
25
26 typedef struct {
27 long pos;
28 char* ns;
29 } UTF8Char;
30
31 `}
32
33 # UTF-8 char as defined in RFC-3629, e.g. 1-4 Bytes
34 #
35 # A UTF-8 char has its bytes stored in a NativeString (char*)
36 extern class UnicodeChar `{ UTF8Char* `}
37
38 new(pos: Int, ns: NativeString) `{
39 UTF8Char* u = malloc(sizeof(UTF8Char));
40 u->pos = pos;
41 u->ns = ns;
42 return u;
43 `}
44
45 # Real length of the char in UTF8
46 #
47 # As per the specification :
48 #
49 # Length | UTF-8 octet sequence
50 # | (binary)
51 # ---------+-------------------------------------------------
52 # 1 | 0xxxxxxx
53 # 2 | 110xxxxx 10xxxxxx
54 # 3 | 1110xxxx 10xxxxxx 10xxxxxx
55 # 4 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
56 private fun len: Int `{
57 char* ns = recv->ns;
58 int pos = recv->pos;
59 char nspos = ns[pos];
60 if((nspos & 0x80) == 0x00){ return 1;}
61 if((nspos & 0xE0) == 0xC0){ return 2;}
62 if((nspos & 0xF0) == 0xE0){ return 3;}
63 if((nspos & 0xF7) == 0xF0){ return 4;}
64 // Invalid character
65 return 1;
66 `}
67
68 # Position in containing NativeString
69 private fun pos: Int `{
70 return recv->pos;
71 `}
72
73 private fun pos=(p: Int) `{recv->pos = p;`}
74
75 # C char* wrapping the char
76 fun ns: NativeString `{
77 return recv->ns;
78 `}
79
80 redef fun to_s import NativeString.to_s_with_length `{
81 int len = utf8___UnicodeChar_len___impl(recv);
82 char* r = malloc(len + 1);
83 r[len] = '\0';
84 char* src = (recv->ns + recv->pos);
85 memcpy(r, src, len);
86 return NativeString_to_s_with_length(r, len);
87 `}
88 end
89
90 # A `StringIndex` is used to keep track of the position of characters in a `FlatString` object
91 #
92 # It becomes mandatory for UTF-8 strings since characters do not have a fixed size.
93 private extern class StringIndex `{ UTF8Char* `}
94
95 new(size: Int) `{ return malloc(size*sizeof(UTF8Char)); `}
96
97 # Sets the character at `index` as `item`
98 fun []=(index: Int, item: UnicodeChar) `{ recv[index] = *item; `}
99
100 # Gets the character at position `id`
101 fun [](id: Int): UnicodeChar `{ return &recv[id]; `}
102
103 # Copies a part of self starting at index `my_from` of length `length` into `other`, starting at `its_from`
104 fun copy_to(other: StringIndex, my_from: Int, its_from: Int, length: Int)`{
105 UTF8Char* myfrom = recv + my_from*(sizeof(UTF8Char));
106 UTF8Char* itsfrom = other + its_from*(sizeof(UTF8Char));
107 memcpy(itsfrom, myfrom, length);
108 `}
109 end