debb3f53d9ba70fe1162f927bdf9cba97006fd6a
[nit.git] / lib / core / codecs / utf8.nit
1 # This file is part of NIT (http://www.nitlanguage.org).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Codec for UTF-8 I/O
16 module utf8
17
18 import codec_base
19 intrude import text::flat
20 intrude import bytes
21
22 # Codec supporting UTF-8
23 private class UTF8Codec
24 super Codec
25
26 redef fun char_max_size do return 4
27
28 redef fun codet_size do return 1
29
30 redef fun max_lookahead do return 4
31
32 redef fun encode_char(c) do
33 var ns = new CString(c.u8char_len)
34 add_char_to(c, ns)
35 return ns
36 end
37
38 redef fun add_char_to(c, stream) do
39 c.u8char_tos(stream, c.u8char_len)
40 return c.u8char_len
41 end
42
43 redef fun encode_string(s) do
44 var buf = new Bytes.with_capacity(s.byte_length)
45 add_string_to(s, buf)
46 return buf
47 end
48
49 redef fun add_string_to(s, b) do
50 s.append_to_bytes(b)
51 return s.byte_length
52 end
53
54 redef fun is_valid_char(ns, len) do
55 if len == 0 then return 2
56 if not ns[0].is_valid_utf8_start then return 2
57 for i in [1 .. len[ do if ns[i] & 0b1100_0000u8 != 0b1000_0000u8 then return 2
58 if len != ns[0].u8len then return 1
59 return 0
60 end
61
62 redef fun decode_char(b) do
63 var c = b.char_at(0)
64 var cp = c.code_point
65 if cp >= 0xD800 and cp <= 0xDFFF then return 0xFFFD.code_point
66 if cp == 0xFFFE or cp == 0xFFFF then return 0xFFFD.code_point
67 return c
68 end
69
70 redef fun decode_string(ns, len) do
71 var ret = ns.to_s_with_length(len)
72 var rit = ret.as(FlatString).items
73 if rit == ns then
74 var nns = new CString(len)
75 rit.copy_to(nns, len, 0, 0)
76 return nns.to_s_full(ret.byte_length, ret.length)
77 end
78 return ret
79 end
80 end
81
82 # Returns the instance of a UTF-8 Codec
83 fun utf8_codec: Codec do return once new UTF8Codec