import text
import bytes
-# Codes UTF-8 entities to an external format
-abstract class Coder
+# Codes/Decodes entities from/to UTF-8
+abstract class Codec
+ # Maximum size of a `character` in supported encoding
+ fun char_max_size: Int is abstract
# Transforms `c` to its representation in the format of `self`
- fun code_char(c: Char): Bytes is abstract
+ fun encode_char(c: Char): NativeString is abstract
# Adds a char `c` to bytes `s`
- fun add_char_to(c: Char, s: Bytes) is abstract
+ #
+ # Returns the number of bytes written to `s`
+ fun add_char_to(c: Char, s: NativeString): Int is abstract
# Transforms `s` to the format of `self`
- fun code_string(s: Text): Bytes is abstract
+ fun encode_string(s: Text): Bytes is abstract
- # Adds a string `s` to bytes `b`
- fun add_string_to(s: Text, b: Bytes) is abstract
-end
+ # Adds a string `s` coded as the supported encoding to `b`
+ #
+ # Returns the number of bytes written to `s`
+ fun add_string_to(s: Text, b: Bytes): Int is abstract
+
+ # Size of a codet for the target encoding
+ fun codet_size: Int is abstract
+
+ # How many lookaheads might be required to decode a single char ?
+ fun max_lookahead: Int is abstract
-# Decodes entities in an external format to UTF-8
-abstract class Decoder
+ # Is the sequence of bytes in `ns` at `position` a valid Char ?
+ #
+ # Returns either
+ # * 0 if valid
+ # * 1 if incomplete
+ # * 2 if invalid
+ fun is_valid_char(ns: NativeString, position: Int): Int is abstract
# Decodes a char from `b` to a Unicode code-point
- fun decode_char(b: Bytes): Char is abstract
+ fun decode_char(b: NativeString): Char is abstract
# Decodes a string `b` to UTF-8
- fun decode_string(b: Bytes): String is abstract
+ fun decode_string(b: NativeString, len: Int): String is abstract
end
import codec_base
import utf8
+import iso8859_1
--- /dev/null
+# This file is part of NIT (http://www.nitlanguage.org).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Codec for ISO8859-1 I/O
+module iso8859_1
+
+import codec_base
+intrude import bytes
+
+private class ISO88591Codec
+ super Codec
+
+ redef fun char_max_size do return 1
+
+ redef fun codet_size do return 1
+
+ redef fun max_lookahead do return 1
+
+ redef fun encode_char(c) do
+ var ns = new NativeString(c.u8char_len)
+ add_char_to(c, ns)
+ return ns
+ end
+
+ redef fun add_char_to(c, stream) do
+ var cp = if c.code_point <= 255 then c else '?'
+ stream[0] = cp.ascii
+ return 1
+ end
+
+ redef fun encode_string(s) do
+ var ns = new Bytes.with_capacity(s.bytelen)
+ add_string_to(s, ns)
+ return ns
+ end
+
+ redef fun add_string_to(s, b) do
+ var pos = 0
+ for i in s.chars do
+ var cp = i.code_point
+ if cp <= 255 then
+ b[pos] = cp.to_b
+ else
+ b[pos] = 0x3Fu8
+ end
+ pos += 1
+ end
+ return pos
+ end
+
+ redef fun is_valid_char(ns, len) do
+ return 0
+ end
+
+ redef fun decode_char(b) do
+ return b[0].to_i.code_point
+ end
+
+ redef fun decode_string(b, len) do
+ var buf = new Bytes.with_capacity(len)
+ for i in [0 .. len[ do buf.add_char(b[i].to_i.code_point)
+ return buf.to_s
+ end
+end
+
+# Returns the instance of a ISO8859-1 Codec
+fun iso88591_codec: Codec do return once new ISO88591Codec
module utf8
import codec_base
+intrude import text::flat
+intrude import bytes
-# Returns UTF-8 entities as-is
-private class UTF8Coder
- super Coder
+# Codec supporting UTF-8
+private class UTF8Codec
+ super Codec
- redef fun code_char(c) do return c.to_s.to_bytes
+ redef fun char_max_size do return 4
- redef fun add_char_to(c, stream) do c.to_s.append_to_bytes(stream)
+ redef fun codet_size do return 1
- redef fun code_string(s) do return s.to_bytes
+ redef fun max_lookahead do return 4
- redef fun add_string_to(s, b) do s.append_to_bytes(b)
-end
+ redef fun encode_char(c) do
+ var ns = new NativeString(c.u8char_len)
+ add_char_to(c, ns)
+ return ns
+ end
-# Decodes entities in an external format to UTF-8
-private class UTF8Decoder
- super Decoder
+ redef fun add_char_to(c, stream) do
+ c.u8char_tos(stream, c.u8char_len)
+ return c.u8char_len
+ end
+
+ redef fun encode_string(s) do
+ var buf = new Bytes.with_capacity(s.bytelen)
+ add_string_to(s, buf)
+ return buf
+ end
+
+ redef fun add_string_to(s, b) do
+ s.append_to_bytes(b)
+ return s.bytelen
+ end
+
+ redef fun is_valid_char(ns, len) do
+ if len == 0 then return 2
+ if not ns[0].is_valid_utf8_start then return 2
+ for i in [1 .. len[ do if ns[i] & 0b1100_0000u8 != 0b1000_0000u8 then return 2
+ if len != ns[0].u8len then return 1
+ return 0
+ end
redef fun decode_char(b) do
- var s = b.to_s
- return s[0]
+ var c = b.char_at(0)
+ var cp = c.code_point
+ if cp >= 0xD800 and cp <= 0xDFFF then return 0xFFFD.code_point
+ if cp == 0xFFFE or cp == 0xFFFF then return 0xFFFD.code_point
+ return c
end
- redef fun decode_string(b) do
- return b.to_s
+ redef fun decode_string(ns, len) do
+ var ret = ns.to_s_with_length(len)
+ var rit = ret.as(FlatString).items
+ if rit == ns then
+ var nns = new NativeString(len)
+ rit.copy_to(nns, len, 0, 0)
+ return nns.to_s_full(ret.bytelen, ret.length)
+ end
+ return ret
end
end
-# Returns the instance of a UTF-8 Coder
-fun utf8_coder: Coder do return once new UTF8Coder
-# Returns the instance of a UTF-8 Decoder
-fun utf8_decoder: Decoder do return once new UTF8Decoder
+# Returns the instance of a UTF-8 Codec
+fun utf8_codec: Codec do return once new UTF8Codec
super Stream
# Decoder used to transform input bytes to UTF-8
- var decoder: Decoder = utf8_decoder is writable
+ var decoder: Codec = utf8_codec is writable
# Reads a character. Returns `null` on EOF or timeout
fun read_char: nullable Char is abstract
super Stream
# The coder from a nit UTF-8 String to the output file
- var coder: Coder = utf8_coder is writable
+ var coder: Codec = utf8_codec is writable
# Writes bytes from `s`
fun write_bytes(s: Bytes) is abstract