From 58ec3d46d8904fa050b9de1c42e68cd171ce26c5 Mon Sep 17 00:00:00 2001 From: Lucas Bajolet Date: Tue, 5 Apr 2016 16:39:02 -0400 Subject: [PATCH] Codecs: Refactor API Signed-off-by: Lucas Bajolet --- lib/core/codecs/codec_base.nit | 40 ++++++++++++++------- lib/core/codecs/codecs.nit | 1 + lib/core/codecs/iso8859_1.nit | 78 ++++++++++++++++++++++++++++++++++++++++ lib/core/codecs/utf8.nit | 71 ++++++++++++++++++++++++++---------- lib/core/stream.nit | 4 +-- 5 files changed, 161 insertions(+), 33 deletions(-) create mode 100644 lib/core/codecs/iso8859_1.nit diff --git a/lib/core/codecs/codec_base.nit b/lib/core/codecs/codec_base.nit index b4a9523..0d7af31 100644 --- a/lib/core/codecs/codec_base.nit +++ b/lib/core/codecs/codec_base.nit @@ -24,28 +24,44 @@ module codec_base import text import bytes -# Codes UTF-8 entities to an external format -abstract class Coder +# Codes/Decodes entities from/to UTF-8 +abstract class Codec + # Maximum size of a `character` in supported encoding + fun char_max_size: Int is abstract # Transforms `c` to its representation in the format of `self` - fun code_char(c: Char): Bytes is abstract + fun encode_char(c: Char): NativeString is abstract # Adds a char `c` to bytes `s` - fun add_char_to(c: Char, s: Bytes) is abstract + # + # Returns the number of bytes written to `s` + fun add_char_to(c: Char, s: NativeString): Int is abstract # Transforms `s` to the format of `self` - fun code_string(s: Text): Bytes is abstract + fun encode_string(s: Text): Bytes is abstract - # Adds a string `s` to bytes `b` - fun add_string_to(s: Text, b: Bytes) is abstract -end + # Adds a string `s` coded as the supported encoding to `b` + # + # Returns the number of bytes written to `s` + fun add_string_to(s: Text, b: Bytes): Int is abstract + + # Size of a codet for the target encoding + fun codet_size: Int is abstract + + # How many lookaheads might be required to decode a single char ? + fun max_lookahead: Int is abstract -# Decodes entities in an external format to UTF-8 -abstract class Decoder + # Is the sequence of bytes in `ns` at `position` a valid Char ? + # + # Returns either + # * 0 if valid + # * 1 if incomplete + # * 2 if invalid + fun is_valid_char(ns: NativeString, position: Int): Int is abstract # Decodes a char from `b` to a Unicode code-point - fun decode_char(b: Bytes): Char is abstract + fun decode_char(b: NativeString): Char is abstract # Decodes a string `b` to UTF-8 - fun decode_string(b: Bytes): String is abstract + fun decode_string(b: NativeString, len: Int): String is abstract end diff --git a/lib/core/codecs/codecs.nit b/lib/core/codecs/codecs.nit index 25e9931..b3980bf 100644 --- a/lib/core/codecs/codecs.nit +++ b/lib/core/codecs/codecs.nit @@ -17,3 +17,4 @@ module codecs import codec_base import utf8 +import iso8859_1 diff --git a/lib/core/codecs/iso8859_1.nit b/lib/core/codecs/iso8859_1.nit new file mode 100644 index 0000000..02c39ef --- /dev/null +++ b/lib/core/codecs/iso8859_1.nit @@ -0,0 +1,78 @@ +# This file is part of NIT (http://www.nitlanguage.org). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Codec for ISO8859-1 I/O +module iso8859_1 + +import codec_base +intrude import bytes + +private class ISO88591Codec + super Codec + + redef fun char_max_size do return 1 + + redef fun codet_size do return 1 + + redef fun max_lookahead do return 1 + + redef fun encode_char(c) do + var ns = new NativeString(c.u8char_len) + add_char_to(c, ns) + return ns + end + + redef fun add_char_to(c, stream) do + var cp = if c.code_point <= 255 then c else '?' + stream[0] = cp.ascii + return 1 + end + + redef fun encode_string(s) do + var ns = new Bytes.with_capacity(s.bytelen) + add_string_to(s, ns) + return ns + end + + redef fun add_string_to(s, b) do + var pos = 0 + for i in s.chars do + var cp = i.code_point + if cp <= 255 then + b[pos] = cp.to_b + else + b[pos] = 0x3Fu8 + end + pos += 1 + end + return pos + end + + redef fun is_valid_char(ns, len) do + return 0 + end + + redef fun decode_char(b) do + return b[0].to_i.code_point + end + + redef fun decode_string(b, len) do + var buf = new Bytes.with_capacity(len) + for i in [0 .. len[ do buf.add_char(b[i].to_i.code_point) + return buf.to_s + end +end + +# Returns the instance of a ISO8859-1 Codec +fun iso88591_codec: Codec do return once new ISO88591Codec diff --git a/lib/core/codecs/utf8.nit b/lib/core/codecs/utf8.nit index 65f2fc9..bb257dd 100644 --- a/lib/core/codecs/utf8.nit +++ b/lib/core/codecs/utf8.nit @@ -16,35 +16,68 @@ module utf8 import codec_base +intrude import text::flat +intrude import bytes -# Returns UTF-8 entities as-is -private class UTF8Coder - super Coder +# Codec supporting UTF-8 +private class UTF8Codec + super Codec - redef fun code_char(c) do return c.to_s.to_bytes + redef fun char_max_size do return 4 - redef fun add_char_to(c, stream) do c.to_s.append_to_bytes(stream) + redef fun codet_size do return 1 - redef fun code_string(s) do return s.to_bytes + redef fun max_lookahead do return 4 - redef fun add_string_to(s, b) do s.append_to_bytes(b) -end + redef fun encode_char(c) do + var ns = new NativeString(c.u8char_len) + add_char_to(c, ns) + return ns + end -# Decodes entities in an external format to UTF-8 -private class UTF8Decoder - super Decoder + redef fun add_char_to(c, stream) do + c.u8char_tos(stream, c.u8char_len) + return c.u8char_len + end + + redef fun encode_string(s) do + var buf = new Bytes.with_capacity(s.bytelen) + add_string_to(s, buf) + return buf + end + + redef fun add_string_to(s, b) do + s.append_to_bytes(b) + return s.bytelen + end + + redef fun is_valid_char(ns, len) do + if len == 0 then return 2 + if not ns[0].is_valid_utf8_start then return 2 + for i in [1 .. len[ do if ns[i] & 0b1100_0000u8 != 0b1000_0000u8 then return 2 + if len != ns[0].u8len then return 1 + return 0 + end redef fun decode_char(b) do - var s = b.to_s - return s[0] + var c = b.char_at(0) + var cp = c.code_point + if cp >= 0xD800 and cp <= 0xDFFF then return 0xFFFD.code_point + if cp == 0xFFFE or cp == 0xFFFF then return 0xFFFD.code_point + return c end - redef fun decode_string(b) do - return b.to_s + redef fun decode_string(ns, len) do + var ret = ns.to_s_with_length(len) + var rit = ret.as(FlatString).items + if rit == ns then + var nns = new NativeString(len) + rit.copy_to(nns, len, 0, 0) + return nns.to_s_full(ret.bytelen, ret.length) + end + return ret end end -# Returns the instance of a UTF-8 Coder -fun utf8_coder: Coder do return once new UTF8Coder -# Returns the instance of a UTF-8 Decoder -fun utf8_decoder: Decoder do return once new UTF8Decoder +# Returns the instance of a UTF-8 Codec +fun utf8_codec: Codec do return once new UTF8Codec diff --git a/lib/core/stream.nit b/lib/core/stream.nit index 19ed842..8a75543 100644 --- a/lib/core/stream.nit +++ b/lib/core/stream.nit @@ -66,7 +66,7 @@ abstract class Reader super Stream # Decoder used to transform input bytes to UTF-8 - var decoder: Decoder = utf8_decoder is writable + var decoder: Codec = utf8_codec is writable # Reads a character. Returns `null` on EOF or timeout fun read_char: nullable Char is abstract @@ -406,7 +406,7 @@ abstract class Writer super Stream # The coder from a nit UTF-8 String to the output file - var coder: Coder = utf8_coder is writable + var coder: Codec = utf8_codec is writable # Writes bytes from `s` fun write_bytes(s: Bytes) is abstract -- 1.7.9.5