src: Compiler, interpreter and parser updates for UTF-8
authorLucas Bajolet <r4pass@hotmail.com>
Fri, 10 Jul 2015 20:15:52 +0000 (16:15 -0400)
committerLucas Bajolet <r4pass@hotmail.com>
Thu, 23 Jul 2015 15:17:32 +0000 (11:17 -0400)
Signed-off-by: Lucas Bajolet <r4pass@hotmail.com>

src/compiler/abstract_compiler.nit
src/interpreter/naive_interpreter.nit
src/parser/lexer_work.nit

index 43686ae..3dcfab1 100644 (file)
@@ -1494,8 +1494,12 @@ abstract class AbstractCompilerVisitor
        fun char_instance(value: Char): RuntimeVariable
        do
                var t = mmodule.char_type
-               var res = new RuntimeVariable("'{value.to_s.escape_to_c}'", t, t)
-               return res
+
+               if value.ascii < 128 then
+                       return new RuntimeVariable("'{value.to_s.escape_to_c}'", t, t)
+               else
+                       return new RuntimeVariable("{value.ascii}", t, t)
+               end
        end
 
        # Generate a float value
@@ -1537,7 +1541,7 @@ abstract class AbstractCompilerVisitor
                var native_mtype = mmodule.native_string_type
                var nat = self.new_var(native_mtype)
                self.add("{nat} = \"{string.escape_to_c}\";")
-               var length = self.int_instance(string.length)
+               var length = self.int_instance(string.bytelen)
                self.add("{res} = {self.send(self.get_property("to_s_with_length", native_mtype), [nat, length]).as(not null)};")
                self.add("{name} = {res};")
                self.add("\}")
@@ -2157,10 +2161,7 @@ redef class AMethPropdef
                                return true
                        end
                else if cname == "Char" then
-                       if pname == "output" then
-                               v.add("printf(\"%c\", ((unsigned char){arguments.first}));")
-                               return true
-                       else if pname == "object_id" then
+                       if pname == "object_id" then
                                v.ret(v.new_expr("(long){arguments.first}", ret.as(not null)))
                                return true
                        else if pname == "successor" then
index 78bd4e2..61889c7 100644 (file)
@@ -275,10 +275,10 @@ class NaiveInterpreter
        # Return a new native string initialized with `txt`
        fun native_string_instance(txt: String): Instance
        do
-               var instance = native_string_instance_len(txt.length+1)
+               var instance = native_string_instance_len(txt.bytelen+1)
                var val = instance.val
-               val[txt.length] = 0u8
-               txt.to_cstring.copy_to(val, txt.length, 0, 0)
+               val[txt.bytelen] = 0u8
+               txt.to_cstring.copy_to(val, txt.bytelen, 0, 0)
 
                return instance
        end
@@ -298,7 +298,7 @@ class NaiveInterpreter
        fun string_instance(txt: String): Instance
        do
                var nat = native_string_instance(txt)
-               var res = self.send(self.force_get_primitive_method("to_s_with_length", nat.mtype), [nat, self.int_instance(txt.length)])
+               var res = self.send(self.force_get_primitive_method("to_s_with_length", nat.mtype), [nat, self.int_instance(txt.bytelen)])
                assert res != null
                return res
        end
index 0f06bce..94810af 100644 (file)
@@ -164,7 +164,17 @@ class Lexer
                        if sp >= string_len then
                                dfa_state = -1
                        else
+                               # Very ugly hack, this is because of the way SableCC generates its tables.
+                               # Due to the 0xFFFF limit of a Java char, when a big Nit char is read (i.e.
+                               # code point > 65535), it crashes.
+                               #
+                               # Hence, if a char has a code point <= 255 (ISO8859 range), it is left as is.
+                               # Else, it is replaced by 255.
+                               # This does not corrupt the lexer and works perfectly on any character.
+                               #
+                               # TL;DR: Java fucked up, need retarded solution to cope for retarded decision
                                var c = string[sp].ascii
+                               if c >= 256 then c = 255
                                sp += 1
 
                                var cr = _cr