lib/json: Added ad-hoc string parser for JSON
authorLucas Bajolet <r4pass@hotmail.com>
Fri, 11 Dec 2015 21:35:53 +0000 (16:35 -0500)
committerLucas Bajolet <r4pass@hotmail.com>
Wed, 16 Dec 2015 15:58:29 +0000 (10:58 -0500)
Signed-off-by: Lucas Bajolet <r4pass@hotmail.com>

lib/core/text/native.nit
lib/json/static.nit
lib/json/string_parser.nit [new file with mode: 0644]
lib/parser_base.nit

index 4028ea0..9ea8382 100644 (file)
@@ -31,6 +31,21 @@ redef class Byte
        end
 end
 
+redef class Int
+       # Returns the code_point from a utf16 surrogate pair
+       #
+       #     assert 0xD83DDE02.from_utf16_surr == 0x1F602
+       fun from_utf16_surr: Int do
+               var hi = (self & 0xFFFF0000) >> 16
+               var lo = self & 0xFFFF
+               var cp = 0
+               cp += (hi - 0xD800) << 10
+               cp += lo - 0xDC00
+               cp += 0x10000
+               return cp
+       end
+end
+
 # Native strings are simple C char *
 extern class NativeString `{ char* `}
        # Creates a new NativeString with a capacity of `length`
index 6f2a9e4..15a4355 100644 (file)
@@ -47,7 +47,7 @@ interface Jsonable
        # avoid cyclic references between `append_json` and `to_json` when none are
        # implemented.
        protected fun to_json_by_append: String do
-               var buffer = new RopeBuffer
+               var buffer = new FlatBuffer
                append_json(buffer)
                return buffer.to_s
        end
@@ -89,6 +89,22 @@ end
 redef class Text
        super Jsonable
 
+       # Removes JSON-escaping if necessary in a JSON string
+       #
+       #     assert "\\\"string\\uD83D\\uDE02\\\"".unescape_json == "\"string😂\""
+       fun unescape_json: Text do
+               if not json_need_escape then return self
+               return self.json_to_nit_string
+       end
+
+       # Does `self` need treatment from JSON to Nit ?
+       #
+       # i.e. is there at least one `\` character in it ?
+       #
+       #     assert not "string".json_need_escape
+       #     assert "\\\"string\\\"".json_need_escape
+       protected fun json_need_escape: Bool do return has('\\')
+
        redef fun append_json(buffer) do
                buffer.add '\"'
                for i in [0 .. self.length[ do
@@ -97,8 +113,6 @@ redef class Text
                                buffer.append "\\\\"
                        else if char == '\"' then
                                buffer.append "\\\""
-                       else if char == '\/' then
-                               buffer.append "\\/"
                        else if char < ' ' then
                                if char == '\n' then
                                        buffer.append "\\n"
@@ -106,10 +120,6 @@ redef class Text
                                        buffer.append "\\r"
                                else if char == '\t' then
                                        buffer.append "\\t"
-                               else if char == 0x0C.code_point then
-                                       buffer.append "\\f"
-                               else if char == 0x08.code_point then
-                                       buffer.append "\\b"
                                else
                                        buffer.append char.escape_to_utf16
                                end
@@ -120,13 +130,66 @@ redef class Text
                buffer.add '\"'
        end
 
+       # Escapes `self` from a JSON string to a Nit string
+       #
+       #     assert "\\\"string\\\"".json_to_nit_string == "\"string\""
+       #     assert "\\nEscape\\t\\n".json_to_nit_string == "\nEscape\t\n"
+       #     assert "\\u0041zu\\uD800\\uDFD3".json_to_nit_string == "Azu𐏓"
+       protected fun json_to_nit_string: String do
+               var res = new FlatBuffer.with_capacity(bytelen)
+               var i = 0
+               while i < self.length do
+                       var char = self[i]
+                       if char == '\\' then
+                               i += 1
+                               char = self[i]
+                               if char == 'b' then
+                                       char = 0x08.code_point
+                               else if char == 'f' then
+                                       char = 0x0C.code_point
+                               else if char == 'n' then
+                                       char = '\n'
+                               else if char == 'r' then
+                                       char = '\r'
+                               else if char == 't' then
+                                       char = '\t'
+                               else if char == 'u' then
+                                       var code = substring(i + 1, 4)
+                                       var hx = code.to_hex
+                                       if hx >= 0xD800 and hx <= 0xDFFF then
+                                               var lostr = substring(i + 7, 4)
+                                               if lostr.length < 4 then
+                                                       hx = 0xFFFD
+                                               else
+                                                       hx <<= 16
+                                                       hx += lostr.to_hex
+                                                       hx = hx.from_utf16_surr
+                                               end
+                                               i += 6
+                                       end
+                                       i += 4
+                                       char = hx.code_point
+                               end
+                               # `"`, `/` or `\` => Keep `char` as-is.
+                       end
+                       res.add char
+                       i += 1
+               end
+               return res.to_s
+       end
+
+
        # Encode `self` in JSON.
        #
        # ~~~
        # assert "\t\"http://example.com\"\r\n\0\\".to_json ==
-       #     "\"\\t\\\"http:\\/\\/example.com\\\"\\r\\n\\u0000\\\\\""
+       #     "\"\\t\\\"http://example.com\\\"\\r\\n\\u0000\\\\\""
        # ~~~
-       redef fun to_json do return to_json_by_append
+       redef fun to_json do
+               var b = new FlatBuffer.with_capacity(bytelen)
+               append_json(b)
+               return b.to_s
+       end
 
        # Parse `self` as JSON.
        #
@@ -173,6 +236,16 @@ redef class Text
        end
 end
 
+redef class FlatText
+       redef fun json_need_escape do
+               var its = items
+               for i in [first_byte .. last_byte] do
+                       if its[i] == 0x5Cu8 then return true
+               end
+               return false
+       end
+end
+
 redef class Buffer
 
        # Append the JSON representation of `jsonable` to `self`.
@@ -424,51 +497,7 @@ end
 
 redef class Nstring
        # The represented string.
-       private fun to_nit_string: String do
-               var res = new Buffer
-               var i = 1
-               while i < text.length - 1 do
-                       var char = text[i]
-                       if char == '\\' then
-                               i += 1
-                               char = text[i]
-                               if char == 'b' then
-                                       char = 0x08.code_point
-                               else if char == 'f' then
-                                       char = 0x0C.code_point
-                               else if char == 'n' then
-                                       char = '\n'
-                               else if char == 'r' then
-                                       char = '\r'
-                               else if char == 't' then
-                                       char = '\t'
-                               else if char == 'u' then
-                                       var escape = new Buffer
-                                       escape.append "\\u"
-                                       var code = text.substring(i + 1, 4)
-                                       escape.append code
-                                       var hx = code.to_hex
-                                       if hx >= 0xD800 and hx <= 0xDFFF then
-                                               var lostr = text.substring(i + 7, 4)
-                                               if lostr.length < 4 then
-                                                       escape.clear
-                                                       escape.append "\\uFFFD"
-                                               else
-                                                       escape.append "\\u"
-                                                       escape.append lostr
-                                               end
-                                               i += 6
-                                       end
-                                       i += 4
-                                       char = escape.from_utf16_escape
-                               end
-                               # `"`, `/` or `\` => Keep `char` as-is.
-                       end
-                       res.add char
-                       i += 1
-               end
-               return res.to_s
-       end
+       private fun to_nit_string: String do return text.substring(1, text.length - 2).unescape_json.to_s
 end
 
 redef class Nvalue_object
diff --git a/lib/json/string_parser.nit b/lib/json/string_parser.nit
new file mode 100644 (file)
index 0000000..d429697
--- /dev/null
@@ -0,0 +1,293 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Simple ad-hoc implementation of a JSON parser for String inputs
+module string_parser
+
+import parser_base
+import static
+
+redef class Char
+       # Is `self` a valid number start ?
+       private fun is_json_num_start: Bool do
+               if self == '-' then return true
+               if self.is_numeric then return true
+               return false
+       end
+
+       # Is `self` a valid JSON separator ?
+       private fun is_json_separator: Bool do
+               if self == ':' then return true
+               if self == ',' then return true
+               if self == '{' then return true
+               if self == '}' then return true
+               if self == '[' then return true
+               if self == ']' then return true
+               if self == '"' then return true
+               if self.is_whitespace then return true
+               return false
+       end
+end
+
+# A simple ad-hoc JSON parser
+#
+# To parse a simple JSON document, read it as a String and give it to `parse_entity`
+# NOTE: if your document contains several non-nested entities, use `parse_entity` for each
+# JSON entity to parse
+class JSONStringParser
+       super StringProcessor
+
+       # Parses a JSON Entity
+       #
+       # ~~~nit
+       # var p = new JSONStringParser("""{"numbers": [1,23,3], "string": "string"}""")
+       # assert p.parse_entity isa JsonObject
+       # ~~~
+       fun parse_entity: nullable Jsonable do
+               var srclen = len
+               ignore_whitespaces
+               if pos >= srclen then return make_parse_error("Empty JSON")
+               var c = src[pos]
+               if c == '[' then
+                       pos += 1
+                       return parse_json_array
+               else if c == '"' then
+                       var s = parse_json_string
+                       return s
+               else if c == '{' then
+                       pos += 1
+                       return parse_json_object
+               else if c == 'f' then
+                       if pos + 4 >= srclen then make_parse_error("Error: bad JSON entity")
+                       if src[pos + 1] == 'a' and src[pos + 2] == 'l' and src[pos + 3] == 's' and src[pos + 4] == 'e' then
+                               pos += 5
+                               return false
+                       end
+                       return make_parse_error("Error: bad JSON entity")
+               else if c == 't' then
+                       if pos + 3 >= srclen then make_parse_error("Error: bad JSON entity")
+                       if src[pos + 1] == 'r' and src[pos + 2] == 'u' and src[pos + 3] == 'e' then
+                               pos += 4
+                               return true
+                       end
+                       return make_parse_error("Error: bad JSON entity")
+               else if c == 'n' then
+                       if pos + 3 >= srclen then make_parse_error("Error: bad JSON entity")
+                       if src[pos + 1] == 'u' and src[pos + 2] == 'l' and src[pos + 3] == 'l' then
+                               pos += 4
+                               return null
+                       end
+                       return make_parse_error("Error: bad JSON entity")
+               end
+               if not c.is_json_num_start then return make_parse_error("Bad JSON character")
+               return parse_json_number
+       end
+
+       # Parses a JSON Array
+       fun parse_json_array: Jsonable do
+               var max = len
+               if pos >= max then return make_parse_error("Incomplete JSON array")
+               var arr = new JsonArray
+               var c = src[pos]
+               while not c == ']' do
+                       ignore_whitespaces
+                       if pos >= max then return make_parse_error("Incomplete JSON array")
+                       if src[pos] == ']' then break
+                       var ent = parse_entity
+                       #print "Parsed an entity {ent} for a JSON array"
+                       if ent isa JsonParseError then return ent
+                       arr.add ent
+                       ignore_whitespaces
+                       if pos >= max then return make_parse_error("Incomplete JSON array")
+                       c = src[pos]
+                       if c == ']' then break
+                       if c != ',' then return make_parse_error("Bad array separator {c}")
+                       pos += 1
+               end
+               pos += 1
+               return arr
+       end
+
+       # Parses a JSON Object
+       fun parse_json_object: Jsonable do
+               var max = len
+               if pos >= max then return make_parse_error("Incomplete JSON object")
+               var obj = new JsonObject
+               var c = src[pos]
+               while not c == '}' do
+                       ignore_whitespaces
+                       if pos >= max then return make_parse_error("Malformed JSON object")
+                       if src[pos] == '}' then break
+                       var key = parse_entity
+                       #print "Parsed key {key} for JSON object"
+                       if not key isa String then return make_parse_error("Bad key format {key or else "null"}")
+                       ignore_whitespaces
+                       if pos >= max then return make_parse_error("Incomplete JSON object")
+                       if not src[pos] == ':' then return make_parse_error("Bad key/value separator {src[pos]}")
+                       pos += 1
+                       ignore_whitespaces
+                       var value = parse_entity
+                       #print "Parsed value {value} for JSON object"
+                       if value isa JsonParseError then return value
+                       obj[key] = value
+                       ignore_whitespaces
+                       if pos >= max then return make_parse_error("Incomplete JSON object")
+                       c = src[pos]
+                       if c == '}' then break
+                       if c != ',' then return make_parse_error("Bad object separator {src[pos]}")
+                       pos += 1
+               end
+               pos += 1
+               return obj
+       end
+
+       # Creates a `JsonParseError` with the right message and location
+       protected fun make_parse_error(message: String): JsonParseError do
+               var err = new JsonParseError(message)
+               err.location = hot_location
+               return err
+       end
+
+       # Parses an Int or Float
+       fun parse_json_number: Jsonable do
+               var max = len
+               var p = pos
+               var c = src[p]
+               var is_neg = false
+               if c == '-' then
+                       is_neg = true
+                       p += 1
+                       if p >= max then return make_parse_error("Bad JSON number")
+                       c = src[p]
+               end
+               var val = 0
+               while c.is_numeric do
+                       val *= 10
+                       val += c.to_i
+                       p += 1
+                       if p >= max then break
+                       c = src[p]
+               end
+               if c == '.' then
+                       p += 1
+                       if p >= max then return make_parse_error("Bad JSON number")
+                       c = src[p]
+                       var fl = val.to_f
+                       var frac = 0.1
+                       while c.is_numeric do
+                               fl += c.to_i.to_f * frac
+                               frac /= 10.0
+                               p += 1
+                               if p >= max then break
+                               c = src[p]
+                       end
+                       if c == 'e' or c == 'E' then
+                               p += 1
+                               var exp = 0
+                               if p >= max then return make_parse_error("Malformed JSON number")
+                               c = src[p]
+                               while c.is_numeric do
+                                       exp *= 10
+                                       exp += c.to_i
+                                       p += 1
+                                       if p >= max then break
+                                       c = src[p]
+                               end
+                               fl *= (10 ** exp).to_f
+                       end
+                       if p < max and not c.is_json_separator then return make_parse_error("Malformed JSON number")
+                       pos = p
+                       if is_neg then return -fl
+                       return fl
+               end
+               if c == 'e' or c == 'E' then
+                       p += 1
+                       if p >= max then return make_parse_error("Bad JSON number")
+                       var exp = src[p].to_i
+                       c = src[p]
+                       while c.is_numeric do
+                               exp *= 10
+                               exp += c.to_i
+                               p += 1
+                               if p >= max then break
+                               c = src[p]
+                       end
+                       val *= (10 ** exp)
+               end
+               if p < max and not src[p].is_json_separator then return make_parse_error("Malformed JSON number")
+               pos = p
+               if is_neg then return -val
+               return val
+       end
+
+       # Parses and returns a Nit string from a JSON String
+       fun parse_json_string: Jsonable do
+               var ln = src.length
+               var p = pos
+               p += 1
+               if p > ln then return make_parse_error("Malformed JSON String")
+               var c = src[p]
+               var st = p
+               while c != '"' do
+                       if c == '\\' then
+                               if p + 1 >= ln then return make_parse_error("Malformed Escape sequence in JSON string")
+                               p += 1
+                               c = src[p]
+                               if c == 'u' then
+                                       p += 1
+                                       if p + 3 >= ln then return make_parse_error("Bad Unicode escape sequence in string")
+                                       for i in [0 .. 4[ do if not src[p + i].is_hexdigit then return make_parse_error("Bad Unicode escape sequence in string")
+                                       p += 3
+                               end
+                       end
+                       p += 1
+                       if p >= ln then return make_parse_error("Malformed JSON String")
+                       c = src[p]
+               end
+               pos = p + 1
+               return src.substring(st, p - st).unescape_json
+       end
+
+       # Ignores any character until a JSON separator is encountered
+       fun ignore_until_separator do
+               var max = len
+               while pos < max do
+                       if not src[pos].is_json_separator then return
+               end
+       end
+end
+
+redef class Text
+       redef fun parse_json do return (new JSONStringParser(self.to_s)).parse_entity
+end
+
+redef class JsonParseError
+
+       # Location of the error in source
+       var location: nullable Location = null
+
+       # Get the JSON representation of `self`.
+       #
+       # ~~~
+       # var err = new JsonParseError("foo", new Position(1, 2, 3, 4, 5, 6))
+       # assert err.to_json == "Parsing error: foo"
+       # ~~~
+       redef fun to_json do
+               var l = location
+               var m = message
+               return if l == null then "Parsing error: {m}" else "Parsing error at {l}: {m}"
+       end
+
+       redef fun to_s do return to_json
+end
index ac002fe..234b733 100644 (file)
@@ -14,57 +14,71 @@ module parser_base
 # Basic facilities for common parser operations on String sources
 class StringProcessor
        # Source document to parse
-       private var src: String
+       protected var src: String
+
+       # Length of the source document
+       protected var len: Int is noinit
 
        # Current position in `src`
-       private var pos = 0
+       protected var pos = 0
 
        # Position at which current line started
-       private var line_start = 0
+       protected var line_start = 0
 
        # Current line in `src`
-       private var line = 1
+       protected var line = 1
 
        # Offset in the current line
-       private fun line_offset: Int do return pos - line_start + 1
+       protected fun line_offset: Int do return pos - line_start + 1
+
+       init do
+               _len = src.length
+       end
 
        # Gives the current location in the `src`
        fun current_location: Location do return new Location(line, line_offset)
 
        # Advances in `src` until a non-whitespace character is encountered
-       private fun ignore_whitespaces do
-               var srclen = src.length
-               if pos >= srclen then return
-               var c = src[pos]
+       protected fun ignore_whitespaces do
+               var srclen = _len
+               var p = _pos
+               if p >= srclen then return
+               var c = src[p]
                while c.is_whitespace do
-                       pos += 1
-                       if pos >= srclen then break
+                       p += 1
+                       if p >= srclen then break
                        if c == '\n' then
-                               line += 1
-                               line_start = pos
+                               _line += 1
+                               _line_start = p
                        end
-                       c = src[pos]
+                       c = src[p]
                end
+               _pos = p
+               return
        end
 
        # Reads characters until pattern `s` is found
-       private fun ignore_until(s: String): Int do
-               if s.length == 0 then return pos
-               var srclen = src.length
-               if pos >= srclen then return -1
+       protected fun ignore_until(s: String): Int do
+               if s.length == 0 then return _pos
+               var srclen = _len
+               var p = _pos
+               if p >= srclen then return -1
                loop
                        var c = s[0]
-                       var src_c = src[pos]
+                       var src_c = src[p]
                        while src_c != c do
-                               pos += 1
-                               if pos >= srclen then return -1
+                               p += 1
+                               if p >= srclen then
+                                       _pos = p
+                                       return -1
+                               end
                                if src_c == '\n' then
                                        line += 1
                                        line_start= pos
                                end
-                               src_c = src[pos]
+                               src_c = src[p]
                        end
-                       var relpos = pos
+                       var relpos = p
                        var fnd = true
                        for i in s do
                                if relpos >= srclen then
@@ -72,15 +86,27 @@ class StringProcessor
                                        break
                                end
                                if src[relpos] != i then
-                                       pos += 1
+                                       p += 1
                                        fnd = false
                                        break
                                end
                                relpos += 1
                        end
-                       if fnd then return pos
+                       if fnd then
+                               _pos = p
+                               return p
+                       end
                end
        end
+
+       # Ignores any printable character until a whitespace is encountered
+       protected fun ignore_until_whitespace: Int do
+               while not src[pos].is_whitespace do pos += 1
+               return pos
+       end
+
+       # Returns the current location as a `Location` object
+       protected fun hot_location: Location do return new Location(line, line_offset)
 end
 
 # Information about the location of an entity in a source document