Merge: Nitcc custom lexer
authorJean Privat <jean@pryen.org>
Mon, 15 Aug 2016 19:19:16 +0000 (15:19 -0400)
committerJean Privat <jean@pryen.org>
Mon, 15 Aug 2016 19:19:16 +0000 (15:19 -0400)
Proof of concept of how a nitcc lexer can be monkey-patched.

maybe not enough for @ppepos, but it's something.

Pull-Request: #2258
Reviewed-by: Lucas Bajolet <r4pass@hotmail.com>

contrib/nitcc/.gitignore
contrib/nitcc/examples/blob.nit [new file with mode: 0644]
contrib/nitcc/examples/blob.sablecc [new file with mode: 0644]
contrib/nitcc/src/Makefile
lib/nitcc_runtime.nit

index 8d6951d..f7a4467 100644 (file)
@@ -13,3 +13,4 @@ nitcc1
 nitcc
 calc
 minilang
+blob
diff --git a/contrib/nitcc/examples/blob.nit b/contrib/nitcc/examples/blob.nit
new file mode 100644 (file)
index 0000000..696197f
--- /dev/null
@@ -0,0 +1,85 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Example of the hijack of a lexer to inject custom behavior.
+# see `blob.sablecc` for the grammar
+import blob_test_parser
+
+redef class Lexer_blob
+
+       # Two context, *in blob* (custom), and *not in blob* (normal).
+       # The initial state is *in blob*.
+       var in_blob = true
+
+       # Refine the `next_token` to hijack the lexer.
+       redef fun next_token
+       do
+               if not in_blob then
+                       # Normal lexer
+                       var res = super
+                       # Watch for tokens that trigger a context change
+                       if res isa Nendmark then in_blob = true
+                       return res
+               end
+
+               # Custom lexer
+               # Manage pos, line and col manually
+               # TODO: improve the lexer API
+
+               var pos = pos_start
+               var line = line_start
+               var col = col_start
+               var text = stream
+               var len = text.length
+
+               # Need to count three '{' or the end of text
+               var cpt = 0
+               while pos < len do
+                       var c = text[pos]
+                       if c == '{' then
+                               cpt += 1
+                               if cpt == 3 then
+                                       # Got them, backtrack them.
+                                       pos -= 3
+                                       col -= 3
+                                       break
+                               end
+                       else
+                               cpt = 0
+                       end
+
+                       # Next char, count lines.
+                       pos += 1
+                       col += 1
+                       if c == '\n' then
+                               line += 1
+                               col = 1
+                       end
+               end
+
+               # Create manually the `blob token`
+               var token = new Nblob
+               var position = new Position(pos_start, pos, line_start, line, col_start, col)
+               token.position = position
+               token.text = text.substring(pos_start, pos-pos_start+1)
+
+               # Prepare for the next token
+               pos_start = pos + 1
+               line_start = line
+               col_start = col + 1
+               in_blob = false
+
+               return token
+       end
+end
diff --git a/contrib/nitcc/examples/blob.sablecc b/contrib/nitcc/examples/blob.sablecc
new file mode 100644 (file)
index 0000000..a2dee60
--- /dev/null
@@ -0,0 +1,22 @@
+/* Special lexer that will be hijacked. See blob.nit */
+Grammar blob;
+
+Lexer
+// These tokens are recognized by the genuine lexer
+d = '0'..'9';
+int = d+;
+white = #9..#13 | ' ';
+// Need to name this token, we will use it to change context
+endmark = '}}}';
+
+// Special token that the genuine lexer is expect to not recognize.
+// But that muse be known by the parser or the application.
+// TODO: Maybe add a special keyword?
+//       blob = Phony;
+blob = #0;
+
+Parser
+Ignored white;
+ps = p*;
+// Parser do not know that `blob` is phony.
+p = blob | '{{{' int endmark;
index 52356e6..ed5ba5d 100644 (file)
@@ -1,6 +1,6 @@
 NITC=../../../bin/nitc
 
-all: nitcc calc minilang
+all: nitcc calc minilang blob
 
 nitcc_parser_gen: nitcc_parser_gen.nit
        @echo "*** Compile the nitcc bootstrap parser generator -- level 0"
@@ -33,6 +33,12 @@ minilang: nitcc ../examples/minilang.sablecc ../examples/minilang.nit
        ${NITC} ../examples/minilang.nit -v
        printf "10\n42\n" | ./minilang ../examples/minilang.minilang
 
+blob: nitcc ../examples/blob.sablecc ../examples/blob.nit
+       @echo "*** Example program, blob"
+       cd ../examples && ../src/nitcc blob.sablecc
+       ${NITC} ../examples/blob.nit -v
+       ./blob -e "abc {{{ 1 }}} de {{{ 2 }}} { 3 }"
+
 check: tests
 tests:
        cd ../tests && ./run
@@ -42,7 +48,7 @@ clean:
                *.dot *.out \
                nitcc_lexer.nit nitcc_parser.nit nitcc_test_parser.nit nitcc_parser_gen \
                nitcc0 nitcc1 \
-               calc minilang \
+               calc minilang blob \
                ../examples/*.dot ../examples/*.out ../examples/*_lexer.nit ../examples/*_parser.nit ../examples/*_test_parser.nit \
                2>/dev/null || true
 
index ac75d8b..a09bc3c 100644 (file)
@@ -165,16 +165,36 @@ abstract class Lexer
        fun lex: CircularArray[NToken]
        do
                var res = new CircularArray[NToken]
+               loop
+                       var t = next_token
+                       if t != null then res.add t
+                       if t isa NEof or t isa NError then break
+               end
+               return res
+       end
+
+       # Cursor current position (in chars, starting from 0)
+       var pos_start = 0
+
+       # Cursor current line (starting from 1)
+       var line_start = 1
+
+       # Cursor current column (in chars, starting from 1)
+       var col_start = 1
+
+       # Move the cursor and return the next token.
+       #
+       # Returns a `NEof` and the end.
+       # Returns `null` if the token is ignored.
+       fun next_token: nullable NToken
+       do
                var state = start_state
-               var pos = 0
-               var pos_start = 0
-               var pos_end = 0
-               var line = 1
-               var line_start = 1
-               var line_end = 0
-               var col = 1
-               var col_start = 1
-               var col_end = 0
+               var pos = pos_start
+               var pos_end = pos_start - 1
+               var line = line_start
+               var line_end = line_start - 1
+               var col = col_start
+               var col_end = col_start - 1
                var last_state: nullable DFAState = null
                var text = stream
                var length = text.length
@@ -195,39 +215,30 @@ abstract class Lexer
                                next = state.trans(c)
                        end
                        if next == null then
+                               var token
                                if pos_start < length then
                                        if last_state == null then
-                                               var token = new NLexerError
+                                               token = new NLexerError
                                                var position = new Position(pos_start, pos, line_start, line, col_start, col)
                                                token.position = position
                                                token.text = text.substring(pos_start, pos-pos_start+1)
-                                               res.push token
-                                               break
-                                       end
-                                       if not last_state.is_ignored then
+                                       else if not last_state.is_ignored then
                                                var position = new Position(pos_start, pos_end, line_start, line_end, col_start, col_end)
-                                               var token = last_state.make_token(position, text)
-                                               if token != null then res.push(token)
+                                               token = last_state.make_token(position, text)
+                                       else
+                                               token = null
                                        end
-                               end
-                               if pos >= length then
-                                       var token = new NEof
+                               else
+                                       token = new NEof
                                        var position = new Position(pos, pos, line, line, col, col)
                                        token.position = position
                                        token.text = ""
-                                       res.push token
-                                       break
                                end
-                               state = start_state
                                pos_start = pos_end + 1
-                               pos = pos_start
                                line_start = line_end
-                               line = line_start
                                col_start = col_end
-                               col = col_start
 
-                               last_state = null
-                               continue
+                               return token
                        end
                        state = next
                        pos += 1
@@ -237,7 +248,6 @@ abstract class Lexer
                                col = 1
                        end
                end
-               return res
        end
 end