src/parser/xss/lexer.xss

   1 $ // This file is part of NIT ( http://www.nitlanguage.org ).
   2 $ //
   3 $ // Copyright 2008 Jean Privat <jean@pryen.org>
   4 $ // Based on algorithms developped for ( http://www.sablecc.org/ ).
   5 $ //
   6 $ // Licensed under the Apache License, Version 2.0 (the "License");
   7 $ // you may not use this file except in compliance with the License.
   8 $ // You may obtain a copy of the License at
   9 $ //
  10 $ //     http://www.apache.org/licenses/LICENSE-2.0
  11 $ //
  12 $ // Unless required by applicable law or agreed to in writing, software
  13 $ // distributed under the License is distributed on an "AS IS" BASIS,
  14 $ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15 $ // See the License for the specific language governing permissions and
  16 $ // limitations under the License.
  17
  18 $ template make_lexer()
  19
  20 # The lexer extract NIT tokens from an input stream.
  21 # It is better user with the Parser
  22 class Lexer
  23         super TablesCapable
  24         # Last peeked token
  25         var _token: nullable Token
  26
  27         # Lexer current state
  28         var _state: Int = 0
  29
  30         # The source file
  31         readable var _file: SourceFile
  32
  33         # Current character in the stream
  34         var _stream_pos: Int = 0
  35
  36         # Current line number in the input stream
  37         var _line: Int = 0
  38
  39         # Current column in the input stream
  40         var _pos: Int = 0
  41
  42         # Was the last character a cariage-return?
  43         var _cr: Bool = false
  44
  45 $ foreach {lexer_data/state}
  46         # Constante state values
  47         private fun state_${translate(@name,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","abcdefghijklmnopqrstuvwxyz")}: Int do return @id end
  48 $ end foreach
  49
  50         # Create a new lexer for a stream (and a name)
  51         init(file: SourceFile)
  52         do
  53                 _file = file
  54         end
  55
  56         # The last peeked token to chain them
  57         private var last_token: nullable Token = null
  58
  59         # Give the next token (but do not consume it)
  60         fun peek: Token
  61         do
  62                 var t = _token
  63                 if t != null then return t
  64
  65                 t = get_token
  66                 while t == null do t = get_token
  67
  68                 var l = last_token
  69                 if l != null then
  70                         l.next_token = t
  71                         t.prev_token = l
  72                 end
  73
  74                 last_token = t
  75                 _token = t
  76                 return t
  77         end
  78
  79         # Give and consume the next token
  80         fun next: Token
  81         do
  82                 var result = peek
  83                 _token = null
  84                 return result
  85         end
  86
  87         # Primitive method to return a token, or return null if it is discarded
  88         # Is used to implement `peek` and `next`
  89         protected fun get_token: nullable Token
  90         do
  91                 var dfa_state = 0
  92
  93                 var sp = _stream_pos
  94                 var start_stream_pos = sp
  95                 var start_pos = _pos
  96                 var start_line = _line
  97                 var string = _file.string
  98                 var string_len = string.length
  99
 100                 var accept_state = -1
 101                 var accept_token = -1
 102                 var accept_length = -1
 103                 var accept_pos = -1
 104                 var accept_line = -1
 105
 106                 loop
 107                         if sp >= string_len then
 108                                 dfa_state = -1
 109                         else
 110                                 var c = string[sp].ascii
 111                                 sp += 1
 112
 113                                 var cr = _cr
 114                                 var line = _line
 115                                 var pos = _pos
 116                                 if c == 10 then
 117                                         if cr then
 118                                                 cr = false
 119                                                 _file.line_starts[line] = sp
 120                                         else
 121                                                 line = line + 1
 122                                                 pos = 0
 123                                                 _file.line_starts[line] = sp
 124                                         end
 125                                 else if c == 13 then
 126                                         line = line + 1
 127                                         pos = 0
 128                                         cr = true
 129                                         _file.line_starts[line] = sp
 130                                 else
 131                                         pos = pos + 1
 132                                         cr = false
 133                                 end
 134
 135                                 loop
 136                                         var old_state = dfa_state
 137                                         if dfa_state < -1 then
 138                                                 old_state = -2 - dfa_state
 139                                         end
 140
 141                                         dfa_state = -1
 142
 143                                         var low = 0
 144                                         var high = lexer_goto(old_state, 0) - 1
 145
 146                                         if high >= 0 then
 147                                                 while low <= high do
 148                                                         var middle = (low + high) / 2
 149                                                         var offset = middle * 3 + 1 # +1 because length is at 0
 150
 151                                                         if c < lexer_goto(old_state, offset) then
 152                                                                 high = middle - 1
 153                                                         else if c > lexer_goto(old_state, offset+1) then
 154                                                                 low = middle + 1
 155                                                         else
 156                                                                 dfa_state = lexer_goto(old_state, offset+2)
 157                                                                 break
 158                                                         end
 159                                                 end
 160                                         end
 161                                         if dfa_state > -2 then break
 162                                 end
 163
 164                                 _cr = cr
 165                                 _line = line
 166                                 _pos = pos
 167                         end
 168
 169                         if dfa_state >= 0 then
 170                                 var tok = lexer_accept(dfa_state)
 171                                 if tok != -1 then
 172                                         accept_state = dfa_state
 173                                         accept_token = tok
 174                                         accept_length = sp - start_stream_pos
 175                                         accept_pos = _pos
 176                                         accept_line = _line
 177                                 end
 178                         else
 179                                 if accept_state != -1 then
 180                                         var location = new Location(_file, start_line + 1, accept_line + 1, start_pos + 1, accept_pos)
 181                                         _pos = accept_pos
 182                                         _line = accept_line
 183                                         _stream_pos = start_stream_pos + accept_length
 184 $ foreach {//token}
 185                                         if accept_token == ${position()-1} then
 186 $    if {count(transition[@from!=@to])!=0}
 187                                                 var state_id = _state
 188 $        foreach transition in {transition[@from!=@to]}
 189                                                 if state_id == ${/parser/lexer_data/state[@name=$transition/@from]/@id} then
 190                                                         _state = state_${translate(@to,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","abcdefghijklmnopqrstuvwxyz")}
 191                                                 end
 192 $        end
 193 $    end if
 194 $    if {@parser_index}
 195                                                 return new @ename.init_tk(location)
 196 $    else
 197                                                 return null
 198 $    end
 199                                         end
 200 $ end foreach
 201                                 else
 202                                         _stream_pos = sp
 203                                         var location = new Location(_file, start_line + 1, start_line + 1, start_pos + 1, start_pos + 1)
 204                                         if sp > start_stream_pos then
 205                                                 var text = string.substring(start_stream_pos, sp-start_stream_pos)
 206                                                 var token = new PLexerError.init_lexer_error("Syntax error: unknown token {text}.", location, text)
 207                                                 return token
 208                                         else
 209                                                 var token = new EOF.init_tk(location)
 210                                                 return token
 211                                         end
 212                                 end
 213                         end
 214                 end
 215         end
 216 end
 217
 218 $ end template
 219
 220
 221
 222 $ template make_lexer_table()
 223 $ foreach {lexer_data/goto_table/state}
 224 $     foreach {row}
 225 $         if {count(goto)!=0}
 226 static const int lexer_goto_row${position()}[] = {
 227         ${count(goto)},
 228 $             foreach {goto}
 229         @low, @high, @state[-sep ','-]
 230 $             end foreach
 231 };
 232 $         end
 233 $     end foreach
 234 static const int lexer_goto_row_null[] = {0};
 235 const int* const lexer_goto_table[] = {
 236 $     foreach {row}
 237 $         if {count(goto)!=0}
 238         lexer_goto_row${position()}[-sep ','-]
 239 $         else
 240         lexer_goto_row_null[-sep ','-]
 241 $         end
 242 $     end foreach
 243 };
 244 $ end foreach
 245
 246 $ foreach {lexer_data/accept_table/state}
 247 const int lexer_accept_table[] = {
 248         [-foreach {i}-]${.}[-sep ','-][-end foreach-]
 249 };
 250 $ end foreach
 251
 252 $ end template