src/parser/lexer_work.nit

   1 # This file is part of NIT ( http://www.nitlanguage.org ).
   2 #
   3 # Licensed under the Apache License, Version 2.0 (the "License");
   4 # you may not use this file except in compliance with the License.
   5 # You may obtain a copy of the License at
   6 #
   7 #     http://www.apache.org/licenses/LICENSE-2.0
   8 #
   9 # Unless required by applicable law or agreed to in writing, software
  10 # distributed under the License is distributed on an "AS IS" BASIS,
  11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 # See the License for the specific language governing permissions and
  13 # limitations under the License.
  14
  15 # Internal algorithm and data structures for the Nit lexer
  16 module lexer_work
  17
  18 intrude import parser_nodes
  19 private import tables
  20
  21 redef class Token
  22     private var cached_text: nullable String
  23
  24     redef fun text
  25     do
  26         var res = _cached_text
  27         if res != null then return res
  28         res = location.text
  29         _cached_text = res
  30         return res
  31     end
  32
  33     redef fun text=(text)
  34     do
  35         _cached_text = text
  36     end
  37
  38     fun parser_index: Int is abstract
  39 end
  40
  41 redef class EOF
  42     init init_tk(loc: Location)
  43     do
  44         _cached_text = ""
  45                 _location = loc
  46     end
  47 end
  48
  49 redef class AError
  50     var message: String
  51
  52     init init_error(message: String, loc: Location)
  53     do
  54                 init_tk(loc)
  55                 self.message = message
  56     end
  57 end
  58
  59 redef class ALexerError
  60     var string: String
  61
  62     init init_lexer_error(message: String, loc: Location, string: String)
  63     do
  64                 init_error(message, loc)
  65                 self.string = string
  66     end
  67 end
  68
  69 redef class AParserError
  70     var token: Token
  71
  72     init init_parser_error(message: String, loc: Location, token: Token)
  73     do
  74                 init_error(message, loc)
  75                 self.token = token
  76     end
  77 end
  78
  79 # The lexer extract NIT tokens from an input stream.
  80 # It is better user with the Parser
  81 class Lexer
  82         super TablesCapable
  83
  84         # Last peeked token
  85         var token: nullable Token = null
  86
  87         # Lexer current state
  88         private var state: Int = 0
  89
  90         # The source file
  91         var file: SourceFile
  92
  93         # Current character in the stream
  94         var stream_pos: Int = 0
  95
  96         # Current line number in the input stream
  97         var line: Int = 0
  98
  99         # Current column in the input stream
 100         var pos: Int = 0
 101
 102         # Was the last character a carriage-return?
 103         var cr: Bool = false
 104
 105         # Constante state values
 106         private fun state_initial: Int do return 0 end
 107
 108         # The last peeked token to chain them
 109         private var last_token: nullable Token = null
 110
 111         # Give the next token (but do not consume it)
 112         fun peek: Token
 113         do
 114                 var t = _token
 115                 if t != null then return t
 116
 117                 t = get_token
 118                 while t == null do t = get_token
 119
 120                 if isset t._location then
 121                         var l = last_token
 122                         if l != null then
 123                                 l.next_token = t
 124                                 t.prev_token = l
 125                         else
 126                                 file.first_token = t
 127                         end
 128                         last_token = t
 129                 end
 130
 131                 _token = t
 132                 return t
 133         end
 134
 135         # Give and consume the next token
 136         fun next: Token
 137         do
 138                 var result = peek
 139                 _token = null
 140                 return result
 141         end
 142
 143         # Primitive method to return a token, or return null if it is discarded
 144         # Is used to implement `peek` and `next`
 145         protected fun get_token: nullable Token
 146         do
 147                 var dfa_state = 0
 148
 149                 var sp = _stream_pos
 150                 var start_stream_pos = sp
 151                 var start_pos = _pos
 152                 var start_line = _line
 153                 var file = self.file
 154                 var string = file.string
 155                 var string_len = string.length
 156
 157                 var accept_state = -1
 158                 var accept_token = -1
 159                 var accept_length = -1
 160                 var accept_pos = -1
 161                 var accept_line = -1
 162
 163                 loop
 164                         if sp >= string_len then
 165                                 dfa_state = -1
 166                         else
 167                                 # Very ugly hack, this is because of the way SableCC generates its tables.
 168                                 # Due to the 0xFFFF limit of a Java char, when a big Nit char is read (i.e.
 169                                 # code point > 65535), it crashes.
 170                                 #
 171                                 # Hence, if a char has a code point <= 255 (ISO8859 range), it is left as is.
 172                                 # Else, it is replaced by 255.
 173                                 # This does not corrupt the lexer and works perfectly on any character.
 174                                 #
 175                                 # TL;DR: Java fucked up, need retarded solution to cope for retarded decision
 176                                 var c = string[sp].code_point
 177                                 if c >= 256 then c = 255
 178                                 sp += 1
 179
 180                                 var cr = _cr
 181                                 var line = _line
 182                                 var pos = _pos
 183                                 if c == 10 then
 184                                         if cr then
 185                                                 cr = false
 186                                                 file.line_starts[line] = sp
 187                                         else
 188                                                 line = line + 1
 189                                                 pos = 0
 190                                                 file.line_starts[line] = sp
 191                                         end
 192                                 else if c == 13 then
 193                                         line = line + 1
 194                                         pos = 0
 195                                         cr = true
 196                                         file.line_starts[line] = sp
 197                                 else
 198                                         pos = pos + 1
 199                                         cr = false
 200                                 end
 201
 202                                 loop
 203                                         var old_state = dfa_state
 204                                         if dfa_state < -1 then
 205                                                 old_state = -2 - dfa_state
 206                                         end
 207
 208                                         dfa_state = -1
 209
 210                                         var low = 0
 211                                         var high = lexer_goto(old_state, 0) - 1
 212
 213                                         if high >= 0 then
 214                                                 while low <= high do
 215                                                         var middle = (low + high) / 2
 216                                                         var offset = middle * 3 + 1 # +1 because length is at 0
 217
 218                                                         if c < lexer_goto(old_state, offset) then
 219                                                                 high = middle - 1
 220                                                         else if c > lexer_goto(old_state, offset+1) then
 221                                                                 low = middle + 1
 222                                                         else
 223                                                                 dfa_state = lexer_goto(old_state, offset+2)
 224                                                                 break
 225                                                         end
 226                                                 end
 227                                         end
 228                                         if dfa_state > -2 then break
 229                                 end
 230
 231                                 _cr = cr
 232                                 _line = line
 233                                 _pos = pos
 234                         end
 235
 236                         if dfa_state >= 0 then
 237                                 var tok = lexer_accept(dfa_state)
 238                                 if tok != -1 then
 239                                         accept_state = dfa_state
 240                                         accept_token = tok
 241                                         accept_length = sp - start_stream_pos
 242                                         accept_pos = _pos
 243                                         accept_line = _line
 244                                 end
 245                         else
 246                                 if accept_state != -1 then
 247                                         _pos = accept_pos
 248                                         _line = accept_line
 249                                         _stream_pos = start_stream_pos + accept_length
 250                                         if accept_token == 0 then
 251                                                 # Ignored token (whitespaces)
 252                                                 return null
 253                                         end
 254                                         var location = new Location(file, start_line + 1, accept_line + 1, start_pos + 1, accept_pos)
 255                                         return make_token(accept_token, location)
 256                                 else
 257                                         _stream_pos = sp
 258                                         var location = new Location(file, start_line + 1, start_line + 1, start_pos + 1, start_pos + 1)
 259                                         if sp > start_stream_pos then
 260                                                 var text = string.substring(start_stream_pos, sp-start_stream_pos)
 261                                                 var token = new ALexerError.init_lexer_error("Syntax Error: unknown token `{text}`.", location, text)
 262                                                 file.last_token = token
 263                                                 return token
 264                                         else
 265                                                 var token = new EOF.init_tk(location)
 266                                                 file.last_token = token
 267                                                 return token
 268                                         end
 269                                 end
 270                         end
 271                 end
 272         end
 273
 274         # Allocate the right Token object for a given identifier
 275         protected fun make_token(accept_token: Int, location: Location): Token is abstract
 276 end