3e8a246e331829935be17f6eafd6d482ba0eb73b
[nit.git] / src / parser / xss / lexer.xss
1 $ // This file is part of NIT ( http://www.nitlanguage.org ).
2 $ //
3 $ // Copyright 2008 Jean Privat <jean@pryen.org>
4 $ // Based on algorithms developped for ( http://www.sablecc.org/ ).
5 $ //
6 $ // Licensed under the Apache License, Version 2.0 (the "License");
7 $ // you may not use this file except in compliance with the License.
8 $ // You may obtain a copy of the License at
9 $ //
10 $ //     http://www.apache.org/licenses/LICENSE-2.0
11 $ //
12 $ // Unless required by applicable law or agreed to in writing, software
13 $ // distributed under the License is distributed on an "AS IS" BASIS,
14 $ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 $ // See the License for the specific language governing permissions and
16 $ // limitations under the License.
17
18 $ template make_lexer()
19
20 # The lexer extract NIT tokens from an input stream.
21 # It is better user with the Parser
22 class Lexer
23         # Last peeked token
24         var _token: nullable Token
25
26         # Lexer current state
27         var _state: Int = 0
28
29         # Name of the stream (as given to tokens)
30         readable var _filename: String
31
32         # Input stream where character are read
33         var _stream: IStream
34
35         # Pushback buffer to store unread character
36         var _stream_buf: Buffer
37
38         # Number of character stored in the pushback buffer
39         var _stream_pos: Int
40
41         # Current line number in the input stream
42         var _line: Int = 0
43
44         # Current column in the input stream
45         var _pos: Int = 0
46
47         # Was the last character a cariage-return?
48         var _cr: Bool = false
49
50         # If the end of stream?
51         var _eof: Bool = false
52
53         # Current working text read from the input stream
54         var _text: Buffer
55
56 $ foreach {lexer_data/state}
57         # Constante state values
58         private fun state_${translate(@name,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","abcdefghijklmnopqrstuvwxyz")}: Int do return @id end
59 $ end foreach
60
61         # Create a new lexer for a stream (and a name)
62         init(stream: IStream, fname: String)
63         do
64                 _filename = fname
65                 _text = new Buffer
66                 _stream = stream
67                 _stream_pos = -1
68                 _stream_buf = new Buffer
69                 build_goto_table
70                 build_accept_table
71         end
72
73         # Give the next token (but do not consume it)
74         fun peek: Token
75         do
76                 while _token == null do
77                         _token = get_token
78                 end
79                 return _token.as(not null)
80         end
81
82         # Give and consume the next token
83         fun next: Token
84         do
85                 var result = _token
86                 while result == null do
87                         result = get_token
88                 end
89                 _token = null
90                 return result.as(not null)
91         end
92
93         # Get a token, or null if it is discarded
94         private fun get_token: nullable Token
95         do
96                 var dfa_state = 0
97
98                 var start_pos = _pos
99                 var start_line = _line
100
101                 var accept_state = -1
102                 var accept_token = -1
103                 var accept_length = -1
104                 var accept_pos = -1
105                 var accept_line = -1
106
107                 var goto_table = _goto_table[_state]
108                 var accept = _accept_table[_state]
109                 var text = _text
110                 text.clear
111
112                 while true do
113                         var c = get_char
114
115                         if c != -1 then
116                                 var cr = _cr
117                                 var line = _line
118                                 var pos = _pos
119                                 if c == 10 then
120                                         if cr then
121                                                 cr = false
122                                         else
123                                                 line = line + 1
124                                                 pos = 0
125                                         end
126                                 else if c == 13 then
127                                         line = line + 1
128                                         pos = 0
129                                         cr = true
130                                 else
131                                         pos = pos + 1
132                                         cr = false
133                                 end
134
135                                 text.add(c.ascii)
136
137                                 var first_loop = true # aka until
138                                 while dfa_state < -1 or first_loop do
139                                         var old_state = dfa_state
140                                         if dfa_state < -1 then
141                                                 old_state = -2 - dfa_state
142                                         end
143
144                                         dfa_state = -1
145
146                                         var tmp0 = goto_table[old_state]
147                                         var low = 0
148                                         var high = tmp0.length - 1
149
150                                         if high >= 0 then
151                                                 var tmp1 = tmp0.intern_items
152                                                 while low <= high do
153                                                         var middle = (low + high) / 2
154                                                         var tmp2 = tmp1[middle].intern_items
155
156                                                         if c < tmp2[0] then
157                                                                 high = middle - 1
158                                                         else if c > tmp2[1] then
159                                                                 low = middle + 1
160                                                         else
161                                                                 dfa_state = tmp2[2]
162                                                                 low = high + 1 # aka break
163                                                         end
164                                                 end
165                                         end
166                                         first_loop = false # aka until
167                                 end
168
169                                 _cr = cr
170                                 _line = line
171                                 _pos = pos
172                         else
173                                 dfa_state = -1
174                         end
175
176                         if dfa_state >= 0 then
177                                 if accept[dfa_state] != -1 then
178                                         accept_state = dfa_state
179                                         accept_token = accept[dfa_state]
180                                         accept_length = text.length
181                                         accept_pos = _pos
182                                         accept_line = _line
183                                 end
184                         else
185                                 if accept_state != -1 then
186 $ foreach {//token}
187                                         if accept_token == ${position()-1} then
188                                                 var location = new Location(_filename, start_line + 1, accept_line + 1, start_pos + 1, accept_pos)
189 $    if {not(@text)}
190 $        if {@parser_index}
191                                                 var token_text = text.substring(0, accept_length)
192                                                 var token = new @ename.init_tk(token_text, location)
193 $        end
194 $    else
195                                                 var token = new @ename.init_tk(location)
196 $    end
197                                                 push_back(accept_length)
198                                                 _pos = accept_pos
199                                                 _line = accept_line
200 $    if {count(transition[@from!=@to])!=0}
201                                                 var state_id = _state
202 $        foreach transition in {transition[@from!=@to]}
203                                                 if state_id == ${/parser/lexer_data/state[@name=$transition/@from]/@id} then
204                                                         _state = state_${translate(@to,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","abcdefghijklmnopqrstuvwxyz")}
205                                                 end
206 $        end
207 $    end if
208 $    if {@parser_index}
209                                                 return token
210 $    else
211                                                 return null
212 $    end
213                                         end
214 $ end foreach
215                                 else
216                                         var location = new Location(_filename, start_line + 1, start_line + 1, start_pos + 1, start_pos + 1)
217                                         if text.length > 0 then
218                                                 var token = new PError.init_error("Syntax error: unknown token {text}.", location)
219                                                 return token
220                                         else
221                                                 var token = new EOF(location)
222                                                 return token
223                                         end
224                                 end
225                         end
226                 end
227                 return null
228         end
229
230         # Read the next character.
231         # The character is read from the stream of from the pushback buffer.
232         private fun get_char: Int
233         do
234                 if _eof then
235                         return -1
236                 end
237
238                 var result: Int
239
240                 var sp = _stream_pos
241                 if sp >= 0 then
242                         var res = _stream_buf[_stream_pos]
243                         _stream_pos = sp - 1
244                         result = res.ascii
245                 else
246                         result = _stream.read_char
247                 end
248
249                 if result == -1 then
250                         _eof = true
251                 end
252
253                 return result
254         end
255
256         # Unread some characters.
257         # Unread characters are stored in the pushback buffer.
258         private fun push_back(accept_length: Int)
259         do
260                 var length = _text.length
261                 var i = length - 1
262                 while i >= accept_length do
263                         _eof = false
264                         _stream_pos = _stream_pos + 1
265                         _stream_buf[_stream_pos] = _text[i]
266                         i = i - 1
267                 end
268         end
269
270         var _goto_table: Array[Array[Array[Array[Int]]]]
271         private fun build_goto_table
272         do
273                 _goto_table = once [
274 $ foreach {lexer_data/goto_table/state}
275                         [
276 $     foreach {row}
277 $         if {count(goto)!=0}
278                                 [
279 $             foreach {goto}
280                                         [@low, @high, @state][-sep ','-]
281 $             end foreach
282                                 ][-sep ','-]
283 $         else
284                                 nil_array[-sep ','-]
285 $         end
286 $     end foreach
287                         ][-sep ','-]
288 $ end foreach
289                 ]
290         end
291
292         private fun nil_array: Array[Array[Int]]
293         do
294                 return once new Array[Array[Int]]
295         end
296
297         var _accept_table: Array[Array[Int]]
298         private fun build_accept_table do
299                 _accept_table = once [
300 $ foreach {lexer_data/accept_table/state}
301                         [
302                                 [-foreach {i}-]${.}[-sep ','-][-end foreach-]
303
304                         ][-sep ','-]
305 $ end foreach
306                 ]
307         end
308 end
309
310 $ end template