f8e98eae12b5176a9ae4190d1e4fd49aaaf11500
[nit.git] / src / parser / xss / lexer.xss
1 $ // This file is part of NIT ( http://www.nitlanguage.org ).
2 $ //
3 $ // Copyright 2008 Jean Privat <jean@pryen.org>
4 $ // Based on algorithms developped for ( http://www.sablecc.org/ ).
5 $ //
6 $ // Licensed under the Apache License, Version 2.0 (the "License");
7 $ // you may not use this file except in compliance with the License.
8 $ // You may obtain a copy of the License at
9 $ //
10 $ //     http://www.apache.org/licenses/LICENSE-2.0
11 $ //
12 $ // Unless required by applicable law or agreed to in writing, software
13 $ // distributed under the License is distributed on an "AS IS" BASIS,
14 $ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 $ // See the License for the specific language governing permissions and
16 $ // limitations under the License.
17
18 $ template make_lexer()
19
20 # The lexer extract NIT tokens from an input stream.
21 # It is better user with the Parser
22 class Lexer
23         # Last peeked token
24         var _token: nullable Token
25
26         # Lexer current state
27         var _state: Int = 0
28
29         # Name of the stream (as given to tokens)
30         readable var _filename: String
31
32         # Input stream where character are read
33         var _stream: IStream
34
35         # Pushback buffer to store unread character
36         var _stream_buf: Buffer
37
38         # Number of character stored in the pushback buffer
39         var _stream_pos: Int
40
41         # Current line number in the input stream
42         var _line: Int = 0
43
44         # Current column in the input stream
45         var _pos: Int = 0
46
47         # Was the last character a cariage-return?
48         var _cr: Bool = false
49
50         # If the end of stream?
51         var _eof: Bool = false
52
53         # Current working text read from the input stream
54         var _text: Buffer
55
56 $ foreach {lexer_data/state}
57         # Constante state values
58         private fun state_${translate(@name,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","abcdefghijklmnopqrstuvwxyz")}: Int do return @id end
59 $ end foreach
60
61         # Create a new lexer for a stream (and a name)
62         init(stream: IStream, fname: String)
63         do
64                 _filename = fname
65                 _text = new Buffer
66                 _stream = stream
67                 _stream_pos = -1
68                 _stream_buf = new Buffer
69                 build_goto_table
70                 build_accept_table
71         end
72
73         # Give the next token (but do not consume it)
74         fun peek: Token
75         do
76                 while _token == null do
77                         _token = get_token
78                 end
79                 return _token.as(not null)
80         end
81
82         # Give and consume the next token
83         fun next: Token
84         do
85                 var result = _token
86                 while result == null do
87                         result = get_token
88                 end
89                 _token = null
90                 return result.as(not null)
91         end
92
93         # Get a token, or null if it is discarded
94         private fun get_token: nullable Token
95         do
96                 var dfa_state = 0
97
98                 var start_pos = _pos
99                 var start_line = _line
100
101                 var accept_state = -1
102                 var accept_token = -1
103                 var accept_length = -1
104                 var accept_pos = -1
105                 var accept_line = -1
106
107                 var goto_table = _goto_table[_state]
108                 var accept = _accept_table[_state]
109                 var text = _text
110                 text.clear
111
112                 loop
113                         var c = get_char
114
115                         if c != -1 then
116                                 var cr = _cr
117                                 var line = _line
118                                 var pos = _pos
119                                 if c == 10 then
120                                         if cr then
121                                                 cr = false
122                                         else
123                                                 line = line + 1
124                                                 pos = 0
125                                         end
126                                 else if c == 13 then
127                                         line = line + 1
128                                         pos = 0
129                                         cr = true
130                                 else
131                                         pos = pos + 1
132                                         cr = false
133                                 end
134
135                                 text.add(c.ascii)
136
137                                 loop
138                                         var old_state = dfa_state
139                                         if dfa_state < -1 then
140                                                 old_state = -2 - dfa_state
141                                         end
142
143                                         dfa_state = -1
144
145                                         var tmp0 = goto_table[old_state]
146                                         var low = 0
147                                         var high = tmp0.length - 1
148
149                                         if high >= 0 then
150                                                 var tmp1 = tmp0.intern_items
151                                                 while low <= high do
152                                                         var middle = (low + high) / 2
153                                                         var tmp2 = tmp1[middle].intern_items
154
155                                                         if c < tmp2[0] then
156                                                                 high = middle - 1
157                                                         else if c > tmp2[1] then
158                                                                 low = middle + 1
159                                                         else
160                                                                 dfa_state = tmp2[2]
161                                                                 break
162                                                         end
163                                                 end
164                                         end
165                                         if dfa_state > -2 then break
166                                 end
167
168                                 _cr = cr
169                                 _line = line
170                                 _pos = pos
171                         else
172                                 dfa_state = -1
173                         end
174
175                         if dfa_state >= 0 then
176                                 if accept[dfa_state] != -1 then
177                                         accept_state = dfa_state
178                                         accept_token = accept[dfa_state]
179                                         accept_length = text.length
180                                         accept_pos = _pos
181                                         accept_line = _line
182                                 end
183                         else
184                                 if accept_state != -1 then
185                                         var location = new Location(_filename, start_line + 1, accept_line + 1, start_pos + 1, accept_pos)
186                                         _pos = accept_pos
187                                         _line = accept_line
188                                         push_back(accept_length)
189 $ foreach {//token}
190                                         if accept_token == ${position()-1} then
191 $    if {count(transition[@from!=@to])!=0}
192                                                 var state_id = _state
193 $        foreach transition in {transition[@from!=@to]}
194                                                 if state_id == ${/parser/lexer_data/state[@name=$transition/@from]/@id} then
195                                                         _state = state_${translate(@to,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","abcdefghijklmnopqrstuvwxyz")}
196                                                 end
197 $        end
198 $    end if
199 $    if {@parser_index}
200 $        if {not(@text)}
201                                                 var token_text = text.substring(0, accept_length)
202                                                 return new @ename.init_tk(token_text, location)
203 $        else
204                                                 return new @ename.init_tk(location)
205 $        end
206 $    else
207                                                 return null
208 $    end
209                                         end
210 $ end foreach
211                                 else
212                                         var location = new Location(_filename, start_line + 1, start_line + 1, start_pos + 1, start_pos + 1)
213                                         if text.length > 0 then
214                                                 var token = new PError.init_error("Syntax error: unknown token {text}.", location)
215                                                 return token
216                                         else
217                                                 var token = new EOF(location)
218                                                 return token
219                                         end
220                                 end
221                         end
222                         if false then break # FIXME remove once unreach loop exits are in c_src
223                 end
224                 return null # FIXME remove once unreach loop exits are in c_src
225         end
226
227         # Read the next character.
228         # The character is read from the stream of from the pushback buffer.
229         private fun get_char: Int
230         do
231                 if _eof then
232                         return -1
233                 end
234
235                 var result: Int
236
237                 var sp = _stream_pos
238                 if sp >= 0 then
239                         var res = _stream_buf[_stream_pos]
240                         _stream_pos = sp - 1
241                         result = res.ascii
242                 else
243                         result = _stream.read_char
244                 end
245
246                 if result == -1 then
247                         _eof = true
248                 end
249
250                 return result
251         end
252
253         # Unread some characters.
254         # Unread characters are stored in the pushback buffer.
255         private fun push_back(accept_length: Int)
256         do
257                 var length = _text.length
258                 var i = length - 1
259                 while i >= accept_length do
260                         _eof = false
261                         _stream_pos = _stream_pos + 1
262                         _stream_buf[_stream_pos] = _text[i]
263                         i = i - 1
264                 end
265         end
266
267         var _goto_table: Array[Array[Array[Array[Int]]]]
268         private fun build_goto_table
269         do
270                 _goto_table = once [
271 $ foreach {lexer_data/goto_table/state}
272                         [
273 $     foreach {row}
274 $         if {count(goto)!=0}
275                                 [
276 $             foreach {goto}
277                                         [@low, @high, @state][-sep ','-]
278 $             end foreach
279                                 ][-sep ','-]
280 $         else
281                                 nil_array[-sep ','-]
282 $         end
283 $     end foreach
284                         ][-sep ','-]
285 $ end foreach
286                 ]
287         end
288
289         private fun nil_array: Array[Array[Int]]
290         do
291                 return once new Array[Array[Int]]
292         end
293
294         var _accept_table: Array[Array[Int]]
295         private fun build_accept_table do
296                 _accept_table = once [
297 $ foreach {lexer_data/accept_table/state}
298                         [
299                                 [-foreach {i}-]${.}[-sep ','-][-end foreach-]
300
301                         ][-sep ','-]
302 $ end foreach
303                 ]
304         end
305 end
306
307 $ end template