Merge: Added contributing guidelines and link from readme
[nit.git] / lib / core / re.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Copyright 2014 Alexis Laferrière <alexis.laf@xymus.net>
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16
17 # Regular expression support for all services based on `Pattern`
18 #
19 # Implemented using libc regular expressions.
20 #
21 # The main entities are `Text::to_re` and `Regex`.
22 module re
23
24 import text
25 import text::flat
26 import gc
27 import error
28
29 in "C Header" `{
30 #include <sys/types.h>
31 #include <regex.h>
32 `}
33
34 # Main extern class to wrap libc regular expression support
35 #
36 # It is recommanded to use the higher level API offered by the class `Regex`,
37 # but it can still be used for advanced purpose or in optimized code.
38 #
39 # To use this class and other `private` entities of this module, use `intrude import core::re`
40 private extern class NativeRegex `{ regex_t* `}
41 # Allocate a new `NativeRegex`, it must then be compiled using `regcomp` before calling `regexec`
42 new malloc `{ return malloc(sizeof(regex_t)); `}
43
44 # Compile the regular expression `regex` into a form that is suitable for subsequent `regexec` searches
45 fun regcomp(regex: NativeString, cflags: Int): Int `{
46 return regcomp(self, regex, cflags);
47 `}
48
49 # Match `string` against the precompiled pattern buffer of `self`, locating matches
50 #
51 # `nmatch` and `pmatch` are used to provide information regarding the location of any matches.
52 # `eflags` may be the bitwise-or of one or both of `flag_notbol` and `flag_noteol`.
53 fun regexec(string: NativeString, nmatch: Int, pmatch: NativeMatchArray, eflags: Int): Int `{
54 return regexec(self, string, nmatch, pmatch, eflags);
55 `}
56
57 # Match `string` against the precompiled pattern buffer of `self`, do not locate matches
58 #
59 # `eflags` may be the bitwise-or of one or both of `flag_notbol` and `flag_noteol`.
60 fun regexec_match_only(string: NativeString, eflags: Int): Int `{
61 return regexec(self, string, 0, NULL, eflags);
62 `}
63
64 # Free the memory allocated to the pattern buffer by the compiling process
65 #
66 # Does not free the memory holding `self`, use `free` for this purpose.
67 fun regfree `{ regfree(self); `}
68
69 # Turn the error codes that can be returned by both `regcomp` and `regexec` into error message strings
70 fun regerror(errcode: Int): NativeString `{
71 size_t len = regerror(errcode, self, NULL, 0);
72 char *message = malloc(len);
73 regerror(errcode, self, message, len);
74
75 return message;
76 `}
77
78 # Number of parenthetical subexpressions in this compiled regular expression
79 fun re_nsub: Int `{ return self->re_nsub; `}
80 end
81
82 # Flags for `NativeRegex::regcomp`
83
84 private fun flag_extended: Int `{ return REG_EXTENDED; `}
85 private fun flag_icase: Int `{ return REG_ICASE; `}
86 private fun flag_nosub: Int `{ return REG_NOSUB; `}
87 private fun flag_newline: Int `{ return REG_NEWLINE; `}
88
89 # Flags for `NativeRegex::regexec`
90
91 private fun flag_notbol: Int `{ return REG_NOTBOL; `}
92 private fun flag_noteol: Int `{ return REG_NOTEOL; `}
93
94 # Errors of `NativeRegex::regexec`
95
96 private fun error_nomatch: Int `{ return REG_NOMATCH; `}
97 private fun error_espace: Int `{ return REG_ESPACE; `}
98
99 redef universal Int
100 private fun is_nomatch: Bool `{ return self == REG_NOMATCH; `}
101 end
102
103 # An array of `regmatch_t` or a pointer to one
104 private extern class NativeMatchArray `{ regmatch_t* `}
105 # Allocate a new array of `length` `regmatch_t`
106 new malloc(length: Int) `{ return malloc(length * sizeof(regmatch_t)); `}
107
108 # The offset in string of the beginning of a substring
109 fun rm_so: Int `{ return self->rm_so; `}
110
111 # The offset in string of the end of the substring
112 fun rm_eo: Int `{ return self->rm_eo; `}
113
114 # Get a pointer to the element at `index`, can also be used as a subarray
115 fun [](index: Int): NativeMatchArray `{ return self + index; `}
116 end
117
118 redef extern class NativeString
119 private fun substring_from(index: Int): NativeString `{ return self + index; `}
120 end
121
122 redef class Text
123 # Get a `Regex` instance from `self`
124 fun to_re: Regex do return new Regex(self.to_s)
125 end
126
127 # A regular expression pattern
128 #
129 # Used as a `Pattern` on intances of `Text` to call `has`, `search_all`, `replace`, etc.
130 #
131 # Example:
132 #
133 # var re = "ab+a".to_re
134 # assert "aabbbbaaaaba".search_all(re).join(", ") == "abbbba, aba"
135 # assert "aabbbbaaaaba".has(re)
136 # assert "aabbbbaaaaba".replace(re, "+") == "a+aa+"
137 # assert "aabbbbaaaaba".split(re) == ["a", "aa", ""]
138 class Regex
139 super Finalizable
140 super Pattern
141
142 # The `String` source of this regular expression
143 var string: String is writable
144
145 # Treat the pattern as a POSIX extended regular expression (the default)
146 #
147 # If `false`, it is treated as a POSIX basic regular expression (BRE).
148 #
149 # The extended syntax supports `?`, `+` and `|`. Also, `\` causes the following
150 # character to be used as literal.
151 var extended = true is writable
152
153 # Ignore case when matching letters
154 var ignore_case = false is writable
155
156 # Optimize `self` for `String::has` and `is_in`, but do not support searches
157 #
158 # If `true`, `self` cannont be used with `String::search_all`, `String::replace`
159 # or `String::split`.
160 var optimize_has = false is writable
161
162 # Treat a newline in string as dividing string into multiple lines
163 #
164 # So that `$` can match before the newline and `^` can match after.
165 # Also, don’t permit `.` to match a newline, and don’t permit `[^…]` to match a newline.
166 #
167 # Otherwise, newline acts like any other ordinary character.
168 var newline = false is writable
169
170 # Do not regard the beginning of the specified string as the beginning of a line
171 #
172 # More generally, don’t make any assumptions about what text might precede it.
173 var not_bol = false is writable
174
175 # Do not regard the end of the specified string as the end of a line
176 #
177 # More generally, don’t make any assumptions about what text might follow it.
178 var not_eol = false is writable
179
180 # Cache of the last used compiled regular expression
181 private var native: nullable NativeRegex = null
182
183 # Cache of a single `regmatch_t` to prevent many calls to `malloc`
184 private var native_match: NativeMatchArray is lazy do
185 native_match_is_init = true
186 return new NativeMatchArray.malloc(native.as(not null).re_nsub+1)
187 end
188
189 private var native_match_is_init = false
190
191 # `cflags` of the last successful `compile`
192 private var cflags_cache = 0
193
194 # `string` of the last successful `compile`
195 private var string_cache: nullable String = null
196
197 # Compile the regular expression, if needed
198 #
199 # Return `null` on success and an `Error` otherwise.
200 #
201 # This method is always called by `get_match` and `has_match`, but the user
202 # should call it to check for errors.
203 #
204 # assert "ab".to_re.compile == null
205 # assert "[ab".to_re.compile.message == "Unmatched [ or [^"
206 fun compile: nullable Error
207 do
208 var cflags = 0
209 if extended then cflags |= flag_extended
210 if ignore_case then cflags |= flag_icase
211 if optimize_has then cflags |= flag_nosub
212 if newline then cflags |= flag_newline
213
214 var native = self.native
215 var need_compilation = native == null or cflags != cflags_cache or string != string_cache
216
217 if need_compilation then
218
219 # Initial allocation
220 if native == null then
221 native = new NativeRegex.malloc
222 self.native = native
223 end
224
225 var res = native.regcomp(string.to_cstring, cflags)
226
227 # All is good
228 if res == 0 then
229 # Update the cache
230 self.native = native
231
232 # We store these to know if we need to recompile or not
233 self.cflags_cache = cflags
234 self.string_cache = string
235
236 return null
237 end
238
239 var error_cstr = native.regerror(res)
240
241 # We leave it to the lib to decide how to allocate the string that we keep
242 var error_str = error_cstr.to_s_with_copy
243 error_cstr.free
244
245 return new Error(error_str)
246 end
247
248 return null
249 end
250
251 redef fun finalize
252 do
253 var native = self.native
254 if native != null then
255 native.regfree
256 native.free
257 self.native = null
258
259 if native_match_is_init then
260 self.native_match.free
261 end
262 end
263 end
264
265 private fun gather_eflags: Int
266 do
267 var eflags = 0
268 if not_bol then eflags |= flag_notbol
269 if not_eol then eflags |= flag_noteol
270 return eflags
271 end
272
273 private fun get_error(errcode: Int): String
274 do
275 var native = native
276 assert native != null
277
278 # Error, should be out of memory but we cover any possible error anyway
279 var error_cstr = native.regerror(errcode)
280
281 # We leave it to the lib to decide how to allocate the string that we keep
282 var error_str = error_cstr.to_s_with_copy
283 error_cstr.free
284
285 return error_str
286 end
287
288 # assert "ab".to_re.is_in("abcd")
289 # assert "ab".to_re.is_in("cdab")
290 # assert not "ab".to_re.is_in("acdb")
291 # assert "ab".to_re.is_in("ab")
292 redef fun is_in(text)
293 do
294 var comp_res = compile
295 assert comp_res == null else "Regex compilation failed with: {comp_res.message}\n".output
296
297 var native = native
298 assert native != null
299
300 # Actually execute
301 var eflags = gather_eflags
302 var res = native.regexec_match_only(text.to_cstring, eflags)
303
304 # Got a match?
305 if res == 0 then return true
306
307 # Got no match, not an error?
308 if res.is_nomatch then return false
309
310 # Error, should be out of memory but we cover any possible error anyway
311 var error_str = get_error(res)
312 "Regex search failed with: {error_str}\n".output
313 abort
314 end
315
316 # require: not optimize_has
317 #
318 # assert "l".to_re.search_index_in("hello world", 0) == 2
319 # assert "el+o".to_re.search_index_in("hello world", 0) == 1
320 # assert "l+".to_re.search_index_in("hello world", 3) == 3
321 # assert "z".to_re.search_index_in("hello world", 0) == -1
322 redef fun search_index_in(text, from)
323 do
324 assert not optimize_has
325
326 var comp_res = compile
327 assert comp_res == null else "Regex compilation failed with: {comp_res.message}\n".output
328
329 var native = native
330 assert native != null
331
332 # Actually execute
333 text = text.to_s
334 var cstr = text.substring_from(from).to_cstring
335 var eflags = gather_eflags
336 var match = self.native_match
337
338 var res = native.regexec(cstr, 1, match, eflags)
339
340 # Found one?
341 if res == 0 then return match.rm_so + from
342
343 # No more match?
344 if res.is_nomatch then return -1
345
346 # Error, should be out of memory but we cover any possible error anyway
347 var error_str = get_error(res)
348 "Regex search failed with: {error_str}\n".output
349 abort
350 end
351
352 # require: not optimize_has
353 #
354 # assert "l".to_re.search_in("hello world", 0).from == 2
355 # assert "el+o".to_re.search_in("hello world", 0).from == 1
356 # assert "l+".to_re.search_in("hello world", 3).from == 3
357 # assert "z".to_re.search_in("hello world", 0) == null
358 # assert "cd(e)".to_re.search_in("abcdef", 2)[1].to_s == "e"
359 redef fun search_in(text, charfrom)
360 do
361 assert not optimize_has
362
363 var comp_res = compile
364 assert comp_res == null else "Regex compilation failed with: {comp_res.message}\n".output
365
366 var native = native
367 assert native != null
368
369 # Actually execute
370 var cstr = text.to_cstring
371 var rets = cstr.to_s_with_length(text.bytelen)
372 var bytefrom = cstr.char_to_byte_index_cached(charfrom, 0, 0)
373 var subcstr = cstr.fast_cstring(bytefrom)
374 var eflags = gather_eflags
375 var native_match = self.native_match
376
377 var nsub = native.re_nsub
378 var res = native.regexec(subcstr, nsub + 1, native_match, eflags)
379
380 # Found one?
381 if res == 0 then
382 var bfrom = native_match.rm_so + bytefrom
383 var bto = native_match.rm_eo - 1 + bytefrom
384 var cpos = cstr.byte_to_char_index_cached(bfrom, charfrom, bytefrom)
385 var len = cstr.utf8_length(bfrom, bto - bfrom + 1)
386 var match = new Match(rets, cpos, len)
387 var subs = match.subs
388
389 # Add sub expressions
390 for i in [1 .. nsub] do
391 if native_match[i].rm_so < 0 then
392 subs.add null
393 continue
394 end
395 var sub_bfrom = native_match[i].rm_so + bytefrom
396 var sub_bto = native_match[i].rm_eo - 1 + bytefrom
397 var sub_cpos = cstr.byte_to_char_index_cached(sub_bfrom, cpos, bfrom)
398 var sub_len = cstr.utf8_length(sub_bfrom, sub_bto - sub_bfrom + 1)
399 subs.add(new Match(rets, sub_cpos, sub_len))
400 end
401
402 return match
403 end
404
405 # No more match?
406 if res.is_nomatch then return null
407
408 # Error, should be out of memory but we cover any possible error anyway
409 var error_str = get_error(res)
410 "Regex search failed with: {error_str}\n".output
411 abort
412 end
413
414 # require: not optimize_has
415 #
416 # assert "ab".to_re.search_all_in("abbab").join(", ") == "ab, ab"
417 # assert "b+".to_re.search_all_in("abbabaabbbbbcab").join(", ") == "bb, b, bbbbb, b"
418 redef fun search_all_in(text)
419 do
420 assert not optimize_has
421
422 var comp_res = compile
423 assert comp_res == null else "Regex compilation failed with: {comp_res.message}\n".output
424
425 var native = native
426 assert native != null
427
428 # Actually execute
429 var cstr = text.to_cstring
430 var subcstr = cstr
431 var rets = cstr.to_s_with_length(text.bytelen)
432 var eflags = gather_eflags
433 var eflags_or_notbol = eflags | flag_notbol
434 var native_match = self.native_match
435 var matches = new Array[Match]
436
437 var nsub = native.re_nsub
438 var res = native.regexec(subcstr, nsub + 1, native_match, eflags)
439 var bytesub = 0
440 var charsub = 0
441 while res == 0 do
442 var bfrom = native_match.rm_so + bytesub
443 var bto = native_match.rm_eo - 1 + bytesub
444 var cstart = cstr.byte_to_char_index_cached(bfrom, charsub, bytesub)
445 var len = cstr.utf8_length(bfrom, bto - bfrom + 1)
446 var match = new Match(rets, cstart, len)
447 matches.add match
448 var subs = match.subs
449
450 # Add sub expressions
451 for i in [1 .. nsub] do
452 if native_match[i].rm_so < 0 then
453 subs.add null
454 continue
455 end
456 var sub_bfrom = native_match[i].rm_so + bytesub
457 var sub_bto = native_match[i].rm_eo - 1 + bytesub
458 var sub_cstart = cstr.byte_to_char_index_cached(sub_bfrom, cstart, bfrom)
459 var sub_len = cstr.utf8_length(sub_bfrom, sub_bto - sub_bfrom + 1)
460 subs.add(new Match(rets, sub_cstart, sub_len))
461 end
462
463 bytesub = bto + 1
464 charsub = cstart + len
465 subcstr = cstr.fast_cstring(bytesub)
466 res = native.regexec(subcstr, nsub + 1, native_match, eflags_or_notbol)
467 end
468
469 # No more match?
470 if res.is_nomatch then return matches
471
472 # Error, should be out of memory but we cover any possible error anyway
473 var error_str = get_error(res)
474 "Regex search failed with: {error_str}\n".output
475 abort
476 end
477
478 redef fun to_s do return "/{string}/"
479 end
480
481 redef class Match
482 # Parenthesized subexpressions in this match
483 #
484 # ~~~
485 # var re = "c (d e+) f".to_re
486 # var match = "a b c d eee f g".search(re)
487 # assert match.subs.length == 1
488 # assert match.subs.first.to_s == "d eee"
489 # ~~~
490 var subs = new Array[nullable Match] is lazy
491
492 # Get the `n`th expression in this match
493 #
494 # `n == 0` returns this match, and a greater `n` returns the corresponding
495 # subexpression.
496 #
497 # Require: `n >= 0 and n <= subs.length`
498 #
499 # ~~~
500 # var re = "c (d e+) f".to_re
501 # var match = "a b c d eee f g".search(re)
502 # assert match[0].to_s == "c d eee f"
503 # assert match[1].to_s == "d eee"
504 # ~~~
505 fun [](n: Int): nullable Match do
506 if n == 0 then return self
507 assert n > 0 and n <= subs.length
508 return subs[n-1]
509 end
510 end