--- /dev/null
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Copyright 2014 Alexis Laferrière <alexis.laf@xymus.net>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Regular expression support for all services based on `Pattern`
+#
+# Implemented using libc regular expressions.
+#
+# The main entities are `Text::to_re` and `Regex`.
+module re
+
+import text
+import gc
+import error
+
+in "C Header" `{
+ #include <sys/types.h>
+ #include <regex.h>
+`}
+
+# Main extern class to wrap libc regular expression support
+#
+# It is recommanded to use the higher level API offered by the class `Regex`,
+# but it can still be used for advanced purpose or in optimized code.
+#
+# To use this class and other `private` entities of this module, use `intrude import standard::re`
+private extern class NativeRegex `{ regex_t* `}
+ # Allocate a new `NativeRegex`, it must then be compiled using `regcomp` before calling `regexec`
+ new malloc `{ return malloc(sizeof(regex_t)); `}
+
+ # Compile the regular expression `regex` into a form that is suitable for subsequent `regexec` searches
+ fun regcomp(regex: NativeString, cflags: Int): Int `{
+ return regcomp(self, regex, cflags);
+ `}
+
+ # Match `string` against the precompiled pattern buffer of `self`, locating matches
+ #
+ # `nmatch` and `pmatch` are used to provide information regarding the location of any matches.
+ # `eflags` may be the bitwise-or of one or both of `flag_notbol` and `flag_noteol`.
+ fun regexec(string: NativeString, nmatch: Int, pmatch: NativeMatchArray, eflags: Int): Int `{
+ return regexec(self, string, nmatch, pmatch, eflags);
+ `}
+
+ # Match `string` against the precompiled pattern buffer of `self`, do not locate matches
+ #
+ # `eflags` may be the bitwise-or of one or both of `flag_notbol` and `flag_noteol`.
+ fun regexec_match_only(string: NativeString, eflags: Int): Int `{
+ return regexec(self, string, 0, NULL, eflags);
+ `}
+
+ # Free the memory allocated to the pattern buffer by the compiling process
+ #
+ # Does not free the memory holding `self`, use `free` for this purpose.
+ fun regfree `{ regfree(self); `}
+
+ # Turn the error codes that can be returned by both `regcomp` and `regexec` into error message strings
+ fun regerror(errcode: Int): NativeString `{
+ size_t len = regerror(errcode, self, NULL, 0);
+ char *message = malloc(len);
+ regerror(errcode, self, message, len);
+
+ return message;
+ `}
+
+ # Number of parenthetical subexpressions in this compiled regular expression
+ fun re_nsub: Int `{ return self->re_nsub; `}
+end
+
+# Flags for `NativeRegex::regcomp`
+
+private fun flag_extended: Int `{ return REG_EXTENDED; `}
+private fun flag_icase: Int `{ return REG_ICASE; `}
+private fun flag_nosub: Int `{ return REG_NOSUB; `}
+private fun flag_newline: Int `{ return REG_NEWLINE; `}
+
+# Flags for `NativeRegex::regexec`
+
+private fun flag_notbol: Int `{ return REG_NOTBOL; `}
+private fun flag_noteol: Int `{ return REG_NOTEOL; `}
+
+# Errors of `NativeRegex::regexec`
+
+private fun error_nomatch: Int `{ return REG_NOMATCH; `}
+private fun error_espace: Int `{ return REG_ESPACE; `}
+
+redef universal Int
+ private fun is_nomatch: Bool `{ return self == REG_NOMATCH; `}
+end
+
+# An array of `regmatch_t` or a pointer to one
+private extern class NativeMatchArray `{ regmatch_t* `}
+ # Allocate a new array of `length` `regmatch_t`
+ new malloc(length: Int) `{ return malloc(length * sizeof(regmatch_t)); `}
+
+ # The offset in string of the beginning of a substring
+ fun rm_so: Int `{ return self->rm_so; `}
+
+ # The offset in string of the end of the substring
+ fun rm_eo: Int `{ return self->rm_eo; `}
+
+ # Get a pointer to the element at `index`, can also be used as a subarray
+ fun [](index: Int): NativeMatchArray `{ return self + index; `}
+end
+
+redef extern class NativeString
+ private fun substring_from(index: Int): NativeString `{ return self + index; `}
+end
+
+redef class Text
+ # Get a `Regex` instance from `self`
+ fun to_re: Regex do return new Regex(self.to_s)
+end
+
+# A regular expression pattern
+#
+# Used as a `Pattern` on intances of `Text` to call `has`, `search_all`, `replace`, etc.
+#
+# Example:
+#
+# var re = "ab+a".to_re
+# assert "aabbbbaaaaba".search_all(re).join(", ") == "abbbba, aba"
+# assert "aabbbbaaaaba".has(re)
+# assert "aabbbbaaaaba".replace(re, "+") == "a+aa+"
+# assert "aabbbbaaaaba".split(re) == ["a", "aa", ""]
+class Regex
+ super Finalizable
+ super Pattern
+
+ # The `String` source of this regular expression
+ var string: String is writable
+
+ # Treat the pattern as a POSIX extended regular expression (the default)
+ #
+ # If `false`, it is treated as a POSIX basic regular expression (BRE).
+ #
+ # The extended syntax supports `?`, `+` and `|`. Also, `\` causes the following
+ # character to be used as literal.
+ var extended = true is writable
+
+ # Ignore case when matching letters
+ var ignore_case = false is writable
+
+ # Optimize `self` for `is_in` and `String::has`, but do not support searches
+ #
+ # If `true`, `self` cannont be used with `String::search_all`, `String::replace`
+ # or `String::split`.
+ var optimize_is_in = false is writable
+
+ # Treat a newline in string as dividing string into multiple lines
+ #
+ # So that `$` can match before the newline and `^` can match after.
+ # Also, don’t permit `.` to match a newline, and don’t permit `[^…]` to match a newline.
+ #
+ # Otherwise, newline acts like any other ordinary character.
+ var newline = false is writable
+
+ # Do not regard the beginning of the specified string as the beginning of a line
+ #
+ # More generally, don’t make any assumptions about what text might precede it.
+ var not_bol = false is writable
+
+ # Do not regard the end of the specified string as the end of a line
+ #
+ # More generally, don’t make any assumptions about what text might follow it.
+ var not_eol = false is writable
+
+ # Cache of the last used compiled regular expression
+ private var native: nullable NativeRegex = null
+
+ # Cache of a single `regmatch_t` to prevent many calls to `malloc`
+ private var native_match: NativeMatchArray is lazy do
+ native_match_is_init = true
+ return new NativeMatchArray.malloc(native.re_nsub+1)
+ end
+
+ private var native_match_is_init = false
+
+ # `cflags` of the last successful `compile`
+ private var cflags_cache = 0
+
+ # `string` of the last successful `compile`
+ private var string_cache: nullable String = null
+
+ # Compile the regular expression, if needed
+ #
+ # Return `null` on success and an `Error` otherwise.
+ #
+ # This method is always called by `get_match` and `has_match`, but the user
+ # should call it to check for errors.
+ #
+ # assert "ab".to_re.compile == null
+ # assert "[ab".to_re.compile.message == "Unmatched [ or [^"
+ fun compile: nullable Error
+ do
+ var cflags = 0
+ if extended then cflags |= flag_extended
+ if ignore_case then cflags |= flag_icase
+ if optimize_is_in then cflags |= flag_nosub
+ if newline then cflags |= flag_newline
+
+ var native = self.native
+ var need_compilation = native == null or cflags != cflags_cache or string != string_cache
+
+ if need_compilation then
+
+ # Initial allocation
+ if native == null then
+ native = new NativeRegex.malloc
+ self.native = native
+ end
+
+ var res = native.regcomp(string.to_cstring, cflags)
+
+ # All is good
+ if res == 0 then
+ # Update the cache
+ self.native = native
+
+ # We store these to know if we need to recompile or not
+ self.cflags_cache = cflags
+ self.string_cache = string
+
+ return null
+ end
+
+ var error_cstr = native.regerror(res)
+
+ # We leave it to the lib to decide how to allocate the string that we keep
+ var error_str = error_cstr.to_s_with_copy
+ error_cstr.free
+
+ return new Error(error_str)
+ end
+
+ return null
+ end
+
+ redef fun finalize
+ do
+ var native = self.native
+ if native != null then
+ native.regfree
+ native.free
+ self.native = null
+
+ if native_match_is_init then
+ self.native_match.free
+ end
+ end
+ end
+
+ private fun gather_eflags: Int
+ do
+ var eflags = 0
+ if not_bol then eflags |= flag_notbol
+ if not_eol then eflags |= flag_noteol
+ return eflags
+ end
+
+ private fun get_error(errcode: Int): String
+ do
+ # Error, should be out of memory but we cover any possible error anyway
+ var error_cstr = native.regerror(errcode)
+
+ # We leave it to the lib to decide how to allocate the string that we keep
+ var error_str = error_cstr.to_s_with_copy
+ error_cstr.free
+
+ return error_str
+ end
+
+ # assert "ab".to_re.is_in("abcd")
+ # assert "ab".to_re.is_in("cdab")
+ # assert not "ab".to_re.is_in("acdb")
+ # assert "ab".to_re.is_in("ab")
+ redef fun is_in(text)
+ do
+ var comp_res = compile
+ assert comp_res == null else "Regex compilation failed with: {comp_res.message}\n".output
+
+ # Actually execute
+ var eflags = gather_eflags
+ var res = native.regexec_match_only(text.to_cstring, eflags)
+
+ # Got a match?
+ if res == 0 then return true
+
+ # Got no match, not an error?
+ if res.is_nomatch then return false
+
+ # Error, should be out of memory but we cover any possible error anyway
+ var error_str = get_error(res)
+ "Regex search failed with: {error_str}\n".output
+ abort
+ end
+
+ # require: not optimize_is_in
+ #
+ # assert "l".to_re.search_index_in("hello world", 0) == 2
+ # assert "el+o".to_re.search_index_in("hello world", 0) == 1
+ # assert "l+".to_re.search_index_in("hello world", 3) == 3
+ # assert "z".to_re.search_index_in("hello world", 0) == -1
+ redef fun search_index_in(text, from)
+ do
+ assert not optimize_is_in
+
+ var comp_res = compile
+ assert comp_res == null else "Regex compilation failed with: {comp_res.message}\n".output
+
+ # Actually execute
+ text = text.to_s
+ var cstr = text.substring_from(from).to_cstring
+ var eflags = gather_eflags
+ var match = self.native_match
+
+ var res = native.regexec(cstr, 1, match, eflags)
+
+ # Found one?
+ if res == 0 then return match.rm_so + from
+
+ # No more match?
+ if res.is_nomatch then return -1
+
+ # Error, should be out of memory but we cover any possible error anyway
+ var error_str = get_error(res)
+ "Regex search failed with: {error_str}\n".output
+ abort
+ end
+
+ # require: not optimize_is_in
+ #
+ # assert "l".to_re.search_in("hello world", 0).from == 2
+ # assert "el+o".to_re.search_in("hello world", 0).from == 1
+ # assert "l+".to_re.search_in("hello world", 3).from == 3
+ # assert "z".to_re.search_in("hello world", 0) == null
+ redef fun search_in(text, from)
+ do
+ assert not optimize_is_in
+
+ var comp_res = compile
+ assert comp_res == null else "Regex compilation failed with: {comp_res.message}\n".output
+
+ # Actually execute
+ text = text.to_s
+ var cstr = text.substring_from(from).to_cstring
+ var eflags = gather_eflags
+ var native_match = self.native_match
+
+ var nsub = native.re_nsub
+ var res = native.regexec(cstr, nsub+1, native_match, eflags)
+
+ # Found one?
+ if res == 0 then
+ var match = new Match(text,
+ from + native_match.rm_so,
+ native_match.rm_eo - native_match.rm_so)
+
+ # Add sub expressions
+ for i in [1..nsub] do
+ match.subs.add new Match( text,
+ native_match[i].rm_so,
+ native_match[i].rm_eo - native_match[i].rm_so)
+ end
+
+ return match
+ end
+
+ # No more match?
+ if res.is_nomatch then return null
+
+ # Error, should be out of memory but we cover any possible error anyway
+ var error_str = get_error(res)
+ "Regex search failed with: {error_str}\n".output
+ abort
+ end
+
+ # require: not optimize_is_in
+ #
+ # assert "ab".to_re.search_all_in("abbab").join(", ") == "ab, ab"
+ # assert "b+".to_re.search_all_in("abbabaabbbbbcab").join(", ") == "bb, b, bbbbb, b"
+ redef fun search_all_in(text)
+ do
+ assert not optimize_is_in
+
+ var comp_res = compile
+ assert comp_res == null else "Regex compilation failed with: {comp_res.message}\n".output
+
+ # Actually execute
+ text = text.to_s
+ var cstr = text.to_cstring
+ var eflags = gather_eflags
+ var eflags_or_notbol = eflags | flag_notbol
+ var native_match = self.native_match
+ var matches = new Array[Match]
+
+ var nsub = native.re_nsub
+ var res = native.regexec(cstr, nsub+1, native_match, eflags)
+ var d = 0
+ while res == 0 do
+ var match = new Match(text,
+ d + native_match.rm_so,
+ native_match.rm_eo - native_match.rm_so)
+ matches.add match
+
+ # Add sub expressions
+ for i in [1..nsub] do
+ match.subs.add new Match( text,
+ d + native_match[i].rm_so,
+ native_match[i].rm_eo - native_match[i].rm_so)
+ end
+
+ if d == native_match.rm_eo then
+ d += 1
+ else d = d + native_match.rm_eo
+ cstr = cstr.substring_from(native_match.rm_eo)
+ res = native.regexec(cstr, nsub+1, native_match, eflags_or_notbol)
+ end
+
+ # No more match?
+ if res.is_nomatch then return matches
+
+ # Error, should be out of memory but we cover any possible error anyway
+ var error_str = get_error(res)
+ "Regex search failed with: {error_str}\n".output
+ abort
+ end
+
+ redef fun to_s do return "/{string}/"
+end
+
+redef class Match
+ # Parenthesized subexpressions in this match
+ #
+ # ~~~
+ # var re = "c (d e+) f".to_re
+ # var match = "a b c d eee f g".search(re)
+ # assert match.subs.length == 1
+ # assert match.subs.first.to_s == "d eee"
+ # ~~~
+ var subs = new Array[Match] is lazy
+
+ # Get the `n`th expression in this match
+ #
+ # `n == 0` returns this match, and a greater `n` returns the corresponding
+ # subexpression.
+ #
+ # Require: `n >= 0 and n <= subs.length`
+ #
+ # ~~~
+ # var re = "c (d e+) f".to_re
+ # var match = "a b c d eee f g".search(re)
+ # assert match[0].to_s == "c d eee f"
+ # assert match[1].to_s == "d eee"
+ # ~~~
+ fun [](n: Int): Match do
+ if n == 0 then return self
+ assert n > 0 and n <= subs.length
+ return subs[n-1]
+ end
+end