X-Git-Url: http://nitlanguage.org diff --git a/lib/standard/string_search.nit b/lib/standard/string_search.nit deleted file mode 100644 index 437341f..0000000 --- a/lib/standard/string_search.nit +++ /dev/null @@ -1,416 +0,0 @@ -# This file is part of NIT ( http://www.nitlanguage.org ). -# -# Copyright 2005-2008 Jean Privat -# -# This file is free software, which comes along with NIT. This software is -# distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; -# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. You can modify it is you want, provided this header -# is kept unaltered, and a notification of the changes is added. -# You are allowed to redistribute it and sell it, alone or is a part of -# another product. - -# Basic string search, match and replace. -module string_search - -import string - -# Patterns are abstract string motifs (include `String` and `Char`). -interface Pattern - # Search `self` into `s` from a certain position. - # Return the position of the first character of the matching section. - # Return -1 if not found. - # - # assert 'l'.search_index_in("hello world", 0) == 2 - # assert 'l'.search_index_in("hello world", 3) == 3 - # assert 'z'.search_index_in("hello world", 0) == -1 - # - # This method is usually faster than `search_in` if what is - # required is only the index. - # Note: in most implementation, `search_in` is implemented with this method. - protected fun search_index_in(s: Text, from: Int): Int is abstract - - # Search `self` into `s` from a certain position. - # Return null if not found. - # - # assert 'l'.search_in("hello world", 0).from == 2 - # assert 'l'.search_in("hello world", 3).from == 3 - # assert 'z'.search_in("hello world", 0) == null - # - # If only the index of the first character if required, see `search_index_in`. - # - # Note: Is used by `String::search`, `String::search_from`, and others. - protected fun search_in(s: Text, from: Int): nullable Match is abstract - - # Search all `self` occurrences into `s`. - # - # assert 'l'.search_all_in("hello world").length == 3 - # assert 'z'.search_all_in("hello world").length == 0 - # - # Note: Is used by `String::search_all`. - protected fun search_all_in(s: Text): Array[Match] - do - var res = new Array[Match] # Result - var match = search_in(s, 0) - while match != null do - res.add(match) - match = search_in(s, match.after) - end - return res - end - - # Split `s` using `self` is separator. - # - # Returns an array of matches that are between each occurence of `self`. - # If self is not present, an array with a single match on `s` is retunred. - # - # assert 'l'.split_in("hello world").join("|") == "he||o wor|d" - # assert 'z'.split_in("hello world").join("|") == "hello world" - # - # Note: is used by `String::split` - protected fun split_in(s: Text): Array[Match] - do - var res = new Array[Match] # Result - var i = 0 # Cursor - var match = search_in(s, 0) - while match != null do - # Compute the splited part length - var len = match.from - i - res.add(new Match(s.to_s, i, len)) - i = match.after - match = search_in(s, i) - end - # Add the last part - res.add(new Match(s.to_s, i, s.length - i)) - return res - end - - # Is `self` in `s`? - protected fun is_in(s: Text): Bool do return search_index_in(s, 0) != -1 -end - -# BM_Pattern are pre-compiled string motif for the Boyer-Moore algorithm. -# (cf. A Fast String Searching Algorithm, with R.S. Boyer. Communications -# of the Association for Computing Machinery, 20(10), 1977, pp. 762-772.) -# http://www.cs.utexas.edu/users/moore/best-ideas/string-searching/index.html -# -# var pat = new BM_Pattern("hello") -# assert "I say hello to the world!".search(pat).from == 6 -# assert "I say goodbye to the world!".search(pat) == null -class BM_Pattern - super Pattern - - redef fun to_s do return _motif - - # boyer-moore search gives the position of the first occurrence of a pattern starting at position `from` - redef fun search_index_in(s, from) - do - assert from >= 0 - var n = s.length - var m = _length - - var j = from - while j < n - m + 1 do - var i = m - 1 # Cursor in the pattern - while i >= 0 and _motif.chars[i] == s.chars[i + j] do i -= 1 - if i < 0 then - return j - else - var gs = _gs[i] # Good shift - var bc = bc(s.chars[i+j]) - m + 1 + i # Bad char - # Both are true, do move to the best - if gs > bc then - j += gs - else - j += bc - end - end - end - return -1 # found nothing - end - - # boyer-moore search. Return null if not found - redef fun search_in(s, from) - do - var to = search_index_in(s, from) - if to < 0 then - return null - else - return new Match(s.to_s, to, _length) - end - end - - init - do - _length = _motif.length - _gs = new Array[Int].with_capacity(_length) - compute_gs - compute_bc - end - - # searched motif - private var motif: String - - # length of the motif - private var length: Int is noinit - - private fun bc(e: Char): Int - do - if _bc_table.has_key(e) then - return _bc_table[e] - else - return _length - end - end - - # good shifts - private var gs: Array[Int] is noinit - - # bad characters - private var bc_table = new ArrayMap[Char, Int] - - private fun compute_bc - do - var x = _motif - var m = _length - var i = 0 - while i < m - 1 do - _bc_table[x.chars[i]] = m - i - 1 - i += 1 - end - end - - private fun suffixes: Array[Int] - do - var x = _motif - var m = _length - var suff = new Array[Int].filled_with(m, m) - - var f = 0 - var g = m - 1 - var i = m - 2 - while i >= 0 do - if i > g and suff[i+m-1-f] < i - g then - suff[i] = suff[i+m-1-f] - else - if i < g then g = i - f = i - while g >= 0 and x.chars[g] == x.chars[g + m - 1 - f] do g -= 1 - suff[i] = f - g - end - i -= 1 - end - return suff - end - - private fun compute_gs - do - var m = _length - var suff = suffixes - var i = 0 - while i < m do - _gs[i] = m - i += 1 - end - var j = 0 - i = m - 1 - while i >= -1 do - if i == -1 or suff[i] == i + 1 then - while j < m - 1 - i do - if _gs[j] == m then _gs[j] = m - 1 - i - j += 1 - end - end - i -= 1 - end - i = 0 - while i < m - 1 do - _gs[m - 1 - suff[i]] = m - 1 - i - i += 1 - end - end - - redef fun hash do return _motif.hash - redef fun ==(o) do return o isa BM_Pattern and o._motif == _motif -end - -# Matches are a part of a `Text` found by a `Pattern`. -class Match - # The base string matched - var string: String - - # The starting position in the string - var from: Int - - # The length of the matching part - var length: Int - - # The position of the first character just after the matching part. - # May be out of the base string - fun after: Int do return from + length - - # The contents of the matching part - redef fun to_s do return string.substring(from,length) - - init - do - assert positive_length: length >= 0 - assert valid_from: from >= 0 - assert valid_after: from + length <= string.length - end -end - -redef class Char - super Pattern - - redef fun search_index_in(s, from) - do - var stop = s.length - while from < stop do - if s.chars[from] == self then return from - from += 1 - end - return -1 - end - - redef fun search_in(s, from) - do - var pos = search_index_in(s, from) - if pos < 0 then - return null - else - return new Match(s.to_s, pos, 1) - end - end -end - -redef class Text - super Pattern - - redef fun search_index_in(s, from) - do - assert from >= 0 - var stop = s.length - length + 1 - while from < stop do - var i = length - 1 - while i >= 0 and self.chars[i] == s.chars[i + from] do i -= 1 - # Test if we found - if i < 0 then return from - # Not found so try next one - from += 1 - end - return -1 - end - - redef fun search_in(s, from) - do - var pos = search_index_in(s, from) - if pos < 0 then - return null - else - return new Match(s.to_s, pos, length) - end - end - - # Search the first occurence of the pattern `p`. - # Return null if not found. - # - # assert "I say hello to the world!".search("hello").from == 6 - # assert "I say goodbye to the world!".search("hello") == null - fun search(p: Pattern): nullable Match do return p.search_in(self, 0) - - # Search the first occurence of the pattern `p` after `from`. - # The search starts at `from`. - # Return null if not found. - # - # assert "I say hello to the world!".search_from("hello",4).from == 6 - # assert "I say hello to the world!".search_from("hello",7) == null - fun search_from(p: Pattern, from: Int): nullable Match do return p.search_in(self, from) - - # Search the last occurence of the text `t`. - # - # assert "bob".search_last("b").from == 2 - # assert "bob".search_last("bo").from == 0 - # assert "bob".search_last("ob").from == 1 - # assert "bobob".search_last("ob").from == 3 - # assert "bobbob".search_last("bb").from == 2 - # assert "bobbob".search_last("bob").from == 3 - # assert "bob".search_last("z") == null - # assert "".search_last("b") == null - fun search_last(t: Text): nullable Match do - return search_last_up_to(t, length) - end - - # Search the last occurence of the text `t` before `up_to`. - # - # assert "bobbob".search_last_up_to("b", 3).from == 2 - # assert "bobbob".search_last_up_to("b", 6).from == 5 - # assert "bobbob".search_last_up_to("b", 0) == null - fun search_last_up_to(t: Text, up_to: Int): nullable Match do - var i = up_to - t.length - - while i >= 0 do - if substring(i, t.length) == t then - return new Match(self.to_s, i, t.length) - end - i -= 1 - end - return null - end - - # Search all occurrences of p into self. - # - # var a = new Array[Int] - # for i in "hello world".search_all('o') do - # a.add(i.from) - # end - # assert a == [4, 7] - fun search_all(p: Pattern): Array[Match] do return p.search_all_in(self) - - # Split `self` using `p` as separator. - # - # assert "hello world".split('o') == ["hell", " w", "rld"] - fun split(p: Pattern): Array[String] - do - var matches = p.split_in(self) - var res = new Array[String].with_capacity(matches.length) - for m in matches do res.add(m.to_s) - return res - end - - # @deprecated alias for `split` - fun split_with(p: Pattern): Array[String] do return self.split(p) - - # Split `self` on the first `=` - # - # assert "hello".split_once_on('l') == ["he", "lo"] - # assert "a, b, c, d, e".split_once_on(", ") == ["a", "b, c, d, e"] - fun split_once_on(p: Pattern): Array[SELFTYPE] - do - var m = p.search_in(self, 0) - var res = new Array[SELFTYPE] - if m == null then - res.add self - else - res.add substring(0, m.from) - res.add substring_from(m.after) - end - return res - end - - # Replace all occurences of a pattern with a string - # - # assert "hlelo".replace("le", "el") == "hello" - # assert "hello".replace('l', "") == "heo" - fun replace(p: Pattern, string: SELFTYPE): String - do - return self.split_with(p).join(string) - end - - # Does `self` contains at least one instance of `pattern`? - # - # assert "hello".has('l') - # assert "hello".has("ll") - # assert not "hello".has("lll") - fun has(pattern: Pattern): Bool do return pattern.is_in(self) -end