1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # Copyright 2005-2008 Jean Privat <jean@pryen.org>
5 # This file is free software, which comes along with NIT. This software is
6 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
7 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
8 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
9 # is kept unaltered, and a notification of the changes is added.
10 # You are allowed to redistribute it and sell it, alone or is a part of
13 # This module is about string search and matching.
14 # It includes some good features
19 # Patterns are string motifs.
21 # Search `self' into `s' from a certain position.
22 # Return the position of the first character of the matching section.
23 # Return -1 if not found.
24 fun search_index_in
(s
: String, from
: Int): Int is abstract
26 # Search `self' into `s' from a certain position.
27 # Return null if not found.
28 fun search_in
(s
: String, from
: Int): nullable Match is abstract
30 # Search all `self' occucences into `s'.
31 fun search_all_in
(s
: String): Array[Match]
33 var res
= new Array[Match] # Result
34 var match
= search_in
(s
, 0)
35 while match
!= null do
37 match
= search_in
(s
, match
.after
)
42 # Split `s' using `self' is separator.
43 fun split_in
(s
: String): Array[Match]
45 var res
= new Array[Match] # Result
47 var match
= search_in
(s
, 0)
48 while match
!= null do
49 # Compute the splited part length
50 var len
= match
.from
- i
51 res
.add
(new Match(s
, i
, len
))
53 match
= search_in
(s
, i
)
56 res
.add
(new Match(s
, i
, s
.length
- i
))
61 # BM_Pattern are precompiled string motif for the Boyer-Moore Fast String Searching Algorithm
62 # (cf. A Fast String Searching Algorithm, with R.S. Boyer.
63 # Communications of the Association for Computing Machinery, 20(10), 1977, pp. 762-772.)
64 # see also http://www.cs.utexas.edu/users/moore/best-ideas/string-searching/index.html
67 redef fun to_s
do return _motif
69 # boyer-moore search gives the position of the first occurence of a pattern starting at position `from'
70 redef fun search_index_in
(s
, from
)
77 while j
< n
- m
+ 1 do
78 var i
= m
- 1 # Cursor in the pattern
79 while i
>= 0 and _motif
[i
] == s
[i
+ j
] do i
-= 1
83 var gs
= _gs
[i
] # Good shift
84 var bc
= bc
(s
[i
+j
]) - m
+ 1 + i
# Bad char
85 # Both are true, do move to the best
93 return -1 # found nothing
96 # boyer-moore search. Return null if not found
97 redef fun search_in
(s
, from
)
99 var to
= search_index_in
(s
, from
)
103 return new Match(s
, to
, _length
)
107 # Compile a new motif
111 _length
= motif
.length
112 _gs
= new Array[Int].with_capacity
(_length
)
113 _bc_table
= new ArrayMap[Char, Int]
121 # length of the motif
124 private fun bc
(e
: Char): Int
126 if _bc_table
.has_key
(e
) then
137 var _bc_table
: Map[Char, Int]
139 private fun compute_bc
145 _bc_table
[x
[i
]] = m
- i
- 1
150 private fun suffixes
: Array[Int]
154 var suff
= new Array[Int].filled_with
(m
, m
)
160 if i
> g
and suff
[i
+m-1-f
] < i
- g
then
161 suff
[i
] = suff
[i
+m-1-f
]
165 while g
>= 0 and x
[g
] == x
[g
+ m
- 1 - f
] do g
-= 1
173 private fun compute_gs
186 if i
== -1 or suff
[i
] == i
+ 1 then
187 while j
< m
- 1 - i
do
188 if _gs
[j
] == m
then _gs
[j
] = m
- 1 - i
196 _gs
[m
- 1 - suff
[i
]] = m
- 1 - i
202 # Matches are a part of a string.
204 # The base string matched
205 readable var _string
: String
207 # The starting position in the string
208 readable var _from
: Int
210 # The length of the mathching part
211 readable var _length
: Int
213 # The position of the first character just after the matching part.
214 # May be out of the base string
215 fun after
: Int do return _from
+ _length
217 # The contents of the mathing part
218 redef fun to_s
do return _string
.substring
(_from
, _length
)
220 # Matches `len' characters of `s' from `f'.
221 init(s
: String, f
: Int, len
: Int)
223 assert positive_length
: len
>= 0
224 assert valid_from
: f
>= 0
225 assert valid_after
: f
+ len
<= s
.length
234 redef fun search_index_in
(s
, from
)
238 if s
[from
] == self then return from
244 redef fun search_in
(s
, from
)
246 var pos
= search_index_in
(s
, from
)
250 return new Match(s
, pos
, 1)
257 redef fun search_index_in
(s
, from
)
260 var stop
= s
.length
- length
+ 1
263 while i
>= 0 and self[i
] == s
[i
+ from
] do i
-= 1
265 if i
< 0 then return from
266 # Not found so try next one
272 redef fun search_in
(s
, from
)
274 var pos
= search_index_in
(s
, from
)
278 return new Match(s
, pos
, length
)
282 # Like `search_from' but from the first chararter.
283 fun search
(p
: Pattern): nullable Match do return p
.search_in
(self, 0)
285 # Search the given pattern into self from a.
286 # The search starts at `from'.
287 # Return null if not found.
288 fun search_from
(p
: Pattern, from
: Int): nullable Match do return p
.search_in
(self, from
)
290 # Search all occurences of p into self.
292 # var a = new Array[Int]
293 # for i in "hello world".searches('o') do
297 fun search_all
(p
: Pattern): Array[Match] do return p
.search_all_in
(self)
299 # Split self using p is separator.
300 # "hello world".split('o') # -> ["hell", " w", "rld"]
301 fun split_with
(p
: Pattern): Array[String]
303 var matches
= p
.split_in
(self)
304 var res
= new Array[String].with_capacity
(matches
.length
)
305 for m
in matches
do res
.add
(m
.to_s
)
309 # Split self using '\n' is separator.
310 # "hello\nworld".split # -> ["hello","world"]
311 fun split
: Array[String] do return split_with
('\n')