1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
15 # Implementation of BKTree
17 # As proposed by W. A. Burkhard and R. M. Keller in
18 # "Some approaches to best-match file searching" the BKTree data structure
19 # is usefull to speed-up spell checking based on the Levenshtein distance between
22 # With a BKTree, the complexity to find mispelled words in a dictionnary is *O(log n)*
23 # instead of *O(n)* with a dummy list.
28 # # Create a new BKTree
29 # var tree = new BKTree
31 # # Index some words in the dictionnary for spell checking
41 # # Search suggestions in the BKTree
42 # assert tree.search("caqe").join(", ") == "\{1: cake\}, \{1: cape\}"
45 # See <https://dl.acm.org/citation.cfm?id=362003.362025>.
50 # A BKTree implementation
52 # See `add` to insert a new word.
53 # See `search` to find matches from a `key`.
56 # Default tolerance used to find matches
59 var default_tolerance
= 2 is writable
62 private var root
: nullable BKNode = null
64 # Add a `key` in the tree
65 fun add
(key
: String) do
68 self.root
= new BKNode(key
)
73 var dist
= node
.key
.levenshtein_distance
(key
)
75 while node
.has_key
(dist
) do
76 if dist
== 0 then return
78 dist
= node
.key
.levenshtein_distance
(key
)
80 node
[dist
] = new BKNode(key
)
83 # Search `key` with a distance of `tolerance` in `self`.
85 # If `tolerance` is null, the use `default_tolerance` instead.
86 fun search
(key
: String, tolerance
: nullable Int): Array[BKMatch] do
87 var res
= new Array[BKMatch]
90 if tolerance
== null then tolerance
= self.default_tolerance
91 search_recursive
(root
, res
, key
, tolerance
)
93 default_comparator
.sort
(res
)
97 private fun search_recursive
(node
: BKNode, res
: Array[BKMatch], key
: String, tolerance
: Int) do
98 var dist
= node
.key
.levenshtein_distance
(key
)
99 var min
= dist
- tolerance
100 var max
= dist
+ tolerance
102 if dist
< tolerance
then
103 res
.add
new BKMatch(dist
, node
.key
)
106 for odist
, child
in node
do
107 if odist
< min
or odist
> max
then continue
108 search_recursive
(child
, res
, key
, tolerance
)
113 # A node that goes in a BKTree
115 # Each child is mapped from `self` by its distance as an Int.
117 super HashMap[Int, BKNode]
119 # Key stored in `self`
122 redef fun to_s
do return "\{{key}\}"
125 # A match in a BKTree
127 # Used to order results returned by `BKTree::search`.
131 redef type OTHER: BKMatch
133 # Distance of `key` from the search query
139 redef fun ==(o
) do return o
isa BKMatch and distance
== o
.distance
and key
== o
.key
142 if distance
== o
.distance
then
145 return distance
<=> o
.distance
148 redef fun to_s
do return "\{{distance}: {key}\}"