lib/trees: introduce BKTree
[nit.git] / lib / trees / bktree.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Implementation of BKTree
16 #
17 # As proposed by W. A. Burkhard and R. M. Keller in
18 # "Some approaches to best-match file searching" the BKTree data structure
19 # is usefull to speed-up spell checking based on the Levenshtein distance between
20 # words.
21 #
22 # With a BKTree, the complexity to find mispelled words in a dictionnary is *O(log n)*
23 # instead of *O(n)* with a dummy list.
24 #
25 # Example:
26 #
27 # ~~~
28 # # Create a new BKTree
29 # var tree = new BKTree
30 #
31 # # Index some words in the dictionnary for spell checking
32 # tree.add("book")
33 # tree.add("books")
34 # tree.add("cake")
35 # tree.add("boo")
36 # tree.add("cape")
37 # tree.add("boon")
38 # tree.add("cook")
39 # tree.add("cart")
40 #
41 # # Search suggestions in the BKTree
42 # assert tree.search("caqe").join(", ") == "\{1: cake\}, \{1: cape\}"
43 # ~~~
44 #
45 # See <https://dl.acm.org/citation.cfm?id=362003.362025>.
46 module bktree
47
48 import abstract_tree
49
50 # A BKTree implementation
51 #
52 # See `add` to insert a new word.
53 # See `search` to find matches from a `key`.
54 class BKTree
55
56 # Default tolerance used to find matches
57 #
58 # Default is `2`.
59 var default_tolerance = 2 is writable
60
61 # Tree root
62 private var root: nullable BKNode = null
63
64 # Add a `key` in the tree
65 fun add(key: String) do
66 var root = self.root
67 if root == null then
68 self.root = new BKNode(key)
69 return
70 end
71
72 var node = root
73 var dist = node.key.levenshtein_distance(key)
74
75 while node.has_key(dist) do
76 if dist == 0 then return
77 node = node[dist]
78 dist = node.key.levenshtein_distance(key)
79 end
80 node[dist] = new BKNode(key)
81 end
82
83 # Search `key` with a distance of `tolerance` in `self`.
84 #
85 # If `tolerance` is null, the use `default_tolerance` instead.
86 fun search(key: String, tolerance: nullable Int): Array[BKMatch] do
87 var res = new Array[BKMatch]
88 var root = self.root
89 if root != null then
90 if tolerance == null then tolerance = self.default_tolerance
91 search_recursive(root, res, key, tolerance)
92 end
93 default_comparator.sort(res)
94 return res
95 end
96
97 private fun search_recursive(node: BKNode, res: Array[BKMatch], key: String, tolerance: Int) do
98 var dist = node.key.levenshtein_distance(key)
99 var min = dist - tolerance
100 var max = dist + tolerance
101
102 if dist < tolerance then
103 res.add new BKMatch(dist, node.key)
104 end
105
106 for odist, child in node do
107 if odist < min or odist > max then continue
108 search_recursive(child, res, key, tolerance)
109 end
110 end
111 end
112
113 # A node that goes in a BKTree
114 #
115 # Each child is mapped from `self` by its distance as an Int.
116 private class BKNode
117 super HashMap[Int, BKNode]
118
119 # Key stored in `self`
120 var key: String
121
122 redef fun to_s do return "\{{key}\}"
123 end
124
125 # A match in a BKTree
126 #
127 # Used to order results returned by `BKTree::search`.
128 class BKMatch
129 super Comparable
130
131 redef type OTHER: BKMatch
132
133 # Distance of `key` from the search query
134 var distance: Int
135
136 # Matched `key`
137 var key: String
138
139 redef fun ==(o) do return o isa BKMatch and distance == o.distance and key == o.key
140
141 redef fun <=>(o) do
142 if distance == o.distance then
143 return key <=> o.key
144 end
145 return distance <=> o.distance
146 end
147
148 redef fun to_s do return "\{{distance}: {key}\}"
149 end