lib/trees: introduce BKTree
authorAlexandre Terrasa <alexandre@moz-code.org>
Thu, 14 Jun 2018 17:30:04 +0000 (13:30 -0400)
committerAlexandre Terrasa <alexandre@moz-code.org>
Tue, 19 Jun 2018 15:29:56 +0000 (11:29 -0400)
Signed-off-by: Alexandre Terrasa <alexandre@moz-code.org>

lib/trees/bktree.nit [new file with mode: 0644]

diff --git a/lib/trees/bktree.nit b/lib/trees/bktree.nit
new file mode 100644 (file)
index 0000000..b7d0d48
--- /dev/null
@@ -0,0 +1,149 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Implementation of BKTree
+#
+# As proposed by W. A. Burkhard and R. M. Keller in
+# "Some approaches to best-match file searching" the BKTree data structure
+# is usefull to speed-up spell checking based on the Levenshtein distance between
+# words.
+#
+# With a BKTree, the complexity to find mispelled words in a dictionnary is *O(log n)*
+# instead of *O(n)* with a dummy list.
+#
+# Example:
+#
+# ~~~
+# # Create a new BKTree
+# var tree = new BKTree
+#
+# # Index some words in the dictionnary for spell checking
+# tree.add("book")
+# tree.add("books")
+# tree.add("cake")
+# tree.add("boo")
+# tree.add("cape")
+# tree.add("boon")
+# tree.add("cook")
+# tree.add("cart")
+#
+# # Search suggestions in the BKTree
+# assert tree.search("caqe").join(", ") == "\{1: cake\}, \{1: cape\}"
+# ~~~
+#
+# See <https://dl.acm.org/citation.cfm?id=362003.362025>.
+module bktree
+
+import abstract_tree
+
+# A BKTree implementation
+#
+# See `add` to insert a new word.
+# See `search` to find matches from a `key`.
+class BKTree
+
+       # Default tolerance used to find matches
+       #
+       # Default is `2`.
+       var default_tolerance = 2 is writable
+
+       # Tree root
+       private var root: nullable BKNode = null
+
+       # Add a `key` in the tree
+       fun add(key: String) do
+               var root = self.root
+               if root == null then
+                       self.root = new BKNode(key)
+                       return
+               end
+
+               var node = root
+               var dist = node.key.levenshtein_distance(key)
+
+               while node.has_key(dist) do
+                       if dist == 0 then return
+                       node = node[dist]
+                       dist = node.key.levenshtein_distance(key)
+               end
+               node[dist] = new BKNode(key)
+       end
+
+       # Search `key` with a distance of `tolerance` in `self`.
+       #
+       # If `tolerance` is null, the use `default_tolerance` instead.
+       fun search(key: String, tolerance: nullable Int): Array[BKMatch] do
+               var res = new Array[BKMatch]
+               var root = self.root
+               if root != null then
+                       if tolerance == null then tolerance = self.default_tolerance
+                       search_recursive(root, res, key, tolerance)
+               end
+               default_comparator.sort(res)
+               return res
+       end
+
+       private fun search_recursive(node: BKNode, res: Array[BKMatch], key: String, tolerance: Int) do
+               var dist = node.key.levenshtein_distance(key)
+               var min = dist - tolerance
+               var max = dist + tolerance
+
+               if dist < tolerance then
+                       res.add new BKMatch(dist, node.key)
+               end
+
+               for odist, child in node do
+                       if odist < min or odist > max then continue
+                       search_recursive(child, res, key, tolerance)
+               end
+       end
+end
+
+# A node that goes in a BKTree
+#
+# Each child is mapped from `self` by its distance as an Int.
+private class BKNode
+       super HashMap[Int, BKNode]
+
+       # Key stored in `self`
+       var key: String
+
+       redef fun to_s do return "\{{key}\}"
+end
+
+# A match in a BKTree
+#
+# Used to order results returned by `BKTree::search`.
+class BKMatch
+       super Comparable
+
+       redef type OTHER: BKMatch
+
+       # Distance of `key` from the search query
+       var distance: Int
+
+       # Matched `key`
+       var key: String
+
+       redef fun ==(o) do return o isa BKMatch and distance == o.distance and key == o.key
+
+       redef fun <=>(o) do
+               if distance == o.distance then
+                       return key <=> o.key
+               end
+               return distance <=> o.distance
+       end
+
+       redef fun to_s do return "\{{distance}: {key}\}"
+end