As proposed by W. A. Burkhard and R. M. Keller in "Some approaches to best-match file searching" the BKTree data structure is usefull to speed-up spell checking based on the Levenshtein distance between words.
With a BKTree, the complexity to find mispelled words in a dictionnary is O(log n) instead of O(n) with a dummy list.
Example:
var tree = new BKTree
# Index some words in the dictionnary for spell checking
tree.add("book")
tree.add("books")
tree.add("cake")
tree.add("boo")
tree.add("cape")
tree.add("boon")
tree.add("cook")
tree.add("cart")
# Search suggestions in the BKTree
assert tree.search("caqe").join(", ") == "\{1: cake\}, \{1: cape\}"
core :: union_find
union–find algorithm using an efficient disjoint-set data structure
# Implementation of BKTree
#
# As proposed by W. A. Burkhard and R. M. Keller in
# "Some approaches to best-match file searching" the BKTree data structure
# is usefull to speed-up spell checking based on the Levenshtein distance between
# words.
#
# With a BKTree, the complexity to find mispelled words in a dictionnary is *O(log n)*
# instead of *O(n)* with a dummy list.
#
# Example:
#
# ~~~
# # Create a new BKTree
# var tree = new BKTree
#
# # Index some words in the dictionnary for spell checking
# tree.add("book")
# tree.add("books")
# tree.add("cake")
# tree.add("boo")
# tree.add("cape")
# tree.add("boon")
# tree.add("cook")
# tree.add("cart")
#
# # Search suggestions in the BKTree
# assert tree.search("caqe").join(", ") == "\{1: cake\}, \{1: cape\}"
# ~~~
#
# See <https://dl.acm.org/citation.cfm?id=362003.362025>.
module bktree
import abstract_tree
# A BKTree implementation
#
# See `add` to insert a new word.
# See `search` to find matches from a `key`.
class BKTree
# Default tolerance used to find matches
#
# Default is `2`.
var default_tolerance = 2 is writable
# Tree root
private var root: nullable BKNode = null
# Add a `key` in the tree
fun add(key: String) do
var root = self.root
if root == null then
self.root = new BKNode(key)
return
end
var node = root
var dist = node.key.levenshtein_distance(key)
while node.has_key(dist) do
if dist == 0 then return
node = node[dist]
dist = node.key.levenshtein_distance(key)
end
node[dist] = new BKNode(key)
end
# Search `key` with a distance of `tolerance` in `self`.
#
# If `tolerance` is null, the use `default_tolerance` instead.
fun search(key: String, tolerance: nullable Int): Array[BKMatch] do
var res = new Array[BKMatch]
var root = self.root
if root != null then
if tolerance == null then tolerance = self.default_tolerance
search_recursive(root, res, key, tolerance)
end
default_comparator.sort(res)
return res
end
private fun search_recursive(node: BKNode, res: Array[BKMatch], key: String, tolerance: Int) do
var dist = node.key.levenshtein_distance(key)
var min = dist - tolerance
var max = dist + tolerance
if dist < tolerance then
res.add new BKMatch(dist, node.key)
end
for odist, child in node do
if odist < min or odist > max then continue
search_recursive(child, res, key, tolerance)
end
end
end
# A node that goes in a BKTree
#
# Each child is mapped from `self` by its distance as an Int.
private class BKNode
super HashMap[Int, BKNode]
# Key stored in `self`
var key: String
redef fun to_s do return "\{{key}\}"
end
# A match in a BKTree
#
# Used to order results returned by `BKTree::search`.
class BKMatch
super Comparable
redef type OTHER: BKMatch
# Distance of `key` from the search query
var distance: Int
# Matched `key`
var key: String
redef fun ==(o) do return o isa BKMatch and distance == o.distance and key == o.key
redef fun <=>(o) do
if distance == o.distance then
return key <=> o.key
end
return distance <=> o.distance
end
redef fun to_s do return "\{{distance}: {key}\}"
end
lib/trees/bktree.nit:15,1--149,3