Implementation of BKTree

As proposed by W. A. Burkhard and R. M. Keller in "Some approaches to best-match file searching" the BKTree data structure is usefull to speed-up spell checking based on the Levenshtein distance between words.

With a BKTree, the complexity to find mispelled words in a dictionnary is O(log n) instead of O(n) with a dummy list.

Example:

var tree = new BKTree

# Index some words in the dictionnary for spell checking
tree.add("book")
tree.add("books")
tree.add("cake")
tree.add("boo")
tree.add("cape")
tree.add("boon")
tree.add("cook")
tree.add("cart")

# Search suggestions in the BKTree
assert tree.search("caqe").join(", ") == "\{1: cake\}, \{1: cape\}"

See https://dl.acm.org/citation.cfm?id=362003.362025.

Introduced classes

class BKMatch

trees :: BKMatch

A match in a BKTree
class BKTree

trees :: BKTree

A BKTree implementation

All class definitions

class BKMatch

trees $ BKMatch

A match in a BKTree
class BKTree

trees $ BKTree

A BKTree implementation
package_diagram trees::bktree bktree trees::abstract_tree abstract_tree trees::bktree->trees::abstract_tree core core trees::abstract_tree->core ...core ... ...core->core a_star-m a_star-m a_star-m->trees::bktree

Ancestors

module abstract_collection

core :: abstract_collection

Abstract collection classes and services.
module abstract_text

core :: abstract_text

Abstract class for manipulation of sequences of characters
module array

core :: array

This module introduces the standard array structure.
module bitset

core :: bitset

Services to handle BitSet
module bytes

core :: bytes

Services for byte streams and arrays
module circular_array

core :: circular_array

Efficient data structure to access both end of the sequence.
module codec_base

core :: codec_base

Base for codecs to use with streams
module codecs

core :: codecs

Group module for all codec-related manipulations
module collection

core :: collection

This module define several collection classes.
module core

core :: core

Standard classes and methods used by default by Nit programs and libraries.
module environ

core :: environ

Access to the environment variables of the process
module error

core :: error

Standard error-management infrastructure.
module exec

core :: exec

Invocation and management of operating system sub-processes.
module file

core :: file

File manipulations (create, read, write, etc.)
module fixed_ints

core :: fixed_ints

Basic integers of fixed-precision
module fixed_ints_text

core :: fixed_ints_text

Text services to complement fixed_ints
module flat

core :: flat

All the array-based text representations
module gc

core :: gc

Access to the Nit internal garbage collection mechanism
module hash_collection

core :: hash_collection

Introduce HashMap and HashSet.
module iso8859_1

core :: iso8859_1

Codec for ISO8859-1 I/O
module kernel

core :: kernel

Most basic classes and methods.
module list

core :: list

This module handle double linked lists
module math

core :: math

Mathematical operations
module native

core :: native

Native structures for text and bytes
module numeric

core :: numeric

Advanced services for Numeric types
module protocol

core :: protocol

module queue

core :: queue

Queuing data structures and wrappers
module range

core :: range

Module for range of discrete objects.
module re

core :: re

Regular expression support for all services based on Pattern
module ropes

core :: ropes

Tree-based representation of a String.
module sorter

core :: sorter

This module contains classes used to compare things and sorts arrays.
module stream

core :: stream

Input and output streams of characters
module text

core :: text

All the classes and methods related to the manipulation of text entities
module time

core :: time

Management of time and dates
module union_find

core :: union_find

union–find algorithm using an efficient disjoint-set data structure
module utf8

core :: utf8

Codec for UTF-8 I/O

Parents

module abstract_tree

trees :: abstract_tree

Introduce tree structures abstraction

Children

module a_star-m

a_star-m

# Implementation of BKTree
#
# As proposed by W. A. Burkhard and R. M. Keller in
# "Some approaches to best-match file searching" the BKTree data structure
# is usefull to speed-up spell checking based on the Levenshtein distance between
# words.
#
# With a BKTree, the complexity to find mispelled words in a dictionnary is *O(log n)*
# instead of *O(n)* with a dummy list.
#
# Example:
#
# ~~~
# # Create a new BKTree
# var tree = new BKTree
#
# # Index some words in the dictionnary for spell checking
# tree.add("book")
# tree.add("books")
# tree.add("cake")
# tree.add("boo")
# tree.add("cape")
# tree.add("boon")
# tree.add("cook")
# tree.add("cart")
#
# # Search suggestions in the BKTree
# assert tree.search("caqe").join(", ") == "\{1: cake\}, \{1: cape\}"
# ~~~
#
# See <https://dl.acm.org/citation.cfm?id=362003.362025>.
module bktree

import abstract_tree

# A BKTree implementation
#
# See `add` to insert a new word.
# See `search` to find matches from a `key`.
class BKTree

	# Default tolerance used to find matches
	#
	# Default is `2`.
	var default_tolerance = 2 is writable

	# Tree root
	private var root: nullable BKNode = null

	# Add a `key` in the tree
	fun add(key: String) do
		var root = self.root
		if root == null then
			self.root = new BKNode(key)
			return
		end

		var node = root
		var dist = node.key.levenshtein_distance(key)

		while node.has_key(dist) do
			if dist == 0 then return
			node = node[dist]
			dist = node.key.levenshtein_distance(key)
		end
		node[dist] = new BKNode(key)
	end

	# Search `key` with a distance of `tolerance` in `self`.
	#
	# If `tolerance` is null, the use `default_tolerance` instead.
	fun search(key: String, tolerance: nullable Int): Array[BKMatch] do
		var res = new Array[BKMatch]
		var root = self.root
		if root != null then
			if tolerance == null then tolerance = self.default_tolerance
			search_recursive(root, res, key, tolerance)
		end
		default_comparator.sort(res)
		return res
	end

	private fun search_recursive(node: BKNode, res: Array[BKMatch], key: String, tolerance: Int) do
		var dist = node.key.levenshtein_distance(key)
		var min = dist - tolerance
		var max = dist + tolerance

		if dist < tolerance then
			res.add new BKMatch(dist, node.key)
		end

		for odist, child in node do
			if odist < min or odist > max then continue
			search_recursive(child, res, key, tolerance)
		end
	end
end

# A node that goes in a BKTree
#
# Each child is mapped from `self` by its distance as an Int.
private class BKNode
	super HashMap[Int, BKNode]

	# Key stored in `self`
	var key: String

	redef fun to_s do return "\{{key}\}"
end

# A match in a BKTree
#
# Used to order results returned by `BKTree::search`.
class BKMatch
	super Comparable

	redef type OTHER: BKMatch

	# Distance of `key` from the search query
	var distance: Int

	# Matched `key`
	var key: String

	redef fun ==(o) do return o isa BKMatch and distance == o.distance and key == o.key

	redef fun <=>(o) do
		if distance == o.distance then
			return key <=> o.key
		end
		return distance <=> o.distance
	end

	redef fun to_s do return "\{{distance}: {key}\}"
end
lib/trees/bktree.nit:15,1--149,3