From: Jean Privat <jean@pryen.org>
Date: Mon, 13 Aug 2018 19:25:30 +0000 (-0400)
Subject: Merge: src/model/model_index: model index uses BKTree
X-Git-Url: http://nitlanguage.org?hp=72867cacdcf3691f4d133313561f8d229ffbcd6d

Merge: src/model/model_index: model index uses BKTree

This PR makes ModelIndex use a BKTree (#2718) to speed up Nitweb name similarity search queries on large code corpora.

Levenstein names comparison with 300k+ entities took a bit long (up to 10s for Nitweb with `contrib` and `src` for some queries). The use of BKTree speeds things up and brings those queries under 0.1s.


Pull-Request: #2724
Reviewed-by: Jean Privat <jean@pryen.org>
---

diff --git a/lib/core/collection/circular_array.nit b/lib/core/collection/circular_array.nit
index 9419853..1cc757b 100644
--- a/lib/core/collection/circular_array.nit
+++ b/lib/core/collection/circular_array.nit
@@ -19,11 +19,11 @@ import array
 
 # Efficient data structure to access both end of the sequence.
 #
-# A circular array offers efficient random access,
-# efficient manipulation for both ends of the structure (push, pop, ) and
+# A circular array offers efficient random access, efficient manipulation
+# at both ends of the structure (push, pop, shift and unshift) and
 # automatic amortized growth.
 #
-# Therefore it can be used as is or as and efficient queue (FIFO/LIFO)
+# Therefore it can be used as is or as an efficient queue (FIFO/LIFO).
 class CircularArray[E]
 	super Sequence[E]
 
diff --git a/lib/core/collection/union_find.nit b/lib/core/collection/union_find.nit
index 6c74ca2..864c509 100644
--- a/lib/core/collection/union_find.nit
+++ b/lib/core/collection/union_find.nit
@@ -13,7 +13,7 @@ module union_find
 
 import hash_collection
 
-# Data structure to keeps track of elements partitioned into disjoint subsets
+# Data structure to keep track of elements partitioned into disjoint subsets
 #
 #     var s = new DisjointSet[Int]
 #     s.add(1)
@@ -22,15 +22,15 @@ import hash_collection
 #     s.union(1,2)
 #     assert s.in_same_subset(1,2)
 #
-# `in_same_subset` is transitive, reflexive and symetric
+# `in_same_subset` is transitive, reflexive and symmetric
 #
 #     s.add(3)
 #     assert not s.in_same_subset(1,3)
 #     s.union(3,2)
 #     assert s.in_same_subset(1,3)
 #
-# Unkike theorical Disjoint-set data structures, the underling implementation is opaque
-# that makes the traditionnal `find` method unavailable for clients.
+# Unlike theoretical Disjoint-set data structures, the underling implementation is opaque
+# making the traditional `find` method unavailable for clients.
 # The methods `in_same_subset`, `to_partitions`, and their variations are offered instead.
 class DisjointSet[E]
 	super SimpleCollection[E]
@@ -172,7 +172,7 @@ class DisjointSet[E]
 		return to_subpartition(self)
 	end
 
-	# Construct a partitionning on `es`, a subset of elements
+	# Construct a partitioning on `es`, a subset of elements
 	#
 	#     var s = new DisjointSet[Int]
 	#     s.add_all([1,2,3,4,5,6])
@@ -237,7 +237,7 @@ private class DisjointSetNode
 	# If parent == self then the node is a root
 	var parent: DisjointSetNode = self
 
-	# The rank to no desequilibrate the structure.
+	# The rank to keep the structure balanced.
 	# The term rank is used instead of depth since
 	# path compression is used, see `DisjointSet::nfind`
 	var rank = 0