From: Jean Privat <jean@pryen.org>
Date: Wed, 20 Jun 2018 18:59:11 +0000 (-0400)
Subject: Merge: lib/trees: introduce BKTree
X-Git-Url: http://nitlanguage.org?hp=2859aa3bea53470ca3e09a20f4b3bce0a8d9859b

Merge: lib/trees: introduce BKTree

This PR introduces a new kind of tree: the BKTree.

This data structure can be used to speed-up the comparison of a string and a collection of strings with Levenshtein distance.

See https://dl.acm.org/citation.cfm?id=362003.362025 for more details.

Pull-Request: #2718
---

diff --git a/lib/vsm/vsm.nit b/lib/vsm/vsm.nit
index 4a64f99..ad3d928 100644
--- a/lib/vsm/vsm.nit
+++ b/lib/vsm/vsm.nit
@@ -77,6 +77,17 @@ class Vector
 		return super
 	end
 
+	# Increment value for `obj` term
+	#
+	# If the term isn't already in the vector, the new value is 1.0.
+	fun inc(obj: nullable Object) do
+		if has_key(obj) then
+			self[obj] += 1.0
+		else
+			self[obj] = 1.0
+		end
+	end
+
 	# The norm of the vector.
 	#
 	# `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
@@ -112,10 +123,18 @@ end
 # Documents can then be matched to query vectors.
 class VSMIndex
 
+	# Kind of documents stored in this index
+	#
+	# Clients can redefine this type to specialize the index.
+	type DOC: Document
+
 	# Documents index
+	var documents = new HashSet[DOC]
+
+	# Inversed index
 	#
-	# TODO use a more efficient representation.
-	var documents = new HashSet[Document]
+	# Link documents to existing terms.
+	var inversed_index = new HashMap[nullable Object, Array[DOC]]
 
 	# Count for all terms in all indexed documents
 	#
@@ -137,12 +156,18 @@ class VSMIndex
 	#
 	# Returns an `IndexMatch` for each indexed document.
 	# Results are ordered by descending similarity.
-	fun match_vector(query: Vector): Array[IndexMatch] do
-		var matches = new Array[IndexMatch]
+	fun match_vector(query: Vector): Array[IndexMatch[DOC]] do
+		var documents = new HashSet[DOC]
+		for term, count in query do
+			if inversed_index.has_key(term) then
+				documents.add_all inversed_index[term]
+			end
+		end
+		var matches = new Array[IndexMatch[DOC]]
 		for doc in documents do
 			var sim = query.cosine_similarity(doc.tfidf)
 			if sim == 0.0 then continue
-			matches.add new IndexMatch(doc, sim)
+			matches.add new IndexMatch[DOC](doc, sim)
 		end
 		sorter.sort(matches)
 		return matches
@@ -156,13 +181,13 @@ class VSMIndex
 	#
 	# When processing batch documents, use `auto_update = false` to disable
 	# the auto update of the index.
-	fun index_document(doc: Document, auto_update: nullable Bool) do
+	fun index_document(doc: DOC, auto_update: nullable Bool) do
 		for term, count in doc.terms_count do
-			if not terms_doc_count.has_key(term) then
-				terms_doc_count[term] = 1.0
-			else
-				terms_doc_count[term] += 1.0
+			terms_doc_count.inc(term)
+			if not inversed_index.has_key(term) then
+				inversed_index[term] = new Array[DOC]
 			end
+			inversed_index[term].add doc
 		end
 		documents.add doc
 		if auto_update == null or auto_update then update_index
@@ -196,7 +221,7 @@ class StringIndex
 	# Return the Document created.
 	#
 	# See `index_document`.
-	fun index_string(title, uri, string: String, auto_update: nullable Bool): Document do
+	fun index_string(title, uri, string: String, auto_update: nullable Bool): DOC do
 		var vector = parse_string(string)
 		var doc = new Document(title, uri, vector)
 		index_document(doc, auto_update)
@@ -206,7 +231,7 @@ class StringIndex
 	# Match the `query` string against all indexed documents
 	#
 	# See `match_vector`.
-	fun match_string(query: String): Array[IndexMatch] do
+	fun match_string(query: String): Array[IndexMatch[DOC]] do
 		var vector = parse_string(query)
 		var doc = new Document("", "", vector)
 		return match_vector(doc.terms_frequency)
@@ -221,12 +246,7 @@ class StringIndex
 		loop
 			var token = reader.read_word
 			if token == "" then break
-
-			if not vector.has_key(token) then
-				vector[token] = 1.0
-			else
-				vector[token] += 1.0
-			end
+			vector.inc(token)
 		end
 		return vector
 	end
@@ -241,7 +261,7 @@ class FileIndex
 	# Return the created document or null if `path` is not accepted by `accept_file`.
 	#
 	# See `index_document`.
-	fun index_file(path: String, auto_update: nullable Bool): nullable Document do
+	fun index_file(path: String, auto_update: nullable Bool): nullable DOC do
 		if not accept_file(path) then return null
 		var vector = parse_file(path)
 		var doc = new Document(path, path, vector)
@@ -347,17 +367,17 @@ class Document
 	# A high weight in tfâidf is reached by a high term frequency
 	# (in the given document) and a low document frequency of the term in the
 	# whole collection of documents
-	var tfidf = new Vector
+	var tfidf: Vector = terms_count is lazy
 
 	redef fun to_s do return "{title}"
 end
 
 # A match to a `request` in an `Index`
-class IndexMatch
+class IndexMatch[DOC: Document]
 	super Comparable
 
 	# Document matching the `request_vector`
-	var document: Document
+	var document: DOC
 
 	# Similarity between the `request` and the `doc`.
 	#
@@ -372,7 +392,7 @@ end
 class IndexMatchSorter
 	super DefaultComparator
 
-	redef type COMPARED: IndexMatch
+	redef type COMPARED: IndexMatch[Document]
 
 	redef fun compare(a, b) do
 		return b.similarity <=> a.similarity
diff --git a/src/indexing/code_index.nit b/src/indexing/code_index.nit
new file mode 100644
index 0000000..765fad0
--- /dev/null
+++ b/src/indexing/code_index.nit
@@ -0,0 +1,204 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# An index that contains Nit code
+#
+# Model entities are indexed by their ANode.
+#
+# Vectorization is based on model usage such as:
+# * modules importation
+# * classes spcialization and refinement
+# * methods calls and refinements
+#
+# Example:
+# ~~~nitish
+# # Create the index
+# var index = new CodeIndex(toolcontext)
+# for mentity in mentities do
+#	index.index_mentity(mentity)
+# end
+#
+# # Match a piece of code
+# var matches = index.match_code("print \"Hello, World!\"")
+# for match in matches do
+#	print match
+# end
+# ~~~
+module code_index
+
+import vsm
+import semantize
+import parser_util
+
+# Index for Nit doc
+class CodeIndex
+	super VSMIndex
+
+	redef type DOC: CodeDocument
+
+	# ToolContext used to parse pieces of code
+	var toolcontext: ToolContext
+
+	# Index `mentity`
+	fun index_mentity(mentity: MEntity) do
+		var terms = vectorize_mentity(mentity)
+		var doc = new CodeDocument(mentity, terms)
+		index_document(doc, false)
+	end
+
+	# Match `code` with the index
+	fun match_code(code: String): Array[IndexMatch[DOC]] do
+		var node = parse_code(code)
+		if node == null then return new Array[IndexMatch[DOC]]
+		return match_node(node)
+	end
+
+	# Match `node` with the index
+	fun match_node(node: ANode): Array[IndexMatch[DOC]] do
+		var vector = vectorize_node(node)
+		return match_vector(vector)
+	end
+
+	# Parse a piece of code
+	private fun parse_code(code: String): nullable AModule do
+		# Try to parse code
+		var node = toolcontext.parse_something(code)
+		if not node isa AModule then return null
+
+		# Load code into model
+		var mbuilder = toolcontext.modelbuilder
+		mbuilder.load_rt_module(null, node, "tmp")
+		mbuilder.run_phases
+		return node
+	end
+
+	# Transform `node` in a Vector
+	private fun vectorize_node(node: ANode): Vector do
+		var visitor = new CodeIndexVisitor
+		visitor.enter_visit(node)
+		return visitor.vector
+	end
+
+	# Transform `mentity` in a Vector
+	private fun vectorize_mentity(mentity: MEntity): Vector do
+		var node = toolcontext.modelbuilder.mentity2node(mentity)
+		if node == null then return new Vector
+		return vectorize_node(node)
+	end
+end
+
+# A specific document for mentities code
+class CodeDocument
+	super Document
+	autoinit(mentity, terms_count)
+
+	# MEntity related to this document
+	var mentity: MEntity
+
+	redef var title = mentity.full_name is lazy
+
+	redef var uri = mentity.location.to_s is lazy
+end
+
+# Code index visitor
+#
+# Used to build a VSM Vector from a Nit ANode.
+private class CodeIndexVisitor
+	super Visitor
+
+	var vector = new Vector
+
+	redef fun visit(node) do
+		node.accept_code_index_visitor(self)
+	end
+end
+
+redef class ANode
+	private fun accept_code_index_visitor(v: CodeIndexVisitor) do
+		visit_all(v)
+	end
+end
+
+redef class AStdImport
+	redef fun accept_code_index_visitor(v) do
+		var mmodule = self.mmodule
+		if mmodule != null then
+			v.vector.inc "import#{mmodule.full_name}"
+		end
+	end
+end
+
+redef class AStdClassdef
+	redef fun accept_code_index_visitor(v) do
+		var mclassdef = self.mclassdef
+		if mclassdef != null then
+			if not mclassdef.is_intro then
+				v.vector.inc "redef#{mclassdef.full_name}"
+				v.vector.inc "redef#{mclassdef.mclass.full_name}"
+			end
+		end
+		visit_all(v)
+	end
+end
+
+redef class ASuperPropdef
+	redef fun accept_code_index_visitor(v) do
+		var mtype = self.n_type.mtype
+		if mtype isa MClassType then
+			v.vector.inc "super#{mtype.mclass.intro.full_name}"
+			v.vector.inc "super#{mtype.mclass.full_name}"
+		end
+	end
+end
+
+redef class APropdef
+	redef fun accept_code_index_visitor(v) do
+		var mpropdef = self.mpropdef
+		if mpropdef != null then
+			if not mpropdef.is_intro then
+				v.vector.inc "redef#{mpropdef.mproperty.intro.full_name}"
+				v.vector.inc "redef#{mpropdef.mproperty.full_name}"
+			end
+		end
+		visit_all(v)
+	end
+end
+
+redef class ASendExpr
+	redef fun accept_code_index_visitor(v) do
+		var callsite = self.callsite
+		if callsite != null then
+			var args = callsite.signaturemap.as(not null).map.length
+			v.vector.inc "call#{callsite.mpropdef.full_name}({args})"
+			v.vector.inc "call#{callsite.mpropdef.mproperty.full_name}({args})"
+			v.vector.inc "call#{callsite.mpropdef.mclassdef.full_name}({args})"
+			v.vector.inc "call#{callsite.mpropdef.mclassdef.mclass.full_name}({args})"
+		end
+		visit_all(v)
+	end
+end
+
+redef class ANewExpr
+	redef fun accept_code_index_visitor(v) do
+		var callsite = self.callsite
+		if callsite != null then
+			var args = callsite.signaturemap.as(not null).map.length
+			v.vector.inc "call#{callsite.mpropdef.full_name}({args})"
+			v.vector.inc "call#{callsite.mpropdef.mproperty.full_name}({args})"
+			v.vector.inc "new#{callsite.mpropdef.mclassdef.full_name}({args})"
+			v.vector.inc "new#{callsite.mpropdef.mclassdef.mclass.full_name}({args})"
+		end
+		visit_all(v)
+	end
+end
diff --git a/src/indexing/tests/test_code_index.nit b/src/indexing/tests/test_code_index.nit
new file mode 100644
index 0000000..b2a8f2e
--- /dev/null
+++ b/src/indexing/tests/test_code_index.nit
@@ -0,0 +1,74 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+module test_code_index is test
+
+import code_index
+import frontend
+
+class TestCodeIndex
+	test
+
+	# CodeIndex used in tests
+	var test_index: CodeIndex is noinit
+
+	# Initialize test variables
+	#
+	# Must be called before test execution.
+	# FIXME should be before_all
+	fun build_test_env is before do
+		var test_path = "NIT_TESTING_PATH".environ.dirname
+		var test_src = test_path / "../../../tests/test_prog"
+
+		# build model
+		var toolcontext = new ToolContext
+		var model = new Model
+		var modelbuilder = new ModelBuilder(model, toolcontext)
+		var mmodules = modelbuilder.parse_full([test_src])
+		modelbuilder.run_phases
+		toolcontext.run_global_phases(mmodules)
+
+		# create index
+		var index = new CodeIndex(toolcontext)
+		for mmodule in mmodules do
+			index.index_mentity(mmodule)
+		end
+		test_index = index
+		modelbuilder.paths.add test_src
+	end
+
+	fun test_find1 is test do
+		var query = "import game\n"
+		var matches = test_index.match_code(query)
+		assert matches.first.document.mentity.full_name == "test_prog::test_prog"
+	end
+
+	fun test_find2 is test do
+		var query = "import game\nimport rpg\n"
+		var matches = test_index.match_code(query)
+		assert matches.first.document.mentity.full_name == "test_prog::game"
+	end
+
+	fun test_find3 is test do
+		var query = "import game\nclass MyGame\nsuper Game\nredef fun start_game do end\nend\n"
+		var matches = test_index.match_code(query)
+		assert matches.first.document.mentity.full_name == "test_prog::game_examples"
+	end
+
+	fun test_find_error is test do
+		var query = "error"
+		var matches = test_index.match_code(query)
+		assert matches.is_empty
+	end
+end