Merge: lib/trees: introduce BKTree
authorJean Privat <jean@pryen.org>
Wed, 20 Jun 2018 18:59:11 +0000 (14:59 -0400)
committerJean Privat <jean@pryen.org>
Wed, 20 Jun 2018 18:59:11 +0000 (14:59 -0400)
This PR introduces a new kind of tree: the BKTree.

This data structure can be used to speed-up the comparison of a string and a collection of strings with Levenshtein distance.

See https://dl.acm.org/citation.cfm?id=362003.362025 for more details.

Pull-Request: #2718

lib/vsm/vsm.nit
src/indexing/code_index.nit [new file with mode: 0644]
src/indexing/tests/test_code_index.nit [new file with mode: 0644]

index 4a64f99..ad3d928 100644 (file)
@@ -77,6 +77,17 @@ class Vector
                return super
        end
 
+       # Increment value for `obj` term
+       #
+       # If the term isn't already in the vector, the new value is 1.0.
+       fun inc(obj: nullable Object) do
+               if has_key(obj) then
+                       self[obj] += 1.0
+               else
+                       self[obj] = 1.0
+               end
+       end
+
        # The norm of the vector.
        #
        # `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
@@ -112,10 +123,18 @@ end
 # Documents can then be matched to query vectors.
 class VSMIndex
 
+       # Kind of documents stored in this index
+       #
+       # Clients can redefine this type to specialize the index.
+       type DOC: Document
+
        # Documents index
+       var documents = new HashSet[DOC]
+
+       # Inversed index
        #
-       # TODO use a more efficient representation.
-       var documents = new HashSet[Document]
+       # Link documents to existing terms.
+       var inversed_index = new HashMap[nullable Object, Array[DOC]]
 
        # Count for all terms in all indexed documents
        #
@@ -137,12 +156,18 @@ class VSMIndex
        #
        # Returns an `IndexMatch` for each indexed document.
        # Results are ordered by descending similarity.
-       fun match_vector(query: Vector): Array[IndexMatch] do
-               var matches = new Array[IndexMatch]
+       fun match_vector(query: Vector): Array[IndexMatch[DOC]] do
+               var documents = new HashSet[DOC]
+               for term, count in query do
+                       if inversed_index.has_key(term) then
+                               documents.add_all inversed_index[term]
+                       end
+               end
+               var matches = new Array[IndexMatch[DOC]]
                for doc in documents do
                        var sim = query.cosine_similarity(doc.tfidf)
                        if sim == 0.0 then continue
-                       matches.add new IndexMatch(doc, sim)
+                       matches.add new IndexMatch[DOC](doc, sim)
                end
                sorter.sort(matches)
                return matches
@@ -156,13 +181,13 @@ class VSMIndex
        #
        # When processing batch documents, use `auto_update = false` to disable
        # the auto update of the index.
-       fun index_document(doc: Document, auto_update: nullable Bool) do
+       fun index_document(doc: DOC, auto_update: nullable Bool) do
                for term, count in doc.terms_count do
-                       if not terms_doc_count.has_key(term) then
-                               terms_doc_count[term] = 1.0
-                       else
-                               terms_doc_count[term] += 1.0
+                       terms_doc_count.inc(term)
+                       if not inversed_index.has_key(term) then
+                               inversed_index[term] = new Array[DOC]
                        end
+                       inversed_index[term].add doc
                end
                documents.add doc
                if auto_update == null or auto_update then update_index
@@ -196,7 +221,7 @@ class StringIndex
        # Return the Document created.
        #
        # See `index_document`.
-       fun index_string(title, uri, string: String, auto_update: nullable Bool): Document do
+       fun index_string(title, uri, string: String, auto_update: nullable Bool): DOC do
                var vector = parse_string(string)
                var doc = new Document(title, uri, vector)
                index_document(doc, auto_update)
@@ -206,7 +231,7 @@ class StringIndex
        # Match the `query` string against all indexed documents
        #
        # See `match_vector`.
-       fun match_string(query: String): Array[IndexMatch] do
+       fun match_string(query: String): Array[IndexMatch[DOC]] do
                var vector = parse_string(query)
                var doc = new Document("", "", vector)
                return match_vector(doc.terms_frequency)
@@ -221,12 +246,7 @@ class StringIndex
                loop
                        var token = reader.read_word
                        if token == "" then break
-
-                       if not vector.has_key(token) then
-                               vector[token] = 1.0
-                       else
-                               vector[token] += 1.0
-                       end
+                       vector.inc(token)
                end
                return vector
        end
@@ -241,7 +261,7 @@ class FileIndex
        # Return the created document or null if `path` is not accepted by `accept_file`.
        #
        # See `index_document`.
-       fun index_file(path: String, auto_update: nullable Bool): nullable Document do
+       fun index_file(path: String, auto_update: nullable Bool): nullable DOC do
                if not accept_file(path) then return null
                var vector = parse_file(path)
                var doc = new Document(path, path, vector)
@@ -347,17 +367,17 @@ class Document
        # A high weight in tf–idf is reached by a high term frequency
        # (in the given document) and a low document frequency of the term in the
        # whole collection of documents
-       var tfidf = new Vector
+       var tfidf: Vector = terms_count is lazy
 
        redef fun to_s do return "{title}"
 end
 
 # A match to a `request` in an `Index`
-class IndexMatch
+class IndexMatch[DOC: Document]
        super Comparable
 
        # Document matching the `request_vector`
-       var document: Document
+       var document: DOC
 
        # Similarity between the `request` and the `doc`.
        #
@@ -372,7 +392,7 @@ end
 class IndexMatchSorter
        super DefaultComparator
 
-       redef type COMPARED: IndexMatch
+       redef type COMPARED: IndexMatch[Document]
 
        redef fun compare(a, b) do
                return b.similarity <=> a.similarity
diff --git a/src/indexing/code_index.nit b/src/indexing/code_index.nit
new file mode 100644 (file)
index 0000000..765fad0
--- /dev/null
@@ -0,0 +1,204 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# An index that contains Nit code
+#
+# Model entities are indexed by their ANode.
+#
+# Vectorization is based on model usage such as:
+# * modules importation
+# * classes spcialization and refinement
+# * methods calls and refinements
+#
+# Example:
+# ~~~nitish
+# # Create the index
+# var index = new CodeIndex(toolcontext)
+# for mentity in mentities do
+#      index.index_mentity(mentity)
+# end
+#
+# # Match a piece of code
+# var matches = index.match_code("print \"Hello, World!\"")
+# for match in matches do
+#      print match
+# end
+# ~~~
+module code_index
+
+import vsm
+import semantize
+import parser_util
+
+# Index for Nit doc
+class CodeIndex
+       super VSMIndex
+
+       redef type DOC: CodeDocument
+
+       # ToolContext used to parse pieces of code
+       var toolcontext: ToolContext
+
+       # Index `mentity`
+       fun index_mentity(mentity: MEntity) do
+               var terms = vectorize_mentity(mentity)
+               var doc = new CodeDocument(mentity, terms)
+               index_document(doc, false)
+       end
+
+       # Match `code` with the index
+       fun match_code(code: String): Array[IndexMatch[DOC]] do
+               var node = parse_code(code)
+               if node == null then return new Array[IndexMatch[DOC]]
+               return match_node(node)
+       end
+
+       # Match `node` with the index
+       fun match_node(node: ANode): Array[IndexMatch[DOC]] do
+               var vector = vectorize_node(node)
+               return match_vector(vector)
+       end
+
+       # Parse a piece of code
+       private fun parse_code(code: String): nullable AModule do
+               # Try to parse code
+               var node = toolcontext.parse_something(code)
+               if not node isa AModule then return null
+
+               # Load code into model
+               var mbuilder = toolcontext.modelbuilder
+               mbuilder.load_rt_module(null, node, "tmp")
+               mbuilder.run_phases
+               return node
+       end
+
+       # Transform `node` in a Vector
+       private fun vectorize_node(node: ANode): Vector do
+               var visitor = new CodeIndexVisitor
+               visitor.enter_visit(node)
+               return visitor.vector
+       end
+
+       # Transform `mentity` in a Vector
+       private fun vectorize_mentity(mentity: MEntity): Vector do
+               var node = toolcontext.modelbuilder.mentity2node(mentity)
+               if node == null then return new Vector
+               return vectorize_node(node)
+       end
+end
+
+# A specific document for mentities code
+class CodeDocument
+       super Document
+       autoinit(mentity, terms_count)
+
+       # MEntity related to this document
+       var mentity: MEntity
+
+       redef var title = mentity.full_name is lazy
+
+       redef var uri = mentity.location.to_s is lazy
+end
+
+# Code index visitor
+#
+# Used to build a VSM Vector from a Nit ANode.
+private class CodeIndexVisitor
+       super Visitor
+
+       var vector = new Vector
+
+       redef fun visit(node) do
+               node.accept_code_index_visitor(self)
+       end
+end
+
+redef class ANode
+       private fun accept_code_index_visitor(v: CodeIndexVisitor) do
+               visit_all(v)
+       end
+end
+
+redef class AStdImport
+       redef fun accept_code_index_visitor(v) do
+               var mmodule = self.mmodule
+               if mmodule != null then
+                       v.vector.inc "import#{mmodule.full_name}"
+               end
+       end
+end
+
+redef class AStdClassdef
+       redef fun accept_code_index_visitor(v) do
+               var mclassdef = self.mclassdef
+               if mclassdef != null then
+                       if not mclassdef.is_intro then
+                               v.vector.inc "redef#{mclassdef.full_name}"
+                               v.vector.inc "redef#{mclassdef.mclass.full_name}"
+                       end
+               end
+               visit_all(v)
+       end
+end
+
+redef class ASuperPropdef
+       redef fun accept_code_index_visitor(v) do
+               var mtype = self.n_type.mtype
+               if mtype isa MClassType then
+                       v.vector.inc "super#{mtype.mclass.intro.full_name}"
+                       v.vector.inc "super#{mtype.mclass.full_name}"
+               end
+       end
+end
+
+redef class APropdef
+       redef fun accept_code_index_visitor(v) do
+               var mpropdef = self.mpropdef
+               if mpropdef != null then
+                       if not mpropdef.is_intro then
+                               v.vector.inc "redef#{mpropdef.mproperty.intro.full_name}"
+                               v.vector.inc "redef#{mpropdef.mproperty.full_name}"
+                       end
+               end
+               visit_all(v)
+       end
+end
+
+redef class ASendExpr
+       redef fun accept_code_index_visitor(v) do
+               var callsite = self.callsite
+               if callsite != null then
+                       var args = callsite.signaturemap.as(not null).map.length
+                       v.vector.inc "call#{callsite.mpropdef.full_name}({args})"
+                       v.vector.inc "call#{callsite.mpropdef.mproperty.full_name}({args})"
+                       v.vector.inc "call#{callsite.mpropdef.mclassdef.full_name}({args})"
+                       v.vector.inc "call#{callsite.mpropdef.mclassdef.mclass.full_name}({args})"
+               end
+               visit_all(v)
+       end
+end
+
+redef class ANewExpr
+       redef fun accept_code_index_visitor(v) do
+               var callsite = self.callsite
+               if callsite != null then
+                       var args = callsite.signaturemap.as(not null).map.length
+                       v.vector.inc "call#{callsite.mpropdef.full_name}({args})"
+                       v.vector.inc "call#{callsite.mpropdef.mproperty.full_name}({args})"
+                       v.vector.inc "new#{callsite.mpropdef.mclassdef.full_name}({args})"
+                       v.vector.inc "new#{callsite.mpropdef.mclassdef.mclass.full_name}({args})"
+               end
+               visit_all(v)
+       end
+end
diff --git a/src/indexing/tests/test_code_index.nit b/src/indexing/tests/test_code_index.nit
new file mode 100644 (file)
index 0000000..b2a8f2e
--- /dev/null
@@ -0,0 +1,74 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+module test_code_index is test
+
+import code_index
+import frontend
+
+class TestCodeIndex
+       test
+
+       # CodeIndex used in tests
+       var test_index: CodeIndex is noinit
+
+       # Initialize test variables
+       #
+       # Must be called before test execution.
+       # FIXME should be before_all
+       fun build_test_env is before do
+               var test_path = "NIT_TESTING_PATH".environ.dirname
+               var test_src = test_path / "../../../tests/test_prog"
+
+               # build model
+               var toolcontext = new ToolContext
+               var model = new Model
+               var modelbuilder = new ModelBuilder(model, toolcontext)
+               var mmodules = modelbuilder.parse_full([test_src])
+               modelbuilder.run_phases
+               toolcontext.run_global_phases(mmodules)
+
+               # create index
+               var index = new CodeIndex(toolcontext)
+               for mmodule in mmodules do
+                       index.index_mentity(mmodule)
+               end
+               test_index = index
+               modelbuilder.paths.add test_src
+       end
+
+       fun test_find1 is test do
+               var query = "import game\n"
+               var matches = test_index.match_code(query)
+               assert matches.first.document.mentity.full_name == "test_prog::test_prog"
+       end
+
+       fun test_find2 is test do
+               var query = "import game\nimport rpg\n"
+               var matches = test_index.match_code(query)
+               assert matches.first.document.mentity.full_name == "test_prog::game"
+       end
+
+       fun test_find3 is test do
+               var query = "import game\nclass MyGame\nsuper Game\nredef fun start_game do end\nend\n"
+               var matches = test_index.match_code(query)
+               assert matches.first.document.mentity.full_name == "test_prog::game_examples"
+       end
+
+       fun test_find_error is test do
+               var query = "error"
+               var matches = test_index.match_code(query)
+               assert matches.is_empty
+       end
+end