From: Jean Privat Date: Wed, 20 Jun 2018 18:59:11 +0000 (-0400) Subject: Merge: lib/trees: introduce BKTree X-Git-Url: http://nitlanguage.org?hp=2859aa3bea53470ca3e09a20f4b3bce0a8d9859b Merge: lib/trees: introduce BKTree This PR introduces a new kind of tree: the BKTree. This data structure can be used to speed-up the comparison of a string and a collection of strings with Levenshtein distance. See https://dl.acm.org/citation.cfm?id=362003.362025 for more details. Pull-Request: #2718 --- diff --git a/lib/vsm/vsm.nit b/lib/vsm/vsm.nit index 4a64f99..ad3d928 100644 --- a/lib/vsm/vsm.nit +++ b/lib/vsm/vsm.nit @@ -77,6 +77,17 @@ class Vector return super end + # Increment value for `obj` term + # + # If the term isn't already in the vector, the new value is 1.0. + fun inc(obj: nullable Object) do + if has_key(obj) then + self[obj] += 1.0 + else + self[obj] = 1.0 + end + end + # The norm of the vector. # # `||x|| = (x1 ** 2 ... + xn ** 2).sqrt` @@ -112,10 +123,18 @@ end # Documents can then be matched to query vectors. class VSMIndex + # Kind of documents stored in this index + # + # Clients can redefine this type to specialize the index. + type DOC: Document + # Documents index + var documents = new HashSet[DOC] + + # Inversed index # - # TODO use a more efficient representation. - var documents = new HashSet[Document] + # Link documents to existing terms. + var inversed_index = new HashMap[nullable Object, Array[DOC]] # Count for all terms in all indexed documents # @@ -137,12 +156,18 @@ class VSMIndex # # Returns an `IndexMatch` for each indexed document. # Results are ordered by descending similarity. - fun match_vector(query: Vector): Array[IndexMatch] do - var matches = new Array[IndexMatch] + fun match_vector(query: Vector): Array[IndexMatch[DOC]] do + var documents = new HashSet[DOC] + for term, count in query do + if inversed_index.has_key(term) then + documents.add_all inversed_index[term] + end + end + var matches = new Array[IndexMatch[DOC]] for doc in documents do var sim = query.cosine_similarity(doc.tfidf) if sim == 0.0 then continue - matches.add new IndexMatch(doc, sim) + matches.add new IndexMatch[DOC](doc, sim) end sorter.sort(matches) return matches @@ -156,13 +181,13 @@ class VSMIndex # # When processing batch documents, use `auto_update = false` to disable # the auto update of the index. - fun index_document(doc: Document, auto_update: nullable Bool) do + fun index_document(doc: DOC, auto_update: nullable Bool) do for term, count in doc.terms_count do - if not terms_doc_count.has_key(term) then - terms_doc_count[term] = 1.0 - else - terms_doc_count[term] += 1.0 + terms_doc_count.inc(term) + if not inversed_index.has_key(term) then + inversed_index[term] = new Array[DOC] end + inversed_index[term].add doc end documents.add doc if auto_update == null or auto_update then update_index @@ -196,7 +221,7 @@ class StringIndex # Return the Document created. # # See `index_document`. - fun index_string(title, uri, string: String, auto_update: nullable Bool): Document do + fun index_string(title, uri, string: String, auto_update: nullable Bool): DOC do var vector = parse_string(string) var doc = new Document(title, uri, vector) index_document(doc, auto_update) @@ -206,7 +231,7 @@ class StringIndex # Match the `query` string against all indexed documents # # See `match_vector`. - fun match_string(query: String): Array[IndexMatch] do + fun match_string(query: String): Array[IndexMatch[DOC]] do var vector = parse_string(query) var doc = new Document("", "", vector) return match_vector(doc.terms_frequency) @@ -221,12 +246,7 @@ class StringIndex loop var token = reader.read_word if token == "" then break - - if not vector.has_key(token) then - vector[token] = 1.0 - else - vector[token] += 1.0 - end + vector.inc(token) end return vector end @@ -241,7 +261,7 @@ class FileIndex # Return the created document or null if `path` is not accepted by `accept_file`. # # See `index_document`. - fun index_file(path: String, auto_update: nullable Bool): nullable Document do + fun index_file(path: String, auto_update: nullable Bool): nullable DOC do if not accept_file(path) then return null var vector = parse_file(path) var doc = new Document(path, path, vector) @@ -347,17 +367,17 @@ class Document # A high weight in tf–idf is reached by a high term frequency # (in the given document) and a low document frequency of the term in the # whole collection of documents - var tfidf = new Vector + var tfidf: Vector = terms_count is lazy redef fun to_s do return "{title}" end # A match to a `request` in an `Index` -class IndexMatch +class IndexMatch[DOC: Document] super Comparable # Document matching the `request_vector` - var document: Document + var document: DOC # Similarity between the `request` and the `doc`. # @@ -372,7 +392,7 @@ end class IndexMatchSorter super DefaultComparator - redef type COMPARED: IndexMatch + redef type COMPARED: IndexMatch[Document] redef fun compare(a, b) do return b.similarity <=> a.similarity diff --git a/src/indexing/code_index.nit b/src/indexing/code_index.nit new file mode 100644 index 0000000..765fad0 --- /dev/null +++ b/src/indexing/code_index.nit @@ -0,0 +1,204 @@ +# This file is part of NIT ( http://www.nitlanguage.org ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# An index that contains Nit code +# +# Model entities are indexed by their ANode. +# +# Vectorization is based on model usage such as: +# * modules importation +# * classes spcialization and refinement +# * methods calls and refinements +# +# Example: +# ~~~nitish +# # Create the index +# var index = new CodeIndex(toolcontext) +# for mentity in mentities do +# index.index_mentity(mentity) +# end +# +# # Match a piece of code +# var matches = index.match_code("print \"Hello, World!\"") +# for match in matches do +# print match +# end +# ~~~ +module code_index + +import vsm +import semantize +import parser_util + +# Index for Nit doc +class CodeIndex + super VSMIndex + + redef type DOC: CodeDocument + + # ToolContext used to parse pieces of code + var toolcontext: ToolContext + + # Index `mentity` + fun index_mentity(mentity: MEntity) do + var terms = vectorize_mentity(mentity) + var doc = new CodeDocument(mentity, terms) + index_document(doc, false) + end + + # Match `code` with the index + fun match_code(code: String): Array[IndexMatch[DOC]] do + var node = parse_code(code) + if node == null then return new Array[IndexMatch[DOC]] + return match_node(node) + end + + # Match `node` with the index + fun match_node(node: ANode): Array[IndexMatch[DOC]] do + var vector = vectorize_node(node) + return match_vector(vector) + end + + # Parse a piece of code + private fun parse_code(code: String): nullable AModule do + # Try to parse code + var node = toolcontext.parse_something(code) + if not node isa AModule then return null + + # Load code into model + var mbuilder = toolcontext.modelbuilder + mbuilder.load_rt_module(null, node, "tmp") + mbuilder.run_phases + return node + end + + # Transform `node` in a Vector + private fun vectorize_node(node: ANode): Vector do + var visitor = new CodeIndexVisitor + visitor.enter_visit(node) + return visitor.vector + end + + # Transform `mentity` in a Vector + private fun vectorize_mentity(mentity: MEntity): Vector do + var node = toolcontext.modelbuilder.mentity2node(mentity) + if node == null then return new Vector + return vectorize_node(node) + end +end + +# A specific document for mentities code +class CodeDocument + super Document + autoinit(mentity, terms_count) + + # MEntity related to this document + var mentity: MEntity + + redef var title = mentity.full_name is lazy + + redef var uri = mentity.location.to_s is lazy +end + +# Code index visitor +# +# Used to build a VSM Vector from a Nit ANode. +private class CodeIndexVisitor + super Visitor + + var vector = new Vector + + redef fun visit(node) do + node.accept_code_index_visitor(self) + end +end + +redef class ANode + private fun accept_code_index_visitor(v: CodeIndexVisitor) do + visit_all(v) + end +end + +redef class AStdImport + redef fun accept_code_index_visitor(v) do + var mmodule = self.mmodule + if mmodule != null then + v.vector.inc "import#{mmodule.full_name}" + end + end +end + +redef class AStdClassdef + redef fun accept_code_index_visitor(v) do + var mclassdef = self.mclassdef + if mclassdef != null then + if not mclassdef.is_intro then + v.vector.inc "redef#{mclassdef.full_name}" + v.vector.inc "redef#{mclassdef.mclass.full_name}" + end + end + visit_all(v) + end +end + +redef class ASuperPropdef + redef fun accept_code_index_visitor(v) do + var mtype = self.n_type.mtype + if mtype isa MClassType then + v.vector.inc "super#{mtype.mclass.intro.full_name}" + v.vector.inc "super#{mtype.mclass.full_name}" + end + end +end + +redef class APropdef + redef fun accept_code_index_visitor(v) do + var mpropdef = self.mpropdef + if mpropdef != null then + if not mpropdef.is_intro then + v.vector.inc "redef#{mpropdef.mproperty.intro.full_name}" + v.vector.inc "redef#{mpropdef.mproperty.full_name}" + end + end + visit_all(v) + end +end + +redef class ASendExpr + redef fun accept_code_index_visitor(v) do + var callsite = self.callsite + if callsite != null then + var args = callsite.signaturemap.as(not null).map.length + v.vector.inc "call#{callsite.mpropdef.full_name}({args})" + v.vector.inc "call#{callsite.mpropdef.mproperty.full_name}({args})" + v.vector.inc "call#{callsite.mpropdef.mclassdef.full_name}({args})" + v.vector.inc "call#{callsite.mpropdef.mclassdef.mclass.full_name}({args})" + end + visit_all(v) + end +end + +redef class ANewExpr + redef fun accept_code_index_visitor(v) do + var callsite = self.callsite + if callsite != null then + var args = callsite.signaturemap.as(not null).map.length + v.vector.inc "call#{callsite.mpropdef.full_name}({args})" + v.vector.inc "call#{callsite.mpropdef.mproperty.full_name}({args})" + v.vector.inc "new#{callsite.mpropdef.mclassdef.full_name}({args})" + v.vector.inc "new#{callsite.mpropdef.mclassdef.mclass.full_name}({args})" + end + visit_all(v) + end +end diff --git a/src/indexing/tests/test_code_index.nit b/src/indexing/tests/test_code_index.nit new file mode 100644 index 0000000..b2a8f2e --- /dev/null +++ b/src/indexing/tests/test_code_index.nit @@ -0,0 +1,74 @@ +# This file is part of NIT ( http://www.nitlanguage.org ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +module test_code_index is test + +import code_index +import frontend + +class TestCodeIndex + test + + # CodeIndex used in tests + var test_index: CodeIndex is noinit + + # Initialize test variables + # + # Must be called before test execution. + # FIXME should be before_all + fun build_test_env is before do + var test_path = "NIT_TESTING_PATH".environ.dirname + var test_src = test_path / "../../../tests/test_prog" + + # build model + var toolcontext = new ToolContext + var model = new Model + var modelbuilder = new ModelBuilder(model, toolcontext) + var mmodules = modelbuilder.parse_full([test_src]) + modelbuilder.run_phases + toolcontext.run_global_phases(mmodules) + + # create index + var index = new CodeIndex(toolcontext) + for mmodule in mmodules do + index.index_mentity(mmodule) + end + test_index = index + modelbuilder.paths.add test_src + end + + fun test_find1 is test do + var query = "import game\n" + var matches = test_index.match_code(query) + assert matches.first.document.mentity.full_name == "test_prog::test_prog" + end + + fun test_find2 is test do + var query = "import game\nimport rpg\n" + var matches = test_index.match_code(query) + assert matches.first.document.mentity.full_name == "test_prog::game" + end + + fun test_find3 is test do + var query = "import game\nclass MyGame\nsuper Game\nredef fun start_game do end\nend\n" + var matches = test_index.match_code(query) + assert matches.first.document.mentity.full_name == "test_prog::game_examples" + end + + fun test_find_error is test do + var query = "error" + var matches = test_index.match_code(query) + assert matches.is_empty + end +end