From b5223fcf73ae72c3f5937b06c6d0991c99351def Mon Sep 17 00:00:00 2001 From: Alexandre Terrasa Date: Fri, 22 Sep 2017 16:37:19 -0400 Subject: [PATCH] lib/vsm: introduce an indexing process based on VSM Signed-off-by: Alexandre Terrasa --- lib/vsm/examples/example_vsm.nit | 65 +++++++++ lib/vsm/vsm.nit | 272 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 337 insertions(+) create mode 100644 lib/vsm/examples/example_vsm.nit diff --git a/lib/vsm/examples/example_vsm.nit b/lib/vsm/examples/example_vsm.nit new file mode 100644 index 0000000..5f1192b --- /dev/null +++ b/lib/vsm/examples/example_vsm.nit @@ -0,0 +1,65 @@ +# This file is part of NIT ( http://www.nitlanguage.org ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Example using a `FileIndex` +# +# This example shows of to index files from the system and retrieve them +# with text queries. +module example_vsm + +import vsm +import config + +redef class Config + + # --whitelist-exts + var opt_white_exts = new OptionArray("Allowed file extensions (default is [])", + "-w", "--whitelist-exts") + + # --blacklist-exts + var opt_black_exts = new OptionArray("Allowed file extensions (default is [])", + "-b", "--blacklist-exts") + + redef init do + opts.add_option(opt_white_exts, opt_black_exts) + end +end + +var config = new Config +config.tool_description = "usage: example_vsm " +config.parse_options(args) + +if args.length < 1 then + config.usage + exit 1 +end + +var index = new FileIndex +index.whitelist_exts = config.opt_white_exts.value +index.blacklist_exts = config.opt_black_exts.value + +print "Building index..." +index.index_files(args, true) +print "Indexed {index.documents.length} documents" + +loop + print "\nEnter query:" + printn "> " + var input = sys.stdin.read_line + var matches = index.match_string(input) + printn "" + for match in matches do + print match + end +end diff --git a/lib/vsm/vsm.nit b/lib/vsm/vsm.nit index e34eb85..41d5152 100644 --- a/lib/vsm/vsm.nit +++ b/lib/vsm/vsm.nit @@ -100,3 +100,275 @@ class Vector return "[{join(", ", ":")}]" end end + +# A Document index based on VSM +# +# Using VSMIndex you can index documents associated with their vector. +# Documents can then be matched to query vectors. +class VSMIndex + + # Documents index + # + # TODO use a more efficient representation. + var documents = new HashSet[Document] + + # Count for all terms in all indexed documents + # + # Used to compute the `inverse_doc_frequency`. + var terms_doc_count = new Vector + + # Inverse document frequency + # + # The inverse document frequency is a measure of how much information a term + # provides, that is, whether the term is common or rare across all documents. + var inverse_doc_frequency = new Vector + + # Used to sort matches + # + # See `IndexMatch`. + var sorter = new IndexMatchSorter + + # Match `query` vector to all index document vectors + # + # Returns an `IndexMatch` for each indexed document. + # Results are ordered by descending similarity. + fun match_vector(query: Vector): Array[IndexMatch] do + var matches = new Array[IndexMatch] + for doc in documents do + var sim = query.cosine_similarity(doc.tfidf) + if sim == 0.0 then continue + matches.add new IndexMatch(doc, sim) + end + sorter.sort(matches) + return matches + end + + # Index a document + # + # With each new document, the `inverse_doc_frequency` must be updated. + # By default, the method `update_index` is called after each call to + # `index_document`. + # + # When processing batch documents, use `auto_update = false` to disable + # the auto update of the index. + fun index_document(doc: Document, auto_update: nullable Bool) do + for term, count in doc.terms_count do + if not terms_doc_count.has_key(term) then + terms_doc_count[term] = 1.0 + else + terms_doc_count[term] += 1.0 + end + end + documents.add doc + if auto_update == null or auto_update then update_index + end + + # Update the index + # + # Recompute the `inverse_doc_frequency` values. + # Must be called manually after indexing new document with the option + # `auto_update = false`. + fun update_index do + for doc in documents do + for term, ccount in doc.terms_count do + inverse_doc_frequency[term] = (documents.length.to_f / terms_doc_count[term]).log + end + end + for doc in documents do + for term, freq in doc.terms_frequency do + doc.tfidf[term] = freq * inverse_doc_frequency[term] + end + end + end +end + +# A VSM index to store strings +class StringIndex + super VSMIndex + + # Index a new Document from `title`, `uri` and string `string`. + # + # Return the Document created. + # + # See `index_document`. + fun index_string(title, uri, string: String, auto_update: nullable Bool): Document do + var vector = parse_string(string) + var doc = new Document(title, uri, vector) + index_document(doc, auto_update) + return doc + end + + # Match the `query` string against all indexed documents + # + # See `match_vector`. + fun match_string(query: String): Array[IndexMatch] do + var vector = parse_string(query) + return match_vector(vector) + end + + # Parse the `string` as a Vector + # + # Returns a vector containing the terms of `string`. + fun parse_string(string: String): Vector do + var reader = new StringReader(string) + var vector = new Vector + loop + var token = reader.read_word + if token == "" then break + + if not vector.has_key(token) then + vector[token] = 1.0 + else + vector[token] += 1.0 + end + end + return vector + end +end + +# A VSMIndex to index files +class FileIndex + super StringIndex + + # Index a file from its `path`. + # + # Return the created document or null if `path` is not accepted by `accept_file`. + # + # See `index_document`. + fun index_file(path: String, auto_update: nullable Bool): nullable Document do + if not accept_file(path) then return null + var vector = parse_file(path) + var doc = new Document(path, path, vector) + index_document(doc, auto_update) + return doc + end + + # Index multiple files + # + # The recursive method `index_dir` will be called for each directory found + # in `paths`. + # + # See `index_file` + fun index_files(paths: Collection[String], auto_update: nullable Bool) do + for path in paths do + if path.to_path.is_dir then + index_dir(path, false) + else + index_file(path, false) + end + end + if auto_update != null and auto_update then update_index + end + + # Index all files in `dir` recursively + # + # See `index_file`. + fun index_dir(dir: String, auto_update: nullable Bool) do + if not dir.to_path.is_dir then return + for file in dir.files do + var path = dir / file + if path.to_path.is_dir then + index_dir(path, false) + else + index_file(path, false) + end + end + if auto_update != null and auto_update then update_index + end + + # Is `path` accepted depending on `whitelist_exts` and `blacklist_exts`? + fun accept_file(path: String): Bool do + var ext = path.file_extension + if ext != null then + ext = ext.to_lower + if blacklist_exts.has(ext) then return false + if whitelist_exts.not_empty and not whitelist_exts.has(ext) then return false + end + return whitelist_exts.is_empty + end + + # Parse the `file` content as a Vector + # + # See `parse_string`. + fun parse_file(file: String): Vector do + return parse_string(file.to_path.read_all) + end + + # File extensions white list + # + # If not empty, only files with these extensions will be indexed. + # + # If an extension is in both `whitelist_exts` and `blacklist_exts`, the + # blacklist will prevail and the file will be ignored. + var whitelist_exts = new Array[String] is writable + + # File extensions black list + # + # Files with these extensions will not be indexed. + var blacklist_exts = new Array[String] is writable +end + +# A Document to add in a VSMIndex +class Document + + # Document title + var title: String + + # Document URI + var uri: String + + # Count of all terms found in the document + # + # Used to compute the document `terms_frequency`. + var terms_count: Vector + + # Frequency of each term found in the document + # + # Used to match the document against the `VSMIndex::inverse_doc_frequency`. + var terms_frequency: Vector is lazy do + var all_terms = 0.0 + for t, c in terms_count do all_terms += c + + var vector = new Vector + for t, c in terms_count do + vector[t] = c / all_terms + end + return vector + end + + # Term frequency–Inverse document frequency for each term + # + # A high weight in tf–idf is reached by a high term frequency + # (in the given document) and a low document frequency of the term in the + # whole collection of documents + var tfidf = new Vector + + redef fun to_s do return "{title}" +end + +# A match to a `request` in an `Index` +class IndexMatch + super Comparable + + # Document matching the `request_vector` + var document: Document + + # Similarity between the `request` and the `doc`. + # + # Result is in the range 0.0 .. 1.1 where 0.0 means no similarity and 1.0 + # means perfect similarity. + var similarity: Float + + redef fun to_s do return "{document} ({similarity})" +end + +# Sort matches by similarity +class IndexMatchSorter + super DefaultComparator + + redef type COMPARED: IndexMatch + + redef fun compare(a, b) do + return b.similarity <=> a.similarity + end +end -- 1.7.9.5