From b5223fcf73ae72c3f5937b06c6d0991c99351def Mon Sep 17 00:00:00 2001
From: Alexandre Terrasa <alexandre@moz-code.org>
Date: Fri, 22 Sep 2017 16:37:19 -0400
Subject: [PATCH] lib/vsm: introduce an indexing process based on VSM

Signed-off-by: Alexandre Terrasa <alexandre@moz-code.org>
---
 lib/vsm/examples/example_vsm.nit |   65 +++++++++
 lib/vsm/vsm.nit                  |  272 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 337 insertions(+)
 create mode 100644 lib/vsm/examples/example_vsm.nit
diff --git a/lib/vsm/examples/example_vsm.nit b/lib/vsm/examples/example_vsm.nit
new file mode 100644
index 0000000..5f1192b
--- /dev/null
+++ b/lib/vsm/examples/example_vsm.nit
@@ -0,0 +1,65 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Example using a `FileIndex`
+#
+# This example shows of to index files from the system and retrieve them
+# with text queries.
+module example_vsm
+
+import vsm
+import config
+
+redef class Config
+
+	# --whitelist-exts
+	var opt_white_exts = new OptionArray("Allowed file extensions (default is [])",
+		"-w", "--whitelist-exts")
+
+	# --blacklist-exts
+	var opt_black_exts = new OptionArray("Allowed file extensions (default is [])",
+		"-b", "--blacklist-exts")
+
+	redef init do
+		opts.add_option(opt_white_exts, opt_black_exts)
+	end
+end
+
+var config = new Config
+config.tool_description = "usage: example_vsm <files>"
+config.parse_options(args)
+
+if args.length < 1 then
+	config.usage
+	exit 1
+end
+
+var index = new FileIndex
+index.whitelist_exts = config.opt_white_exts.value
+index.blacklist_exts = config.opt_black_exts.value
+
+print "Building index..."
+index.index_files(args, true)
+print "Indexed {index.documents.length} documents"
+
+loop
+	print "\nEnter query:"
+	printn "> "
+	var input = sys.stdin.read_line
+	var matches = index.match_string(input)
+	printn ""
+	for match in matches do
+		print match
+	end
+end
diff --git a/lib/vsm/vsm.nit b/lib/vsm/vsm.nit
index e34eb85..41d5152 100644
--- a/lib/vsm/vsm.nit
+++ b/lib/vsm/vsm.nit
@@ -100,3 +100,275 @@ class Vector
 		return "[{join(", ", ":")}]"
 	end
 end
+
+# A Document index based on VSM
+#
+# Using VSMIndex you can index documents associated with their vector.
+# Documents can then be matched to query vectors.
+class VSMIndex
+
+	# Documents index
+	#
+	# TODO use a more efficient representation.
+	var documents = new HashSet[Document]
+
+	# Count for all terms in all indexed documents
+	#
+	# Used to compute the `inverse_doc_frequency`.
+	var terms_doc_count = new Vector
+
+	# Inverse document frequency
+	#
+	# The inverse document frequency is a measure of how much information a term
+	# provides, that is, whether the term is common or rare across all documents.
+	var inverse_doc_frequency = new Vector
+
+	# Used to sort matches
+	#
+	# See `IndexMatch`.
+	var sorter = new IndexMatchSorter
+
+	# Match `query` vector to all index document vectors
+	#
+	# Returns an `IndexMatch` for each indexed document.
+	# Results are ordered by descending similarity.
+	fun match_vector(query: Vector): Array[IndexMatch] do
+		var matches = new Array[IndexMatch]
+		for doc in documents do
+			var sim = query.cosine_similarity(doc.tfidf)
+			if sim == 0.0 then continue
+			matches.add new IndexMatch(doc, sim)
+		end
+		sorter.sort(matches)
+		return matches
+	end
+
+	# Index a document
+	#
+	# With each new document, the `inverse_doc_frequency` must be updated.
+	# By default, the method `update_index` is called after each call to
+	# `index_document`.
+	#
+	# When processing batch documents, use `auto_update = false` to disable
+	# the auto update of the index.
+	fun index_document(doc: Document, auto_update: nullable Bool) do
+		for term, count in doc.terms_count do
+			if not terms_doc_count.has_key(term) then
+				terms_doc_count[term] = 1.0
+			else
+				terms_doc_count[term] += 1.0
+			end
+		end
+		documents.add doc
+		if auto_update == null or auto_update then update_index
+	end
+
+	# Update the index
+	#
+	# Recompute the `inverse_doc_frequency` values.
+	# Must be called manually after indexing new document with the option
+	# `auto_update = false`.
+	fun update_index do
+		for doc in documents do
+			for term, ccount in doc.terms_count do
+				inverse_doc_frequency[term] = (documents.length.to_f / terms_doc_count[term]).log
+			end
+		end
+		for doc in documents do
+			for term, freq in doc.terms_frequency do
+				doc.tfidf[term] = freq * inverse_doc_frequency[term]
+			end
+		end
+	end
+end
+
+# A VSM index to store strings
+class StringIndex
+	super VSMIndex
+
+	# Index a new Document from `title`, `uri` and string `string`.
+	#
+	# Return the Document created.
+	#
+	# See `index_document`.
+	fun index_string(title, uri, string: String, auto_update: nullable Bool): Document do
+		var vector = parse_string(string)
+		var doc = new Document(title, uri, vector)
+		index_document(doc, auto_update)
+		return doc
+	end
+
+	# Match the `query` string against all indexed documents
+	#
+	# See `match_vector`.
+	fun match_string(query: String): Array[IndexMatch] do
+		var vector = parse_string(query)
+		return match_vector(vector)
+	end
+
+	# Parse the `string` as a Vector
+	#
+	# Returns a vector containing the terms of `string`.
+	fun parse_string(string: String): Vector do
+		var reader = new StringReader(string)
+		var vector = new Vector
+		loop
+			var token = reader.read_word
+			if token == "" then break
+
+			if not vector.has_key(token) then
+				vector[token] = 1.0
+			else
+				vector[token] += 1.0
+			end
+		end
+		return vector
+	end
+end
+
+# A VSMIndex to index files
+class FileIndex
+	super StringIndex
+
+	# Index a file from its `path`.
+	#
+	# Return the created document or null if `path` is not accepted by `accept_file`.
+	#
+	# See `index_document`.
+	fun index_file(path: String, auto_update: nullable Bool): nullable Document do
+		if not accept_file(path) then return null
+		var vector = parse_file(path)
+		var doc = new Document(path, path, vector)
+		index_document(doc, auto_update)
+		return doc
+	end
+
+	# Index multiple files
+	#
+	# The recursive method `index_dir` will be called for each directory found
+	# in `paths`.
+	#
+	# See `index_file`
+	fun index_files(paths: Collection[String], auto_update: nullable Bool) do
+		for path in paths do
+			if path.to_path.is_dir then
+				index_dir(path, false)
+			else
+				index_file(path, false)
+			end
+		end
+		if auto_update != null and auto_update then update_index
+	end
+
+	# Index all files in `dir` recursively
+	#
+	# See `index_file`.
+	fun index_dir(dir: String, auto_update: nullable Bool) do
+		if not dir.to_path.is_dir then return
+		for file in dir.files do
+			var path = dir / file
+			if path.to_path.is_dir then
+				index_dir(path, false)
+			else
+				index_file(path, false)
+			end
+		end
+		if auto_update != null and auto_update then update_index
+	end
+
+	# Is `path` accepted depending on `whitelist_exts` and `blacklist_exts`?
+	fun accept_file(path: String): Bool do
+		var ext = path.file_extension
+		if ext != null then
+			ext = ext.to_lower
+			if blacklist_exts.has(ext) then return false
+			if whitelist_exts.not_empty and not whitelist_exts.has(ext) then return false
+		end
+		return whitelist_exts.is_empty
+	end
+
+	# Parse the `file` content as a Vector
+	#
+	# See `parse_string`.
+	fun parse_file(file: String): Vector do
+		return parse_string(file.to_path.read_all)
+	end
+
+	# File extensions white list
+	#
+	# If not empty, only files with these extensions will be indexed.
+	#
+	# If an extension is in both `whitelist_exts` and `blacklist_exts`, the
+	# blacklist will prevail and the file will be ignored.
+	var whitelist_exts = new Array[String] is writable
+
+	# File extensions black list
+	#
+	# Files with these extensions will not be indexed.
+	var blacklist_exts = new Array[String] is writable
+end
+
+# A Document to add in a VSMIndex
+class Document
+
+	# Document title
+	var title: String
+
+	# Document URI
+	var uri: String
+
+	# Count of all terms found in the document
+	#
+	# Used to compute the document `terms_frequency`.
+	var terms_count: Vector
+
+	# Frequency of each term found in the document
+	#
+	# Used to match the document against the `VSMIndex::inverse_doc_frequency`.
+	var terms_frequency: Vector is lazy do
+		var all_terms = 0.0
+		for t, c in terms_count do all_terms += c
+
+		var vector = new Vector
+		for t, c in terms_count do
+			vector[t] = c / all_terms
+		end
+		return vector
+	end
+
+	# Term frequencyâInverse document frequency for each term
+	#
+	# A high weight in tfâidf is reached by a high term frequency
+	# (in the given document) and a low document frequency of the term in the
+	# whole collection of documents
+	var tfidf = new Vector
+
+	redef fun to_s do return "{title}"
+end
+
+# A match to a `request` in an `Index`
+class IndexMatch
+	super Comparable
+
+	# Document matching the `request_vector`
+	var document: Document
+
+	# Similarity between the `request` and the `doc`.
+	#
+	# Result is in the range 0.0 .. 1.1 where 0.0 means no similarity and 1.0
+	# means perfect similarity.
+	var similarity: Float
+
+	redef fun to_s do return "{document} ({similarity})"
+end
+
+# Sort matches by similarity
+class IndexMatchSorter
+	super DefaultComparator
+
+	redef type COMPARED: IndexMatch
+
+	redef fun compare(a, b) do
+		return b.similarity <=> a.similarity
+	end
+end
-- 
1.7.9.5