Vector Space Model (VSM) is an algebraic model for representing text documents (and any objects, in general) as vectors of identifiers, such as, for example, index terms.
It is used in information filtering, information retrieval, indexing and relevancy rankings.
serialization :: serialization_core
Abstract services to serialize Nit objects to different formatscore :: union_find
union–find algorithm using an efficient disjoint-set data structurenlp :: nlp_server
# Vector Space Model
#
# Vector Space Model (VSM) is an algebraic model for representing text documents
# (and any objects, in general) as vectors of identifiers, such as, for example,
# index terms.
#
# It is used in information filtering, information retrieval, indexing and
# relevancy rankings.
module vsm
import counter
# A n-dimensions vector
#
# *n-dimensions* vectors are used to represent a text document or an object.
class Vector
	super HashMap[nullable Object, Float]
	# Cosine similarity of `self` and `other`.
	#
	# Gives the proximity in the range `[0.0 .. 1.0]` where 0.0 means that the
	# two vectors are orthogonal and 1.0 means that they are identical.
	#
	# ~~~
	# var v1 = new Vector
	# v1["x"] = 1.0
	# v1["y"] = 2.0
	# v1["z"] = 3.0
	#
	# var v2 = new Vector
	# v2["x"] = 1.0
	# v2["y"] = 2.0
	# v2["z"] = 3.0
	#
	# var v3 = new Vector
	# v3["a"] = 1.0
	# v3["b"] = 2.0
	# v3["c"] = 3.0
	#
	# print v1.cosine_similarity(v2)
	# assert v1.cosine_similarity(v2) == 1.0
	# print v1.cosine_similarity(v3)
	# assert v1.cosine_similarity(v3) == 0.0
	# ~~~
	fun cosine_similarity(other: SELF): Float do
		# Collect terms
		var terms = new HashSet[nullable Object]
		for k in self.keys do terms.add k
		for k in other.keys do terms.add k
		# Get dot product of two vectors
		var dot = 0.0
		for term in terms do
			dot += self.get_or_default(term, 0.0) * other.get_or_default(term, 0.0)
		end
		var cos = dot.to_f / (self.norm * other.norm)
		if cos.is_nan then return 0.0
		return cos
	end
	redef fun [](k) do
		if not has_key(k) then return 0.0
		return super
	end
	# Increment value for `obj` term
	#
	# If the term isn't already in the vector, the new value is 1.0.
	fun inc(obj: nullable Object) do
		if has_key(obj) then
			self[obj] += 1.0
		else
			self[obj] = 1.0
		end
	end
	# The norm of the vector.
	#
	# `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
	#
	# ~~~
	# var v = new Vector
	# v["x"] = 1.0
	# v["y"] = 1.0
	# v["z"] = 1.0
	# v["t"] = 1.0
	# assert v.norm.is_approx(2.0, 0.001)
	#
	# v["x"] = 1.0
	# v["y"] = 2.0
	# v["z"] = 3.0
	# v["t"] = 0.0
	# assert v.norm.is_approx(3.742, 0.001)
	# ~~~
	fun norm: Float do
		var sum = 0.0
		for v in self.values do sum += v.pow(2.0)
		return sum.to_f.sqrt
	end
	redef fun to_s do
		return "[{join(", ", ":")}]"
	end
end
# A Document index based on VSM
#
# Using VSMIndex you can index documents associated with their vector.
# Documents can then be matched to query vectors.
class VSMIndex
	# Kind of documents stored in this index
	#
	# Clients can redefine this type to specialize the index.
	type DOC: Document
	# Documents index
	var documents = new HashSet[DOC]
	# Inversed index
	#
	# Link documents to existing terms.
	var inversed_index = new HashMap[nullable Object, Array[DOC]]
	# Count for all terms in all indexed documents
	#
	# Used to compute the `inverse_doc_frequency`.
	var terms_doc_count = new Vector
	# Inverse document frequency
	#
	# The inverse document frequency is a measure of how much information a term
	# provides, that is, whether the term is common or rare across all documents.
	var inverse_doc_frequency = new Vector
	# Used to sort matches
	#
	# See `IndexMatch`.
	var sorter = new IndexMatchSorter
	# Match `query` vector to all index document vectors
	#
	# Returns an `IndexMatch` for each indexed document.
	# Results are ordered by descending similarity.
	fun match_vector(query: Vector): Array[IndexMatch[DOC]] do
		var documents = new HashSet[DOC]
		for term, count in query do
			if inversed_index.has_key(term) then
				documents.add_all inversed_index[term]
			end
		end
		var matches = new Array[IndexMatch[DOC]]
		for doc in documents do
			var sim = query.cosine_similarity(doc.tfidf)
			if sim == 0.0 then continue
			matches.add new IndexMatch[DOC](doc, sim)
		end
		sorter.sort(matches)
		return matches
	end
	# Index a document
	#
	# With each new document, the `inverse_doc_frequency` must be updated.
	# By default, the method `update_index` is called after each call to
	# `index_document`.
	#
	# When processing batch documents, use `auto_update = false` to disable
	# the auto update of the index.
	fun index_document(doc: DOC, auto_update: nullable Bool) do
		for term, count in doc.terms_count do
			terms_doc_count.inc(term)
			if not inversed_index.has_key(term) then
				inversed_index[term] = new Array[DOC]
			end
			inversed_index[term].add doc
		end
		documents.add doc
		if auto_update == null or auto_update then update_index
	end
	# Update the index
	#
	# Recompute the `inverse_doc_frequency` values.
	# Must be called manually after indexing new document with the option
	# `auto_update = false`.
	fun update_index do
		for doc in documents do
			for term, ccount in doc.terms_count do
				inverse_doc_frequency[term] = (documents.length.to_f / terms_doc_count[term]).log
			end
		end
		for doc in documents do
			for term, freq in doc.terms_frequency do
				doc.tfidf[term] = freq * inverse_doc_frequency[term]
			end
		end
	end
end
# A VSM index to store strings
class StringIndex
	super VSMIndex
	# Index a new Document from `title`, `uri` and string `string`.
	#
	# Return the Document created.
	#
	# See `index_document`.
	fun index_string(title, uri, string: String, auto_update: nullable Bool): DOC do
		var vector = parse_string(string)
		var doc = new Document(title, uri, vector)
		index_document(doc, auto_update)
		return doc
	end
	# Match the `query` string against all indexed documents
	#
	# See `match_vector`.
	fun match_string(query: String): Array[IndexMatch[DOC]] do
		var vector = parse_string(query)
		var doc = new Document("", "", vector)
		return match_vector(doc.terms_frequency)
	end
	# Parse the `string` as a Vector
	#
	# Returns a vector containing the terms of `string`.
	fun parse_string(string: String): Vector do
		var reader = new StringReader(string)
		var vector = new Vector
		loop
			var token = reader.read_word
			if token == "" then break
			vector.inc(token)
		end
		return vector
	end
end
# A VSMIndex to index files
class FileIndex
	super StringIndex
	# Index a file from its `path`.
	#
	# Return the created document or null if `path` is not accepted by `accept_file`.
	#
	# See `index_document`.
	fun index_file(path: String, auto_update: nullable Bool): nullable DOC do
		if not accept_file(path) then return null
		var vector = parse_file(path)
		var doc = new Document(path, path, vector)
		index_document(doc, auto_update)
		return doc
	end
	# Index multiple files
	#
	# The recursive method `index_dir` will be called for each directory found
	# in `paths`.
	#
	# See `index_file`
	fun index_files(paths: Collection[String], auto_update: nullable Bool) do
		for path in paths do
			if path.to_path.is_dir then
				index_dir(path, false)
			else
				index_file(path, false)
			end
		end
		if auto_update != null and auto_update then update_index
	end
	# Index all files in `dir` recursively
	#
	# See `index_file`.
	fun index_dir(dir: String, auto_update: nullable Bool) do
		if not dir.to_path.is_dir then return
		for file in dir.files do
			var path = dir / file
			if path.to_path.is_dir then
				index_dir(path, false)
			else
				index_file(path, false)
			end
		end
		if auto_update != null and auto_update then update_index
	end
	# Is `path` accepted depending on `whitelist_exts` and `blacklist_exts`?
	fun accept_file(path: String): Bool do
		var ext = path.file_extension
		if ext != null then
			ext = ext.to_lower
			if blacklist_exts.has(ext) then return false
			if whitelist_exts.not_empty and not whitelist_exts.has(ext) then return false
		end
		return whitelist_exts.is_empty
	end
	# Parse the `file` content as a Vector
	#
	# See `parse_string`.
	fun parse_file(file: String): Vector do
		return parse_string(file.to_path.read_all)
	end
	# File extensions white list
	#
	# If not empty, only files with these extensions will be indexed.
	#
	# If an extension is in both `whitelist_exts` and `blacklist_exts`, the
	# blacklist will prevail and the file will be ignored.
	var whitelist_exts = new Array[String] is writable
	# File extensions black list
	#
	# Files with these extensions will not be indexed.
	var blacklist_exts = new Array[String] is writable
end
# A Document to add in a VSMIndex
class Document
	# Document title
	var title: String
	# Document URI
	var uri: String
	# Count of all terms found in the document
	#
	# Used to compute the document `terms_frequency`.
	var terms_count: Vector
	# Frequency of each term found in the document
	#
	# Used to match the document against the `VSMIndex::inverse_doc_frequency`.
	var terms_frequency: Vector is lazy do
		var all_terms = 0.0
		for t, c in terms_count do all_terms += c
		var vector = new Vector
		for t, c in terms_count do
			vector[t] = c / all_terms
		end
		return vector
	end
	# Term frequency–Inverse document frequency for each term
	#
	# A high weight in tf–idf is reached by a high term frequency
	# (in the given document) and a low document frequency of the term in the
	# whole collection of documents
	var tfidf: Vector = terms_count is lazy
	redef fun to_s do return "{title}"
end
# A match to a `request` in an `Index`
class IndexMatch[DOC: Document]
	super Comparable
	# Document matching the `request_vector`
	var document: DOC
	# Similarity between the `request` and the `doc`.
	#
	# Result is in the range 0.0 .. 1.1 where 0.0 means no similarity and 1.0
	# means perfect similarity.
	var similarity: Float
	redef fun to_s do return "{document} ({similarity})"
end
# Sort matches by similarity
class IndexMatchSorter
	super DefaultComparator
	redef type COMPARED: IndexMatch[Document]
	redef fun compare(a, b) do
		return b.similarity <=> a.similarity
	end
end
lib/vsm/vsm.nit:15,1--400,3