# This file is part of NIT ( http://www.nitlanguage.org ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vector Space Model
#
# Vector Space Model (VSM) is an algebraic model for representing text documents
# (and any objects, in general) as vectors of identifiers, such as, for example,
# index terms.
#
# It is used in information filtering, information retrieval, indexing and
# relevancy rankings.
module vsm

import counter

# A n-dimensions vector
#
# *n-dimensions* vectors are used to represent a text document or an object.
class Vector
	super HashMap[nullable Object, Float]

	# Cosine similarity of `self` and `other`.
	#
	# Gives the proximity in the range `[0.0 .. 1.0]` where 0.0 means that the
	# two vectors are orthogonal and 1.0 means that they are identical.
	#
	# ~~~
	# var v1 = new Vector
	# v1["x"] = 1.0
	# v1["y"] = 2.0
	# v1["z"] = 3.0
	#
	# var v2 = new Vector
	# v2["x"] = 1.0
	# v2["y"] = 2.0
	# v2["z"] = 3.0
	#
	# var v3 = new Vector
	# v3["a"] = 1.0
	# v3["b"] = 2.0
	# v3["c"] = 3.0
	#
	# print v1.cosine_similarity(v2)
	# assert v1.cosine_similarity(v2) == 1.0
	# print v1.cosine_similarity(v3)
	# assert v1.cosine_similarity(v3) == 0.0
	# ~~~
	fun cosine_similarity(other: SELF): Float do
		# Collect terms
		var terms = new HashSet[nullable Object]
		for k in self.keys do terms.add k
		for k in other.keys do terms.add k

		# Get dot product of two vectors
		var dot = 0.0
		for term in terms do
			dot += self.get_or_default(term, 0.0) * other.get_or_default(term, 0.0)
		end
		var cos = dot.to_f / (self.norm * other.norm)
		if cos.is_nan then return 0.0
		return cos
	end

	redef fun [](k) do
		if not has_key(k) then return 0.0
		return super
	end

	# Increment value for `obj` term
	#
	# If the term isn't already in the vector, the new value is 1.0.
	fun inc(obj: nullable Object) do
		if has_key(obj) then
			self[obj] += 1.0
		else
			self[obj] = 1.0
		end
	end

	# The norm of the vector.
	#
	# `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
	#
	# ~~~
	# var v = new Vector
	# v["x"] = 1.0
	# v["y"] = 1.0
	# v["z"] = 1.0
	# v["t"] = 1.0
	# assert v.norm.is_approx(2.0, 0.001)
	#
	# v["x"] = 1.0
	# v["y"] = 2.0
	# v["z"] = 3.0
	# v["t"] = 0.0
	# assert v.norm.is_approx(3.742, 0.001)
	# ~~~
	fun norm: Float do
		var sum = 0.0
		for v in self.values do sum += v.pow(2.0)
		return sum.to_f.sqrt
	end

	redef fun to_s do
		return "[{join(", ", ":")}]"
	end
end

# A Document index based on VSM
#
# Using VSMIndex you can index documents associated with their vector.
# Documents can then be matched to query vectors.
class VSMIndex

	# Kind of documents stored in this index
	#
	# Clients can redefine this type to specialize the index.
	type DOC: Document

	# Documents index
	var documents = new HashSet[DOC]

	# Inversed index
	#
	# Link documents to existing terms.
	var inversed_index = new HashMap[nullable Object, Array[DOC]]

	# Count for all terms in all indexed documents
	#
	# Used to compute the `inverse_doc_frequency`.
	var terms_doc_count = new Vector

	# Inverse document frequency
	#
	# The inverse document frequency is a measure of how much information a term
	# provides, that is, whether the term is common or rare across all documents.
	var inverse_doc_frequency = new Vector

	# Used to sort matches
	#
	# See `IndexMatch`.
	var sorter = new IndexMatchSorter

	# Match `query` vector to all index document vectors
	#
	# Returns an `IndexMatch` for each indexed document.
	# Results are ordered by descending similarity.
	fun match_vector(query: Vector): Array[IndexMatch[DOC]] do
		var documents = new HashSet[DOC]
		for term, count in query do
			if inversed_index.has_key(term) then
				documents.add_all inversed_index[term]
			end
		end
		var matches = new Array[IndexMatch[DOC]]
		for doc in documents do
			var sim = query.cosine_similarity(doc.tfidf)
			if sim == 0.0 then continue
			matches.add new IndexMatch[DOC](doc, sim)
		end
		sorter.sort(matches)
		return matches
	end

	# Index a document
	#
	# With each new document, the `inverse_doc_frequency` must be updated.
	# By default, the method `update_index` is called after each call to
	# `index_document`.
	#
	# When processing batch documents, use `auto_update = false` to disable
	# the auto update of the index.
	fun index_document(doc: DOC, auto_update: nullable Bool) do
		for term, count in doc.terms_count do
			terms_doc_count.inc(term)
			if not inversed_index.has_key(term) then
				inversed_index[term] = new Array[DOC]
			end
			inversed_index[term].add doc
		end
		documents.add doc
		if auto_update == null or auto_update then update_index
	end

	# Update the index
	#
	# Recompute the `inverse_doc_frequency` values.
	# Must be called manually after indexing new document with the option
	# `auto_update = false`.
	fun update_index do
		for doc in documents do
			for term, ccount in doc.terms_count do
				inverse_doc_frequency[term] = (documents.length.to_f / terms_doc_count[term]).log
			end
		end
		for doc in documents do
			for term, freq in doc.terms_frequency do
				doc.tfidf[term] = freq * inverse_doc_frequency[term]
			end
		end
	end
end

# A VSM index to store strings
class StringIndex
	super VSMIndex

	# Index a new Document from `title`, `uri` and string `string`.
	#
	# Return the Document created.
	#
	# See `index_document`.
	fun index_string(title, uri, string: String, auto_update: nullable Bool): DOC do
		var vector = parse_string(string)
		var doc = new Document(title, uri, vector)
		index_document(doc, auto_update)
		return doc
	end

	# Match the `query` string against all indexed documents
	#
	# See `match_vector`.
	fun match_string(query: String): Array[IndexMatch[DOC]] do
		var vector = parse_string(query)
		var doc = new Document("", "", vector)
		return match_vector(doc.terms_frequency)
	end

	# Parse the `string` as a Vector
	#
	# Returns a vector containing the terms of `string`.
	fun parse_string(string: String): Vector do
		var reader = new StringReader(string)
		var vector = new Vector
		loop
			var token = reader.read_word
			if token == "" then break
			vector.inc(token)
		end
		return vector
	end
end

# A VSMIndex to index files
class FileIndex
	super StringIndex

	# Index a file from its `path`.
	#
	# Return the created document or null if `path` is not accepted by `accept_file`.
	#
	# See `index_document`.
	fun index_file(path: String, auto_update: nullable Bool): nullable DOC do
		if not accept_file(path) then return null
		var vector = parse_file(path)
		var doc = new Document(path, path, vector)
		index_document(doc, auto_update)
		return doc
	end

	# Index multiple files
	#
	# The recursive method `index_dir` will be called for each directory found
	# in `paths`.
	#
	# See `index_file`
	fun index_files(paths: Collection[String], auto_update: nullable Bool) do
		for path in paths do
			if path.to_path.is_dir then
				index_dir(path, false)
			else
				index_file(path, false)
			end
		end
		if auto_update != null and auto_update then update_index
	end

	# Index all files in `dir` recursively
	#
	# See `index_file`.
	fun index_dir(dir: String, auto_update: nullable Bool) do
		if not dir.to_path.is_dir then return
		for file in dir.files do
			var path = dir / file
			if path.to_path.is_dir then
				index_dir(path, false)
			else
				index_file(path, false)
			end
		end
		if auto_update != null and auto_update then update_index
	end

	# Is `path` accepted depending on `whitelist_exts` and `blacklist_exts`?
	fun accept_file(path: String): Bool do
		var ext = path.file_extension
		if ext != null then
			ext = ext.to_lower
			if blacklist_exts.has(ext) then return false
			if whitelist_exts.not_empty and not whitelist_exts.has(ext) then return false
		end
		return whitelist_exts.is_empty
	end

	# Parse the `file` content as a Vector
	#
	# See `parse_string`.
	fun parse_file(file: String): Vector do
		return parse_string(file.to_path.read_all)
	end

	# File extensions white list
	#
	# If not empty, only files with these extensions will be indexed.
	#
	# If an extension is in both `whitelist_exts` and `blacklist_exts`, the
	# blacklist will prevail and the file will be ignored.
	var whitelist_exts = new Array[String] is writable

	# File extensions black list
	#
	# Files with these extensions will not be indexed.
	var blacklist_exts = new Array[String] is writable
end

# A Document to add in a VSMIndex
class Document

	# Document title
	var title: String

	# Document URI
	var uri: String

	# Count of all terms found in the document
	#
	# Used to compute the document `terms_frequency`.
	var terms_count: Vector

	# Frequency of each term found in the document
	#
	# Used to match the document against the `VSMIndex::inverse_doc_frequency`.
	var terms_frequency: Vector is lazy do
		var all_terms = 0.0
		for t, c in terms_count do all_terms += c

		var vector = new Vector
		for t, c in terms_count do
			vector[t] = c / all_terms
		end
		return vector
	end

	# Term frequency–Inverse document frequency for each term
	#
	# A high weight in tf–idf is reached by a high term frequency
	# (in the given document) and a low document frequency of the term in the
	# whole collection of documents
	var tfidf: Vector = terms_count is lazy

	redef fun to_s do return "{title}"
end

# A match to a `request` in an `Index`
class IndexMatch[DOC: Document]
	super Comparable

	# Document matching the `request_vector`
	var document: DOC

	# Similarity between the `request` and the `doc`.
	#
	# Result is in the range 0.0 .. 1.1 where 0.0 means no similarity and 1.0
	# means perfect similarity.
	var similarity: Float

	redef fun to_s do return "{document} ({similarity})"
end

# Sort matches by similarity
class IndexMatchSorter
	super DefaultComparator

	redef type COMPARED: IndexMatch[Document]

	redef fun compare(a, b) do
		return b.similarity <=> a.similarity
	end
end