X-Git-Url: http://nitlanguage.org

diff --git a/lib/vsm/vsm.nit b/lib/vsm/vsm.nit
index 4a64f99..ad3d928 100644
--- a/lib/vsm/vsm.nit
+++ b/lib/vsm/vsm.nit
@@ -77,6 +77,17 @@ class Vector
 		return super
 	end
 
+	# Increment value for `obj` term
+	#
+	# If the term isn't already in the vector, the new value is 1.0.
+	fun inc(obj: nullable Object) do
+		if has_key(obj) then
+			self[obj] += 1.0
+		else
+			self[obj] = 1.0
+		end
+	end
+
 	# The norm of the vector.
 	#
 	# `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
@@ -112,10 +123,18 @@ end
 # Documents can then be matched to query vectors.
 class VSMIndex
 
+	# Kind of documents stored in this index
+	#
+	# Clients can redefine this type to specialize the index.
+	type DOC: Document
+
 	# Documents index
+	var documents = new HashSet[DOC]
+
+	# Inversed index
 	#
-	# TODO use a more efficient representation.
-	var documents = new HashSet[Document]
+	# Link documents to existing terms.
+	var inversed_index = new HashMap[nullable Object, Array[DOC]]
 
 	# Count for all terms in all indexed documents
 	#
@@ -137,12 +156,18 @@ class VSMIndex
 	#
 	# Returns an `IndexMatch` for each indexed document.
 	# Results are ordered by descending similarity.
-	fun match_vector(query: Vector): Array[IndexMatch] do
-		var matches = new Array[IndexMatch]
+	fun match_vector(query: Vector): Array[IndexMatch[DOC]] do
+		var documents = new HashSet[DOC]
+		for term, count in query do
+			if inversed_index.has_key(term) then
+				documents.add_all inversed_index[term]
+			end
+		end
+		var matches = new Array[IndexMatch[DOC]]
 		for doc in documents do
 			var sim = query.cosine_similarity(doc.tfidf)
 			if sim == 0.0 then continue
-			matches.add new IndexMatch(doc, sim)
+			matches.add new IndexMatch[DOC](doc, sim)
 		end
 		sorter.sort(matches)
 		return matches
@@ -156,13 +181,13 @@ class VSMIndex
 	#
 	# When processing batch documents, use `auto_update = false` to disable
 	# the auto update of the index.
-	fun index_document(doc: Document, auto_update: nullable Bool) do
+	fun index_document(doc: DOC, auto_update: nullable Bool) do
 		for term, count in doc.terms_count do
-			if not terms_doc_count.has_key(term) then
-				terms_doc_count[term] = 1.0
-			else
-				terms_doc_count[term] += 1.0
+			terms_doc_count.inc(term)
+			if not inversed_index.has_key(term) then
+				inversed_index[term] = new Array[DOC]
 			end
+			inversed_index[term].add doc
 		end
 		documents.add doc
 		if auto_update == null or auto_update then update_index
@@ -196,7 +221,7 @@ class StringIndex
 	# Return the Document created.
 	#
 	# See `index_document`.
-	fun index_string(title, uri, string: String, auto_update: nullable Bool): Document do
+	fun index_string(title, uri, string: String, auto_update: nullable Bool): DOC do
 		var vector = parse_string(string)
 		var doc = new Document(title, uri, vector)
 		index_document(doc, auto_update)
@@ -206,7 +231,7 @@ class StringIndex
 	# Match the `query` string against all indexed documents
 	#
 	# See `match_vector`.
-	fun match_string(query: String): Array[IndexMatch] do
+	fun match_string(query: String): Array[IndexMatch[DOC]] do
 		var vector = parse_string(query)
 		var doc = new Document("", "", vector)
 		return match_vector(doc.terms_frequency)
@@ -221,12 +246,7 @@ class StringIndex
 		loop
 			var token = reader.read_word
 			if token == "" then break
-
-			if not vector.has_key(token) then
-				vector[token] = 1.0
-			else
-				vector[token] += 1.0
-			end
+			vector.inc(token)
 		end
 		return vector
 	end
@@ -241,7 +261,7 @@ class FileIndex
 	# Return the created document or null if `path` is not accepted by `accept_file`.
 	#
 	# See `index_document`.
-	fun index_file(path: String, auto_update: nullable Bool): nullable Document do
+	fun index_file(path: String, auto_update: nullable Bool): nullable DOC do
 		if not accept_file(path) then return null
 		var vector = parse_file(path)
 		var doc = new Document(path, path, vector)
@@ -347,17 +367,17 @@ class Document
 	# A high weight in tfâidf is reached by a high term frequency
 	# (in the given document) and a low document frequency of the term in the
 	# whole collection of documents
-	var tfidf = new Vector
+	var tfidf: Vector = terms_count is lazy
 
 	redef fun to_s do return "{title}"
 end
 
 # A match to a `request` in an `Index`
-class IndexMatch
+class IndexMatch[DOC: Document]
 	super Comparable
 
 	# Document matching the `request_vector`
-	var document: Document
+	var document: DOC
 
 	# Similarity between the `request` and the `doc`.
 	#
@@ -372,7 +392,7 @@ end
 class IndexMatchSorter
 	super DefaultComparator
 
-	redef type COMPARED: IndexMatch
+	redef type COMPARED: IndexMatch[Document]
 
 	redef fun compare(a, b) do
 		return b.similarity <=> a.similarity