lib/vsm: default tfidf values are extracted from terms frequencies

[nit.git] / lib / vsm / vsm.nit
diff --git a/lib/vsm/vsm.nit b/lib/vsm/vsm.nit

index d589e89..ad3d928 100644 (file)
--- a/lib/vsm/vsm.nit
+++ b/lib/vsm/vsm.nit
@@ -77,6 +77,17 @@ class Vector
                 return super
         end
  
+       # Increment value for `obj` term
+       #
+       # If the term isn't already in the vector, the new value is 1.0.
+       fun inc(obj: nullable Object) do
+               if has_key(obj) then
+                       self[obj] += 1.0
+               else
+                       self[obj] = 1.0
+               end
+       end
+
         # The norm of the vector.
         #
         # `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
@@ -120,6 +131,11 @@ class VSMIndex
         # Documents index
         var documents = new HashSet[DOC]
  
+       # Inversed index
+       #
+       # Link documents to existing terms.
+       var inversed_index = new HashMap[nullable Object, Array[DOC]]
+
         # Count for all terms in all indexed documents
         #
         # Used to compute the `inverse_doc_frequency`.
@@ -141,6 +157,12 @@ class VSMIndex
         # Returns an `IndexMatch` for each indexed document.
         # Results are ordered by descending similarity.
         fun match_vector(query: Vector): Array[IndexMatch[DOC]] do
+               var documents = new HashSet[DOC]
+               for term, count in query do
+                       if inversed_index.has_key(term) then
+                               documents.add_all inversed_index[term]
+                       end
+               end
                 var matches = new Array[IndexMatch[DOC]]
                 for doc in documents do
                         var sim = query.cosine_similarity(doc.tfidf)
@@ -161,11 +183,11 @@ class VSMIndex
         # the auto update of the index.
         fun index_document(doc: DOC, auto_update: nullable Bool) do
                 for term, count in doc.terms_count do
-                       if not terms_doc_count.has_key(term) then
-                               terms_doc_count[term] = 1.0
-                       else
-                               terms_doc_count[term] += 1.0
+                       terms_doc_count.inc(term)
+                       if not inversed_index.has_key(term) then
+                               inversed_index[term] = new Array[DOC]
                         end
+                       inversed_index[term].add doc
                 end
                 documents.add doc
                 if auto_update == null or auto_update then update_index
@@ -224,12 +246,7 @@ class StringIndex
                 loop
                         var token = reader.read_word
                         if token == "" then break
-
-                       if not vector.has_key(token) then
-                               vector[token] = 1.0
-                       else
-                               vector[token] += 1.0
-                       end
+                       vector.inc(token)
                 end
                 return vector
         end
@@ -350,7 +367,7 @@ class Document
         # A high weight in tf–idf is reached by a high term frequency
         # (in the given document) and a low document frequency of the term in the
         # whole collection of documents
-       var tfidf = new Vector
+       var tfidf: Vector = terms_count is lazy
  
         redef fun to_s do return "{title}"
  end