X-Git-Url: http://nitlanguage.org diff --git a/lib/vsm/vsm.nit b/lib/vsm/vsm.nit index d589e89..e7ebc2e 100644 --- a/lib/vsm/vsm.nit +++ b/lib/vsm/vsm.nit @@ -77,6 +77,17 @@ class Vector return super end + # Increment value for `obj` term + # + # If the term isn't already in the vector, the new value is 1.0. + fun inc(obj: nullable Object) do + if has_key(obj) then + self[obj] += 1.0 + else + self[obj] = 1.0 + end + end + # The norm of the vector. # # `||x|| = (x1 ** 2 ... + xn ** 2).sqrt` @@ -120,6 +131,11 @@ class VSMIndex # Documents index var documents = new HashSet[DOC] + # Inversed index + # + # Link documents to existing terms. + var inversed_index = new HashMap[nullable Object, Array[DOC]] + # Count for all terms in all indexed documents # # Used to compute the `inverse_doc_frequency`. @@ -141,6 +157,12 @@ class VSMIndex # Returns an `IndexMatch` for each indexed document. # Results are ordered by descending similarity. fun match_vector(query: Vector): Array[IndexMatch[DOC]] do + var documents = new HashSet[DOC] + for term, count in query do + if inversed_index.has_key(term) then + documents.add_all inversed_index[term] + end + end var matches = new Array[IndexMatch[DOC]] for doc in documents do var sim = query.cosine_similarity(doc.tfidf) @@ -161,11 +183,11 @@ class VSMIndex # the auto update of the index. fun index_document(doc: DOC, auto_update: nullable Bool) do for term, count in doc.terms_count do - if not terms_doc_count.has_key(term) then - terms_doc_count[term] = 1.0 - else - terms_doc_count[term] += 1.0 + terms_doc_count.inc(term) + if not inversed_index.has_key(term) then + inversed_index[term] = new Array[DOC] end + inversed_index[term].add doc end documents.add doc if auto_update == null or auto_update then update_index @@ -224,12 +246,7 @@ class StringIndex loop var token = reader.read_word if token == "" then break - - if not vector.has_key(token) then - vector[token] = 1.0 - else - vector[token] += 1.0 - end + vector.inc(token) end return vector end