return super
end
+ # Increment value for `obj` term
+ #
+ # If the term isn't already in the vector, the new value is 1.0.
+ fun inc(obj: nullable Object) do
+ if has_key(obj) then
+ self[obj] += 1.0
+ else
+ self[obj] = 1.0
+ end
+ end
+
# The norm of the vector.
#
# `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
# Documents index
var documents = new HashSet[DOC]
+ # Inversed index
+ #
+ # Link documents to existing terms.
+ var inversed_index = new HashMap[nullable Object, Array[DOC]]
+
# Count for all terms in all indexed documents
#
# Used to compute the `inverse_doc_frequency`.
# Returns an `IndexMatch` for each indexed document.
# Results are ordered by descending similarity.
fun match_vector(query: Vector): Array[IndexMatch[DOC]] do
+ var documents = new HashSet[DOC]
+ for term, count in query do
+ if inversed_index.has_key(term) then
+ documents.add_all inversed_index[term]
+ end
+ end
var matches = new Array[IndexMatch[DOC]]
for doc in documents do
var sim = query.cosine_similarity(doc.tfidf)
# the auto update of the index.
fun index_document(doc: DOC, auto_update: nullable Bool) do
for term, count in doc.terms_count do
- if not terms_doc_count.has_key(term) then
- terms_doc_count[term] = 1.0
- else
- terms_doc_count[term] += 1.0
+ terms_doc_count.inc(term)
+ if not inversed_index.has_key(term) then
+ inversed_index[term] = new Array[DOC]
end
+ inversed_index[term].add doc
end
documents.add doc
if auto_update == null or auto_update then update_index
loop
var token = reader.read_word
if token == "" then break
-
- if not vector.has_key(token) then
- vector[token] = 1.0
- else
- vector[token] += 1.0
- end
+ vector.inc(token)
end
return vector
end
# A high weight in tf–idf is reached by a high term frequency
# (in the given document) and a low document frequency of the term in the
# whole collection of documents
- var tfidf = new Vector
+ var tfidf: Vector = terms_count is lazy
redef fun to_s do return "{title}"
end