vsm :: VSMIndex :: defaultinit
# A Document index based on VSM
#
# Using VSMIndex you can index documents associated with their vector.
# Documents can then be matched to query vectors.
class VSMIndex
# Kind of documents stored in this index
#
# Clients can redefine this type to specialize the index.
type DOC: Document
# Documents index
var documents = new HashSet[DOC]
# Inversed index
#
# Link documents to existing terms.
var inversed_index = new HashMap[nullable Object, Array[DOC]]
# Count for all terms in all indexed documents
#
# Used to compute the `inverse_doc_frequency`.
var terms_doc_count = new Vector
# Inverse document frequency
#
# The inverse document frequency is a measure of how much information a term
# provides, that is, whether the term is common or rare across all documents.
var inverse_doc_frequency = new Vector
# Used to sort matches
#
# See `IndexMatch`.
var sorter = new IndexMatchSorter
# Match `query` vector to all index document vectors
#
# Returns an `IndexMatch` for each indexed document.
# Results are ordered by descending similarity.
fun match_vector(query: Vector): Array[IndexMatch[DOC]] do
var documents = new HashSet[DOC]
for term, count in query do
if inversed_index.has_key(term) then
documents.add_all inversed_index[term]
end
end
var matches = new Array[IndexMatch[DOC]]
for doc in documents do
var sim = query.cosine_similarity(doc.tfidf)
if sim == 0.0 then continue
matches.add new IndexMatch[DOC](doc, sim)
end
sorter.sort(matches)
return matches
end
# Index a document
#
# With each new document, the `inverse_doc_frequency` must be updated.
# By default, the method `update_index` is called after each call to
# `index_document`.
#
# When processing batch documents, use `auto_update = false` to disable
# the auto update of the index.
fun index_document(doc: DOC, auto_update: nullable Bool) do
for term, count in doc.terms_count do
terms_doc_count.inc(term)
if not inversed_index.has_key(term) then
inversed_index[term] = new Array[DOC]
end
inversed_index[term].add doc
end
documents.add doc
if auto_update == null or auto_update then update_index
end
# Update the index
#
# Recompute the `inverse_doc_frequency` values.
# Must be called manually after indexing new document with the option
# `auto_update = false`.
fun update_index do
for doc in documents do
for term, ccount in doc.terms_count do
inverse_doc_frequency[term] = (documents.length.to_f / terms_doc_count[term]).log
end
end
for doc in documents do
for term, freq in doc.terms_frequency do
doc.tfidf[term] = freq * inverse_doc_frequency[term]
end
end
end
end
lib/vsm/vsm.nit:120,1--213,3