return cos
end
+ redef fun [](k) do
+ if not has_key(k) then return 0.0
+ return super
+ end
+
+ # Increment value for `obj` term
+ #
+ # If the term isn't already in the vector, the new value is 1.0.
+ fun inc(obj: nullable Object) do
+ if has_key(obj) then
+ self[obj] += 1.0
+ else
+ self[obj] = 1.0
+ end
+ end
+
# The norm of the vector.
#
# `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
# Documents can then be matched to query vectors.
class VSMIndex
+ # Kind of documents stored in this index
+ #
+ # Clients can redefine this type to specialize the index.
+ type DOC: Document
+
# Documents index
+ var documents = new HashSet[DOC]
+
+ # Inversed index
#
- # TODO use a more efficient representation.
- var documents = new HashSet[Document]
+ # Link documents to existing terms.
+ var inversed_index = new HashMap[nullable Object, Array[DOC]]
# Count for all terms in all indexed documents
#
#
# Returns an `IndexMatch` for each indexed document.
# Results are ordered by descending similarity.
- fun match_vector(query: Vector): Array[IndexMatch] do
- var matches = new Array[IndexMatch]
+ fun match_vector(query: Vector): Array[IndexMatch[DOC]] do
+ var documents = new HashSet[DOC]
+ for term, count in query do
+ if inversed_index.has_key(term) then
+ documents.add_all inversed_index[term]
+ end
+ end
+ var matches = new Array[IndexMatch[DOC]]
for doc in documents do
var sim = query.cosine_similarity(doc.tfidf)
if sim == 0.0 then continue
- matches.add new IndexMatch(doc, sim)
+ matches.add new IndexMatch[DOC](doc, sim)
end
sorter.sort(matches)
return matches
#
# When processing batch documents, use `auto_update = false` to disable
# the auto update of the index.
- fun index_document(doc: Document, auto_update: nullable Bool) do
+ fun index_document(doc: DOC, auto_update: nullable Bool) do
for term, count in doc.terms_count do
- if not terms_doc_count.has_key(term) then
- terms_doc_count[term] = 1.0
- else
- terms_doc_count[term] += 1.0
+ terms_doc_count.inc(term)
+ if not inversed_index.has_key(term) then
+ inversed_index[term] = new Array[DOC]
end
+ inversed_index[term].add doc
end
documents.add doc
if auto_update == null or auto_update then update_index
# Return the Document created.
#
# See `index_document`.
- fun index_string(title, uri, string: String, auto_update: nullable Bool): Document do
+ fun index_string(title, uri, string: String, auto_update: nullable Bool): DOC do
var vector = parse_string(string)
var doc = new Document(title, uri, vector)
index_document(doc, auto_update)
# Match the `query` string against all indexed documents
#
# See `match_vector`.
- fun match_string(query: String): Array[IndexMatch] do
+ fun match_string(query: String): Array[IndexMatch[DOC]] do
var vector = parse_string(query)
- return match_vector(vector)
+ var doc = new Document("", "", vector)
+ return match_vector(doc.terms_frequency)
end
# Parse the `string` as a Vector
loop
var token = reader.read_word
if token == "" then break
-
- if not vector.has_key(token) then
- vector[token] = 1.0
- else
- vector[token] += 1.0
- end
+ vector.inc(token)
end
return vector
end
# Return the created document or null if `path` is not accepted by `accept_file`.
#
# See `index_document`.
- fun index_file(path: String, auto_update: nullable Bool): nullable Document do
+ fun index_file(path: String, auto_update: nullable Bool): nullable DOC do
if not accept_file(path) then return null
var vector = parse_file(path)
var doc = new Document(path, path, vector)
# A high weight in tf–idf is reached by a high term frequency
# (in the given document) and a low document frequency of the term in the
# whole collection of documents
- var tfidf = new Vector
+ var tfidf: Vector = terms_count is lazy
redef fun to_s do return "{title}"
end
# A match to a `request` in an `Index`
-class IndexMatch
+class IndexMatch[DOC: Document]
super Comparable
# Document matching the `request_vector`
- var document: Document
+ var document: DOC
# Similarity between the `request` and the `doc`.
#
class IndexMatchSorter
super DefaultComparator
- redef type COMPARED: IndexMatch
+ redef type COMPARED: IndexMatch[Document]
redef fun compare(a, b) do
return b.similarity <=> a.similarity