return cos
end
+ redef fun [](k) do
+ if not has_key(k) then return 0.0
+ return super
+ end
+
+ # Increment value for `obj` term
+ #
+ # If the term isn't already in the vector, the new value is 1.0.
+ fun inc(obj: nullable Object) do
+ if has_key(obj) then
+ self[obj] += 1.0
+ else
+ self[obj] = 1.0
+ end
+ end
+
# The norm of the vector.
#
# `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
return "[{join(", ", ":")}]"
end
end
+
+# A Document index based on VSM
+#
+# Using VSMIndex you can index documents associated with their vector.
+# Documents can then be matched to query vectors.
+class VSMIndex
+
+ # Kind of documents stored in this index
+ #
+ # Clients can redefine this type to specialize the index.
+ type DOC: Document
+
+ # Documents index
+ var documents = new HashSet[DOC]
+
+ # Inversed index
+ #
+ # Link documents to existing terms.
+ var inversed_index = new HashMap[nullable Object, Array[DOC]]
+
+ # Count for all terms in all indexed documents
+ #
+ # Used to compute the `inverse_doc_frequency`.
+ var terms_doc_count = new Vector
+
+ # Inverse document frequency
+ #
+ # The inverse document frequency is a measure of how much information a term
+ # provides, that is, whether the term is common or rare across all documents.
+ var inverse_doc_frequency = new Vector
+
+ # Used to sort matches
+ #
+ # See `IndexMatch`.
+ var sorter = new IndexMatchSorter
+
+ # Match `query` vector to all index document vectors
+ #
+ # Returns an `IndexMatch` for each indexed document.
+ # Results are ordered by descending similarity.
+ fun match_vector(query: Vector): Array[IndexMatch[DOC]] do
+ var documents = new HashSet[DOC]
+ for term, count in query do
+ if inversed_index.has_key(term) then
+ documents.add_all inversed_index[term]
+ end
+ end
+ var matches = new Array[IndexMatch[DOC]]
+ for doc in documents do
+ var sim = query.cosine_similarity(doc.tfidf)
+ if sim == 0.0 then continue
+ matches.add new IndexMatch[DOC](doc, sim)
+ end
+ sorter.sort(matches)
+ return matches
+ end
+
+ # Index a document
+ #
+ # With each new document, the `inverse_doc_frequency` must be updated.
+ # By default, the method `update_index` is called after each call to
+ # `index_document`.
+ #
+ # When processing batch documents, use `auto_update = false` to disable
+ # the auto update of the index.
+ fun index_document(doc: DOC, auto_update: nullable Bool) do
+ for term, count in doc.terms_count do
+ terms_doc_count.inc(term)
+ if not inversed_index.has_key(term) then
+ inversed_index[term] = new Array[DOC]
+ end
+ inversed_index[term].add doc
+ end
+ documents.add doc
+ if auto_update == null or auto_update then update_index
+ end
+
+ # Update the index
+ #
+ # Recompute the `inverse_doc_frequency` values.
+ # Must be called manually after indexing new document with the option
+ # `auto_update = false`.
+ fun update_index do
+ for doc in documents do
+ for term, ccount in doc.terms_count do
+ inverse_doc_frequency[term] = (documents.length.to_f / terms_doc_count[term]).log
+ end
+ end
+ for doc in documents do
+ for term, freq in doc.terms_frequency do
+ doc.tfidf[term] = freq * inverse_doc_frequency[term]
+ end
+ end
+ end
+end
+
+# A VSM index to store strings
+class StringIndex
+ super VSMIndex
+
+ # Index a new Document from `title`, `uri` and string `string`.
+ #
+ # Return the Document created.
+ #
+ # See `index_document`.
+ fun index_string(title, uri, string: String, auto_update: nullable Bool): DOC do
+ var vector = parse_string(string)
+ var doc = new Document(title, uri, vector)
+ index_document(doc, auto_update)
+ return doc
+ end
+
+ # Match the `query` string against all indexed documents
+ #
+ # See `match_vector`.
+ fun match_string(query: String): Array[IndexMatch[DOC]] do
+ var vector = parse_string(query)
+ var doc = new Document("", "", vector)
+ return match_vector(doc.terms_frequency)
+ end
+
+ # Parse the `string` as a Vector
+ #
+ # Returns a vector containing the terms of `string`.
+ fun parse_string(string: String): Vector do
+ var reader = new StringReader(string)
+ var vector = new Vector
+ loop
+ var token = reader.read_word
+ if token == "" then break
+ vector.inc(token)
+ end
+ return vector
+ end
+end
+
+# A VSMIndex to index files
+class FileIndex
+ super StringIndex
+
+ # Index a file from its `path`.
+ #
+ # Return the created document or null if `path` is not accepted by `accept_file`.
+ #
+ # See `index_document`.
+ fun index_file(path: String, auto_update: nullable Bool): nullable DOC do
+ if not accept_file(path) then return null
+ var vector = parse_file(path)
+ var doc = new Document(path, path, vector)
+ index_document(doc, auto_update)
+ return doc
+ end
+
+ # Index multiple files
+ #
+ # The recursive method `index_dir` will be called for each directory found
+ # in `paths`.
+ #
+ # See `index_file`
+ fun index_files(paths: Collection[String], auto_update: nullable Bool) do
+ for path in paths do
+ if path.to_path.is_dir then
+ index_dir(path, false)
+ else
+ index_file(path, false)
+ end
+ end
+ if auto_update != null and auto_update then update_index
+ end
+
+ # Index all files in `dir` recursively
+ #
+ # See `index_file`.
+ fun index_dir(dir: String, auto_update: nullable Bool) do
+ if not dir.to_path.is_dir then return
+ for file in dir.files do
+ var path = dir / file
+ if path.to_path.is_dir then
+ index_dir(path, false)
+ else
+ index_file(path, false)
+ end
+ end
+ if auto_update != null and auto_update then update_index
+ end
+
+ # Is `path` accepted depending on `whitelist_exts` and `blacklist_exts`?
+ fun accept_file(path: String): Bool do
+ var ext = path.file_extension
+ if ext != null then
+ ext = ext.to_lower
+ if blacklist_exts.has(ext) then return false
+ if whitelist_exts.not_empty and not whitelist_exts.has(ext) then return false
+ end
+ return whitelist_exts.is_empty
+ end
+
+ # Parse the `file` content as a Vector
+ #
+ # See `parse_string`.
+ fun parse_file(file: String): Vector do
+ return parse_string(file.to_path.read_all)
+ end
+
+ # File extensions white list
+ #
+ # If not empty, only files with these extensions will be indexed.
+ #
+ # If an extension is in both `whitelist_exts` and `blacklist_exts`, the
+ # blacklist will prevail and the file will be ignored.
+ var whitelist_exts = new Array[String] is writable
+
+ # File extensions black list
+ #
+ # Files with these extensions will not be indexed.
+ var blacklist_exts = new Array[String] is writable
+end
+
+# A Document to add in a VSMIndex
+class Document
+
+ # Document title
+ var title: String
+
+ # Document URI
+ var uri: String
+
+ # Count of all terms found in the document
+ #
+ # Used to compute the document `terms_frequency`.
+ var terms_count: Vector
+
+ # Frequency of each term found in the document
+ #
+ # Used to match the document against the `VSMIndex::inverse_doc_frequency`.
+ var terms_frequency: Vector is lazy do
+ var all_terms = 0.0
+ for t, c in terms_count do all_terms += c
+
+ var vector = new Vector
+ for t, c in terms_count do
+ vector[t] = c / all_terms
+ end
+ return vector
+ end
+
+ # Term frequency–Inverse document frequency for each term
+ #
+ # A high weight in tf–idf is reached by a high term frequency
+ # (in the given document) and a low document frequency of the term in the
+ # whole collection of documents
+ var tfidf: Vector = terms_count is lazy
+
+ redef fun to_s do return "{title}"
+end
+
+# A match to a `request` in an `Index`
+class IndexMatch[DOC: Document]
+ super Comparable
+
+ # Document matching the `request_vector`
+ var document: DOC
+
+ # Similarity between the `request` and the `doc`.
+ #
+ # Result is in the range 0.0 .. 1.1 where 0.0 means no similarity and 1.0
+ # means perfect similarity.
+ var similarity: Float
+
+ redef fun to_s do return "{document} ({similarity})"
+end
+
+# Sort matches by similarity
+class IndexMatchSorter
+ super DefaultComparator
+
+ redef type COMPARED: IndexMatch[Document]
+
+ redef fun compare(a, b) do
+ return b.similarity <=> a.similarity
+ end
+end