A Document index based on VSM

Using VSMIndex you can index documents associated with their vector. Documents can then be matched to query vectors.

Introduced properties

type DOC: Document

vsm :: VSMIndex :: DOC

Kind of documents stored in this index
fun documents: HashSet[DOC]

vsm :: VSMIndex :: documents

Documents index
protected fun documents=(documents: HashSet[DOC])

vsm :: VSMIndex :: documents=

Documents index
fun index_document(doc: DOC, auto_update: nullable Bool)

vsm :: VSMIndex :: index_document

Index a document
fun inverse_doc_frequency: Vector

vsm :: VSMIndex :: inverse_doc_frequency

Inverse document frequency
protected fun inverse_doc_frequency=(inverse_doc_frequency: Vector)

vsm :: VSMIndex :: inverse_doc_frequency=

Inverse document frequency
fun inversed_index: HashMap[nullable Object, Array[DOC]]

vsm :: VSMIndex :: inversed_index

Inversed index
protected fun inversed_index=(inversed_index: HashMap[nullable Object, Array[DOC]])

vsm :: VSMIndex :: inversed_index=

Inversed index
fun match_vector(query: Vector): Array[IndexMatch[DOC]]

vsm :: VSMIndex :: match_vector

Match query vector to all index document vectors
fun sorter: IndexMatchSorter

vsm :: VSMIndex :: sorter

Used to sort matches
protected fun sorter=(sorter: IndexMatchSorter)

vsm :: VSMIndex :: sorter=

Used to sort matches
fun terms_doc_count: Vector

vsm :: VSMIndex :: terms_doc_count

Count for all terms in all indexed documents
protected fun terms_doc_count=(terms_doc_count: Vector)

vsm :: VSMIndex :: terms_doc_count=

Count for all terms in all indexed documents
fun update_index

vsm :: VSMIndex :: update_index

Update the index

Redefined properties

redef type SELF: VSMIndex

vsm $ VSMIndex :: SELF

Type of this instance, automatically specialized in every class

All properties

fun !=(other: nullable Object): Bool

core :: Object :: !=

Have self and other different values?
fun ==(other: nullable Object): Bool

core :: Object :: ==

Have self and other the same value?
type CLASS: Class[SELF]

core :: Object :: CLASS

The type of the class of self.
type DOC: Document

vsm :: VSMIndex :: DOC

Kind of documents stored in this index
type SELF: Object

core :: Object :: SELF

Type of this instance, automatically specialized in every class
protected fun class_factory(name: String): CLASS

core :: Object :: class_factory

Implementation used by get_class to create the specific class.
fun class_name: String

core :: Object :: class_name

The class name of the object.
fun documents: HashSet[DOC]

vsm :: VSMIndex :: documents

Documents index
protected fun documents=(documents: HashSet[DOC])

vsm :: VSMIndex :: documents=

Documents index
fun get_class: CLASS

core :: Object :: get_class

The meta-object representing the dynamic type of self.
fun hash: Int

core :: Object :: hash

The hash code of the object.
fun index_document(doc: DOC, auto_update: nullable Bool)

vsm :: VSMIndex :: index_document

Index a document
init init

core :: Object :: init

fun inspect: String

core :: Object :: inspect

Developer readable representation of self.
protected fun inspect_head: String

core :: Object :: inspect_head

Return "CLASSNAME:#OBJECTID".
fun inverse_doc_frequency: Vector

vsm :: VSMIndex :: inverse_doc_frequency

Inverse document frequency
protected fun inverse_doc_frequency=(inverse_doc_frequency: Vector)

vsm :: VSMIndex :: inverse_doc_frequency=

Inverse document frequency
fun inversed_index: HashMap[nullable Object, Array[DOC]]

vsm :: VSMIndex :: inversed_index

Inversed index
protected fun inversed_index=(inversed_index: HashMap[nullable Object, Array[DOC]])

vsm :: VSMIndex :: inversed_index=

Inversed index
intern fun is_same_instance(other: nullable Object): Bool

core :: Object :: is_same_instance

Return true if self and other are the same instance (i.e. same identity).
fun is_same_serialized(other: nullable Object): Bool

core :: Object :: is_same_serialized

Is self the same as other in a serialization context?
intern fun is_same_type(other: Object): Bool

core :: Object :: is_same_type

Return true if self and other have the same dynamic type.
fun match_vector(query: Vector): Array[IndexMatch[DOC]]

vsm :: VSMIndex :: match_vector

Match query vector to all index document vectors
intern fun object_id: Int

core :: Object :: object_id

An internal hash code for the object based on its identity.
fun output

core :: Object :: output

Display self on stdout (debug only).
intern fun output_class_name

core :: Object :: output_class_name

Display class name on stdout (debug only).
fun serialization_hash: Int

core :: Object :: serialization_hash

Hash value use for serialization
fun sorter: IndexMatchSorter

vsm :: VSMIndex :: sorter

Used to sort matches
protected fun sorter=(sorter: IndexMatchSorter)

vsm :: VSMIndex :: sorter=

Used to sort matches
intern fun sys: Sys

core :: Object :: sys

Return the global sys object, the only instance of the Sys class.
fun terms_doc_count: Vector

vsm :: VSMIndex :: terms_doc_count

Count for all terms in all indexed documents
protected fun terms_doc_count=(terms_doc_count: Vector)

vsm :: VSMIndex :: terms_doc_count=

Count for all terms in all indexed documents
abstract fun to_jvalue(env: JniEnv): JValue

core :: Object :: to_jvalue

fun to_s: String

core :: Object :: to_s

User readable representation of self.
fun update_index

vsm :: VSMIndex :: update_index

Update the index
package_diagram vsm::VSMIndex VSMIndex core::Object Object vsm::VSMIndex->core::Object vsm::StringIndex StringIndex vsm::StringIndex->vsm::VSMIndex nlp::NLPIndex NLPIndex nlp::NLPIndex->vsm::StringIndex vsm::FileIndex FileIndex vsm::FileIndex->vsm::StringIndex nlp::NLPIndex... ... nlp::NLPIndex...->nlp::NLPIndex vsm::FileIndex... ... vsm::FileIndex...->vsm::FileIndex

Parents

interface Object

core :: Object

The root of the class hierarchy.

Children

class StringIndex

vsm :: StringIndex

A VSM index to store strings

Descendants

class FileIndex

vsm :: FileIndex

A VSMIndex to index files
class NLPFileIndex

nlp :: NLPFileIndex

A FileIndex based using a NLPProcessor
class NLPIndex

nlp :: NLPIndex

A StringIndex using a NLPProcessor to parse and vectorize strings

Class definitions

vsm $ VSMIndex
# A Document index based on VSM
#
# Using VSMIndex you can index documents associated with their vector.
# Documents can then be matched to query vectors.
class VSMIndex

	# Kind of documents stored in this index
	#
	# Clients can redefine this type to specialize the index.
	type DOC: Document

	# Documents index
	var documents = new HashSet[DOC]

	# Inversed index
	#
	# Link documents to existing terms.
	var inversed_index = new HashMap[nullable Object, Array[DOC]]

	# Count for all terms in all indexed documents
	#
	# Used to compute the `inverse_doc_frequency`.
	var terms_doc_count = new Vector

	# Inverse document frequency
	#
	# The inverse document frequency is a measure of how much information a term
	# provides, that is, whether the term is common or rare across all documents.
	var inverse_doc_frequency = new Vector

	# Used to sort matches
	#
	# See `IndexMatch`.
	var sorter = new IndexMatchSorter

	# Match `query` vector to all index document vectors
	#
	# Returns an `IndexMatch` for each indexed document.
	# Results are ordered by descending similarity.
	fun match_vector(query: Vector): Array[IndexMatch[DOC]] do
		var documents = new HashSet[DOC]
		for term, count in query do
			if inversed_index.has_key(term) then
				documents.add_all inversed_index[term]
			end
		end
		var matches = new Array[IndexMatch[DOC]]
		for doc in documents do
			var sim = query.cosine_similarity(doc.tfidf)
			if sim == 0.0 then continue
			matches.add new IndexMatch[DOC](doc, sim)
		end
		sorter.sort(matches)
		return matches
	end

	# Index a document
	#
	# With each new document, the `inverse_doc_frequency` must be updated.
	# By default, the method `update_index` is called after each call to
	# `index_document`.
	#
	# When processing batch documents, use `auto_update = false` to disable
	# the auto update of the index.
	fun index_document(doc: DOC, auto_update: nullable Bool) do
		for term, count in doc.terms_count do
			terms_doc_count.inc(term)
			if not inversed_index.has_key(term) then
				inversed_index[term] = new Array[DOC]
			end
			inversed_index[term].add doc
		end
		documents.add doc
		if auto_update == null or auto_update then update_index
	end

	# Update the index
	#
	# Recompute the `inverse_doc_frequency` values.
	# Must be called manually after indexing new document with the option
	# `auto_update = false`.
	fun update_index do
		for doc in documents do
			for term, ccount in doc.terms_count do
				inverse_doc_frequency[term] = (documents.length.to_f / terms_doc_count[term]).log
			end
		end
		for doc in documents do
			for term, freq in doc.terms_frequency do
				doc.tfidf[term] = freq * inverse_doc_frequency[term]
			end
		end
	end
end
lib/vsm/vsm.nit:120,1--213,3