Vector Space Model

Vector Space Model (VSM) is an algebraic model for representing text documents (and any objects, in general) as vectors of identifiers, such as, for example, index terms.

It is used in information filtering, information retrieval, indexing and relevancy rankings.

Introduced classes

class Document

vsm :: Document

A Document to add in a VSMIndex

class FileIndex

vsm :: FileIndex

A VSMIndex to index files

class IndexMatch[DOC: Document]

vsm :: IndexMatch

A match to a request in an Index

class IndexMatchSorter

vsm :: IndexMatchSorter

Sort matches by similarity

class StringIndex

vsm :: StringIndex

A VSM index to store strings

class VSMIndex

vsm :: VSMIndex

A Document index based on VSM

class Vector

vsm :: Vector

A n-dimensions vector

All class definitions

class Document

vsm $ Document

A Document to add in a VSMIndex

class FileIndex

vsm $ FileIndex

A VSMIndex to index files

class IndexMatch[DOC: Document]

vsm $ IndexMatch

A match to a request in an Index

class IndexMatchSorter

vsm $ IndexMatchSorter

Sort matches by similarity

class StringIndex

vsm $ StringIndex

A VSM index to store strings

class VSMIndex

vsm $ VSMIndex

A Document index based on VSM

class Vector

vsm $ Vector

A n-dimensions vector

Ancestors

module abstract_collection

core :: abstract_collection

Abstract collection classes and services.

module abstract_text

core :: abstract_text

Abstract class for manipulation of sequences of characters

module array

core :: array

This module introduces the standard array structure.

module bitset

core :: bitset

Services to handle BitSet

module bytes

core :: bytes

Services for byte streams and arrays

module circular_array

core :: circular_array

Efficient data structure to access both end of the sequence.

module codec_base

core :: codec_base

Base for codecs to use with streams

module codecs

core :: codecs

Group module for all codec-related manipulations

module collection

core :: collection

This module define several collection classes.

module core

core :: core

Standard classes and methods used by default by Nit programs and libraries.

module environ

core :: environ

Access to the environment variables of the process

module error

core :: error

Standard error-management infrastructure.

module exec

core :: exec

Invocation and management of operating system sub-processes.

module file

core :: file

File manipulations (create, read, write, etc.)

module fixed_ints

core :: fixed_ints

Basic integers of fixed-precision

module fixed_ints_text

core :: fixed_ints_text

Text services to complement fixed_ints

module flat

core :: flat

All the array-based text representations

module gc

core :: gc

Access to the Nit internal garbage collection mechanism

module hash_collection

core :: hash_collection

Introduce HashMap and HashSet.

module iso8859_1

core :: iso8859_1

Codec for ISO8859-1 I/O

module kernel

core :: kernel

Most basic classes and methods.

module list

core :: list

This module handle double linked lists

module math

core :: math

Mathematical operations

module meta

meta :: meta

Simple user-defined meta-level to manipulate types of instances as object.

module native

core :: native

Native structures for text and bytes

module numeric

core :: numeric

Advanced services for Numeric types

module poset

poset :: poset

Pre order sets and partial order set (ie hierarchies)

module protocol

core :: protocol

module queue

core :: queue

Queuing data structures and wrappers

module range

core :: range

Module for range of discrete objects.

module re

core :: re

Regular expression support for all services based on Pattern

module ropes

core :: ropes

Tree-based representation of a String.

module serialization_core

serialization :: serialization_core

Abstract services to serialize Nit objects to different formats

module sorter

core :: sorter

This module contains classes used to compare things and sorts arrays.

module stream

core :: stream

Input and output streams of characters

module string_search

core :: string_search

Basic string search, match and replace.

module text

core :: text

All the classes and methods related to the manipulation of text entities

module time

core :: time

Management of time and dates

module union_find

core :: union_find

union–find algorithm using an efficient disjoint-set data structure

module utf8

core :: utf8

Codec for UTF-8 I/O

Parents

module counter

counter :: counter

Simple numerical statistical analysis and presentation

Children

module example_vsm

vsm :: example_vsm

Example using a FileIndex

module nlp

nlp :: nlp

Natural Language Processor based on the StanfordNLP core.

Descendants

module a_star-m

a_star-m

module nlp_index

nlp :: nlp_index

Example showing how to use a NLPFileIndex.

module nlp_server

nlp :: nlp_server

# Vector Space Model
#
# Vector Space Model (VSM) is an algebraic model for representing text documents
# (and any objects, in general) as vectors of identifiers, such as, for example,
# index terms.
#
# It is used in information filtering, information retrieval, indexing and
# relevancy rankings.
module vsm

import counter

# A n-dimensions vector
#
# *n-dimensions* vectors are used to represent a text document or an object.
class Vector
	super HashMap[nullable Object, Float]

	# Cosine similarity of `self` and `other`.
	#
	# Gives the proximity in the range `[0.0 .. 1.0]` where 0.0 means that the
	# two vectors are orthogonal and 1.0 means that they are identical.
	#
	# ~~~
	# var v1 = new Vector
	# v1["x"] = 1.0
	# v1["y"] = 2.0
	# v1["z"] = 3.0
	#
	# var v2 = new Vector
	# v2["x"] = 1.0
	# v2["y"] = 2.0
	# v2["z"] = 3.0
	#
	# var v3 = new Vector
	# v3["a"] = 1.0
	# v3["b"] = 2.0
	# v3["c"] = 3.0
	#
	# print v1.cosine_similarity(v2)
	# assert v1.cosine_similarity(v2) == 1.0
	# print v1.cosine_similarity(v3)
	# assert v1.cosine_similarity(v3) == 0.0
	# ~~~
	fun cosine_similarity(other: SELF): Float do
		# Collect terms
		var terms = new HashSet[nullable Object]
		for k in self.keys do terms.add k
		for k in other.keys do terms.add k

		# Get dot product of two vectors
		var dot = 0.0
		for term in terms do
			dot += self.get_or_default(term, 0.0) * other.get_or_default(term, 0.0)
		end
		var cos = dot.to_f / (self.norm * other.norm)
		if cos.is_nan then return 0.0
		return cos
	end

	redef fun [](k) do
		if not has_key(k) then return 0.0
		return super
	end

	# Increment value for `obj` term
	#
	# If the term isn't already in the vector, the new value is 1.0.
	fun inc(obj: nullable Object) do
		if has_key(obj) then
			self[obj] += 1.0
		else
			self[obj] = 1.0
		end
	end

	# The norm of the vector.
	#
	# `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
	#
	# ~~~
	# var v = new Vector
	# v["x"] = 1.0
	# v["y"] = 1.0
	# v["z"] = 1.0
	# v["t"] = 1.0
	# assert v.norm.is_approx(2.0, 0.001)
	#
	# v["x"] = 1.0
	# v["y"] = 2.0
	# v["z"] = 3.0
	# v["t"] = 0.0
	# assert v.norm.is_approx(3.742, 0.001)
	# ~~~
	fun norm: Float do
		var sum = 0.0
		for v in self.values do sum += v.pow(2.0)
		return sum.to_f.sqrt
	end

	redef fun to_s do
		return "[{join(", ", ":")}]"
	end
end

# A Document index based on VSM
#
# Using VSMIndex you can index documents associated with their vector.
# Documents can then be matched to query vectors.
class VSMIndex

	# Kind of documents stored in this index
	#
	# Clients can redefine this type to specialize the index.
	type DOC: Document

	# Documents index
	var documents = new HashSet[DOC]

	# Inversed index
	#
	# Link documents to existing terms.
	var inversed_index = new HashMap[nullable Object, Array[DOC]]

	# Count for all terms in all indexed documents
	#
	# Used to compute the `inverse_doc_frequency`.
	var terms_doc_count = new Vector

	# Inverse document frequency
	#
	# The inverse document frequency is a measure of how much information a term
	# provides, that is, whether the term is common or rare across all documents.
	var inverse_doc_frequency = new Vector

	# Used to sort matches
	#
	# See `IndexMatch`.
	var sorter = new IndexMatchSorter

	# Match `query` vector to all index document vectors
	#
	# Returns an `IndexMatch` for each indexed document.
	# Results are ordered by descending similarity.
	fun match_vector(query: Vector): Array[IndexMatch[DOC]] do
		var documents = new HashSet[DOC]
		for term, count in query do
			if inversed_index.has_key(term) then
				documents.add_all inversed_index[term]
			end
		end
		var matches = new Array[IndexMatch[DOC]]
		for doc in documents do
			var sim = query.cosine_similarity(doc.tfidf)
			if sim == 0.0 then continue
			matches.add new IndexMatch[DOC](doc, sim)
		end
		sorter.sort(matches)
		return matches
	end

	# Index a document
	#
	# With each new document, the `inverse_doc_frequency` must be updated.
	# By default, the method `update_index` is called after each call to
	# `index_document`.
	#
	# When processing batch documents, use `auto_update = false` to disable
	# the auto update of the index.
	fun index_document(doc: DOC, auto_update: nullable Bool) do
		for term, count in doc.terms_count do
			terms_doc_count.inc(term)
			if not inversed_index.has_key(term) then
				inversed_index[term] = new Array[DOC]
			end
			inversed_index[term].add doc
		end
		documents.add doc
		if auto_update == null or auto_update then update_index
	end

	# Update the index
	#
	# Recompute the `inverse_doc_frequency` values.
	# Must be called manually after indexing new document with the option
	# `auto_update = false`.
	fun update_index do
		for doc in documents do
			for term, ccount in doc.terms_count do
				inverse_doc_frequency[term] = (documents.length.to_f / terms_doc_count[term]).log
			end
		end
		for doc in documents do
			for term, freq in doc.terms_frequency do
				doc.tfidf[term] = freq * inverse_doc_frequency[term]
			end
		end
	end
end

# A VSM index to store strings
class StringIndex
	super VSMIndex

	# Index a new Document from `title`, `uri` and string `string`.
	#
	# Return the Document created.
	#
	# See `index_document`.
	fun index_string(title, uri, string: String, auto_update: nullable Bool): DOC do
		var vector = parse_string(string)
		var doc = new Document(title, uri, vector)
		index_document(doc, auto_update)
		return doc
	end

	# Match the `query` string against all indexed documents
	#
	# See `match_vector`.
	fun match_string(query: String): Array[IndexMatch[DOC]] do
		var vector = parse_string(query)
		var doc = new Document("", "", vector)
		return match_vector(doc.terms_frequency)
	end

	# Parse the `string` as a Vector
	#
	# Returns a vector containing the terms of `string`.
	fun parse_string(string: String): Vector do
		var reader = new StringReader(string)
		var vector = new Vector
		loop
			var token = reader.read_word
			if token == "" then break
			vector.inc(token)
		end
		return vector
	end
end

# A VSMIndex to index files
class FileIndex
	super StringIndex

	# Index a file from its `path`.
	#
	# Return the created document or null if `path` is not accepted by `accept_file`.
	#
	# See `index_document`.
	fun index_file(path: String, auto_update: nullable Bool): nullable DOC do
		if not accept_file(path) then return null
		var vector = parse_file(path)
		var doc = new Document(path, path, vector)
		index_document(doc, auto_update)
		return doc
	end

	# Index multiple files
	#
	# The recursive method `index_dir` will be called for each directory found
	# in `paths`.
	#
	# See `index_file`
	fun index_files(paths: Collection[String], auto_update: nullable Bool) do
		for path in paths do
			if path.to_path.is_dir then
				index_dir(path, false)
			else
				index_file(path, false)
			end
		end
		if auto_update != null and auto_update then update_index
	end

	# Index all files in `dir` recursively
	#
	# See `index_file`.
	fun index_dir(dir: String, auto_update: nullable Bool) do
		if not dir.to_path.is_dir then return
		for file in dir.files do
			var path = dir / file
			if path.to_path.is_dir then
				index_dir(path, false)
			else
				index_file(path, false)
			end
		end
		if auto_update != null and auto_update then update_index
	end

	# Is `path` accepted depending on `whitelist_exts` and `blacklist_exts`?
	fun accept_file(path: String): Bool do
		var ext = path.file_extension
		if ext != null then
			ext = ext.to_lower
			if blacklist_exts.has(ext) then return false
			if whitelist_exts.not_empty and not whitelist_exts.has(ext) then return false
		end
		return whitelist_exts.is_empty
	end

	# Parse the `file` content as a Vector
	#
	# See `parse_string`.
	fun parse_file(file: String): Vector do
		return parse_string(file.to_path.read_all)
	end

	# File extensions white list
	#
	# If not empty, only files with these extensions will be indexed.
	#
	# If an extension is in both `whitelist_exts` and `blacklist_exts`, the
	# blacklist will prevail and the file will be ignored.
	var whitelist_exts = new Array[String] is writable

	# File extensions black list
	#
	# Files with these extensions will not be indexed.
	var blacklist_exts = new Array[String] is writable
end

# A Document to add in a VSMIndex
class Document

	# Document title
	var title: String

	# Document URI
	var uri: String

	# Count of all terms found in the document
	#
	# Used to compute the document `terms_frequency`.
	var terms_count: Vector

	# Frequency of each term found in the document
	#
	# Used to match the document against the `VSMIndex::inverse_doc_frequency`.
	var terms_frequency: Vector is lazy do
		var all_terms = 0.0
		for t, c in terms_count do all_terms += c

		var vector = new Vector
		for t, c in terms_count do
			vector[t] = c / all_terms
		end
		return vector
	end

	# Term frequency–Inverse document frequency for each term
	#
	# A high weight in tf–idf is reached by a high term frequency
	# (in the given document) and a low document frequency of the term in the
	# whole collection of documents
	var tfidf: Vector = terms_count is lazy

	redef fun to_s do return "{title}"
end

# A match to a `request` in an `Index`
class IndexMatch[DOC: Document]
	super Comparable

	# Document matching the `request_vector`
	var document: DOC

	# Similarity between the `request` and the `doc`.
	#
	# Result is in the range 0.0 .. 1.1 where 0.0 means no similarity and 1.0
	# means perfect similarity.
	var similarity: Float

	redef fun to_s do return "{document} ({similarity})"
end

# Sort matches by similarity
class IndexMatchSorter
	super DefaultComparator

	redef type COMPARED: IndexMatch[Document]

	redef fun compare(a, b) do
		return b.similarity <=> a.similarity
	end
end

lib/vsm/vsm.nit:15,1--400,3

module vsm

Summary

Vector Space Model

Introduced classes

Document

FileIndex

IndexMatch

IndexMatchSorter

StringIndex

VSMIndex

Vector

Vector Space Model

Introduced classes

class Document

class FileIndex

class IndexMatch[DOC: Document]

class IndexMatchSorter

class StringIndex

class VSMIndex

class Vector

Summary

All class definitions

Document

FileIndex

IndexMatch

IndexMatchSorter

StringIndex

VSMIndex

Vector

All class definitions

class Document

class FileIndex

class IndexMatch[DOC: Document]

class IndexMatchSorter

class StringIndex

class VSMIndex

class Vector

Summary

Ancestors

abstract_collection

abstract_text

array

bitset

bytes

circular_array

codec_base

codecs

collection

core

environ

error

exec

file

fixed_ints

fixed_ints_text

flat

gc

hash_collection

iso8859_1

kernel

list

math

meta

native

numeric

poset

protocol

queue

range

re

ropes

serialization_core

sorter

stream

string_search

text

time

union_find

utf8

Parents