A StringIndex using a NLPProcessor to parse and vectorize strings

Introduced properties

fun accept_token(token: NLPToken): Bool

nlp :: NLPIndex :: accept_token

Is token accepted by this index?
fun blacklist_pos: Array[String]

nlp :: NLPIndex :: blacklist_pos

Part-Of-Speech blacklist
fun blacklist_pos=(blacklist_pos: Array[String])

nlp :: NLPIndex :: blacklist_pos=

Part-Of-Speech blacklist
init defaultinit(nlp_processor: NLPProcessor)

nlp :: NLPIndex :: defaultinit

fun nlp_processor: NLPProcessor

nlp :: NLPIndex :: nlp_processor

NLP Processor used to tokenize, lemmatize and POS tag documents
protected fun nlp_processor=(nlp_processor: NLPProcessor)

nlp :: NLPIndex :: nlp_processor=

NLP Processor used to tokenize, lemmatize and POS tag documents
fun stoplist: Array[String]

nlp :: NLPIndex :: stoplist

List of lemmas that must not be indexed
fun stoplist=(stoplist: Array[String])

nlp :: NLPIndex :: stoplist=

List of lemmas that must not be indexed
fun whitelist_pos: Array[String]

nlp :: NLPIndex :: whitelist_pos

Part-Of-Speech whitelist
fun whitelist_pos=(whitelist_pos: Array[String])

nlp :: NLPIndex :: whitelist_pos=

Part-Of-Speech whitelist

Redefined properties

redef type SELF: NLPIndex

nlp $ NLPIndex :: SELF

Type of this instance, automatically specialized in every class
redef fun parse_string(string: String): Vector

nlp $ NLPIndex :: parse_string

Parse the string as a Vector

All properties

fun !=(other: nullable Object): Bool

core :: Object :: !=

Have self and other different values?
fun ==(other: nullable Object): Bool

core :: Object :: ==

Have self and other the same value?
type CLASS: Class[SELF]

core :: Object :: CLASS

The type of the class of self.
type DOC: Document

vsm :: VSMIndex :: DOC

Kind of documents stored in this index
type SELF: Object

core :: Object :: SELF

Type of this instance, automatically specialized in every class
fun accept_token(token: NLPToken): Bool

nlp :: NLPIndex :: accept_token

Is token accepted by this index?
fun blacklist_pos: Array[String]

nlp :: NLPIndex :: blacklist_pos

Part-Of-Speech blacklist
fun blacklist_pos=(blacklist_pos: Array[String])

nlp :: NLPIndex :: blacklist_pos=

Part-Of-Speech blacklist
protected fun class_factory(name: String): CLASS

core :: Object :: class_factory

Implementation used by get_class to create the specific class.
fun class_name: String

core :: Object :: class_name

The class name of the object.
init defaultinit(nlp_processor: NLPProcessor)

nlp :: NLPIndex :: defaultinit

fun documents: HashSet[DOC]

vsm :: VSMIndex :: documents

Documents index
protected fun documents=(documents: HashSet[DOC])

vsm :: VSMIndex :: documents=

Documents index
fun get_class: CLASS

core :: Object :: get_class

The meta-object representing the dynamic type of self.
fun hash: Int

core :: Object :: hash

The hash code of the object.
fun index_document(doc: DOC, auto_update: nullable Bool)

vsm :: VSMIndex :: index_document

Index a document
fun index_string(title: String, uri: String, string: String, auto_update: nullable Bool): DOC

vsm :: StringIndex :: index_string

Index a new Document from title, uri and string string.
init init

core :: Object :: init

fun inspect: String

core :: Object :: inspect

Developer readable representation of self.
protected fun inspect_head: String

core :: Object :: inspect_head

Return "CLASSNAME:#OBJECTID".
fun inverse_doc_frequency: Vector

vsm :: VSMIndex :: inverse_doc_frequency

Inverse document frequency
protected fun inverse_doc_frequency=(inverse_doc_frequency: Vector)

vsm :: VSMIndex :: inverse_doc_frequency=

Inverse document frequency
fun inversed_index: HashMap[nullable Object, Array[DOC]]

vsm :: VSMIndex :: inversed_index

Inversed index
protected fun inversed_index=(inversed_index: HashMap[nullable Object, Array[DOC]])

vsm :: VSMIndex :: inversed_index=

Inversed index
intern fun is_same_instance(other: nullable Object): Bool

core :: Object :: is_same_instance

Return true if self and other are the same instance (i.e. same identity).
fun is_same_serialized(other: nullable Object): Bool

core :: Object :: is_same_serialized

Is self the same as other in a serialization context?
intern fun is_same_type(other: Object): Bool

core :: Object :: is_same_type

Return true if self and other have the same dynamic type.
fun match_string(query: String): Array[IndexMatch[DOC]]

vsm :: StringIndex :: match_string

Match the query string against all indexed documents
fun match_vector(query: Vector): Array[IndexMatch[DOC]]

vsm :: VSMIndex :: match_vector

Match query vector to all index document vectors
fun nlp_processor: NLPProcessor

nlp :: NLPIndex :: nlp_processor

NLP Processor used to tokenize, lemmatize and POS tag documents
protected fun nlp_processor=(nlp_processor: NLPProcessor)

nlp :: NLPIndex :: nlp_processor=

NLP Processor used to tokenize, lemmatize and POS tag documents
intern fun object_id: Int

core :: Object :: object_id

An internal hash code for the object based on its identity.
fun output

core :: Object :: output

Display self on stdout (debug only).
intern fun output_class_name

core :: Object :: output_class_name

Display class name on stdout (debug only).
fun parse_string(string: String): Vector

vsm :: StringIndex :: parse_string

Parse the string as a Vector
fun serialization_hash: Int

core :: Object :: serialization_hash

Hash value use for serialization
fun sorter: IndexMatchSorter

vsm :: VSMIndex :: sorter

Used to sort matches
protected fun sorter=(sorter: IndexMatchSorter)

vsm :: VSMIndex :: sorter=

Used to sort matches
fun stoplist: Array[String]

nlp :: NLPIndex :: stoplist

List of lemmas that must not be indexed
fun stoplist=(stoplist: Array[String])

nlp :: NLPIndex :: stoplist=

List of lemmas that must not be indexed
intern fun sys: Sys

core :: Object :: sys

Return the global sys object, the only instance of the Sys class.
fun terms_doc_count: Vector

vsm :: VSMIndex :: terms_doc_count

Count for all terms in all indexed documents
protected fun terms_doc_count=(terms_doc_count: Vector)

vsm :: VSMIndex :: terms_doc_count=

Count for all terms in all indexed documents
abstract fun to_jvalue(env: JniEnv): JValue

core :: Object :: to_jvalue

fun to_s: String

core :: Object :: to_s

User readable representation of self.
fun update_index

vsm :: VSMIndex :: update_index

Update the index
fun whitelist_pos: Array[String]

nlp :: NLPIndex :: whitelist_pos

Part-Of-Speech whitelist
fun whitelist_pos=(whitelist_pos: Array[String])

nlp :: NLPIndex :: whitelist_pos=

Part-Of-Speech whitelist
package_diagram nlp::NLPIndex NLPIndex vsm::StringIndex StringIndex nlp::NLPIndex->vsm::StringIndex vsm::VSMIndex VSMIndex vsm::StringIndex->vsm::VSMIndex ...vsm::VSMIndex ... ...vsm::VSMIndex->vsm::VSMIndex nlp::NLPFileIndex NLPFileIndex nlp::NLPFileIndex->nlp::NLPIndex

Ancestors

interface Object

core :: Object

The root of the class hierarchy.
class VSMIndex

vsm :: VSMIndex

A Document index based on VSM

Parents

class StringIndex

vsm :: StringIndex

A VSM index to store strings

Children

class NLPFileIndex

nlp :: NLPFileIndex

A FileIndex based using a NLPProcessor

Class definitions

nlp $ NLPIndex
# A StringIndex using a NLPProcessor to parse and vectorize strings
class NLPIndex
	super StringIndex

	# NLP Processor used to tokenize, lemmatize and POS tag documents
	var nlp_processor: NLPProcessor

	redef fun parse_string(string) do
		var vector = new Vector
		if string.trim.is_empty then return vector
		var doc = nlp_processor.process(string)
		for sentence in doc.sentences do
			for token in sentence.tokens do
				if not accept_token(token) then continue
				var lemma = token.lemma
				if not vector.has_key(lemma) then
					vector[lemma] = 1.0
				else
					vector[lemma] += 1.0
				end
			end
		end
		return vector
	end

	# Is `token` accepted by this index?
	#
	# See `whitelist_pos` and `blacklist_pos`.
	fun accept_token(token: NLPToken): Bool do
		var pos = token.pos
		if whitelist_pos.not_empty and not whitelist_pos.has(pos) then return false
		if blacklist_pos.has(pos) then return false
		if stoplist.has(token.lemma) then return false
		return true
	end

	# Part-Of-Speech whitelist
	#
	# If not empty, the index accept only the POS tags contained in this list.
	var whitelist_pos = new Array[String] is writable

	# Part-Of-Speech blacklist
	#
	# Reject POS tags contained in this list.
	var blacklist_pos = new Array[String] is writable

	# List of lemmas that must not be indexed
	var stoplist = new Array[String] is writable
end
lib/nlp/nlp.nit:23,1--71,3