A StringIndex using a NLPProcessor to parse and vectorize strings

Introduced properties

fun accept_token(token: NLPToken): Bool

nlp :: NLPIndex :: accept_token

Is token accepted by this index?

fun blacklist_pos: Array[String]

nlp :: NLPIndex :: blacklist_pos

Part-Of-Speech blacklist

fun blacklist_pos=(blacklist_pos: Array[String])

nlp :: NLPIndex :: blacklist_pos=

Part-Of-Speech blacklist

init defaultinit(nlp_processor: NLPProcessor)

nlp :: NLPIndex :: defaultinit

fun nlp_processor: NLPProcessor

nlp :: NLPIndex :: nlp_processor

NLP Processor used to tokenize, lemmatize and POS tag documents

protected fun nlp_processor=(nlp_processor: NLPProcessor)

nlp :: NLPIndex :: nlp_processor=

NLP Processor used to tokenize, lemmatize and POS tag documents

fun stoplist: Array[String]

nlp :: NLPIndex :: stoplist

List of lemmas that must not be indexed

fun stoplist=(stoplist: Array[String])

nlp :: NLPIndex :: stoplist=

List of lemmas that must not be indexed

fun whitelist_pos: Array[String]

nlp :: NLPIndex :: whitelist_pos

Part-Of-Speech whitelist

fun whitelist_pos=(whitelist_pos: Array[String])

nlp :: NLPIndex :: whitelist_pos=

Part-Of-Speech whitelist

Redefined properties

redef type SELF: NLPIndex

nlp $ NLPIndex :: SELF

Type of this instance, automatically specialized in every class

redef fun parse_string(string: String): Vector

nlp $ NLPIndex :: parse_string

Parse the string as a Vector

All properties

fun !=(other: nullable Object): Bool

core :: Object :: !=

Have self and other different values?

fun ==(other: nullable Object): Bool

core :: Object :: ==

Have self and other the same value?

type CLASS: Class[SELF]

core :: Object :: CLASS

The type of the class of self.

type DOC: Document

vsm :: VSMIndex :: DOC

Kind of documents stored in this index

type SELF: Object

core :: Object :: SELF

Type of this instance, automatically specialized in every class

fun accept_token(token: NLPToken): Bool

nlp :: NLPIndex :: accept_token

Is token accepted by this index?

fun blacklist_pos: Array[String]

nlp :: NLPIndex :: blacklist_pos

Part-Of-Speech blacklist

fun blacklist_pos=(blacklist_pos: Array[String])

nlp :: NLPIndex :: blacklist_pos=

Part-Of-Speech blacklist

protected fun class_factory(name: String): CLASS

core :: Object :: class_factory

Implementation used by get_class to create the specific class.

fun class_name: String

core :: Object :: class_name

The class name of the object.

init defaultinit(nlp_processor: NLPProcessor)

nlp :: NLPIndex :: defaultinit

init defaultinit

core :: Object :: defaultinit

init defaultinit

vsm :: VSMIndex :: defaultinit

init defaultinit

vsm :: StringIndex :: defaultinit

fun documents: HashSet[DOC]

vsm :: VSMIndex :: documents

Documents index

protected fun documents=(documents: HashSet[DOC])

vsm :: VSMIndex :: documents=

Documents index

fun get_class: CLASS

core :: Object :: get_class

The meta-object representing the dynamic type of self.

fun hash: Int

core :: Object :: hash

The hash code of the object.

fun index_document(doc: DOC, auto_update: nullable Bool)

vsm :: VSMIndex :: index_document

Index a document

fun index_string(title: String, uri: String, string: String, auto_update: nullable Bool): DOC

vsm :: StringIndex :: index_string

Index a new Document from title, uri and string string.

init init

core :: Object :: init

fun inspect: String

core :: Object :: inspect

Developer readable representation of self.

protected fun inspect_head: String

core :: Object :: inspect_head

Return "CLASSNAME:#OBJECTID".

fun inverse_doc_frequency: Vector

vsm :: VSMIndex :: inverse_doc_frequency

Inverse document frequency

protected fun inverse_doc_frequency=(inverse_doc_frequency: Vector)

vsm :: VSMIndex :: inverse_doc_frequency=

Inverse document frequency

fun inversed_index: HashMap[nullable Object, Array[DOC]]

vsm :: VSMIndex :: inversed_index

Inversed index

protected fun inversed_index=(inversed_index: HashMap[nullable Object, Array[DOC]])

vsm :: VSMIndex :: inversed_index=

Inversed index

intern fun is_same_instance(other: nullable Object): Bool

core :: Object :: is_same_instance

Return true if self and other are the same instance (i.e. same identity).

fun is_same_serialized(other: nullable Object): Bool

core :: Object :: is_same_serialized

Is self the same as other in a serialization context?

intern fun is_same_type(other: Object): Bool

core :: Object :: is_same_type

Return true if self and other have the same dynamic type.

fun match_string(query: String): Array[IndexMatch[DOC]]

vsm :: StringIndex :: match_string

Match the query string against all indexed documents

fun match_vector(query: Vector): Array[IndexMatch[DOC]]

vsm :: VSMIndex :: match_vector

Match query vector to all index document vectors

fun nlp_processor: NLPProcessor

nlp :: NLPIndex :: nlp_processor

NLP Processor used to tokenize, lemmatize and POS tag documents

protected fun nlp_processor=(nlp_processor: NLPProcessor)

nlp :: NLPIndex :: nlp_processor=

NLP Processor used to tokenize, lemmatize and POS tag documents

intern fun object_id: Int

core :: Object :: object_id

An internal hash code for the object based on its identity.

fun output

core :: Object :: output

Display self on stdout (debug only).

intern fun output_class_name

core :: Object :: output_class_name

Display class name on stdout (debug only).

fun parse_string(string: String): Vector

vsm :: StringIndex :: parse_string

Parse the string as a Vector

fun serialization_hash: Int

core :: Object :: serialization_hash

Hash value use for serialization

fun sorter: IndexMatchSorter

vsm :: VSMIndex :: sorter

Used to sort matches

protected fun sorter=(sorter: IndexMatchSorter)

vsm :: VSMIndex :: sorter=

Used to sort matches

fun stoplist: Array[String]

nlp :: NLPIndex :: stoplist

List of lemmas that must not be indexed

fun stoplist=(stoplist: Array[String])

nlp :: NLPIndex :: stoplist=

List of lemmas that must not be indexed

intern fun sys: Sys

core :: Object :: sys

Return the global sys object, the only instance of the Sys class.

fun terms_doc_count: Vector

vsm :: VSMIndex :: terms_doc_count

Count for all terms in all indexed documents

protected fun terms_doc_count=(terms_doc_count: Vector)

vsm :: VSMIndex :: terms_doc_count=

Count for all terms in all indexed documents

abstract fun to_jvalue(env: JniEnv): JValue

core :: Object :: to_jvalue

fun to_s: String

core :: Object :: to_s

User readable representation of self.

fun update_index

vsm :: VSMIndex :: update_index

Update the index

fun whitelist_pos: Array[String]

nlp :: NLPIndex :: whitelist_pos

Part-Of-Speech whitelist

fun whitelist_pos=(whitelist_pos: Array[String])

nlp :: NLPIndex :: whitelist_pos=

Part-Of-Speech whitelist

Ancestors

interface Object

The root of the class hierarchy.

class VSMIndex

vsm :: VSMIndex

A Document index based on VSM

Parents

class StringIndex

vsm :: StringIndex

A VSM index to store strings

Children

class NLPFileIndex

nlp :: NLPFileIndex

A FileIndex based using a NLPProcessor

Class definitions

nlp $ NLPIndex

# A StringIndex using a NLPProcessor to parse and vectorize strings
class NLPIndex
	super StringIndex

	# NLP Processor used to tokenize, lemmatize and POS tag documents
	var nlp_processor: NLPProcessor

	redef fun parse_string(string) do
		var vector = new Vector
		if string.trim.is_empty then return vector
		var doc = nlp_processor.process(string)
		for sentence in doc.sentences do
			for token in sentence.tokens do
				if not accept_token(token) then continue
				var lemma = token.lemma
				if not vector.has_key(lemma) then
					vector[lemma] = 1.0
				else
					vector[lemma] += 1.0
				end
			end
		end
		return vector
	end

	# Is `token` accepted by this index?
	#
	# See `whitelist_pos` and `blacklist_pos`.
	fun accept_token(token: NLPToken): Bool do
		var pos = token.pos
		if whitelist_pos.not_empty and not whitelist_pos.has(pos) then return false
		if blacklist_pos.has(pos) then return false
		if stoplist.has(token.lemma) then return false
		return true
	end

	# Part-Of-Speech whitelist
	#
	# If not empty, the index accept only the POS tags contained in this list.
	var whitelist_pos = new Array[String] is writable

	# Part-Of-Speech blacklist
	#
	# Reject POS tags contained in this list.
	var blacklist_pos = new Array[String] is writable

	# List of lemmas that must not be indexed
	var stoplist = new Array[String] is writable
end

lib/nlp/nlp.nit:23,1--71,3