Natural Language Processor based on the StanfordNLP core.


Introduced classes

class NLPFileIndex

nlp :: NLPFileIndex

A FileIndex based using a NLPProcessor
class NLPIndex

nlp :: NLPIndex

A StringIndex using a NLPProcessor to parse and vectorize strings

All class definitions

class NLPFileIndex

nlp $ NLPFileIndex

A FileIndex based using a NLPProcessor
class NLPIndex

nlp $ NLPIndex

A StringIndex using a NLPProcessor to parse and vectorize strings
package_diagram nlp::nlp nlp nlp::stanford stanford nlp::nlp->nlp::stanford vsm vsm nlp::nlp->vsm opts opts nlp::stanford->opts dom dom nlp::stanford->dom curl curl nlp::stanford->curl pthreads pthreads nlp::stanford->pthreads counter counter vsm->counter config config vsm->config ...opts ... ...opts->opts ...dom ... ...dom->dom ...curl ... ...curl->curl ...pthreads ... ...pthreads->pthreads ...counter ... ...counter->counter ...config ... ...config->config nlp::nlp_index nlp_index nlp::nlp_index->nlp::nlp nlp::nlp_server nlp_server nlp::nlp_server->nlp::nlp a_star-m a_star-m a_star-m->nlp::nlp_index a_star-m->nlp::nlp_server a_star-m... ... a_star-m...->a_star-m


module abstract_collection

core :: abstract_collection

Abstract collection classes and services.
module abstract_text

core :: abstract_text

Abstract class for manipulation of sequences of characters
module array

core :: array

This module introduces the standard array structure.
module bitset

core :: bitset

Services to handle BitSet
module bytes

core :: bytes

Services for byte streams and arrays
module caching

serialization :: caching

Services for caching serialization engines
module circular_array

core :: circular_array

Efficient data structure to access both end of the sequence.
module codec_base

core :: codec_base

Base for codecs to use with streams
module codecs

core :: codecs

Group module for all codec-related manipulations
module collection

core :: collection

This module define several collection classes.
module core

core :: core

Standard classes and methods used by default by Nit programs and libraries.
module counter

counter :: counter

Simple numerical statistical analysis and presentation
module curl

curl :: curl

Data transfer powered by the native curl library
module dom

dom :: dom

Easy XML DOM parser
module engine_tools

serialization :: engine_tools

Advanced services for serialization engines
module environ

core :: environ

Access to the environment variables of the process
module error

core :: error

Standard error-management infrastructure.
module exec

core :: exec

Invocation and management of operating system sub-processes.
module file

core :: file

File manipulations (create, read, write, etc.)
module fixed_ints

core :: fixed_ints

Basic integers of fixed-precision
module fixed_ints_text

core :: fixed_ints_text

Text services to complement fixed_ints
module flat

core :: flat

All the array-based text representations
module gc

core :: gc

Access to the Nit internal garbage collection mechanism
module hash_collection

core :: hash_collection

Introduce HashMap and HashSet.
module inspect

serialization :: inspect

Refine Serializable::inspect to show more useful information
module iso8859_1

core :: iso8859_1

Codec for ISO8859-1 I/O
module kernel

core :: kernel

Most basic classes and methods.
module list

core :: list

This module handle double linked lists
module math

core :: math

Mathematical operations
module meta

meta :: meta

Simple user-defined meta-level to manipulate types of instances as object.
module native

core :: native

Native structures for text and bytes
module native_curl

curl :: native_curl

Binding of C libCurl which allow us to interact with network.
module numeric

core :: numeric

Advanced services for Numeric types
module opts

opts :: opts

Management of options on the command line
module parser

dom :: parser

XML DOM-parsing facilities
module parser_base

parser_base :: parser_base

Simple base for hand-made parsers of all kinds
module poset

poset :: poset

Pre order sets and partial order set (ie hierarchies)
module protocol

core :: protocol

module pthreads

pthreads :: pthreads

Main POSIX threads support and intro the classes Thread, Mutex and Barrier
module queue

core :: queue

Queuing data structures and wrappers
module range

core :: range

Module for range of discrete objects.
module re

core :: re

Regular expression support for all services based on Pattern
module ropes

core :: ropes

Tree-based representation of a String.
module serialization

serialization :: serialization

General serialization services
module serialization_core

serialization :: serialization_core

Abstract services to serialize Nit objects to different formats
module sorter

core :: sorter

This module contains classes used to compare things and sorts arrays.
module stream

core :: stream

Input and output streams of characters
module text

core :: text

All the classes and methods related to the manipulation of text entities
module time

core :: time

Management of time and dates
module union_find

core :: union_find

union–find algorithm using an efficient disjoint-set data structure
module utf8

core :: utf8

Codec for UTF-8 I/O
module xml_entities

dom :: xml_entities

Basic blocks for DOM-XML representation


module stanford

nlp :: stanford

Natural Language Processor based on the StanfordNLP core.
module vsm

vsm :: vsm

Vector Space Model


module nlp_index

nlp :: nlp_index

Example showing how to use a NLPFileIndex.


module a_star-m


# Natural Language Processor based on the StanfordNLP core.
# See
module nlp

import stanford
import vsm

# A StringIndex using a NLPProcessor to parse and vectorize strings
class NLPIndex
	super StringIndex

	# NLP Processor used to tokenize, lemmatize and POS tag documents
	var nlp_processor: NLPProcessor

	redef fun parse_string(string) do
		var vector = new Vector
		if string.trim.is_empty then return vector
		var doc = nlp_processor.process(string)
		for sentence in doc.sentences do
			for token in sentence.tokens do
				if not accept_token(token) then continue
				var lemma = token.lemma
				if not vector.has_key(lemma) then
					vector[lemma] = 1.0
					vector[lemma] += 1.0
		return vector

	# Is `token` accepted by this index?
	# See `whitelist_pos` and `blacklist_pos`.
	fun accept_token(token: NLPToken): Bool do
		var pos = token.pos
		if whitelist_pos.not_empty and not whitelist_pos.has(pos) then return false
		if blacklist_pos.has(pos) then return false
		if stoplist.has(token.lemma) then return false
		return true

	# Part-Of-Speech whitelist
	# If not empty, the index accept only the POS tags contained in this list.
	var whitelist_pos = new Array[String] is writable

	# Part-Of-Speech blacklist
	# Reject POS tags contained in this list.
	var blacklist_pos = new Array[String] is writable

	# List of lemmas that must not be indexed
	var stoplist = new Array[String] is writable

# A FileIndex based using a NLPProcessor
class NLPFileIndex
	super NLPIndex
	super FileIndex