Natural Language Processor based on the StanfordNLP core.

See http://nlp.stanford.edu/software/corenlp.shtml.

Introduced classes

class NLPFileIndex

nlp :: NLPFileIndex

A FileIndex based using a NLPProcessor

class NLPIndex

nlp :: NLPIndex

A StringIndex using a NLPProcessor to parse and vectorize strings

All class definitions

class NLPFileIndex

nlp $ NLPFileIndex

A FileIndex based using a NLPProcessor

class NLPIndex

A StringIndex using a NLPProcessor to parse and vectorize strings

Ancestors

module abstract_collection

core :: abstract_collection

Abstract collection classes and services.

module abstract_text

core :: abstract_text

Abstract class for manipulation of sequences of characters

module array

This module introduces the standard array structure.

module bitset

Services to handle BitSet

module bytes

Services for byte streams and arrays

module caching

serialization :: caching

Services for caching serialization engines

module circular_array

core :: circular_array

Efficient data structure to access both end of the sequence.

module codec_base

core :: codec_base

Base for codecs to use with streams

module codecs

Group module for all codec-related manipulations

module collection

core :: collection

This module define several collection classes.

module core

Standard classes and methods used by default by Nit programs and libraries.

module counter

counter :: counter

Simple numerical statistical analysis and presentation

module curl

Data transfer powered by the native curl library

module dom

Easy XML DOM parser

module engine_tools

serialization :: engine_tools

Advanced services for serialization engines

module environ

core :: environ

Access to the environment variables of the process

module error

Standard error-management infrastructure.

module exec

Invocation and management of operating system sub-processes.

module file

File manipulations (create, read, write, etc.)

module fixed_ints

core :: fixed_ints

Basic integers of fixed-precision

module fixed_ints_text

core :: fixed_ints_text

Text services to complement fixed_ints

module flat

All the array-based text representations

module gc

Access to the Nit internal garbage collection mechanism

module hash_collection

core :: hash_collection

Introduce HashMap and HashSet.

module inspect

serialization :: inspect

Refine Serializable::inspect to show more useful information

module iso8859_1

core :: iso8859_1

Codec for ISO8859-1 I/O

module kernel

Most basic classes and methods.

module list

This module handle double linked lists

module math

Mathematical operations

module meta

Simple user-defined meta-level to manipulate types of instances as object.

module native

Native structures for text and bytes

module native_curl

curl :: native_curl

Binding of C libCurl which allow us to interact with network.

module numeric

core :: numeric

Advanced services for Numeric types

module opts

Management of options on the command line

module parser

XML DOM-parsing facilities

module parser_base

parser_base :: parser_base

Simple base for hand-made parsers of all kinds

module poset

Pre order sets and partial order set (ie hierarchies)

module protocol

core :: protocol

module pthreads

pthreads :: pthreads

Main POSIX threads support and intro the classes Thread, Mutex and Barrier

module queue

Queuing data structures and wrappers

module range

Module for range of discrete objects.

module re

Regular expression support for all services based on Pattern

module ropes

Tree-based representation of a String.

module serialization

serialization :: serialization

General serialization services

module serialization_core

serialization :: serialization_core

Abstract services to serialize Nit objects to different formats

module sorter

This module contains classes used to compare things and sorts arrays.

module stream

Input and output streams of characters

module string_search

core :: string_search

Basic string search, match and replace.

module text

All the classes and methods related to the manipulation of text entities

module time

Management of time and dates

module union_find

core :: union_find

union–find algorithm using an efficient disjoint-set data structure

module utf8

Codec for UTF-8 I/O

module xml_entities

dom :: xml_entities

Basic blocks for DOM-XML representation

Parents

module stanford

nlp :: stanford

Natural Language Processor based on the StanfordNLP core.

module vsm

Vector Space Model

Children

module nlp_index

nlp :: nlp_index

Example showing how to use a NLPFileIndex.

module nlp_server

nlp :: nlp_server

Descendants

module a_star-m

# Natural Language Processor based on the StanfordNLP core.
#
# See http://nlp.stanford.edu/software/corenlp.shtml.
module nlp

import stanford
import vsm

# A StringIndex using a NLPProcessor to parse and vectorize strings
class NLPIndex
	super StringIndex

	# NLP Processor used to tokenize, lemmatize and POS tag documents
	var nlp_processor: NLPProcessor

	redef fun parse_string(string) do
		var vector = new Vector
		if string.trim.is_empty then return vector
		var doc = nlp_processor.process(string)
		for sentence in doc.sentences do
			for token in sentence.tokens do
				if not accept_token(token) then continue
				var lemma = token.lemma
				if not vector.has_key(lemma) then
					vector[lemma] = 1.0
				else
					vector[lemma] += 1.0
				end
			end
		end
		return vector
	end

	# Is `token` accepted by this index?
	#
	# See `whitelist_pos` and `blacklist_pos`.
	fun accept_token(token: NLPToken): Bool do
		var pos = token.pos
		if whitelist_pos.not_empty and not whitelist_pos.has(pos) then return false
		if blacklist_pos.has(pos) then return false
		if stoplist.has(token.lemma) then return false
		return true
	end

	# Part-Of-Speech whitelist
	#
	# If not empty, the index accept only the POS tags contained in this list.
	var whitelist_pos = new Array[String] is writable

	# Part-Of-Speech blacklist
	#
	# Reject POS tags contained in this list.
	var blacklist_pos = new Array[String] is writable

	# List of lemmas that must not be indexed
	var stoplist = new Array[String] is writable
end

# A FileIndex based using a NLPProcessor
class NLPFileIndex
	super NLPIndex
	super FileIndex
end

lib/nlp/nlp.nit:15,1--77,3