Natural Language Processor based on the StanfordNLP core.

See http://nlp.stanford.edu/software/corenlp.shtml.

Introduced classes

class NLPClient

nlp :: NLPClient

A NLPProcessor using a NLPServer as backend
class NLPDocument

nlp :: NLPDocument

A Document represent a text input given to the NLP processor.
class NLPJavaProcessor

nlp :: NLPJavaProcessor

Wrapper around StanfordNLP jar.
interface NLPProcessor

nlp :: NLPProcessor

Natural Language Processor
class NLPSentence

nlp :: NLPSentence

Represent one sentence in a Document.
class NLPServer

nlp :: NLPServer

Stanford web server
class NLPToken

nlp :: NLPToken

Represent one word (or puncutation mark) in a NLPSentence.

All class definitions

class NLPClient

nlp $ NLPClient

A NLPProcessor using a NLPServer as backend
class NLPDocument

nlp $ NLPDocument

A Document represent a text input given to the NLP processor.
class NLPJavaProcessor

nlp $ NLPJavaProcessor

Wrapper around StanfordNLP jar.
interface NLPProcessor

nlp $ NLPProcessor

Natural Language Processor
class NLPSentence

nlp $ NLPSentence

Represent one sentence in a Document.
class NLPServer

nlp $ NLPServer

Stanford web server
class NLPToken

nlp $ NLPToken

Represent one word (or puncutation mark) in a NLPSentence.
package_diagram nlp::stanford stanford opts opts nlp::stanford->opts dom dom nlp::stanford->dom curl curl nlp::stanford->curl pthreads pthreads nlp::stanford->pthreads core core opts->core parser_base parser_base dom->parser_base curl->core json json curl->json pthreads->core ...core ... ...core->core ...parser_base ... ...parser_base->parser_base ...json ... ...json->json nlp::nlp nlp nlp::nlp->nlp::stanford nlp::nlp_index nlp_index nlp::nlp_index->nlp::nlp nlp::nlp_server nlp_server nlp::nlp_server->nlp::nlp nlp::nlp_index... ... nlp::nlp_index...->nlp::nlp_index nlp::nlp_server... ... nlp::nlp_server...->nlp::nlp_server

Ancestors

module abstract_collection

core :: abstract_collection

Abstract collection classes and services.
module abstract_text

core :: abstract_text

Abstract class for manipulation of sequences of characters
module array

core :: array

This module introduces the standard array structure.
module bitset

core :: bitset

Services to handle BitSet
module bytes

core :: bytes

Services for byte streams and arrays
module caching

serialization :: caching

Services for caching serialization engines
module circular_array

core :: circular_array

Efficient data structure to access both end of the sequence.
module codec_base

core :: codec_base

Base for codecs to use with streams
module codecs

core :: codecs

Group module for all codec-related manipulations
module collection

core :: collection

This module define several collection classes.
module core

core :: core

Standard classes and methods used by default by Nit programs and libraries.
module engine_tools

serialization :: engine_tools

Advanced services for serialization engines
module environ

core :: environ

Access to the environment variables of the process
module error

core :: error

Standard error-management infrastructure.
module exec

core :: exec

Invocation and management of operating system sub-processes.
module file

core :: file

File manipulations (create, read, write, etc.)
module fixed_ints

core :: fixed_ints

Basic integers of fixed-precision
module fixed_ints_text

core :: fixed_ints_text

Text services to complement fixed_ints
module flat

core :: flat

All the array-based text representations
module gc

core :: gc

Access to the Nit internal garbage collection mechanism
module hash_collection

core :: hash_collection

Introduce HashMap and HashSet.
module inspect

serialization :: inspect

Refine Serializable::inspect to show more useful information
module iso8859_1

core :: iso8859_1

Codec for ISO8859-1 I/O
module kernel

core :: kernel

Most basic classes and methods.
module list

core :: list

This module handle double linked lists
module math

core :: math

Mathematical operations
module meta

meta :: meta

Simple user-defined meta-level to manipulate types of instances as object.
module native

core :: native

Native structures for text and bytes
module native_curl

curl :: native_curl

Binding of C libCurl which allow us to interact with network.
module numeric

core :: numeric

Advanced services for Numeric types
module parser

dom :: parser

XML DOM-parsing facilities
module parser_base

parser_base :: parser_base

Simple base for hand-made parsers of all kinds
module protocol

core :: protocol

module queue

core :: queue

Queuing data structures and wrappers
module range

core :: range

Module for range of discrete objects.
module re

core :: re

Regular expression support for all services based on Pattern
module ropes

core :: ropes

Tree-based representation of a String.
module serialization

serialization :: serialization

General serialization services
module serialization_core

serialization :: serialization_core

Abstract services to serialize Nit objects to different formats
module sorter

core :: sorter

This module contains classes used to compare things and sorts arrays.
module stream

core :: stream

Input and output streams of characters
module text

core :: text

All the classes and methods related to the manipulation of text entities
module time

core :: time

Management of time and dates
module union_find

core :: union_find

union–find algorithm using an efficient disjoint-set data structure
module utf8

core :: utf8

Codec for UTF-8 I/O
module xml_entities

dom :: xml_entities

Basic blocks for DOM-XML representation

Parents

module curl

curl :: curl

Data transfer powered by the native curl library
module dom

dom :: dom

Easy XML DOM parser
module opts

opts :: opts

Management of options on the command line
module pthreads

pthreads :: pthreads

Main POSIX threads support and intro the classes Thread, Mutex and Barrier

Children

module nlp

nlp :: nlp

Natural Language Processor based on the StanfordNLP core.

Descendants

module a_star-m

a_star-m

module nlp_index

nlp :: nlp_index

Example showing how to use a NLPFileIndex.
# Natural Language Processor based on the StanfordNLP core.
#
# See http://nlp.stanford.edu/software/corenlp.shtml.
module stanford

import opts
import dom
import curl
import pthreads

# Natural Language Processor
#
# NLPProcessor provides natural language processing for input text and files.
# Analyzed documents can be manipulated through the resulting NLPDocument.
interface NLPProcessor

	# Creates a new NLPDocument from a string
	fun process(string: String): NLPDocument is abstract

	# Creates a new NLPDocument from a file content
	fun process_file(path: String): NLPDocument do
		var content = path.to_path.read_all
		return process(content)
	end

	# Creates a new NLPDocument from a list of files (batch mode)
	#
	# Returns a map of file path associated with their NLPDocument.
	fun process_files(paths: Array[String]): Map[String, NLPDocument] do
		var res = new HashMap[String, NLPDocument]
		for file in paths do
			res[file] = process_file(file)
		end
		return res
	end
end

# Wrapper around StanfordNLP jar.
#
# FIXME this should use the Java FFI.
class NLPJavaProcessor
	super NLPProcessor

	# Classpath to give to Java when loading the StanfordNLP jars.
	var java_cp: String

	# Temp dir used to store batch results
	var tmp_dir = ".nlp"

	# Process a string and return a new NLPDocument from this.
	redef fun process(string) do
		var tmp_file = ".nlp.in"
		var file = new FileWriter.open(tmp_file)
		file.write string
		file.close
		var doc = process_file(tmp_file)
		tmp_file.file_delete
		return doc
	end

	# Process the `input` file and return a new NLPDocument from this.
	redef fun process_file(input) do
		# TODO opt annotators
		var tmp_file = "{input.basename}.xml"
		sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -file {input}"
		var doc = new NLPDocument.from_xml_file(tmp_file)
		tmp_file.file_delete
		return doc
	end

	# Batch mode.
	#
	# Returns a map of file path associated with their NLPDocument.
	redef fun process_files(inputs) do
		# Prepare the input file list
		var input_file = "inputs.list"
		var fw = new FileWriter.open(input_file)
		for input in inputs do fw.write "{input}\n"
		fw.close

		# Run Stanford NLP jar
		sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -filelist {input_file} -outputDirectory {tmp_dir}"
		# Parse output
		var map = new HashMap[String, NLPDocument]
		for input in inputs do
			var out_file = tmp_dir / "{input.basename}.xml"
			map[input] = new NLPDocument.from_xml_file(out_file)
		end
		input_file.file_delete
		tmp_dir.rmdir
		return map
	end
end

# A `Document` represent a text input given to the NLP processor.
#
# Once processed, it contains a list of sentences that contain tokens.
class NLPDocument

	#  NLPSentences contained in `self`
	var sentences = new Array[NLPSentence]

	# Init `self` from an xml element.
	#
	# ~~~
	# var xml = """
	# <root>
	#   <document>
	#     <sentences>
	#       <sentence id="1">
	#         <tokens>
	#           <token id="1">
	#             <word>Stanford</word>
	#             <lemma>Stanford</lemma>
	#             <CharacterOffsetBegin>0</CharacterOffsetBegin>
	#             <CharacterOffsetEnd>8</CharacterOffsetEnd>
	#             <POS>NNP</POS>
	#           </token>
	#           <token id="2">
	#             <word>University</word>
	#             <lemma>University</lemma>
	#             <CharacterOffsetBegin>9</CharacterOffsetBegin>
	#             <CharacterOffsetEnd>19</CharacterOffsetEnd>
	#             <POS>NNP</POS>
	#           </token>
	#         </tokens>
	#       </sentence>
	#       <sentence id="2">
	#         <tokens>
	#           <token id="1">
	#             <word>UQAM</word>
	#             <lemma>UQAM</lemma>
	#             <CharacterOffsetBegin>0</CharacterOffsetBegin>
	#             <CharacterOffsetEnd>4</CharacterOffsetEnd>
	#             <POS>NNP</POS>
	#           </token>
	#           <token id="2">
	#             <word>University</word>
	#             <lemma>University</lemma>
	#             <CharacterOffsetBegin>5</CharacterOffsetBegin>
	#             <CharacterOffsetEnd>15</CharacterOffsetEnd>
	#             <POS>NNP</POS>
	#           </token>
	#         </tokens>
	#       </sentence>
	#     </sentences>
	#   </document>
	# </root>""".to_xml.as(XMLDocument)
	#
	# var document = new NLPDocument.from_xml(xml)
	# assert document.sentences.length == 2
	# assert document.sentences.first.tokens.first.word == "Stanford"
	# assert document.sentences.last.tokens.first.word == "UQAM"
	# ~~~
	init from_xml(xml: XMLDocument) do
		for obj in xml["root"].first["document"].first["sentences"].first["sentence"] do
			if obj isa XMLStartTag then
				sentences.add new NLPSentence.from_xml(obj)
			else
				print "Warning: malformed xml, `sentences` is supposed to contain `sencence` tags"
			end
		end
	end

	# Init `self` from a XML file.
	init from_xml_file(path: String) do
		var file = new FileReader.open(path)
		var xml = file.read_lines
		file.close
		xml.shift # remove xml doctype
		xml.shift # remove xslt link
		from_xml(xml.join("\n").to_xml.as(XMLDocument))
	end
end

# Represent one sentence in a `Document`.
class NLPSentence

	# Index of this sentence in the input text.
	var index: Int

	#  NLPTokens contained in `self`.
	var tokens = new Array[NLPToken]

	# Init `self` from an XML element.
	#
	# ~~~
	# var xml = """
	# <sentence id="1">
	#   <tokens>
	#     <token id="1">
	#       <word>Stanford</word>
	#       <lemma>Stanford</lemma>
	#       <CharacterOffsetBegin>0</CharacterOffsetBegin>
	#       <CharacterOffsetEnd>8</CharacterOffsetEnd>
	#       <POS>NNP</POS>
	#     </token>
	#     <token id="2">
	#       <word>University</word>
	#       <lemma>University</lemma>
	#       <CharacterOffsetBegin>9</CharacterOffsetBegin>
	#       <CharacterOffsetEnd>19</CharacterOffsetEnd>
	#       <POS>NNP</POS>
	#     </token>
	#   </tokens>
	# </sentence>""".to_xml["sentence"].first.as(XMLStartTag)
	#
	# var sentence = new  NLPSentence.from_xml(xml)
	# assert sentence.index == 1
	# assert sentence.tokens.length == 2
	# ~~~
	init from_xml(xml: XMLStartTag) do
		var index = xml.attributes.first.as(XMLStringAttr).value.to_i
		for obj in xml["tokens"].first["token"] do
			if obj isa XMLStartTag then
				tokens.add new NLPToken.from_xml(obj)
			else
				print "Warning: malformed xml, `tokens` is supposed to contain `token` tags"
			end
		end
		init(index)
	end
end

# Represent one word (or puncutation mark) in a `NLPSentence`.
class NLPToken

	# Index of this word in the sentence.
	var index: Int

	# Original word
	var word: String

	# `word` lemma
	var lemma: String

	# Position of the first character in the input
	var begin_offset: Int

	# Position of the last character in the input
	var end_offset: Int

	# Part Of Speech tag
	var pos: String

	# Init `self` from an XML element.
	#
	# ~~~
	# var xml = """
	#  <token id="2">
	#	<word>University</word>
	#	<lemma>University</lemma>
	#	<CharacterOffsetBegin>9</CharacterOffsetBegin>
	#	<CharacterOffsetEnd>19</CharacterOffsetEnd>
	#	<POS>NNP</POS>
	#  </token>""".to_xml["token"].first.as(XMLStartTag)
	#
	# var token = new  NLPToken.from_xml(xml)
	# assert token.index == 2
	# assert token.word == "University"
	# assert token.lemma == "University"
	# assert token.begin_offset == 9
	# assert token.end_offset == 19
	# assert token.pos == "NNP"
	# ~~~
	init from_xml(xml: XMLStartTag) do
		var index = xml.attributes.first.as(XMLStringAttr).value.to_i
		var word = read_data(xml, "word")
		var lemma = read_data(xml, "lemma")
		var begin_offset = read_data(xml, "CharacterOffsetBegin").to_i
		var end_offset = read_data(xml, "CharacterOffsetEnd").to_i
		var pos = read_data(xml, "POS")
		init(index, word, lemma, begin_offset, end_offset, pos)
	end

	private fun read_data(xml: XMLStartTag, tag_name: String): String do
		var res = ""
		if xml[tag_name].is_empty then return res
		var first = xml[tag_name].first
		if not first isa XMLStartTag then return res
		var data = first.data
		if data == null then return res
		return data
	end
end

# Stanford web server
#
# Runs the server on `port`.
#
# For more details about the stanford NLP server see
# https://stanfordnlp.github.io/CoreNLP/corenlp-server.html
class NLPServer
	super Thread

	# Stanford jar classpath
	#
	# Classpath to give to Java when loading the StanfordNLP jars.
	var java_cp: String

	# Port the Java server will listen on
	var port: Int

	redef fun main do
		sys.system "java -mx4g -cp \"{java_cp}\" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port {port.to_s} -timeout 15000"
		return null
	end
end

# A NLPProcessor using a NLPServer as backend
class NLPClient
	super NLPProcessor

	# Base uri of the NLP server API
	#
	# For examples "http://localhost:9000" or "https://myserver.com"
	var api_uri: String

	# Annotators to use
	#
	# The specified annotators must exist on the server.
	#
	# Defaults are: `tokenize`, `ssplit`, `pos` and `lemma`.
	var annotators: Array[String] = ["tokenize", "ssplit", "pos", "lemma"] is writable

	# Language to process
	#
	# The language must be available on the server.
	#
	# Default is `en`.
	var language = "en" is writable

	# Output format to ask.
	#
	# Only `xml` is implemented at the moment.
	private var format = "xml"

	# API uri used to build curl POST requests
	fun post_uri: String do
		return "{api_uri}/?properties=%7B%22annotators%22%3A%20%22tokenize%2Cssplit%2Cpos%2clemma%22%2C%22outputFormat%22%3A%22{format}%22%7D&pipelineLanguage={language}"
	end

	redef fun process(string) do
		var request = new CurlHTTPRequest(post_uri)
		request.body = string
		var response = request.execute
		if response isa CurlResponseSuccess then
			if response.status_code != 200 then
				print "Error: {response.body_str}"
				return new NLPDocument
			end
			var xml = response.body_str.to_xml
			if xml isa XMLError then
				print xml
			end
			return new NLPDocument.from_xml(response.body_str.to_xml.as(XMLDocument))
		else if response isa CurlResponseFailed then
			print "Error: {response.error_msg}"
			return new NLPDocument
		end
		return new NLPDocument
	end
end
lib/nlp/stanford.nit:15,1--377,3