See http://nlp.stanford.edu/software/corenlp.shtml.
Serializable::inspect
to show more useful information
curl :: native_curl
Binding of C libCurl which allow us to interact with network.serialization :: serialization_core
Abstract services to serialize Nit objects to different formatscore :: union_find
union–find algorithm using an efficient disjoint-set data structurenlp :: nlp_server
# Natural Language Processor based on the StanfordNLP core.
#
# See http://nlp.stanford.edu/software/corenlp.shtml.
module stanford
import opts
import dom
import curl
import pthreads
# Natural Language Processor
#
# NLPProcessor provides natural language processing for input text and files.
# Analyzed documents can be manipulated through the resulting NLPDocument.
interface NLPProcessor
# Creates a new NLPDocument from a string
fun process(string: String): NLPDocument is abstract
# Creates a new NLPDocument from a file content
fun process_file(path: String): NLPDocument do
var content = path.to_path.read_all
return process(content)
end
# Creates a new NLPDocument from a list of files (batch mode)
#
# Returns a map of file path associated with their NLPDocument.
fun process_files(paths: Array[String]): Map[String, NLPDocument] do
var res = new HashMap[String, NLPDocument]
for file in paths do
res[file] = process_file(file)
end
return res
end
end
# Wrapper around StanfordNLP jar.
#
# FIXME this should use the Java FFI.
class NLPJavaProcessor
super NLPProcessor
# Classpath to give to Java when loading the StanfordNLP jars.
var java_cp: String
# Temp dir used to store batch results
var tmp_dir = ".nlp"
# Process a string and return a new NLPDocument from this.
redef fun process(string) do
var tmp_file = ".nlp.in"
var file = new FileWriter.open(tmp_file)
file.write string
file.close
var doc = process_file(tmp_file)
tmp_file.file_delete
return doc
end
# Process the `input` file and return a new NLPDocument from this.
redef fun process_file(input) do
# TODO opt annotators
var tmp_file = "{input.basename}.xml"
sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -file {input}"
var doc = new NLPDocument.from_xml_file(tmp_file)
tmp_file.file_delete
return doc
end
# Batch mode.
#
# Returns a map of file path associated with their NLPDocument.
redef fun process_files(inputs) do
# Prepare the input file list
var input_file = "inputs.list"
var fw = new FileWriter.open(input_file)
for input in inputs do fw.write "{input}\n"
fw.close
# Run Stanford NLP jar
sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -filelist {input_file} -outputDirectory {tmp_dir}"
# Parse output
var map = new HashMap[String, NLPDocument]
for input in inputs do
var out_file = tmp_dir / "{input.basename}.xml"
map[input] = new NLPDocument.from_xml_file(out_file)
end
input_file.file_delete
tmp_dir.rmdir
return map
end
end
# A `Document` represent a text input given to the NLP processor.
#
# Once processed, it contains a list of sentences that contain tokens.
class NLPDocument
# NLPSentences contained in `self`
var sentences = new Array[NLPSentence]
# Init `self` from an xml element.
#
# ~~~
# var xml = """
# <root>
# <document>
# <sentences>
# <sentence id="1">
# <tokens>
# <token id="1">
# <word>Stanford</word>
# <lemma>Stanford</lemma>
# <CharacterOffsetBegin>0</CharacterOffsetBegin>
# <CharacterOffsetEnd>8</CharacterOffsetEnd>
# <POS>NNP</POS>
# </token>
# <token id="2">
# <word>University</word>
# <lemma>University</lemma>
# <CharacterOffsetBegin>9</CharacterOffsetBegin>
# <CharacterOffsetEnd>19</CharacterOffsetEnd>
# <POS>NNP</POS>
# </token>
# </tokens>
# </sentence>
# <sentence id="2">
# <tokens>
# <token id="1">
# <word>UQAM</word>
# <lemma>UQAM</lemma>
# <CharacterOffsetBegin>0</CharacterOffsetBegin>
# <CharacterOffsetEnd>4</CharacterOffsetEnd>
# <POS>NNP</POS>
# </token>
# <token id="2">
# <word>University</word>
# <lemma>University</lemma>
# <CharacterOffsetBegin>5</CharacterOffsetBegin>
# <CharacterOffsetEnd>15</CharacterOffsetEnd>
# <POS>NNP</POS>
# </token>
# </tokens>
# </sentence>
# </sentences>
# </document>
# </root>""".to_xml.as(XMLDocument)
#
# var document = new NLPDocument.from_xml(xml)
# assert document.sentences.length == 2
# assert document.sentences.first.tokens.first.word == "Stanford"
# assert document.sentences.last.tokens.first.word == "UQAM"
# ~~~
init from_xml(xml: XMLDocument) do
for obj in xml["root"].first["document"].first["sentences"].first["sentence"] do
if obj isa XMLStartTag then
sentences.add new NLPSentence.from_xml(obj)
else
print "Warning: malformed xml, `sentences` is supposed to contain `sencence` tags"
end
end
end
# Init `self` from a XML file.
init from_xml_file(path: String) do
var file = new FileReader.open(path)
var xml = file.read_lines
file.close
xml.shift # remove xml doctype
xml.shift # remove xslt link
from_xml(xml.join("\n").to_xml.as(XMLDocument))
end
end
# Represent one sentence in a `Document`.
class NLPSentence
# Index of this sentence in the input text.
var index: Int
# NLPTokens contained in `self`.
var tokens = new Array[NLPToken]
# Init `self` from an XML element.
#
# ~~~
# var xml = """
# <sentence id="1">
# <tokens>
# <token id="1">
# <word>Stanford</word>
# <lemma>Stanford</lemma>
# <CharacterOffsetBegin>0</CharacterOffsetBegin>
# <CharacterOffsetEnd>8</CharacterOffsetEnd>
# <POS>NNP</POS>
# </token>
# <token id="2">
# <word>University</word>
# <lemma>University</lemma>
# <CharacterOffsetBegin>9</CharacterOffsetBegin>
# <CharacterOffsetEnd>19</CharacterOffsetEnd>
# <POS>NNP</POS>
# </token>
# </tokens>
# </sentence>""".to_xml["sentence"].first.as(XMLStartTag)
#
# var sentence = new NLPSentence.from_xml(xml)
# assert sentence.index == 1
# assert sentence.tokens.length == 2
# ~~~
init from_xml(xml: XMLStartTag) do
var index = xml.attributes.first.as(XMLStringAttr).value.to_i
for obj in xml["tokens"].first["token"] do
if obj isa XMLStartTag then
tokens.add new NLPToken.from_xml(obj)
else
print "Warning: malformed xml, `tokens` is supposed to contain `token` tags"
end
end
init(index)
end
end
# Represent one word (or puncutation mark) in a `NLPSentence`.
class NLPToken
# Index of this word in the sentence.
var index: Int
# Original word
var word: String
# `word` lemma
var lemma: String
# Position of the first character in the input
var begin_offset: Int
# Position of the last character in the input
var end_offset: Int
# Part Of Speech tag
var pos: String
# Init `self` from an XML element.
#
# ~~~
# var xml = """
# <token id="2">
# <word>University</word>
# <lemma>University</lemma>
# <CharacterOffsetBegin>9</CharacterOffsetBegin>
# <CharacterOffsetEnd>19</CharacterOffsetEnd>
# <POS>NNP</POS>
# </token>""".to_xml["token"].first.as(XMLStartTag)
#
# var token = new NLPToken.from_xml(xml)
# assert token.index == 2
# assert token.word == "University"
# assert token.lemma == "University"
# assert token.begin_offset == 9
# assert token.end_offset == 19
# assert token.pos == "NNP"
# ~~~
init from_xml(xml: XMLStartTag) do
var index = xml.attributes.first.as(XMLStringAttr).value.to_i
var word = read_data(xml, "word")
var lemma = read_data(xml, "lemma")
var begin_offset = read_data(xml, "CharacterOffsetBegin").to_i
var end_offset = read_data(xml, "CharacterOffsetEnd").to_i
var pos = read_data(xml, "POS")
init(index, word, lemma, begin_offset, end_offset, pos)
end
private fun read_data(xml: XMLStartTag, tag_name: String): String do
var res = ""
if xml[tag_name].is_empty then return res
var first = xml[tag_name].first
if not first isa XMLStartTag then return res
var data = first.data
if data == null then return res
return data
end
end
# Stanford web server
#
# Runs the server on `port`.
#
# For more details about the stanford NLP server see
# https://stanfordnlp.github.io/CoreNLP/corenlp-server.html
class NLPServer
super Thread
# Stanford jar classpath
#
# Classpath to give to Java when loading the StanfordNLP jars.
var java_cp: String
# Port the Java server will listen on
var port: Int
redef fun main do
sys.system "java -mx4g -cp \"{java_cp}\" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port {port.to_s} -timeout 15000"
return null
end
end
# A NLPProcessor using a NLPServer as backend
class NLPClient
super NLPProcessor
# Base uri of the NLP server API
#
# For examples "http://localhost:9000" or "https://myserver.com"
var api_uri: String
# Annotators to use
#
# The specified annotators must exist on the server.
#
# Defaults are: `tokenize`, `ssplit`, `pos` and `lemma`.
var annotators: Array[String] = ["tokenize", "ssplit", "pos", "lemma"] is writable
# Language to process
#
# The language must be available on the server.
#
# Default is `en`.
var language = "en" is writable
# Output format to ask.
#
# Only `xml` is implemented at the moment.
private var format = "xml"
# API uri used to build curl POST requests
fun post_uri: String do
return "{api_uri}/?properties=%7B%22annotators%22%3A%20%22tokenize%2Cssplit%2Cpos%2clemma%22%2C%22outputFormat%22%3A%22{format}%22%7D&pipelineLanguage={language}"
end
redef fun process(string) do
var request = new CurlHTTPRequest(post_uri)
request.body = string
var response = request.execute
if response isa CurlResponseSuccess then
if response.status_code != 200 then
print "Error: {response.body_str}"
return new NLPDocument
end
var xml = response.body_str.to_xml
if xml isa XMLError then
print xml
end
return new NLPDocument.from_xml(response.body_str.to_xml.as(XMLDocument))
else if response isa CurlResponseFailed then
print "Error: {response.error_msg}"
return new NLPDocument
end
return new NLPDocument
end
end
lib/nlp/stanford.nit:15,1--377,3