import opts
import dom
+import curl
+import pthreads
-# Wrapper around StanfordNLP jar.
+# Natural Language Processor
#
-# NLPProcessor provides natural language processing of input text files and
-# an API to handle analysis results.
+# NLPProcessor provides natural language processing for input text and files.
+# Analyzed documents can be manipulated through the resulting NLPDocument.
+interface NLPProcessor
+
+ # Creates a new NLPDocument from a string
+ fun process(string: String): NLPDocument is abstract
+
+ # Creates a new NLPDocument from a file content
+ fun process_file(path: String): NLPDocument do
+ var content = path.to_path.read_all
+ return process(content)
+ end
+
+ # Creates a new NLPDocument from a list of files (batch mode)
+ #
+ # Returns a map of file path associated with their NLPDocument.
+ fun process_files(paths: Array[String]): Map[String, NLPDocument] do
+ var res = new HashMap[String, NLPDocument]
+ for file in paths do
+ res[file] = process_file(file)
+ end
+ return res
+ end
+end
+
+# Wrapper around StanfordNLP jar.
#
# FIXME this should use the Java FFI.
-class NLPProcessor
+class NLPJavaProcessor
+ super NLPProcessor
# Classpath to give to Java when loading the StanfordNLP jars.
var java_cp: String
+ # Temp dir used to store batch results
+ var tmp_dir = ".nlp"
+
# Process a string and return a new NLPDocument from this.
- fun process(string: String): NLPDocument do
+ redef fun process(string) do
var tmp_file = ".nlp.in"
var file = new FileWriter.open(tmp_file)
file.write string
end
# Process the `input` file and return a new NLPDocument from this.
- fun process_file(input: String): NLPDocument do
+ redef fun process_file(input) do
# TODO opt annotators
var tmp_file = "{input.basename}.xml"
sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -file {input}"
# Batch mode.
#
# Returns a map of file path associated with their NLPDocument.
- fun process_files(inputs: Collection[String], output_dir: String): Map[String, NLPDocument] do
+ redef fun process_files(inputs) do
# Prepare the input file list
var input_file = "inputs.list"
var fw = new FileWriter.open(input_file)
fw.close
# Run Stanford NLP jar
- sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -filelist {input_file} -outputDirectory {output_dir}"
+ sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -filelist {input_file} -outputDirectory {tmp_dir}"
# Parse output
var map = new HashMap[String, NLPDocument]
for input in inputs do
- var out_file = output_dir / "{input.basename}.xml"
+ var out_file = tmp_dir / "{input.basename}.xml"
map[input] = new NLPDocument.from_xml_file(out_file)
end
input_file.file_delete
+ tmp_dir.rmdir
return map
end
end
# ~~~
init from_xml(xml: XMLStartTag) do
var index = xml.attributes.first.as(XMLStringAttr).value.to_i
- var word = xml["word"].first.as(XMLStartTag).data
- var lemma = xml["lemma"].first.as(XMLStartTag).data
- var begin_offset = xml["CharacterOffsetBegin"].first.as(XMLStartTag).data.to_i
- var end_offset = xml["CharacterOffsetEnd"].first.as(XMLStartTag).data.to_i
- var pos = xml["POS"].first.as(XMLStartTag).data
+ var word = read_data(xml, "word")
+ var lemma = read_data(xml, "lemma")
+ var begin_offset = read_data(xml, "CharacterOffsetBegin").to_i
+ var end_offset = read_data(xml, "CharacterOffsetEnd").to_i
+ var pos = read_data(xml, "POS")
init(index, word, lemma, begin_offset, end_offset, pos)
end
+
+ private fun read_data(xml: XMLStartTag, tag_name: String): String do
+ var res = ""
+ if xml[tag_name].is_empty then return res
+ var first = xml[tag_name].first
+ if not first isa XMLStartTag then return res
+ var data = first.data
+ if data == null then return res
+ return data
+ end
+end
+
+# Stanford web server
+#
+# Runs the server on `port`.
+#
+# For more details about the stanford NLP server see
+# https://stanfordnlp.github.io/CoreNLP/corenlp-server.html
+class NLPServer
+ super Thread
+
+ # Stanford jar classpath
+ #
+ # Classpath to give to Java when loading the StanfordNLP jars.
+ var java_cp: String
+
+ # Port the Java server will listen on
+ var port: Int
+
+ redef fun main do
+ sys.system "java -mx4g -cp \"{java_cp}\" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port {port.to_s} -timeout 15000"
+ return null
+ end
+end
+
+# A NLPProcessor using a NLPServer as backend
+class NLPClient
+ super NLPProcessor
+
+ # Base uri of the NLP server API
+ #
+ # For examples "http://localhost:9000" or "https://myserver.com"
+ var api_uri: String
+
+ # Annotators to use
+ #
+ # The specified annotators must exist on the server.
+ #
+ # Defaults are: `tokenize`, `ssplit`, `pos` and `lemma`.
+ var annotators: Array[String] = ["tokenize", "ssplit", "pos", "lemma"] is writable
+
+ # Language to process
+ #
+ # The language must be available on the server.
+ #
+ # Default is `en`.
+ var language = "en" is writable
+
+ # Output format to ask.
+ #
+ # Only `xml` is implemented at the moment.
+ private var format = "xml"
+
+ # API uri used to build curl POST requests
+ fun post_uri: String do
+ return "{api_uri}/?properties=%7B%22annotators%22%3A%20%22tokenize%2Cssplit%2Cpos%2clemma%22%2C%22outputFormat%22%3A%22{format}%22%7D&pipelineLanguage={language}"
+ end
+
+ redef fun process(string) do
+ var request = new CurlHTTPRequest(post_uri)
+ request.body = string
+ var response = request.execute
+ if response isa CurlResponseSuccess then
+ if response.status_code != 200 then
+ print "Error: {response.body_str}"
+ return new NLPDocument
+ end
+ var xml = response.body_str.to_xml
+ if xml isa XMLError then
+ print xml
+ end
+ return new NLPDocument.from_xml(response.body_str.to_xml.as(XMLDocument))
+ else if response isa CurlResponseFailed then
+ print "Error: {response.error_msg}"
+ return new NLPDocument
+ end
+ return new NLPDocument
+ end
end