From b12b9f5c82c5d7989a42e687f8ce2f1268e383d3 Mon Sep 17 00:00:00 2001
From: Alexandre Terrasa <alexandre@moz-code.org>
Date: Fri, 16 Oct 2015 10:08:35 -0400
Subject: [PATCH] lib/nlp: introduce StanfordNLP wrapper

Signed-off-by: Alexandre Terrasa <alexandre@moz-code.org>
---
 lib/nlp/stanford.nit |  258 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 258 insertions(+)
 create mode 100644 lib/nlp/stanford.nit
diff --git a/lib/nlp/stanford.nit b/lib/nlp/stanford.nit
new file mode 100644
index 0000000..734a228
--- /dev/null
+++ b/lib/nlp/stanford.nit
@@ -0,0 +1,258 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Natural Language Processor based on the StanfordNLP core.
+#
+# See http://nlp.stanford.edu/software/corenlp.shtml.
+module stanford
+
+import opts
+import dom
+
+# Wrapper around StanfordNLP jar.
+#
+# NLPProcessor provides natural language processing of input text files and
+# an API to handle analysis results.
+#
+# FIXME this should use the Java FFI.
+class NLPProcessor
+
+	# Classpath to give to Java when loading the StanfordNLP jars.
+	var java_cp: String
+
+	# Process a string and return a new NLPDocument from this.
+	fun process(string: String): NLPDocument do
+		var tmp_file = ".nlp.in"
+		var file = new FileWriter.open(tmp_file)
+		file.write string
+		file.close
+		var doc = process_file(tmp_file)
+		tmp_file.file_delete
+		return doc
+	end
+
+	# Process the `input` file and return a new NLPDocument from this.
+	fun process_file(input: String): NLPDocument do
+		# TODO opt annotators
+		var tmp_file = "{input.basename}.xml"
+		sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -file {input}"
+		var doc = new NLPDocument.from_xml_file(tmp_file)
+		tmp_file.file_delete
+		return doc
+	end
+
+	# Batch mode.
+	#
+	# Returns a map of file path associated with their NLPDocument.
+	fun process_files(inputs: Collection[String], output_dir: String): Map[String, NLPDocument] do
+		# Prepare the input file list
+		var input_file = "inputs.list"
+		var fw = new FileWriter.open(input_file)
+		for input in inputs do fw.write "{input}\n"
+		fw.close
+
+		# Run Stanford NLP jar
+		sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -filelist {input_file} -outputDirectory {output_dir}"
+		# Parse output
+		var map = new HashMap[String, NLPDocument]
+		for input in inputs do
+			var out_file = output_dir / "{input.basename}.xml"
+			map[input] = new NLPDocument.from_xml_file(out_file)
+		end
+		input_file.file_delete
+		return map
+	end
+end
+
+# A `Document` represent a text input given to the NLP processor.
+#
+# Once processed, it contains a list of sentences that contain tokens.
+class NLPDocument
+
+	#  NLPSentences contained in `self`
+	var sentences = new Array[NLPSentence]
+
+	# Init `self` from an xml element.
+	#
+	# ~~~
+	# var xml = """
+	# <root>
+	#   <document>
+	#     <sentences>
+	#       <sentence id="1">
+	#         <tokens>
+	#           <token id="1">
+	#             <word>Stanford</word>
+	#             <lemma>Stanford</lemma>
+	#             <CharacterOffsetBegin>0</CharacterOffsetBegin>
+	#             <CharacterOffsetEnd>8</CharacterOffsetEnd>
+	#             <POS>NNP</POS>
+	#           </token>
+	#           <token id="2">
+	#             <word>University</word>
+	#             <lemma>University</lemma>
+	#             <CharacterOffsetBegin>9</CharacterOffsetBegin>
+	#             <CharacterOffsetEnd>19</CharacterOffsetEnd>
+	#             <POS>NNP</POS>
+	#           </token>
+	#         </tokens>
+	#       </sentence>
+	#       <sentence id="2">
+	#         <tokens>
+	#           <token id="1">
+	#             <word>UQAM</word>
+	#             <lemma>UQAM</lemma>
+	#             <CharacterOffsetBegin>0</CharacterOffsetBegin>
+	#             <CharacterOffsetEnd>4</CharacterOffsetEnd>
+	#             <POS>NNP</POS>
+	#           </token>
+	#           <token id="2">
+	#             <word>University</word>
+	#             <lemma>University</lemma>
+	#             <CharacterOffsetBegin>5</CharacterOffsetBegin>
+	#             <CharacterOffsetEnd>15</CharacterOffsetEnd>
+	#             <POS>NNP</POS>
+	#           </token>
+	#         </tokens>
+	#       </sentence>
+	#     </sentences>
+	#   </document>
+	# </root>""".to_xml.as(XMLDocument)
+	#
+	# var document = new NLPDocument.from_xml(xml)
+	# assert document.sentences.length == 2
+	# assert document.sentences.first.tokens.first.word == "Stanford"
+	# assert document.sentences.last.tokens.first.word == "UQAM"
+	# ~~~
+	init from_xml(xml: XMLDocument) do
+		for obj in xml["root"].first["document"].first["sentences"].first["sentence"] do
+			if obj isa XMLStartTag then
+				sentences.add new NLPSentence.from_xml(obj)
+			else
+				print "Warning: malformed xml, `sentences` is supposed to contain `sencence` tags"
+			end
+		end
+	end
+
+	# Init `self` from a XML file.
+	init from_xml_file(path: String) do
+		var file = new FileReader.open(path)
+		var xml = file.read_lines
+		file.close
+		xml.shift # remove xml doctype
+		xml.shift # remove xslt link
+		from_xml(xml.join("\n").to_xml.as(XMLDocument))
+	end
+end
+
+# Represent one sentence in a `Document`.
+class NLPSentence
+
+	# Index of this sentence in the input text.
+	var index: Int
+
+	#  NLPTokens contained in `self`.
+	var tokens = new Array[NLPToken]
+
+	# Init `self` from an XML element.
+	#
+	# ~~~
+	# var xml = """
+	# <sentence id="1">
+	#   <tokens>
+	#     <token id="1">
+	#       <word>Stanford</word>
+	#       <lemma>Stanford</lemma>
+	#       <CharacterOffsetBegin>0</CharacterOffsetBegin>
+	#       <CharacterOffsetEnd>8</CharacterOffsetEnd>
+	#       <POS>NNP</POS>
+	#     </token>
+	#     <token id="2">
+	#       <word>University</word>
+	#       <lemma>University</lemma>
+	#       <CharacterOffsetBegin>9</CharacterOffsetBegin>
+	#       <CharacterOffsetEnd>19</CharacterOffsetEnd>
+	#       <POS>NNP</POS>
+	#     </token>
+	#   </tokens>
+	# </sentence>""".to_xml["sentence"].first.as(XMLStartTag)
+	#
+	# var sentence = new  NLPSentence.from_xml(xml)
+	# assert sentence.index == 1
+	# assert sentence.tokens.length == 2
+	# ~~~
+	init from_xml(xml: XMLStartTag) do
+		var index = xml.attributes.first.as(XMLStringAttr).value.to_i
+		for obj in xml["tokens"].first["token"] do
+			if obj isa XMLStartTag then
+				tokens.add new NLPToken.from_xml(obj)
+			else
+				print "Warning: malformed xml, `tokens` is supposed to contain `token` tags"
+			end
+		end
+		init(index)
+	end
+end
+
+# Represent one word (or puncutation mark) in a `NLPSentence`.
+class NLPToken
+
+	# Index of this word in the sentence.
+	var index: Int
+
+	# Original word
+	var word: String
+
+	# `word` lemma
+	var lemma: String
+
+	# Position of the first character in the input
+	var begin_offset: Int
+
+	# Position of the last character in the input
+	var end_offset: Int
+
+	# Part Of Speech tag
+	var pos: String
+
+	# Init `self` from an XML element.
+	#
+	# ~~~
+	# var xml = """
+	#  <token id="2">
+	#	<word>University</word>
+	#	<lemma>University</lemma>
+	#	<CharacterOffsetBegin>9</CharacterOffsetBegin>
+	#	<CharacterOffsetEnd>19</CharacterOffsetEnd>
+	#	<POS>NNP</POS>
+	#  </token>""".to_xml["token"].first.as(XMLStartTag)
+	#
+	# var token = new  NLPToken.from_xml(xml)
+	# assert token.index == 2
+	# assert token.word == "University"
+	# assert token.lemma == "University"
+	# assert token.begin_offset == 9
+	# assert token.end_offset == 19
+	# assert token.pos == "NNP"
+	# ~~~
+	init from_xml(xml: XMLStartTag) do
+		var index = xml.attributes.first.as(XMLStringAttr).value.to_i
+		var word = xml["word"].first.as(XMLStartTag).data
+		var lemma = xml["lemma"].first.as(XMLStartTag).data
+		var begin_offset = xml["CharacterOffsetBegin"].first.as(XMLStartTag).data.to_i
+		var end_offset = xml["CharacterOffsetEnd"].first.as(XMLStartTag).data.to_i
+		var pos = xml["POS"].first.as(XMLStartTag).data
+		init(index, word, lemma, begin_offset, end_offset, pos)
+	end
+end
-- 
1.7.9.5