From b12b9f5c82c5d7989a42e687f8ce2f1268e383d3 Mon Sep 17 00:00:00 2001 From: Alexandre Terrasa Date: Fri, 16 Oct 2015 10:08:35 -0400 Subject: [PATCH] lib/nlp: introduce StanfordNLP wrapper Signed-off-by: Alexandre Terrasa --- lib/nlp/stanford.nit | 258 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 258 insertions(+) create mode 100644 lib/nlp/stanford.nit diff --git a/lib/nlp/stanford.nit b/lib/nlp/stanford.nit new file mode 100644 index 0000000..734a228 --- /dev/null +++ b/lib/nlp/stanford.nit @@ -0,0 +1,258 @@ +# This file is part of NIT ( http://www.nitlanguage.org ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Natural Language Processor based on the StanfordNLP core. +# +# See http://nlp.stanford.edu/software/corenlp.shtml. +module stanford + +import opts +import dom + +# Wrapper around StanfordNLP jar. +# +# NLPProcessor provides natural language processing of input text files and +# an API to handle analysis results. +# +# FIXME this should use the Java FFI. +class NLPProcessor + + # Classpath to give to Java when loading the StanfordNLP jars. + var java_cp: String + + # Process a string and return a new NLPDocument from this. + fun process(string: String): NLPDocument do + var tmp_file = ".nlp.in" + var file = new FileWriter.open(tmp_file) + file.write string + file.close + var doc = process_file(tmp_file) + tmp_file.file_delete + return doc + end + + # Process the `input` file and return a new NLPDocument from this. + fun process_file(input: String): NLPDocument do + # TODO opt annotators + var tmp_file = "{input.basename}.xml" + sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -file {input}" + var doc = new NLPDocument.from_xml_file(tmp_file) + tmp_file.file_delete + return doc + end + + # Batch mode. + # + # Returns a map of file path associated with their NLPDocument. + fun process_files(inputs: Collection[String], output_dir: String): Map[String, NLPDocument] do + # Prepare the input file list + var input_file = "inputs.list" + var fw = new FileWriter.open(input_file) + for input in inputs do fw.write "{input}\n" + fw.close + + # Run Stanford NLP jar + sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -filelist {input_file} -outputDirectory {output_dir}" + # Parse output + var map = new HashMap[String, NLPDocument] + for input in inputs do + var out_file = output_dir / "{input.basename}.xml" + map[input] = new NLPDocument.from_xml_file(out_file) + end + input_file.file_delete + return map + end +end + +# A `Document` represent a text input given to the NLP processor. +# +# Once processed, it contains a list of sentences that contain tokens. +class NLPDocument + + # NLPSentences contained in `self` + var sentences = new Array[NLPSentence] + + # Init `self` from an xml element. + # + # ~~~ + # var xml = """ + # + # + # + # + # + # + # Stanford + # Stanford + # 0 + # 8 + # NNP + # + # + # University + # University + # 9 + # 19 + # NNP + # + # + # + # + # + # + # UQAM + # UQAM + # 0 + # 4 + # NNP + # + # + # University + # University + # 5 + # 15 + # NNP + # + # + # + # + # + # """.to_xml.as(XMLDocument) + # + # var document = new NLPDocument.from_xml(xml) + # assert document.sentences.length == 2 + # assert document.sentences.first.tokens.first.word == "Stanford" + # assert document.sentences.last.tokens.first.word == "UQAM" + # ~~~ + init from_xml(xml: XMLDocument) do + for obj in xml["root"].first["document"].first["sentences"].first["sentence"] do + if obj isa XMLStartTag then + sentences.add new NLPSentence.from_xml(obj) + else + print "Warning: malformed xml, `sentences` is supposed to contain `sencence` tags" + end + end + end + + # Init `self` from a XML file. + init from_xml_file(path: String) do + var file = new FileReader.open(path) + var xml = file.read_lines + file.close + xml.shift # remove xml doctype + xml.shift # remove xslt link + from_xml(xml.join("\n").to_xml.as(XMLDocument)) + end +end + +# Represent one sentence in a `Document`. +class NLPSentence + + # Index of this sentence in the input text. + var index: Int + + # NLPTokens contained in `self`. + var tokens = new Array[NLPToken] + + # Init `self` from an XML element. + # + # ~~~ + # var xml = """ + # + # + # + # Stanford + # Stanford + # 0 + # 8 + # NNP + # + # + # University + # University + # 9 + # 19 + # NNP + # + # + # """.to_xml["sentence"].first.as(XMLStartTag) + # + # var sentence = new NLPSentence.from_xml(xml) + # assert sentence.index == 1 + # assert sentence.tokens.length == 2 + # ~~~ + init from_xml(xml: XMLStartTag) do + var index = xml.attributes.first.as(XMLStringAttr).value.to_i + for obj in xml["tokens"].first["token"] do + if obj isa XMLStartTag then + tokens.add new NLPToken.from_xml(obj) + else + print "Warning: malformed xml, `tokens` is supposed to contain `token` tags" + end + end + init(index) + end +end + +# Represent one word (or puncutation mark) in a `NLPSentence`. +class NLPToken + + # Index of this word in the sentence. + var index: Int + + # Original word + var word: String + + # `word` lemma + var lemma: String + + # Position of the first character in the input + var begin_offset: Int + + # Position of the last character in the input + var end_offset: Int + + # Part Of Speech tag + var pos: String + + # Init `self` from an XML element. + # + # ~~~ + # var xml = """ + # + # University + # University + # 9 + # 19 + # NNP + # """.to_xml["token"].first.as(XMLStartTag) + # + # var token = new NLPToken.from_xml(xml) + # assert token.index == 2 + # assert token.word == "University" + # assert token.lemma == "University" + # assert token.begin_offset == 9 + # assert token.end_offset == 19 + # assert token.pos == "NNP" + # ~~~ + init from_xml(xml: XMLStartTag) do + var index = xml.attributes.first.as(XMLStringAttr).value.to_i + var word = xml["word"].first.as(XMLStartTag).data + var lemma = xml["lemma"].first.as(XMLStartTag).data + var begin_offset = xml["CharacterOffsetBegin"].first.as(XMLStartTag).data.to_i + var end_offset = xml["CharacterOffsetEnd"].first.as(XMLStartTag).data.to_i + var pos = xml["POS"].first.as(XMLStartTag).data + init(index, word, lemma, begin_offset, end_offset, pos) + end +end -- 1.7.9.5