lib/nlp: introduce StanfordNLP wrapper
authorAlexandre Terrasa <alexandre@moz-code.org>
Fri, 16 Oct 2015 14:08:35 +0000 (10:08 -0400)
committerAlexandre Terrasa <alexandre@moz-code.org>
Mon, 19 Oct 2015 22:07:05 +0000 (18:07 -0400)
Signed-off-by: Alexandre Terrasa <alexandre@moz-code.org>

lib/nlp/stanford.nit [new file with mode: 0644]

diff --git a/lib/nlp/stanford.nit b/lib/nlp/stanford.nit
new file mode 100644 (file)
index 0000000..734a228
--- /dev/null
@@ -0,0 +1,258 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Natural Language Processor based on the StanfordNLP core.
+#
+# See http://nlp.stanford.edu/software/corenlp.shtml.
+module stanford
+
+import opts
+import dom
+
+# Wrapper around StanfordNLP jar.
+#
+# NLPProcessor provides natural language processing of input text files and
+# an API to handle analysis results.
+#
+# FIXME this should use the Java FFI.
+class NLPProcessor
+
+       # Classpath to give to Java when loading the StanfordNLP jars.
+       var java_cp: String
+
+       # Process a string and return a new NLPDocument from this.
+       fun process(string: String): NLPDocument do
+               var tmp_file = ".nlp.in"
+               var file = new FileWriter.open(tmp_file)
+               file.write string
+               file.close
+               var doc = process_file(tmp_file)
+               tmp_file.file_delete
+               return doc
+       end
+
+       # Process the `input` file and return a new NLPDocument from this.
+       fun process_file(input: String): NLPDocument do
+               # TODO opt annotators
+               var tmp_file = "{input.basename}.xml"
+               sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -file {input}"
+               var doc = new NLPDocument.from_xml_file(tmp_file)
+               tmp_file.file_delete
+               return doc
+       end
+
+       # Batch mode.
+       #
+       # Returns a map of file path associated with their NLPDocument.
+       fun process_files(inputs: Collection[String], output_dir: String): Map[String, NLPDocument] do
+               # Prepare the input file list
+               var input_file = "inputs.list"
+               var fw = new FileWriter.open(input_file)
+               for input in inputs do fw.write "{input}\n"
+               fw.close
+
+               # Run Stanford NLP jar
+               sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -filelist {input_file} -outputDirectory {output_dir}"
+               # Parse output
+               var map = new HashMap[String, NLPDocument]
+               for input in inputs do
+                       var out_file = output_dir / "{input.basename}.xml"
+                       map[input] = new NLPDocument.from_xml_file(out_file)
+               end
+               input_file.file_delete
+               return map
+       end
+end
+
+# A `Document` represent a text input given to the NLP processor.
+#
+# Once processed, it contains a list of sentences that contain tokens.
+class NLPDocument
+
+       #  NLPSentences contained in `self`
+       var sentences = new Array[NLPSentence]
+
+       # Init `self` from an xml element.
+       #
+       # ~~~
+       # var xml = """
+       # <root>
+       #   <document>
+       #     <sentences>
+       #       <sentence id="1">
+       #         <tokens>
+       #           <token id="1">
+       #             <word>Stanford</word>
+       #             <lemma>Stanford</lemma>
+       #             <CharacterOffsetBegin>0</CharacterOffsetBegin>
+       #             <CharacterOffsetEnd>8</CharacterOffsetEnd>
+       #             <POS>NNP</POS>
+       #           </token>
+       #           <token id="2">
+       #             <word>University</word>
+       #             <lemma>University</lemma>
+       #             <CharacterOffsetBegin>9</CharacterOffsetBegin>
+       #             <CharacterOffsetEnd>19</CharacterOffsetEnd>
+       #             <POS>NNP</POS>
+       #           </token>
+       #         </tokens>
+       #       </sentence>
+       #       <sentence id="2">
+       #         <tokens>
+       #           <token id="1">
+       #             <word>UQAM</word>
+       #             <lemma>UQAM</lemma>
+       #             <CharacterOffsetBegin>0</CharacterOffsetBegin>
+       #             <CharacterOffsetEnd>4</CharacterOffsetEnd>
+       #             <POS>NNP</POS>
+       #           </token>
+       #           <token id="2">
+       #             <word>University</word>
+       #             <lemma>University</lemma>
+       #             <CharacterOffsetBegin>5</CharacterOffsetBegin>
+       #             <CharacterOffsetEnd>15</CharacterOffsetEnd>
+       #             <POS>NNP</POS>
+       #           </token>
+       #         </tokens>
+       #       </sentence>
+       #     </sentences>
+       #   </document>
+       # </root>""".to_xml.as(XMLDocument)
+       #
+       # var document = new NLPDocument.from_xml(xml)
+       # assert document.sentences.length == 2
+       # assert document.sentences.first.tokens.first.word == "Stanford"
+       # assert document.sentences.last.tokens.first.word == "UQAM"
+       # ~~~
+       init from_xml(xml: XMLDocument) do
+               for obj in xml["root"].first["document"].first["sentences"].first["sentence"] do
+                       if obj isa XMLStartTag then
+                               sentences.add new NLPSentence.from_xml(obj)
+                       else
+                               print "Warning: malformed xml, `sentences` is supposed to contain `sencence` tags"
+                       end
+               end
+       end
+
+       # Init `self` from a XML file.
+       init from_xml_file(path: String) do
+               var file = new FileReader.open(path)
+               var xml = file.read_lines
+               file.close
+               xml.shift # remove xml doctype
+               xml.shift # remove xslt link
+               from_xml(xml.join("\n").to_xml.as(XMLDocument))
+       end
+end
+
+# Represent one sentence in a `Document`.
+class NLPSentence
+
+       # Index of this sentence in the input text.
+       var index: Int
+
+       #  NLPTokens contained in `self`.
+       var tokens = new Array[NLPToken]
+
+       # Init `self` from an XML element.
+       #
+       # ~~~
+       # var xml = """
+       # <sentence id="1">
+       #   <tokens>
+       #     <token id="1">
+       #       <word>Stanford</word>
+       #       <lemma>Stanford</lemma>
+       #       <CharacterOffsetBegin>0</CharacterOffsetBegin>
+       #       <CharacterOffsetEnd>8</CharacterOffsetEnd>
+       #       <POS>NNP</POS>
+       #     </token>
+       #     <token id="2">
+       #       <word>University</word>
+       #       <lemma>University</lemma>
+       #       <CharacterOffsetBegin>9</CharacterOffsetBegin>
+       #       <CharacterOffsetEnd>19</CharacterOffsetEnd>
+       #       <POS>NNP</POS>
+       #     </token>
+       #   </tokens>
+       # </sentence>""".to_xml["sentence"].first.as(XMLStartTag)
+       #
+       # var sentence = new  NLPSentence.from_xml(xml)
+       # assert sentence.index == 1
+       # assert sentence.tokens.length == 2
+       # ~~~
+       init from_xml(xml: XMLStartTag) do
+               var index = xml.attributes.first.as(XMLStringAttr).value.to_i
+               for obj in xml["tokens"].first["token"] do
+                       if obj isa XMLStartTag then
+                               tokens.add new NLPToken.from_xml(obj)
+                       else
+                               print "Warning: malformed xml, `tokens` is supposed to contain `token` tags"
+                       end
+               end
+               init(index)
+       end
+end
+
+# Represent one word (or puncutation mark) in a `NLPSentence`.
+class NLPToken
+
+       # Index of this word in the sentence.
+       var index: Int
+
+       # Original word
+       var word: String
+
+       # `word` lemma
+       var lemma: String
+
+       # Position of the first character in the input
+       var begin_offset: Int
+
+       # Position of the last character in the input
+       var end_offset: Int
+
+       # Part Of Speech tag
+       var pos: String
+
+       # Init `self` from an XML element.
+       #
+       # ~~~
+       # var xml = """
+       #  <token id="2">
+       #       <word>University</word>
+       #       <lemma>University</lemma>
+       #       <CharacterOffsetBegin>9</CharacterOffsetBegin>
+       #       <CharacterOffsetEnd>19</CharacterOffsetEnd>
+       #       <POS>NNP</POS>
+       #  </token>""".to_xml["token"].first.as(XMLStartTag)
+       #
+       # var token = new  NLPToken.from_xml(xml)
+       # assert token.index == 2
+       # assert token.word == "University"
+       # assert token.lemma == "University"
+       # assert token.begin_offset == 9
+       # assert token.end_offset == 19
+       # assert token.pos == "NNP"
+       # ~~~
+       init from_xml(xml: XMLStartTag) do
+               var index = xml.attributes.first.as(XMLStringAttr).value.to_i
+               var word = xml["word"].first.as(XMLStartTag).data
+               var lemma = xml["lemma"].first.as(XMLStartTag).data
+               var begin_offset = xml["CharacterOffsetBegin"].first.as(XMLStartTag).data.to_i
+               var end_offset = xml["CharacterOffsetEnd"].first.as(XMLStartTag).data.to_i
+               var pos = xml["POS"].first.as(XMLStartTag).data
+               init(index, word, lemma, begin_offset, end_offset, pos)
+       end
+end