# This file is part of NIT ( http://www.nitlanguage.org ). # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Natural Language Processor based on the StanfordNLP core. # # See http://nlp.stanford.edu/software/corenlp.shtml. module stanford import opts import dom # Wrapper around StanfordNLP jar. # # NLPProcessor provides natural language processing of input text files and # an API to handle analysis results. # # FIXME this should use the Java FFI. class NLPProcessor # Classpath to give to Java when loading the StanfordNLP jars. var java_cp: String # Process a string and return a new NLPDocument from this. fun process(string: String): NLPDocument do var tmp_file = ".nlp.in" var file = new FileWriter.open(tmp_file) file.write string file.close var doc = process_file(tmp_file) tmp_file.file_delete return doc end # Process the `input` file and return a new NLPDocument from this. fun process_file(input: String): NLPDocument do # TODO opt annotators var tmp_file = "{input.basename}.xml" sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -file {input}" var doc = new NLPDocument.from_xml_file(tmp_file) tmp_file.file_delete return doc end # Batch mode. # # Returns a map of file path associated with their NLPDocument. fun process_files(inputs: Collection[String], output_dir: String): Map[String, NLPDocument] do # Prepare the input file list var input_file = "inputs.list" var fw = new FileWriter.open(input_file) for input in inputs do fw.write "{input}\n" fw.close # Run Stanford NLP jar sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -filelist {input_file} -outputDirectory {output_dir}" # Parse output var map = new HashMap[String, NLPDocument] for input in inputs do var out_file = output_dir / "{input.basename}.xml" map[input] = new NLPDocument.from_xml_file(out_file) end input_file.file_delete return map end end # A `Document` represent a text input given to the NLP processor. # # Once processed, it contains a list of sentences that contain tokens. class NLPDocument # NLPSentences contained in `self` var sentences = new Array[NLPSentence] # Init `self` from an xml element. # # ~~~ # var xml = """ # # # # # # # Stanford # Stanford # 0 # 8 # NNP # # # University # University # 9 # 19 # NNP # # # # # # # UQAM # UQAM # 0 # 4 # NNP # # # University # University # 5 # 15 # NNP # # # # # # """.to_xml.as(XMLDocument) # # var document = new NLPDocument.from_xml(xml) # assert document.sentences.length == 2 # assert document.sentences.first.tokens.first.word == "Stanford" # assert document.sentences.last.tokens.first.word == "UQAM" # ~~~ init from_xml(xml: XMLDocument) do for obj in xml["root"].first["document"].first["sentences"].first["sentence"] do if obj isa XMLStartTag then sentences.add new NLPSentence.from_xml(obj) else print "Warning: malformed xml, `sentences` is supposed to contain `sencence` tags" end end end # Init `self` from a XML file. init from_xml_file(path: String) do var file = new FileReader.open(path) var xml = file.read_lines file.close xml.shift # remove xml doctype xml.shift # remove xslt link from_xml(xml.join("\n").to_xml.as(XMLDocument)) end end # Represent one sentence in a `Document`. class NLPSentence # Index of this sentence in the input text. var index: Int # NLPTokens contained in `self`. var tokens = new Array[NLPToken] # Init `self` from an XML element. # # ~~~ # var xml = """ # # # # Stanford # Stanford # 0 # 8 # NNP # # # University # University # 9 # 19 # NNP # # # """.to_xml["sentence"].first.as(XMLStartTag) # # var sentence = new NLPSentence.from_xml(xml) # assert sentence.index == 1 # assert sentence.tokens.length == 2 # ~~~ init from_xml(xml: XMLStartTag) do var index = xml.attributes.first.as(XMLStringAttr).value.to_i for obj in xml["tokens"].first["token"] do if obj isa XMLStartTag then tokens.add new NLPToken.from_xml(obj) else print "Warning: malformed xml, `tokens` is supposed to contain `token` tags" end end init(index) end end # Represent one word (or puncutation mark) in a `NLPSentence`. class NLPToken # Index of this word in the sentence. var index: Int # Original word var word: String # `word` lemma var lemma: String # Position of the first character in the input var begin_offset: Int # Position of the last character in the input var end_offset: Int # Part Of Speech tag var pos: String # Init `self` from an XML element. # # ~~~ # var xml = """ # # University # University # 9 # 19 # NNP # """.to_xml["token"].first.as(XMLStartTag) # # var token = new NLPToken.from_xml(xml) # assert token.index == 2 # assert token.word == "University" # assert token.lemma == "University" # assert token.begin_offset == 9 # assert token.end_offset == 19 # assert token.pos == "NNP" # ~~~ init from_xml(xml: XMLStartTag) do var index = xml.attributes.first.as(XMLStringAttr).value.to_i var word = xml["word"].first.as(XMLStartTag).data var lemma = xml["lemma"].first.as(XMLStartTag).data var begin_offset = xml["CharacterOffsetBegin"].first.as(XMLStartTag).data.to_i var end_offset = xml["CharacterOffsetEnd"].first.as(XMLStartTag).data.to_i var pos = xml["POS"].first.as(XMLStartTag).data init(index, word, lemma, begin_offset, end_offset, pos) end end