lib/nlp/stanford.nit

   1 # This file is part of NIT ( http://www.nitlanguage.org ).
   2 #
   3 # Licensed under the Apache License, Version 2.0 (the "License");
   4 # you may not use this file except in compliance with the License.
   5 # You may obtain a copy of the License at
   6 #
   7 #     http://www.apache.org/licenses/LICENSE-2.0
   8 #
   9 # Unless required by applicable law or agreed to in writing, software
  10 # distributed under the License is distributed on an "AS IS" BASIS,
  11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 # See the License for the specific language governing permissions and
  13 # limitations under the License.
  14
  15 # Natural Language Processor based on the StanfordNLP core.
  16 #
  17 # See http://nlp.stanford.edu/software/corenlp.shtml.
  18 module stanford
  19
  20 import opts
  21 import dom
  22
  23 # Wrapper around StanfordNLP jar.
  24 #
  25 # NLPProcessor provides natural language processing of input text files and
  26 # an API to handle analysis results.
  27 #
  28 # FIXME this should use the Java FFI.
  29 class NLPProcessor
  30
  31         # Classpath to give to Java when loading the StanfordNLP jars.
  32         var java_cp: String
  33
  34         # Process a string and return a new NLPDocument from this.
  35         fun process(string: String): NLPDocument do
  36                 var tmp_file = ".nlp.in"
  37                 var file = new FileWriter.open(tmp_file)
  38                 file.write string
  39                 file.close
  40                 var doc = process_file(tmp_file)
  41                 tmp_file.file_delete
  42                 return doc
  43         end
  44
  45         # Process the `input` file and return a new NLPDocument from this.
  46         fun process_file(input: String): NLPDocument do
  47                 # TODO opt annotators
  48                 var tmp_file = "{input.basename}.xml"
  49                 sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -file {input}"
  50                 var doc = new NLPDocument.from_xml_file(tmp_file)
  51                 tmp_file.file_delete
  52                 return doc
  53         end
  54
  55         # Batch mode.
  56         #
  57         # Returns a map of file path associated with their NLPDocument.
  58         fun process_files(inputs: Collection[String], output_dir: String): Map[String, NLPDocument] do
  59                 # Prepare the input file list
  60                 var input_file = "inputs.list"
  61                 var fw = new FileWriter.open(input_file)
  62                 for input in inputs do fw.write "{input}\n"
  63                 fw.close
  64
  65                 # Run Stanford NLP jar
  66                 sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -filelist {input_file} -outputDirectory {output_dir}"
  67                 # Parse output
  68                 var map = new HashMap[String, NLPDocument]
  69                 for input in inputs do
  70                         var out_file = output_dir / "{input.basename}.xml"
  71                         map[input] = new NLPDocument.from_xml_file(out_file)
  72                 end
  73                 input_file.file_delete
  74                 return map
  75         end
  76 end
  77
  78 # A `Document` represent a text input given to the NLP processor.
  79 #
  80 # Once processed, it contains a list of sentences that contain tokens.
  81 class NLPDocument
  82
  83         #  NLPSentences contained in `self`
  84         var sentences = new Array[NLPSentence]
  85
  86         # Init `self` from an xml element.
  87         #
  88         # ~~~
  89         # var xml = """
  90         # <root>
  91         #   <document>
  92         #     <sentences>
  93         #       <sentence id="1">
  94         #         <tokens>
  95         #           <token id="1">
  96         #             <word>Stanford</word>
  97         #             <lemma>Stanford</lemma>
  98         #             <CharacterOffsetBegin>0</CharacterOffsetBegin>
  99         #             <CharacterOffsetEnd>8</CharacterOffsetEnd>
 100         #             <POS>NNP</POS>
 101         #           </token>
 102         #           <token id="2">
 103         #             <word>University</word>
 104         #             <lemma>University</lemma>
 105         #             <CharacterOffsetBegin>9</CharacterOffsetBegin>
 106         #             <CharacterOffsetEnd>19</CharacterOffsetEnd>
 107         #             <POS>NNP</POS>
 108         #           </token>
 109         #         </tokens>
 110         #       </sentence>
 111         #       <sentence id="2">
 112         #         <tokens>
 113         #           <token id="1">
 114         #             <word>UQAM</word>
 115         #             <lemma>UQAM</lemma>
 116         #             <CharacterOffsetBegin>0</CharacterOffsetBegin>
 117         #             <CharacterOffsetEnd>4</CharacterOffsetEnd>
 118         #             <POS>NNP</POS>
 119         #           </token>
 120         #           <token id="2">
 121         #             <word>University</word>
 122         #             <lemma>University</lemma>
 123         #             <CharacterOffsetBegin>5</CharacterOffsetBegin>
 124         #             <CharacterOffsetEnd>15</CharacterOffsetEnd>
 125         #             <POS>NNP</POS>
 126         #           </token>
 127         #         </tokens>
 128         #       </sentence>
 129         #     </sentences>
 130         #   </document>
 131         # </root>""".to_xml.as(XMLDocument)
 132         #
 133         # var document = new NLPDocument.from_xml(xml)
 134         # assert document.sentences.length == 2
 135         # assert document.sentences.first.tokens.first.word == "Stanford"
 136         # assert document.sentences.last.tokens.first.word == "UQAM"
 137         # ~~~
 138         init from_xml(xml: XMLDocument) do
 139                 for obj in xml["root"].first["document"].first["sentences"].first["sentence"] do
 140                         if obj isa XMLStartTag then
 141                                 sentences.add new NLPSentence.from_xml(obj)
 142                         else
 143                                 print "Warning: malformed xml, `sentences` is supposed to contain `sencence` tags"
 144                         end
 145                 end
 146         end
 147
 148         # Init `self` from a XML file.
 149         init from_xml_file(path: String) do
 150                 var file = new FileReader.open(path)
 151                 var xml = file.read_lines
 152                 file.close
 153                 xml.shift # remove xml doctype
 154                 xml.shift # remove xslt link
 155                 from_xml(xml.join("\n").to_xml.as(XMLDocument))
 156         end
 157 end
 158
 159 # Represent one sentence in a `Document`.
 160 class NLPSentence
 161
 162         # Index of this sentence in the input text.
 163         var index: Int
 164
 165         #  NLPTokens contained in `self`.
 166         var tokens = new Array[NLPToken]
 167
 168         # Init `self` from an XML element.
 169         #
 170         # ~~~
 171         # var xml = """
 172         # <sentence id="1">
 173         #   <tokens>
 174         #     <token id="1">
 175         #       <word>Stanford</word>
 176         #       <lemma>Stanford</lemma>
 177         #       <CharacterOffsetBegin>0</CharacterOffsetBegin>
 178         #       <CharacterOffsetEnd>8</CharacterOffsetEnd>
 179         #       <POS>NNP</POS>
 180         #     </token>
 181         #     <token id="2">
 182         #       <word>University</word>
 183         #       <lemma>University</lemma>
 184         #       <CharacterOffsetBegin>9</CharacterOffsetBegin>
 185         #       <CharacterOffsetEnd>19</CharacterOffsetEnd>
 186         #       <POS>NNP</POS>
 187         #     </token>
 188         #   </tokens>
 189         # </sentence>""".to_xml["sentence"].first.as(XMLStartTag)
 190         #
 191         # var sentence = new  NLPSentence.from_xml(xml)
 192         # assert sentence.index == 1
 193         # assert sentence.tokens.length == 2
 194         # ~~~
 195         init from_xml(xml: XMLStartTag) do
 196                 var index = xml.attributes.first.as(XMLStringAttr).value.to_i
 197                 for obj in xml["tokens"].first["token"] do
 198                         if obj isa XMLStartTag then
 199                                 tokens.add new NLPToken.from_xml(obj)
 200                         else
 201                                 print "Warning: malformed xml, `tokens` is supposed to contain `token` tags"
 202                         end
 203                 end
 204                 init(index)
 205         end
 206 end
 207
 208 # Represent one word (or puncutation mark) in a `NLPSentence`.
 209 class NLPToken
 210
 211         # Index of this word in the sentence.
 212         var index: Int
 213
 214         # Original word
 215         var word: String
 216
 217         # `word` lemma
 218         var lemma: String
 219
 220         # Position of the first character in the input
 221         var begin_offset: Int
 222
 223         # Position of the last character in the input
 224         var end_offset: Int
 225
 226         # Part Of Speech tag
 227         var pos: String
 228
 229         # Init `self` from an XML element.
 230         #
 231         # ~~~
 232         # var xml = """
 233         #  <token id="2">
 234         #       <word>University</word>
 235         #       <lemma>University</lemma>
 236         #       <CharacterOffsetBegin>9</CharacterOffsetBegin>
 237         #       <CharacterOffsetEnd>19</CharacterOffsetEnd>
 238         #       <POS>NNP</POS>
 239         #  </token>""".to_xml["token"].first.as(XMLStartTag)
 240         #
 241         # var token = new  NLPToken.from_xml(xml)
 242         # assert token.index == 2
 243         # assert token.word == "University"
 244         # assert token.lemma == "University"
 245         # assert token.begin_offset == 9
 246         # assert token.end_offset == 19
 247         # assert token.pos == "NNP"
 248         # ~~~
 249         init from_xml(xml: XMLStartTag) do
 250                 var index = xml.attributes.first.as(XMLStringAttr).value.to_i
 251                 var word = xml["word"].first.as(XMLStartTag).data
 252                 var lemma = xml["lemma"].first.as(XMLStartTag).data
 253                 var begin_offset = xml["CharacterOffsetBegin"].first.as(XMLStartTag).data.to_i
 254                 var end_offset = xml["CharacterOffsetEnd"].first.as(XMLStartTag).data.to_i
 255                 var pos = xml["POS"].first.as(XMLStartTag).data
 256                 init(index, word, lemma, begin_offset, end_offset, pos)
 257         end
 258 end