lib/nlp/stanford.nit

   1 # This file is part of NIT ( http://www.nitlanguage.org ).
   2 #
   3 # Licensed under the Apache License, Version 2.0 (the "License");
   4 # you may not use this file except in compliance with the License.
   5 # You may obtain a copy of the License at
   6 #
   7 #     http://www.apache.org/licenses/LICENSE-2.0
   8 #
   9 # Unless required by applicable law or agreed to in writing, software
  10 # distributed under the License is distributed on an "AS IS" BASIS,
  11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 # See the License for the specific language governing permissions and
  13 # limitations under the License.
  14
  15 # Natural Language Processor based on the StanfordNLP core.
  16 #
  17 # See http://nlp.stanford.edu/software/corenlp.shtml.
  18 module stanford
  19
  20 import opts
  21 import dom
  22
  23 # Natural Language Processor
  24 #
  25 # NLPProcessor provides natural language processing for input text and files.
  26 # Analyzed documents can be manipulated through the resulting NLPDocument.
  27 interface NLPProcessor
  28
  29         # Creates a new NLPDocument from a string
  30         fun process(string: String): NLPDocument is abstract
  31
  32         # Creates a new NLPDocument from a file content
  33         fun process_file(path: String): NLPDocument do
  34                 var content = path.to_path.read_all
  35                 return process(content)
  36         end
  37
  38         # Creates a new NLPDocument from a list of files (batch mode)
  39         #
  40         # Returns a map of file path associated with their NLPDocument.
  41         fun process_files(paths: Array[String]): Map[String, NLPDocument] do
  42                 var res = new HashMap[String, NLPDocument]
  43                 for file in paths do
  44                         res[file] = process_file(file)
  45                 end
  46                 return res
  47         end
  48 end
  49
  50 # Wrapper around StanfordNLP jar.
  51 #
  52 # FIXME this should use the Java FFI.
  53 class NLPJavaProcessor
  54         super NLPProcessor
  55
  56         # Classpath to give to Java when loading the StanfordNLP jars.
  57         var java_cp: String
  58
  59         # Temp dir used to store batch results
  60         var tmp_dir = ".nlp"
  61
  62         # Process a string and return a new NLPDocument from this.
  63         redef fun process(string) do
  64                 var tmp_file = ".nlp.in"
  65                 var file = new FileWriter.open(tmp_file)
  66                 file.write string
  67                 file.close
  68                 var doc = process_file(tmp_file)
  69                 tmp_file.file_delete
  70                 return doc
  71         end
  72
  73         # Process the `input` file and return a new NLPDocument from this.
  74         redef fun process_file(input) do
  75                 # TODO opt annotators
  76                 var tmp_file = "{input.basename}.xml"
  77                 sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -file {input}"
  78                 var doc = new NLPDocument.from_xml_file(tmp_file)
  79                 tmp_file.file_delete
  80                 return doc
  81         end
  82
  83         # Batch mode.
  84         #
  85         # Returns a map of file path associated with their NLPDocument.
  86         redef fun process_files(inputs) do
  87                 # Prepare the input file list
  88                 var input_file = "inputs.list"
  89                 var fw = new FileWriter.open(input_file)
  90                 for input in inputs do fw.write "{input}\n"
  91                 fw.close
  92
  93                 # Run Stanford NLP jar
  94                 sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -filelist {input_file} -outputDirectory {tmp_dir}"
  95                 # Parse output
  96                 var map = new HashMap[String, NLPDocument]
  97                 for input in inputs do
  98                         var out_file = tmp_dir / "{input.basename}.xml"
  99                         map[input] = new NLPDocument.from_xml_file(out_file)
 100                 end
 101                 input_file.file_delete
 102                 tmp_dir.rmdir
 103                 return map
 104         end
 105 end
 106
 107 # A `Document` represent a text input given to the NLP processor.
 108 #
 109 # Once processed, it contains a list of sentences that contain tokens.
 110 class NLPDocument
 111
 112         #  NLPSentences contained in `self`
 113         var sentences = new Array[NLPSentence]
 114
 115         # Init `self` from an xml element.
 116         #
 117         # ~~~
 118         # var xml = """
 119         # <root>
 120         #   <document>
 121         #     <sentences>
 122         #       <sentence id="1">
 123         #         <tokens>
 124         #           <token id="1">
 125         #             <word>Stanford</word>
 126         #             <lemma>Stanford</lemma>
 127         #             <CharacterOffsetBegin>0</CharacterOffsetBegin>
 128         #             <CharacterOffsetEnd>8</CharacterOffsetEnd>
 129         #             <POS>NNP</POS>
 130         #           </token>
 131         #           <token id="2">
 132         #             <word>University</word>
 133         #             <lemma>University</lemma>
 134         #             <CharacterOffsetBegin>9</CharacterOffsetBegin>
 135         #             <CharacterOffsetEnd>19</CharacterOffsetEnd>
 136         #             <POS>NNP</POS>
 137         #           </token>
 138         #         </tokens>
 139         #       </sentence>
 140         #       <sentence id="2">
 141         #         <tokens>
 142         #           <token id="1">
 143         #             <word>UQAM</word>
 144         #             <lemma>UQAM</lemma>
 145         #             <CharacterOffsetBegin>0</CharacterOffsetBegin>
 146         #             <CharacterOffsetEnd>4</CharacterOffsetEnd>
 147         #             <POS>NNP</POS>
 148         #           </token>
 149         #           <token id="2">
 150         #             <word>University</word>
 151         #             <lemma>University</lemma>
 152         #             <CharacterOffsetBegin>5</CharacterOffsetBegin>
 153         #             <CharacterOffsetEnd>15</CharacterOffsetEnd>
 154         #             <POS>NNP</POS>
 155         #           </token>
 156         #         </tokens>
 157         #       </sentence>
 158         #     </sentences>
 159         #   </document>
 160         # </root>""".to_xml.as(XMLDocument)
 161         #
 162         # var document = new NLPDocument.from_xml(xml)
 163         # assert document.sentences.length == 2
 164         # assert document.sentences.first.tokens.first.word == "Stanford"
 165         # assert document.sentences.last.tokens.first.word == "UQAM"
 166         # ~~~
 167         init from_xml(xml: XMLDocument) do
 168                 for obj in xml["root"].first["document"].first["sentences"].first["sentence"] do
 169                         if obj isa XMLStartTag then
 170                                 sentences.add new NLPSentence.from_xml(obj)
 171                         else
 172                                 print "Warning: malformed xml, `sentences` is supposed to contain `sencence` tags"
 173                         end
 174                 end
 175         end
 176
 177         # Init `self` from a XML file.
 178         init from_xml_file(path: String) do
 179                 var file = new FileReader.open(path)
 180                 var xml = file.read_lines
 181                 file.close
 182                 xml.shift # remove xml doctype
 183                 xml.shift # remove xslt link
 184                 from_xml(xml.join("\n").to_xml.as(XMLDocument))
 185         end
 186 end
 187
 188 # Represent one sentence in a `Document`.
 189 class NLPSentence
 190
 191         # Index of this sentence in the input text.
 192         var index: Int
 193
 194         #  NLPTokens contained in `self`.
 195         var tokens = new Array[NLPToken]
 196
 197         # Init `self` from an XML element.
 198         #
 199         # ~~~
 200         # var xml = """
 201         # <sentence id="1">
 202         #   <tokens>
 203         #     <token id="1">
 204         #       <word>Stanford</word>
 205         #       <lemma>Stanford</lemma>
 206         #       <CharacterOffsetBegin>0</CharacterOffsetBegin>
 207         #       <CharacterOffsetEnd>8</CharacterOffsetEnd>
 208         #       <POS>NNP</POS>
 209         #     </token>
 210         #     <token id="2">
 211         #       <word>University</word>
 212         #       <lemma>University</lemma>
 213         #       <CharacterOffsetBegin>9</CharacterOffsetBegin>
 214         #       <CharacterOffsetEnd>19</CharacterOffsetEnd>
 215         #       <POS>NNP</POS>
 216         #     </token>
 217         #   </tokens>
 218         # </sentence>""".to_xml["sentence"].first.as(XMLStartTag)
 219         #
 220         # var sentence = new  NLPSentence.from_xml(xml)
 221         # assert sentence.index == 1
 222         # assert sentence.tokens.length == 2
 223         # ~~~
 224         init from_xml(xml: XMLStartTag) do
 225                 var index = xml.attributes.first.as(XMLStringAttr).value.to_i
 226                 for obj in xml["tokens"].first["token"] do
 227                         if obj isa XMLStartTag then
 228                                 tokens.add new NLPToken.from_xml(obj)
 229                         else
 230                                 print "Warning: malformed xml, `tokens` is supposed to contain `token` tags"
 231                         end
 232                 end
 233                 init(index)
 234         end
 235 end
 236
 237 # Represent one word (or puncutation mark) in a `NLPSentence`.
 238 class NLPToken
 239
 240         # Index of this word in the sentence.
 241         var index: Int
 242
 243         # Original word
 244         var word: String
 245
 246         # `word` lemma
 247         var lemma: String
 248
 249         # Position of the first character in the input
 250         var begin_offset: Int
 251
 252         # Position of the last character in the input
 253         var end_offset: Int
 254
 255         # Part Of Speech tag
 256         var pos: String
 257
 258         # Init `self` from an XML element.
 259         #
 260         # ~~~
 261         # var xml = """
 262         #  <token id="2">
 263         #       <word>University</word>
 264         #       <lemma>University</lemma>
 265         #       <CharacterOffsetBegin>9</CharacterOffsetBegin>
 266         #       <CharacterOffsetEnd>19</CharacterOffsetEnd>
 267         #       <POS>NNP</POS>
 268         #  </token>""".to_xml["token"].first.as(XMLStartTag)
 269         #
 270         # var token = new  NLPToken.from_xml(xml)
 271         # assert token.index == 2
 272         # assert token.word == "University"
 273         # assert token.lemma == "University"
 274         # assert token.begin_offset == 9
 275         # assert token.end_offset == 19
 276         # assert token.pos == "NNP"
 277         # ~~~
 278         init from_xml(xml: XMLStartTag) do
 279                 var index = xml.attributes.first.as(XMLStringAttr).value.to_i
 280                 var word = xml["word"].first.as(XMLStartTag).data
 281                 var lemma = xml["lemma"].first.as(XMLStartTag).data
 282                 var begin_offset = xml["CharacterOffsetBegin"].first.as(XMLStartTag).data.to_i
 283                 var end_offset = xml["CharacterOffsetEnd"].first.as(XMLStartTag).data.to_i
 284                 var pos = xml["POS"].first.as(XMLStartTag).data
 285                 init(index, word, lemma, begin_offset, end_offset, pos)
 286         end
 287 end