lib/nlp/stanford.nit

   1 # This file is part of NIT ( http://www.nitlanguage.org ).
   2 #
   3 # Licensed under the Apache License, Version 2.0 (the "License");
   4 # you may not use this file except in compliance with the License.
   5 # You may obtain a copy of the License at
   6 #
   7 #     http://www.apache.org/licenses/LICENSE-2.0
   8 #
   9 # Unless required by applicable law or agreed to in writing, software
  10 # distributed under the License is distributed on an "AS IS" BASIS,
  11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 # See the License for the specific language governing permissions and
  13 # limitations under the License.
  14
  15 # Natural Language Processor based on the StanfordNLP core.
  16 #
  17 # See http://nlp.stanford.edu/software/corenlp.shtml.
  18 module stanford
  19
  20 import opts
  21 import dom
  22 import curl
  23 import pthreads
  24
  25 # Natural Language Processor
  26 #
  27 # NLPProcessor provides natural language processing for input text and files.
  28 # Analyzed documents can be manipulated through the resulting NLPDocument.
  29 interface NLPProcessor
  30
  31         # Creates a new NLPDocument from a string
  32         fun process(string: String): NLPDocument is abstract
  33
  34         # Creates a new NLPDocument from a file content
  35         fun process_file(path: String): NLPDocument do
  36                 var content = path.to_path.read_all
  37                 return process(content)
  38         end
  39
  40         # Creates a new NLPDocument from a list of files (batch mode)
  41         #
  42         # Returns a map of file path associated with their NLPDocument.
  43         fun process_files(paths: Array[String]): Map[String, NLPDocument] do
  44                 var res = new HashMap[String, NLPDocument]
  45                 for file in paths do
  46                         res[file] = process_file(file)
  47                 end
  48                 return res
  49         end
  50 end
  51
  52 # Wrapper around StanfordNLP jar.
  53 #
  54 # FIXME this should use the Java FFI.
  55 class NLPJavaProcessor
  56         super NLPProcessor
  57
  58         # Classpath to give to Java when loading the StanfordNLP jars.
  59         var java_cp: String
  60
  61         # Temp dir used to store batch results
  62         var tmp_dir = ".nlp"
  63
  64         # Process a string and return a new NLPDocument from this.
  65         redef fun process(string) do
  66                 var tmp_file = ".nlp.in"
  67                 var file = new FileWriter.open(tmp_file)
  68                 file.write string
  69                 file.close
  70                 var doc = process_file(tmp_file)
  71                 tmp_file.file_delete
  72                 return doc
  73         end
  74
  75         # Process the `input` file and return a new NLPDocument from this.
  76         redef fun process_file(input) do
  77                 # TODO opt annotators
  78                 var tmp_file = "{input.basename}.xml"
  79                 sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -file {input}"
  80                 var doc = new NLPDocument.from_xml_file(tmp_file)
  81                 tmp_file.file_delete
  82                 return doc
  83         end
  84
  85         # Batch mode.
  86         #
  87         # Returns a map of file path associated with their NLPDocument.
  88         redef fun process_files(inputs) do
  89                 # Prepare the input file list
  90                 var input_file = "inputs.list"
  91                 var fw = new FileWriter.open(input_file)
  92                 for input in inputs do fw.write "{input}\n"
  93                 fw.close
  94
  95                 # Run Stanford NLP jar
  96                 sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -filelist {input_file} -outputDirectory {tmp_dir}"
  97                 # Parse output
  98                 var map = new HashMap[String, NLPDocument]
  99                 for input in inputs do
 100                         var out_file = tmp_dir / "{input.basename}.xml"
 101                         map[input] = new NLPDocument.from_xml_file(out_file)
 102                 end
 103                 input_file.file_delete
 104                 tmp_dir.rmdir
 105                 return map
 106         end
 107 end
 108
 109 # A `Document` represent a text input given to the NLP processor.
 110 #
 111 # Once processed, it contains a list of sentences that contain tokens.
 112 class NLPDocument
 113
 114         #  NLPSentences contained in `self`
 115         var sentences = new Array[NLPSentence]
 116
 117         # Init `self` from an xml element.
 118         #
 119         # ~~~
 120         # var xml = """
 121         # <root>
 122         #   <document>
 123         #     <sentences>
 124         #       <sentence id="1">
 125         #         <tokens>
 126         #           <token id="1">
 127         #             <word>Stanford</word>
 128         #             <lemma>Stanford</lemma>
 129         #             <CharacterOffsetBegin>0</CharacterOffsetBegin>
 130         #             <CharacterOffsetEnd>8</CharacterOffsetEnd>
 131         #             <POS>NNP</POS>
 132         #           </token>
 133         #           <token id="2">
 134         #             <word>University</word>
 135         #             <lemma>University</lemma>
 136         #             <CharacterOffsetBegin>9</CharacterOffsetBegin>
 137         #             <CharacterOffsetEnd>19</CharacterOffsetEnd>
 138         #             <POS>NNP</POS>
 139         #           </token>
 140         #         </tokens>
 141         #       </sentence>
 142         #       <sentence id="2">
 143         #         <tokens>
 144         #           <token id="1">
 145         #             <word>UQAM</word>
 146         #             <lemma>UQAM</lemma>
 147         #             <CharacterOffsetBegin>0</CharacterOffsetBegin>
 148         #             <CharacterOffsetEnd>4</CharacterOffsetEnd>
 149         #             <POS>NNP</POS>
 150         #           </token>
 151         #           <token id="2">
 152         #             <word>University</word>
 153         #             <lemma>University</lemma>
 154         #             <CharacterOffsetBegin>5</CharacterOffsetBegin>
 155         #             <CharacterOffsetEnd>15</CharacterOffsetEnd>
 156         #             <POS>NNP</POS>
 157         #           </token>
 158         #         </tokens>
 159         #       </sentence>
 160         #     </sentences>
 161         #   </document>
 162         # </root>""".to_xml.as(XMLDocument)
 163         #
 164         # var document = new NLPDocument.from_xml(xml)
 165         # assert document.sentences.length == 2
 166         # assert document.sentences.first.tokens.first.word == "Stanford"
 167         # assert document.sentences.last.tokens.first.word == "UQAM"
 168         # ~~~
 169         init from_xml(xml: XMLDocument) do
 170                 for obj in xml["root"].first["document"].first["sentences"].first["sentence"] do
 171                         if obj isa XMLStartTag then
 172                                 sentences.add new NLPSentence.from_xml(obj)
 173                         else
 174                                 print "Warning: malformed xml, `sentences` is supposed to contain `sencence` tags"
 175                         end
 176                 end
 177         end
 178
 179         # Init `self` from a XML file.
 180         init from_xml_file(path: String) do
 181                 var file = new FileReader.open(path)
 182                 var xml = file.read_lines
 183                 file.close
 184                 xml.shift # remove xml doctype
 185                 xml.shift # remove xslt link
 186                 from_xml(xml.join("\n").to_xml.as(XMLDocument))
 187         end
 188 end
 189
 190 # Represent one sentence in a `Document`.
 191 class NLPSentence
 192
 193         # Index of this sentence in the input text.
 194         var index: Int
 195
 196         #  NLPTokens contained in `self`.
 197         var tokens = new Array[NLPToken]
 198
 199         # Init `self` from an XML element.
 200         #
 201         # ~~~
 202         # var xml = """
 203         # <sentence id="1">
 204         #   <tokens>
 205         #     <token id="1">
 206         #       <word>Stanford</word>
 207         #       <lemma>Stanford</lemma>
 208         #       <CharacterOffsetBegin>0</CharacterOffsetBegin>
 209         #       <CharacterOffsetEnd>8</CharacterOffsetEnd>
 210         #       <POS>NNP</POS>
 211         #     </token>
 212         #     <token id="2">
 213         #       <word>University</word>
 214         #       <lemma>University</lemma>
 215         #       <CharacterOffsetBegin>9</CharacterOffsetBegin>
 216         #       <CharacterOffsetEnd>19</CharacterOffsetEnd>
 217         #       <POS>NNP</POS>
 218         #     </token>
 219         #   </tokens>
 220         # </sentence>""".to_xml["sentence"].first.as(XMLStartTag)
 221         #
 222         # var sentence = new  NLPSentence.from_xml(xml)
 223         # assert sentence.index == 1
 224         # assert sentence.tokens.length == 2
 225         # ~~~
 226         init from_xml(xml: XMLStartTag) do
 227                 var index = xml.attributes.first.as(XMLStringAttr).value.to_i
 228                 for obj in xml["tokens"].first["token"] do
 229                         if obj isa XMLStartTag then
 230                                 tokens.add new NLPToken.from_xml(obj)
 231                         else
 232                                 print "Warning: malformed xml, `tokens` is supposed to contain `token` tags"
 233                         end
 234                 end
 235                 init(index)
 236         end
 237 end
 238
 239 # Represent one word (or puncutation mark) in a `NLPSentence`.
 240 class NLPToken
 241
 242         # Index of this word in the sentence.
 243         var index: Int
 244
 245         # Original word
 246         var word: String
 247
 248         # `word` lemma
 249         var lemma: String
 250
 251         # Position of the first character in the input
 252         var begin_offset: Int
 253
 254         # Position of the last character in the input
 255         var end_offset: Int
 256
 257         # Part Of Speech tag
 258         var pos: String
 259
 260         # Init `self` from an XML element.
 261         #
 262         # ~~~
 263         # var xml = """
 264         #  <token id="2">
 265         #       <word>University</word>
 266         #       <lemma>University</lemma>
 267         #       <CharacterOffsetBegin>9</CharacterOffsetBegin>
 268         #       <CharacterOffsetEnd>19</CharacterOffsetEnd>
 269         #       <POS>NNP</POS>
 270         #  </token>""".to_xml["token"].first.as(XMLStartTag)
 271         #
 272         # var token = new  NLPToken.from_xml(xml)
 273         # assert token.index == 2
 274         # assert token.word == "University"
 275         # assert token.lemma == "University"
 276         # assert token.begin_offset == 9
 277         # assert token.end_offset == 19
 278         # assert token.pos == "NNP"
 279         # ~~~
 280         init from_xml(xml: XMLStartTag) do
 281                 var index = xml.attributes.first.as(XMLStringAttr).value.to_i
 282                 var word = read_data(xml, "word")
 283                 var lemma = read_data(xml, "lemma")
 284                 var begin_offset = read_data(xml, "CharacterOffsetBegin").to_i
 285                 var end_offset = read_data(xml, "CharacterOffsetEnd").to_i
 286                 var pos = read_data(xml, "POS")
 287                 init(index, word, lemma, begin_offset, end_offset, pos)
 288         end
 289
 290         private fun read_data(xml: XMLStartTag, tag_name: String): String do
 291                 var res = ""
 292                 if xml[tag_name].is_empty then return res
 293                 var first = xml[tag_name].first
 294                 if not first isa XMLStartTag then return res
 295                 var data = first.data
 296                 if data == null then return res
 297                 return data
 298         end
 299 end
 300
 301 # Stanford web server
 302 #
 303 # Runs the server on `port`.
 304 #
 305 # For more details about the stanford NLP server see
 306 # https://stanfordnlp.github.io/CoreNLP/corenlp-server.html
 307 class NLPServer
 308         super Thread
 309
 310         # Stanford jar classpath
 311         #
 312         # Classpath to give to Java when loading the StanfordNLP jars.
 313         var java_cp: String
 314
 315         # Port the Java server will listen on
 316         var port: Int
 317
 318         redef fun main do
 319                 sys.system "java -mx4g -cp \"{java_cp}\" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port {port.to_s} -timeout 15000"
 320                 return null
 321         end
 322 end
 323
 324 # A NLPProcessor using a NLPServer as backend
 325 class NLPClient
 326         super NLPProcessor
 327
 328         # Base uri of the NLP server API
 329         #
 330         # For examples "http://localhost:9000" or "https://myserver.com"
 331         var api_uri: String
 332
 333         # Annotators to use
 334         #
 335         # The specified annotators must exist on the server.
 336         #
 337         # Defaults are: `tokenize`, `ssplit`, `pos` and `lemma`.
 338         var annotators: Array[String] = ["tokenize", "ssplit", "pos", "lemma"] is writable
 339
 340         # Language to process
 341         #
 342         # The language must be available on the server.
 343         #
 344         # Default is `en`.
 345         var language = "en" is writable
 346
 347         # Output format to ask.
 348         #
 349         # Only `xml` is implemented at the moment.
 350         private var format = "xml"
 351
 352         # API uri used to build curl POST requests
 353         fun post_uri: String do
 354                 return "{api_uri}/?properties=%7B%22annotators%22%3A%20%22tokenize%2Cssplit%2Cpos%2clemma%22%2C%22outputFormat%22%3A%22{format}%22%7D&pipelineLanguage={language}"
 355         end
 356
 357         redef fun process(string) do
 358                 var request = new CurlHTTPRequest(post_uri)
 359                 request.body = string
 360                 var response = request.execute
 361                 if response isa CurlResponseSuccess then
 362                         if response.status_code != 200 then
 363                                 print "Error: {response.body_str}"
 364                                 return new NLPDocument
 365                         end
 366                         var xml = response.body_str.to_xml
 367                         if xml isa XMLError then
 368                                 print xml
 369                         end
 370                         return new NLPDocument.from_xml(response.body_str.to_xml.as(XMLDocument))
 371                 else if response isa CurlResponseFailed then
 372                         print "Error: {response.error_msg}"
 373                         return new NLPDocument
 374                 end
 375                 return new NLPDocument
 376         end
 377 end