manual: CI check with nitunit

[nit.git] / lib / nlp / stanford.nit
diff --git a/lib/nlp/stanford.nit b/lib/nlp/stanford.nit

index 734a228..a637d53 100644 (file)
--- a/lib/nlp/stanford.nit
+++ b/lib/nlp/stanford.nit
@@ -19,20 +19,50 @@ module stanford
  
  import opts
  import dom
+import curl
+import pthreads
  
-# Wrapper around StanfordNLP jar.
+# Natural Language Processor
  #
-# NLPProcessor provides natural language processing of input text files and
-# an API to handle analysis results.
+# NLPProcessor provides natural language processing for input text and files.
+# Analyzed documents can be manipulated through the resulting NLPDocument.
+interface NLPProcessor
+
+       # Creates a new NLPDocument from a string
+       fun process(string: String): NLPDocument is abstract
+
+       # Creates a new NLPDocument from a file content
+       fun process_file(path: String): NLPDocument do
+               var content = path.to_path.read_all
+               return process(content)
+       end
+
+       # Creates a new NLPDocument from a list of files (batch mode)
+       #
+       # Returns a map of file path associated with their NLPDocument.
+       fun process_files(paths: Array[String]): Map[String, NLPDocument] do
+               var res = new HashMap[String, NLPDocument]
+               for file in paths do
+                       res[file] = process_file(file)
+               end
+               return res
+       end
+end
+
+# Wrapper around StanfordNLP jar.
  #
  # FIXME this should use the Java FFI.
-class NLPProcessor
+class NLPJavaProcessor
+       super NLPProcessor
  
         # Classpath to give to Java when loading the StanfordNLP jars.
         var java_cp: String
  
+       # Temp dir used to store batch results
+       var tmp_dir = ".nlp"
+
         # Process a string and return a new NLPDocument from this.
-       fun process(string: String): NLPDocument do
+       redef fun process(string) do
                 var tmp_file = ".nlp.in"
                 var file = new FileWriter.open(tmp_file)
                 file.write string
@@ -43,7 +73,7 @@ class NLPProcessor
         end
  
         # Process the `input` file and return a new NLPDocument from this.
-       fun process_file(input: String): NLPDocument do
+       redef fun process_file(input) do
                 # TODO opt annotators
                 var tmp_file = "{input.basename}.xml"
                 sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -file {input}"
@@ -55,7 +85,7 @@ class NLPProcessor
         # Batch mode.
         #
         # Returns a map of file path associated with their NLPDocument.
-       fun process_files(inputs: Collection[String], output_dir: String): Map[String, NLPDocument] do
+       redef fun process_files(inputs) do
                 # Prepare the input file list
                 var input_file = "inputs.list"
                 var fw = new FileWriter.open(input_file)
@@ -63,14 +93,15 @@ class NLPProcessor
                 fw.close
  
                 # Run Stanford NLP jar
-               sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -filelist {input_file} -outputDirectory {output_dir}"
+               sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -filelist {input_file} -outputDirectory {tmp_dir}"
                 # Parse output
                 var map = new HashMap[String, NLPDocument]
                 for input in inputs do
-                       var out_file = output_dir / "{input.basename}.xml"
+                       var out_file = tmp_dir / "{input.basename}.xml"
                         map[input] = new NLPDocument.from_xml_file(out_file)
                 end
                 input_file.file_delete
+               tmp_dir.rmdir
                 return map
         end
  end
@@ -248,11 +279,99 @@ class NLPToken
         # ~~~
         init from_xml(xml: XMLStartTag) do
                 var index = xml.attributes.first.as(XMLStringAttr).value.to_i
-               var word = xml["word"].first.as(XMLStartTag).data
-               var lemma = xml["lemma"].first.as(XMLStartTag).data
-               var begin_offset = xml["CharacterOffsetBegin"].first.as(XMLStartTag).data.to_i
-               var end_offset = xml["CharacterOffsetEnd"].first.as(XMLStartTag).data.to_i
-               var pos = xml["POS"].first.as(XMLStartTag).data
+               var word = read_data(xml, "word")
+               var lemma = read_data(xml, "lemma")
+               var begin_offset = read_data(xml, "CharacterOffsetBegin").to_i
+               var end_offset = read_data(xml, "CharacterOffsetEnd").to_i
+               var pos = read_data(xml, "POS")
                 init(index, word, lemma, begin_offset, end_offset, pos)
         end
+
+       private fun read_data(xml: XMLStartTag, tag_name: String): String do
+               var res = ""
+               if xml[tag_name].is_empty then return res
+               var first = xml[tag_name].first
+               if not first isa XMLStartTag then return res
+               var data = first.data
+               if data == null then return res
+               return data
+       end
+end
+
+# Stanford web server
+#
+# Runs the server on `port`.
+#
+# For more details about the stanford NLP server see
+# https://stanfordnlp.github.io/CoreNLP/corenlp-server.html
+class NLPServer
+       super Thread
+
+       # Stanford jar classpath
+       #
+       # Classpath to give to Java when loading the StanfordNLP jars.
+       var java_cp: String
+
+       # Port the Java server will listen on
+       var port: Int
+
+       redef fun main do
+               sys.system "java -mx4g -cp \"{java_cp}\" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port {port.to_s} -timeout 15000"
+               return null
+       end
+end
+
+# A NLPProcessor using a NLPServer as backend
+class NLPClient
+       super NLPProcessor
+
+       # Base uri of the NLP server API
+       #
+       # For examples "http://localhost:9000" or "https://myserver.com"
+       var api_uri: String
+
+       # Annotators to use
+       #
+       # The specified annotators must exist on the server.
+       #
+       # Defaults are: `tokenize`, `ssplit`, `pos` and `lemma`.
+       var annotators: Array[String] = ["tokenize", "ssplit", "pos", "lemma"] is writable
+
+       # Language to process
+       #
+       # The language must be available on the server.
+       #
+       # Default is `en`.
+       var language = "en" is writable
+
+       # Output format to ask.
+       #
+       # Only `xml` is implemented at the moment.
+       private var format = "xml"
+
+       # API uri used to build curl POST requests
+       fun post_uri: String do
+               return "{api_uri}/?properties=%7B%22annotators%22%3A%20%22tokenize%2Cssplit%2Cpos%2clemma%22%2C%22outputFormat%22%3A%22{format}%22%7D&pipelineLanguage={language}"
+       end
+
+       redef fun process(string) do
+               var request = new CurlHTTPRequest(post_uri)
+               request.body = string
+               var response = request.execute
+               if response isa CurlResponseSuccess then
+                       if response.status_code != 200 then
+                               print "Error: {response.body_str}"
+                               return new NLPDocument
+                       end
+                       var xml = response.body_str.to_xml
+                       if xml isa XMLError then
+                               print xml
+                       end
+                       return new NLPDocument.from_xml(response.body_str.to_xml.as(XMLDocument))
+               else if response isa CurlResponseFailed then
+                       print "Error: {response.error_msg}"
+                       return new NLPDocument
+               end
+               return new NLPDocument
+       end
  end