From 3a1331646498f83f5861a7ff6b48bd6df31c9de1 Mon Sep 17 00:00:00 2001 From: Alexandre Terrasa Date: Wed, 20 Sep 2017 18:06:37 -0400 Subject: [PATCH] lib/nlp: extract NLPProcessor from the Java wrapper Signed-off-by: Alexandre Terrasa --- lib/nlp/nitnlp.nit | 2 +- lib/nlp/stanford.nit | 47 ++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/lib/nlp/nitnlp.nit b/lib/nlp/nitnlp.nit index 72946bc..bbf7d53 100644 --- a/lib/nlp/nitnlp.nit +++ b/lib/nlp/nitnlp.nit @@ -33,7 +33,7 @@ var arguments = options.rest # Processor initialization var java_cp = opt_java_cp.value if java_cp == null then java_cp = "*" -var proc = new NLPProcessor(java_cp) +var proc = new NLPJavaProcessor(java_cp) if arguments.length != 2 then print "Usage: nitnlp text1 text2\n" diff --git a/lib/nlp/stanford.nit b/lib/nlp/stanford.nit index 734a228..29058b3 100644 --- a/lib/nlp/stanford.nit +++ b/lib/nlp/stanford.nit @@ -20,19 +20,47 @@ module stanford import opts import dom -# Wrapper around StanfordNLP jar. +# Natural Language Processor # -# NLPProcessor provides natural language processing of input text files and -# an API to handle analysis results. +# NLPProcessor provides natural language processing for input text and files. +# Analyzed documents can be manipulated through the resulting NLPDocument. +interface NLPProcessor + + # Creates a new NLPDocument from a string + fun process(string: String): NLPDocument is abstract + + # Creates a new NLPDocument from a file content + fun process_file(path: String): NLPDocument do + var content = path.to_path.read_all + return process(content) + end + + # Creates a new NLPDocument from a list of files (batch mode) + # + # Returns a map of file path associated with their NLPDocument. + fun process_files(paths: Array[String]): Map[String, NLPDocument] do + var res = new HashMap[String, NLPDocument] + for file in paths do + res[file] = process_file(file) + end + return res + end +end + +# Wrapper around StanfordNLP jar. # # FIXME this should use the Java FFI. -class NLPProcessor +class NLPJavaProcessor + super NLPProcessor # Classpath to give to Java when loading the StanfordNLP jars. var java_cp: String + # Temp dir used to store batch results + var tmp_dir = ".nlp" + # Process a string and return a new NLPDocument from this. - fun process(string: String): NLPDocument do + redef fun process(string) do var tmp_file = ".nlp.in" var file = new FileWriter.open(tmp_file) file.write string @@ -43,7 +71,7 @@ class NLPProcessor end # Process the `input` file and return a new NLPDocument from this. - fun process_file(input: String): NLPDocument do + redef fun process_file(input) do # TODO opt annotators var tmp_file = "{input.basename}.xml" sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -file {input}" @@ -55,7 +83,7 @@ class NLPProcessor # Batch mode. # # Returns a map of file path associated with their NLPDocument. - fun process_files(inputs: Collection[String], output_dir: String): Map[String, NLPDocument] do + redef fun process_files(inputs) do # Prepare the input file list var input_file = "inputs.list" var fw = new FileWriter.open(input_file) @@ -63,14 +91,15 @@ class NLPProcessor fw.close # Run Stanford NLP jar - sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -filelist {input_file} -outputDirectory {output_dir}" + sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -filelist {input_file} -outputDirectory {tmp_dir}" # Parse output var map = new HashMap[String, NLPDocument] for input in inputs do - var out_file = output_dir / "{input.basename}.xml" + var out_file = tmp_dir / "{input.basename}.xml" map[input] = new NLPDocument.from_xml_file(out_file) end input_file.file_delete + tmp_dir.rmdir return map end end -- 1.7.9.5