1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
15 # Natural Language Processor based on the StanfordNLP core.
17 # See http://nlp.stanford.edu/software/corenlp.shtml.
25 # Natural Language Processor
27 # NLPProcessor provides natural language processing for input text and files.
28 # Analyzed documents can be manipulated through the resulting NLPDocument.
29 interface NLPProcessor
31 # Creates a new NLPDocument from a string
32 fun process
(string
: String): NLPDocument is abstract
34 # Creates a new NLPDocument from a file content
35 fun process_file
(path
: String): NLPDocument do
36 var content
= path
.to_path
.read_all
37 return process
(content
)
40 # Creates a new NLPDocument from a list of files (batch mode)
42 # Returns a map of file path associated with their NLPDocument.
43 fun process_files
(paths
: Array[String]): Map[String, NLPDocument] do
44 var res
= new HashMap[String, NLPDocument]
46 res
[file
] = process_file
(file
)
52 # Wrapper around StanfordNLP jar.
54 # FIXME this should use the Java FFI.
55 class NLPJavaProcessor
58 # Classpath to give to Java when loading the StanfordNLP jars.
61 # Temp dir used to store batch results
64 # Process a string and return a new NLPDocument from this.
65 redef fun process
(string
) do
66 var tmp_file
= ".nlp.in"
67 var file
= new FileWriter.open
(tmp_file
)
70 var doc
= process_file
(tmp_file
)
75 # Process the `input` file and return a new NLPDocument from this.
76 redef fun process_file
(input
) do
78 var tmp_file
= "{input.basename}.xml"
79 sys
.system
"java -cp \"{java_cp}\
" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -file {input}"
80 var doc
= new NLPDocument.from_xml_file
(tmp_file
)
87 # Returns a map of file path associated with their NLPDocument.
88 redef fun process_files
(inputs
) do
89 # Prepare the input file list
90 var input_file
= "inputs.list"
91 var fw
= new FileWriter.open
(input_file
)
92 for input
in inputs
do fw
.write
"{input}\n"
95 # Run Stanford NLP jar
96 sys
.system
"java -cp \"{java_cp}\
" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -filelist {input_file} -outputDirectory {tmp_dir}"
98 var map
= new HashMap[String, NLPDocument]
99 for input
in inputs
do
100 var out_file
= tmp_dir
/ "{input.basename}.xml"
101 map
[input
] = new NLPDocument.from_xml_file
(out_file
)
103 input_file
.file_delete
109 # A `Document` represent a text input given to the NLP processor.
111 # Once processed, it contains a list of sentences that contain tokens.
114 # NLPSentences contained in `self`
115 var sentences
= new Array[NLPSentence]
117 # Init `self` from an xml element.
127 # <word>Stanford</word>
128 # <lemma>Stanford</lemma>
129 # <CharacterOffsetBegin>0</CharacterOffsetBegin>
130 # <CharacterOffsetEnd>8</CharacterOffsetEnd>
134 # <word>University</word>
135 # <lemma>University</lemma>
136 # <CharacterOffsetBegin>9</CharacterOffsetBegin>
137 # <CharacterOffsetEnd>19</CharacterOffsetEnd>
146 # <lemma>UQAM</lemma>
147 # <CharacterOffsetBegin>0</CharacterOffsetBegin>
148 # <CharacterOffsetEnd>4</CharacterOffsetEnd>
152 # <word>University</word>
153 # <lemma>University</lemma>
154 # <CharacterOffsetBegin>5</CharacterOffsetBegin>
155 # <CharacterOffsetEnd>15</CharacterOffsetEnd>
162 # </root>""".to_xml.as(XMLDocument)
164 # var document = new NLPDocument.from_xml(xml)
165 # assert document.sentences.length == 2
166 # assert document.sentences.first.tokens.first.word == "Stanford"
167 # assert document.sentences.last.tokens.first.word == "UQAM"
169 init from_xml
(xml
: XMLDocument) do
170 for obj
in xml
["root"].first
["document"].first
["sentences"].first
["sentence"] do
171 if obj
isa XMLStartTag then
172 sentences
.add
new NLPSentence.from_xml
(obj
)
174 print
"Warning: malformed xml, `sentences` is supposed to contain `sencence` tags"
179 # Init `self` from a XML file.
180 init from_xml_file
(path
: String) do
181 var file
= new FileReader.open
(path
)
182 var xml
= file
.read_lines
184 xml
.shift
# remove xml doctype
185 xml
.shift
# remove xslt link
186 from_xml
(xml
.join
("\n").to_xml
.as(XMLDocument))
190 # Represent one sentence in a `Document`.
193 # Index of this sentence in the input text.
196 # NLPTokens contained in `self`.
197 var tokens
= new Array[NLPToken]
199 # Init `self` from an XML element.
206 # <word>Stanford</word>
207 # <lemma>Stanford</lemma>
208 # <CharacterOffsetBegin>0</CharacterOffsetBegin>
209 # <CharacterOffsetEnd>8</CharacterOffsetEnd>
213 # <word>University</word>
214 # <lemma>University</lemma>
215 # <CharacterOffsetBegin>9</CharacterOffsetBegin>
216 # <CharacterOffsetEnd>19</CharacterOffsetEnd>
220 # </sentence>""".to_xml["sentence"].first.as(XMLStartTag)
222 # var sentence = new NLPSentence.from_xml(xml)
223 # assert sentence.index == 1
224 # assert sentence.tokens.length == 2
226 init from_xml
(xml
: XMLStartTag) do
227 var index
= xml
.attributes
.first
.as(XMLStringAttr).value
.to_i
228 for obj
in xml
["tokens"].first
["token"] do
229 if obj
isa XMLStartTag then
230 tokens
.add
new NLPToken.from_xml
(obj
)
232 print
"Warning: malformed xml, `tokens` is supposed to contain `token` tags"
239 # Represent one word (or puncutation mark) in a `NLPSentence`.
242 # Index of this word in the sentence.
251 # Position of the first character in the input
252 var begin_offset
: Int
254 # Position of the last character in the input
260 # Init `self` from an XML element.
265 # <word>University</word>
266 # <lemma>University</lemma>
267 # <CharacterOffsetBegin>9</CharacterOffsetBegin>
268 # <CharacterOffsetEnd>19</CharacterOffsetEnd>
270 # </token>""".to_xml["token"].first.as(XMLStartTag)
272 # var token = new NLPToken.from_xml(xml)
273 # assert token.index == 2
274 # assert token.word == "University"
275 # assert token.lemma == "University"
276 # assert token.begin_offset == 9
277 # assert token.end_offset == 19
278 # assert token.pos == "NNP"
280 init from_xml
(xml
: XMLStartTag) do
281 var index
= xml
.attributes
.first
.as(XMLStringAttr).value
.to_i
282 var word
= read_data
(xml
, "word")
283 var lemma
= read_data
(xml
, "lemma")
284 var begin_offset
= read_data
(xml
, "CharacterOffsetBegin").to_i
285 var end_offset
= read_data
(xml
, "CharacterOffsetEnd").to_i
286 var pos
= read_data
(xml
, "POS")
287 init(index
, word
, lemma
, begin_offset
, end_offset
, pos
)
290 private fun read_data
(xml
: XMLStartTag, tag_name
: String): String do
292 if xml
[tag_name
].is_empty
then return res
293 var first
= xml
[tag_name
].first
294 if not first
isa XMLStartTag then return res
295 var data
= first
.data
296 if data
== null then return res
301 # Stanford web server
303 # Runs the server on `port`.
305 # For more details about the stanford NLP server see
306 # https://stanfordnlp.github.io/CoreNLP/corenlp-server.html
310 # Stanford jar classpath
312 # Classpath to give to Java when loading the StanfordNLP jars.
315 # Port the Java server will listen on
319 sys
.system
"java -mx4g -cp \"{java_cp}\
" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port {port.to_s} -timeout 15000"
324 # A NLPProcessor using a NLPServer as backend
328 # Base uri of the NLP server API
330 # For examples "http://localhost:9000" or "https://myserver.com"
335 # The specified annotators must exist on the server.
337 # Defaults are: `tokenize`, `ssplit`, `pos` and `lemma`.
338 var annotators
: Array[String] = ["tokenize", "ssplit", "pos", "lemma"] is writable
340 # Language to process
342 # The language must be available on the server.
345 var language
= "en" is writable
347 # Output format to ask.
349 # Only `xml` is implemented at the moment.
350 private var format
= "xml"
352 # API uri used to build curl POST requests
353 fun post_uri
: String do
354 return "{api_uri}/?properties=%7B%22annotators%22%3A%20%22tokenize%2Cssplit%2Cpos%2clemma%22%2C%22outputFormat%22%3A%22{format}%22%7D&pipelineLanguage={language}"
357 redef fun process
(string
) do
358 var request
= new CurlHTTPRequest(post_uri
)
359 request
.body
= string
360 var response
= request
.execute
361 if response
isa CurlResponseSuccess then
362 if response
.status_code
!= 200 then
363 print
"Error: {response.body_str}"
364 return new NLPDocument
366 var xml
= response
.body_str
.to_xml
367 if xml
isa XMLError then
370 return new NLPDocument.from_xml
(response
.body_str
.to_xml
.as(XMLDocument))
371 else if response
isa CurlResponseFailed then
372 print
"Error: {response.error_msg}"
373 return new NLPDocument
375 return new NLPDocument