Merge: doc: fixed some typos and other misc. corrections
[nit.git] / lib / nlp / stanford.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Natural Language Processor based on the StanfordNLP core.
16 #
17 # See http://nlp.stanford.edu/software/corenlp.shtml.
18 module stanford
19
20 import opts
21 import dom
22 import curl
23 import pthreads
24
25 # Natural Language Processor
26 #
27 # NLPProcessor provides natural language processing for input text and files.
28 # Analyzed documents can be manipulated through the resulting NLPDocument.
29 interface NLPProcessor
30
31 # Creates a new NLPDocument from a string
32 fun process(string: String): NLPDocument is abstract
33
34 # Creates a new NLPDocument from a file content
35 fun process_file(path: String): NLPDocument do
36 var content = path.to_path.read_all
37 return process(content)
38 end
39
40 # Creates a new NLPDocument from a list of files (batch mode)
41 #
42 # Returns a map of file path associated with their NLPDocument.
43 fun process_files(paths: Array[String]): Map[String, NLPDocument] do
44 var res = new HashMap[String, NLPDocument]
45 for file in paths do
46 res[file] = process_file(file)
47 end
48 return res
49 end
50 end
51
52 # Wrapper around StanfordNLP jar.
53 #
54 # FIXME this should use the Java FFI.
55 class NLPJavaProcessor
56 super NLPProcessor
57
58 # Classpath to give to Java when loading the StanfordNLP jars.
59 var java_cp: String
60
61 # Temp dir used to store batch results
62 var tmp_dir = ".nlp"
63
64 # Process a string and return a new NLPDocument from this.
65 redef fun process(string) do
66 var tmp_file = ".nlp.in"
67 var file = new FileWriter.open(tmp_file)
68 file.write string
69 file.close
70 var doc = process_file(tmp_file)
71 tmp_file.file_delete
72 return doc
73 end
74
75 # Process the `input` file and return a new NLPDocument from this.
76 redef fun process_file(input) do
77 # TODO opt annotators
78 var tmp_file = "{input.basename}.xml"
79 sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -file {input}"
80 var doc = new NLPDocument.from_xml_file(tmp_file)
81 tmp_file.file_delete
82 return doc
83 end
84
85 # Batch mode.
86 #
87 # Returns a map of file path associated with their NLPDocument.
88 redef fun process_files(inputs) do
89 # Prepare the input file list
90 var input_file = "inputs.list"
91 var fw = new FileWriter.open(input_file)
92 for input in inputs do fw.write "{input}\n"
93 fw.close
94
95 # Run Stanford NLP jar
96 sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -filelist {input_file} -outputDirectory {tmp_dir}"
97 # Parse output
98 var map = new HashMap[String, NLPDocument]
99 for input in inputs do
100 var out_file = tmp_dir / "{input.basename}.xml"
101 map[input] = new NLPDocument.from_xml_file(out_file)
102 end
103 input_file.file_delete
104 tmp_dir.rmdir
105 return map
106 end
107 end
108
109 # A `Document` represent a text input given to the NLP processor.
110 #
111 # Once processed, it contains a list of sentences that contain tokens.
112 class NLPDocument
113
114 # NLPSentences contained in `self`
115 var sentences = new Array[NLPSentence]
116
117 # Init `self` from an xml element.
118 #
119 # ~~~
120 # var xml = """
121 # <root>
122 # <document>
123 # <sentences>
124 # <sentence id="1">
125 # <tokens>
126 # <token id="1">
127 # <word>Stanford</word>
128 # <lemma>Stanford</lemma>
129 # <CharacterOffsetBegin>0</CharacterOffsetBegin>
130 # <CharacterOffsetEnd>8</CharacterOffsetEnd>
131 # <POS>NNP</POS>
132 # </token>
133 # <token id="2">
134 # <word>University</word>
135 # <lemma>University</lemma>
136 # <CharacterOffsetBegin>9</CharacterOffsetBegin>
137 # <CharacterOffsetEnd>19</CharacterOffsetEnd>
138 # <POS>NNP</POS>
139 # </token>
140 # </tokens>
141 # </sentence>
142 # <sentence id="2">
143 # <tokens>
144 # <token id="1">
145 # <word>UQAM</word>
146 # <lemma>UQAM</lemma>
147 # <CharacterOffsetBegin>0</CharacterOffsetBegin>
148 # <CharacterOffsetEnd>4</CharacterOffsetEnd>
149 # <POS>NNP</POS>
150 # </token>
151 # <token id="2">
152 # <word>University</word>
153 # <lemma>University</lemma>
154 # <CharacterOffsetBegin>5</CharacterOffsetBegin>
155 # <CharacterOffsetEnd>15</CharacterOffsetEnd>
156 # <POS>NNP</POS>
157 # </token>
158 # </tokens>
159 # </sentence>
160 # </sentences>
161 # </document>
162 # </root>""".to_xml.as(XMLDocument)
163 #
164 # var document = new NLPDocument.from_xml(xml)
165 # assert document.sentences.length == 2
166 # assert document.sentences.first.tokens.first.word == "Stanford"
167 # assert document.sentences.last.tokens.first.word == "UQAM"
168 # ~~~
169 init from_xml(xml: XMLDocument) do
170 for obj in xml["root"].first["document"].first["sentences"].first["sentence"] do
171 if obj isa XMLStartTag then
172 sentences.add new NLPSentence.from_xml(obj)
173 else
174 print "Warning: malformed xml, `sentences` is supposed to contain `sencence` tags"
175 end
176 end
177 end
178
179 # Init `self` from a XML file.
180 init from_xml_file(path: String) do
181 var file = new FileReader.open(path)
182 var xml = file.read_lines
183 file.close
184 xml.shift # remove xml doctype
185 xml.shift # remove xslt link
186 from_xml(xml.join("\n").to_xml.as(XMLDocument))
187 end
188 end
189
190 # Represent one sentence in a `Document`.
191 class NLPSentence
192
193 # Index of this sentence in the input text.
194 var index: Int
195
196 # NLPTokens contained in `self`.
197 var tokens = new Array[NLPToken]
198
199 # Init `self` from an XML element.
200 #
201 # ~~~
202 # var xml = """
203 # <sentence id="1">
204 # <tokens>
205 # <token id="1">
206 # <word>Stanford</word>
207 # <lemma>Stanford</lemma>
208 # <CharacterOffsetBegin>0</CharacterOffsetBegin>
209 # <CharacterOffsetEnd>8</CharacterOffsetEnd>
210 # <POS>NNP</POS>
211 # </token>
212 # <token id="2">
213 # <word>University</word>
214 # <lemma>University</lemma>
215 # <CharacterOffsetBegin>9</CharacterOffsetBegin>
216 # <CharacterOffsetEnd>19</CharacterOffsetEnd>
217 # <POS>NNP</POS>
218 # </token>
219 # </tokens>
220 # </sentence>""".to_xml["sentence"].first.as(XMLStartTag)
221 #
222 # var sentence = new NLPSentence.from_xml(xml)
223 # assert sentence.index == 1
224 # assert sentence.tokens.length == 2
225 # ~~~
226 init from_xml(xml: XMLStartTag) do
227 var index = xml.attributes.first.as(XMLStringAttr).value.to_i
228 for obj in xml["tokens"].first["token"] do
229 if obj isa XMLStartTag then
230 tokens.add new NLPToken.from_xml(obj)
231 else
232 print "Warning: malformed xml, `tokens` is supposed to contain `token` tags"
233 end
234 end
235 init(index)
236 end
237 end
238
239 # Represent one word (or puncutation mark) in a `NLPSentence`.
240 class NLPToken
241
242 # Index of this word in the sentence.
243 var index: Int
244
245 # Original word
246 var word: String
247
248 # `word` lemma
249 var lemma: String
250
251 # Position of the first character in the input
252 var begin_offset: Int
253
254 # Position of the last character in the input
255 var end_offset: Int
256
257 # Part Of Speech tag
258 var pos: String
259
260 # Init `self` from an XML element.
261 #
262 # ~~~
263 # var xml = """
264 # <token id="2">
265 # <word>University</word>
266 # <lemma>University</lemma>
267 # <CharacterOffsetBegin>9</CharacterOffsetBegin>
268 # <CharacterOffsetEnd>19</CharacterOffsetEnd>
269 # <POS>NNP</POS>
270 # </token>""".to_xml["token"].first.as(XMLStartTag)
271 #
272 # var token = new NLPToken.from_xml(xml)
273 # assert token.index == 2
274 # assert token.word == "University"
275 # assert token.lemma == "University"
276 # assert token.begin_offset == 9
277 # assert token.end_offset == 19
278 # assert token.pos == "NNP"
279 # ~~~
280 init from_xml(xml: XMLStartTag) do
281 var index = xml.attributes.first.as(XMLStringAttr).value.to_i
282 var word = read_data(xml, "word")
283 var lemma = read_data(xml, "lemma")
284 var begin_offset = read_data(xml, "CharacterOffsetBegin").to_i
285 var end_offset = read_data(xml, "CharacterOffsetEnd").to_i
286 var pos = read_data(xml, "POS")
287 init(index, word, lemma, begin_offset, end_offset, pos)
288 end
289
290 private fun read_data(xml: XMLStartTag, tag_name: String): String do
291 var res = ""
292 if xml[tag_name].is_empty then return res
293 var first = xml[tag_name].first
294 if not first isa XMLStartTag then return res
295 var data = first.data
296 if data == null then return res
297 return data
298 end
299 end
300
301 # Stanford web server
302 #
303 # Runs the server on `port`.
304 #
305 # For more details about the stanford NLP server see
306 # https://stanfordnlp.github.io/CoreNLP/corenlp-server.html
307 class NLPServer
308 super Thread
309
310 # Stanford jar classpath
311 #
312 # Classpath to give to Java when loading the StanfordNLP jars.
313 var java_cp: String
314
315 # Port the Java server will listen on
316 var port: Int
317
318 redef fun main do
319 sys.system "java -mx4g -cp \"{java_cp}\" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port {port.to_s} -timeout 15000"
320 return null
321 end
322 end
323
324 # A NLPProcessor using a NLPServer as backend
325 class NLPClient
326 super NLPProcessor
327
328 # Base uri of the NLP server API
329 #
330 # For examples "http://localhost:9000" or "https://myserver.com"
331 var api_uri: String
332
333 # Annotators to use
334 #
335 # The specified annotators must exist on the server.
336 #
337 # Defaults are: `tokenize`, `ssplit`, `pos` and `lemma`.
338 var annotators: Array[String] = ["tokenize", "ssplit", "pos", "lemma"] is writable
339
340 # Language to process
341 #
342 # The language must be available on the server.
343 #
344 # Default is `en`.
345 var language = "en" is writable
346
347 # Output format to ask.
348 #
349 # Only `xml` is implemented at the moment.
350 private var format = "xml"
351
352 # API uri used to build curl POST requests
353 fun post_uri: String do
354 return "{api_uri}/?properties=%7B%22annotators%22%3A%20%22tokenize%2Cssplit%2Cpos%2clemma%22%2C%22outputFormat%22%3A%22{format}%22%7D&pipelineLanguage={language}"
355 end
356
357 redef fun process(string) do
358 var request = new CurlHTTPRequest(post_uri)
359 request.body = string
360 var response = request.execute
361 if response isa CurlResponseSuccess then
362 if response.status_code != 200 then
363 print "Error: {response.body_str}"
364 return new NLPDocument
365 end
366 var xml = response.body_str.to_xml
367 if xml isa XMLError then
368 print xml
369 end
370 return new NLPDocument.from_xml(response.body_str.to_xml.as(XMLDocument))
371 else if response isa CurlResponseFailed then
372 print "Error: {response.error_msg}"
373 return new NLPDocument
374 end
375 return new NLPDocument
376 end
377 end