29058b3b1937ec744c2a904af5f390c529087b60
[nit.git] / lib / nlp / stanford.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Natural Language Processor based on the StanfordNLP core.
16 #
17 # See http://nlp.stanford.edu/software/corenlp.shtml.
18 module stanford
19
20 import opts
21 import dom
22
23 # Natural Language Processor
24 #
25 # NLPProcessor provides natural language processing for input text and files.
26 # Analyzed documents can be manipulated through the resulting NLPDocument.
27 interface NLPProcessor
28
29 # Creates a new NLPDocument from a string
30 fun process(string: String): NLPDocument is abstract
31
32 # Creates a new NLPDocument from a file content
33 fun process_file(path: String): NLPDocument do
34 var content = path.to_path.read_all
35 return process(content)
36 end
37
38 # Creates a new NLPDocument from a list of files (batch mode)
39 #
40 # Returns a map of file path associated with their NLPDocument.
41 fun process_files(paths: Array[String]): Map[String, NLPDocument] do
42 var res = new HashMap[String, NLPDocument]
43 for file in paths do
44 res[file] = process_file(file)
45 end
46 return res
47 end
48 end
49
50 # Wrapper around StanfordNLP jar.
51 #
52 # FIXME this should use the Java FFI.
53 class NLPJavaProcessor
54 super NLPProcessor
55
56 # Classpath to give to Java when loading the StanfordNLP jars.
57 var java_cp: String
58
59 # Temp dir used to store batch results
60 var tmp_dir = ".nlp"
61
62 # Process a string and return a new NLPDocument from this.
63 redef fun process(string) do
64 var tmp_file = ".nlp.in"
65 var file = new FileWriter.open(tmp_file)
66 file.write string
67 file.close
68 var doc = process_file(tmp_file)
69 tmp_file.file_delete
70 return doc
71 end
72
73 # Process the `input` file and return a new NLPDocument from this.
74 redef fun process_file(input) do
75 # TODO opt annotators
76 var tmp_file = "{input.basename}.xml"
77 sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -file {input}"
78 var doc = new NLPDocument.from_xml_file(tmp_file)
79 tmp_file.file_delete
80 return doc
81 end
82
83 # Batch mode.
84 #
85 # Returns a map of file path associated with their NLPDocument.
86 redef fun process_files(inputs) do
87 # Prepare the input file list
88 var input_file = "inputs.list"
89 var fw = new FileWriter.open(input_file)
90 for input in inputs do fw.write "{input}\n"
91 fw.close
92
93 # Run Stanford NLP jar
94 sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -filelist {input_file} -outputDirectory {tmp_dir}"
95 # Parse output
96 var map = new HashMap[String, NLPDocument]
97 for input in inputs do
98 var out_file = tmp_dir / "{input.basename}.xml"
99 map[input] = new NLPDocument.from_xml_file(out_file)
100 end
101 input_file.file_delete
102 tmp_dir.rmdir
103 return map
104 end
105 end
106
107 # A `Document` represent a text input given to the NLP processor.
108 #
109 # Once processed, it contains a list of sentences that contain tokens.
110 class NLPDocument
111
112 # NLPSentences contained in `self`
113 var sentences = new Array[NLPSentence]
114
115 # Init `self` from an xml element.
116 #
117 # ~~~
118 # var xml = """
119 # <root>
120 # <document>
121 # <sentences>
122 # <sentence id="1">
123 # <tokens>
124 # <token id="1">
125 # <word>Stanford</word>
126 # <lemma>Stanford</lemma>
127 # <CharacterOffsetBegin>0</CharacterOffsetBegin>
128 # <CharacterOffsetEnd>8</CharacterOffsetEnd>
129 # <POS>NNP</POS>
130 # </token>
131 # <token id="2">
132 # <word>University</word>
133 # <lemma>University</lemma>
134 # <CharacterOffsetBegin>9</CharacterOffsetBegin>
135 # <CharacterOffsetEnd>19</CharacterOffsetEnd>
136 # <POS>NNP</POS>
137 # </token>
138 # </tokens>
139 # </sentence>
140 # <sentence id="2">
141 # <tokens>
142 # <token id="1">
143 # <word>UQAM</word>
144 # <lemma>UQAM</lemma>
145 # <CharacterOffsetBegin>0</CharacterOffsetBegin>
146 # <CharacterOffsetEnd>4</CharacterOffsetEnd>
147 # <POS>NNP</POS>
148 # </token>
149 # <token id="2">
150 # <word>University</word>
151 # <lemma>University</lemma>
152 # <CharacterOffsetBegin>5</CharacterOffsetBegin>
153 # <CharacterOffsetEnd>15</CharacterOffsetEnd>
154 # <POS>NNP</POS>
155 # </token>
156 # </tokens>
157 # </sentence>
158 # </sentences>
159 # </document>
160 # </root>""".to_xml.as(XMLDocument)
161 #
162 # var document = new NLPDocument.from_xml(xml)
163 # assert document.sentences.length == 2
164 # assert document.sentences.first.tokens.first.word == "Stanford"
165 # assert document.sentences.last.tokens.first.word == "UQAM"
166 # ~~~
167 init from_xml(xml: XMLDocument) do
168 for obj in xml["root"].first["document"].first["sentences"].first["sentence"] do
169 if obj isa XMLStartTag then
170 sentences.add new NLPSentence.from_xml(obj)
171 else
172 print "Warning: malformed xml, `sentences` is supposed to contain `sencence` tags"
173 end
174 end
175 end
176
177 # Init `self` from a XML file.
178 init from_xml_file(path: String) do
179 var file = new FileReader.open(path)
180 var xml = file.read_lines
181 file.close
182 xml.shift # remove xml doctype
183 xml.shift # remove xslt link
184 from_xml(xml.join("\n").to_xml.as(XMLDocument))
185 end
186 end
187
188 # Represent one sentence in a `Document`.
189 class NLPSentence
190
191 # Index of this sentence in the input text.
192 var index: Int
193
194 # NLPTokens contained in `self`.
195 var tokens = new Array[NLPToken]
196
197 # Init `self` from an XML element.
198 #
199 # ~~~
200 # var xml = """
201 # <sentence id="1">
202 # <tokens>
203 # <token id="1">
204 # <word>Stanford</word>
205 # <lemma>Stanford</lemma>
206 # <CharacterOffsetBegin>0</CharacterOffsetBegin>
207 # <CharacterOffsetEnd>8</CharacterOffsetEnd>
208 # <POS>NNP</POS>
209 # </token>
210 # <token id="2">
211 # <word>University</word>
212 # <lemma>University</lemma>
213 # <CharacterOffsetBegin>9</CharacterOffsetBegin>
214 # <CharacterOffsetEnd>19</CharacterOffsetEnd>
215 # <POS>NNP</POS>
216 # </token>
217 # </tokens>
218 # </sentence>""".to_xml["sentence"].first.as(XMLStartTag)
219 #
220 # var sentence = new NLPSentence.from_xml(xml)
221 # assert sentence.index == 1
222 # assert sentence.tokens.length == 2
223 # ~~~
224 init from_xml(xml: XMLStartTag) do
225 var index = xml.attributes.first.as(XMLStringAttr).value.to_i
226 for obj in xml["tokens"].first["token"] do
227 if obj isa XMLStartTag then
228 tokens.add new NLPToken.from_xml(obj)
229 else
230 print "Warning: malformed xml, `tokens` is supposed to contain `token` tags"
231 end
232 end
233 init(index)
234 end
235 end
236
237 # Represent one word (or puncutation mark) in a `NLPSentence`.
238 class NLPToken
239
240 # Index of this word in the sentence.
241 var index: Int
242
243 # Original word
244 var word: String
245
246 # `word` lemma
247 var lemma: String
248
249 # Position of the first character in the input
250 var begin_offset: Int
251
252 # Position of the last character in the input
253 var end_offset: Int
254
255 # Part Of Speech tag
256 var pos: String
257
258 # Init `self` from an XML element.
259 #
260 # ~~~
261 # var xml = """
262 # <token id="2">
263 # <word>University</word>
264 # <lemma>University</lemma>
265 # <CharacterOffsetBegin>9</CharacterOffsetBegin>
266 # <CharacterOffsetEnd>19</CharacterOffsetEnd>
267 # <POS>NNP</POS>
268 # </token>""".to_xml["token"].first.as(XMLStartTag)
269 #
270 # var token = new NLPToken.from_xml(xml)
271 # assert token.index == 2
272 # assert token.word == "University"
273 # assert token.lemma == "University"
274 # assert token.begin_offset == 9
275 # assert token.end_offset == 19
276 # assert token.pos == "NNP"
277 # ~~~
278 init from_xml(xml: XMLStartTag) do
279 var index = xml.attributes.first.as(XMLStringAttr).value.to_i
280 var word = xml["word"].first.as(XMLStartTag).data
281 var lemma = xml["lemma"].first.as(XMLStartTag).data
282 var begin_offset = xml["CharacterOffsetBegin"].first.as(XMLStartTag).data.to_i
283 var end_offset = xml["CharacterOffsetEnd"].first.as(XMLStartTag).data.to_i
284 var pos = xml["POS"].first.as(XMLStartTag).data
285 init(index, word, lemma, begin_offset, end_offset, pos)
286 end
287 end