lib/nlp: introduce StanfordNLP wrapper
[nit.git] / lib / nlp / stanford.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Natural Language Processor based on the StanfordNLP core.
16 #
17 # See http://nlp.stanford.edu/software/corenlp.shtml.
18 module stanford
19
20 import opts
21 import dom
22
23 # Wrapper around StanfordNLP jar.
24 #
25 # NLPProcessor provides natural language processing of input text files and
26 # an API to handle analysis results.
27 #
28 # FIXME this should use the Java FFI.
29 class NLPProcessor
30
31 # Classpath to give to Java when loading the StanfordNLP jars.
32 var java_cp: String
33
34 # Process a string and return a new NLPDocument from this.
35 fun process(string: String): NLPDocument do
36 var tmp_file = ".nlp.in"
37 var file = new FileWriter.open(tmp_file)
38 file.write string
39 file.close
40 var doc = process_file(tmp_file)
41 tmp_file.file_delete
42 return doc
43 end
44
45 # Process the `input` file and return a new NLPDocument from this.
46 fun process_file(input: String): NLPDocument do
47 # TODO opt annotators
48 var tmp_file = "{input.basename}.xml"
49 sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -file {input}"
50 var doc = new NLPDocument.from_xml_file(tmp_file)
51 tmp_file.file_delete
52 return doc
53 end
54
55 # Batch mode.
56 #
57 # Returns a map of file path associated with their NLPDocument.
58 fun process_files(inputs: Collection[String], output_dir: String): Map[String, NLPDocument] do
59 # Prepare the input file list
60 var input_file = "inputs.list"
61 var fw = new FileWriter.open(input_file)
62 for input in inputs do fw.write "{input}\n"
63 fw.close
64
65 # Run Stanford NLP jar
66 sys.system "java -cp \"{java_cp}\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -filelist {input_file} -outputDirectory {output_dir}"
67 # Parse output
68 var map = new HashMap[String, NLPDocument]
69 for input in inputs do
70 var out_file = output_dir / "{input.basename}.xml"
71 map[input] = new NLPDocument.from_xml_file(out_file)
72 end
73 input_file.file_delete
74 return map
75 end
76 end
77
78 # A `Document` represent a text input given to the NLP processor.
79 #
80 # Once processed, it contains a list of sentences that contain tokens.
81 class NLPDocument
82
83 # NLPSentences contained in `self`
84 var sentences = new Array[NLPSentence]
85
86 # Init `self` from an xml element.
87 #
88 # ~~~
89 # var xml = """
90 # <root>
91 # <document>
92 # <sentences>
93 # <sentence id="1">
94 # <tokens>
95 # <token id="1">
96 # <word>Stanford</word>
97 # <lemma>Stanford</lemma>
98 # <CharacterOffsetBegin>0</CharacterOffsetBegin>
99 # <CharacterOffsetEnd>8</CharacterOffsetEnd>
100 # <POS>NNP</POS>
101 # </token>
102 # <token id="2">
103 # <word>University</word>
104 # <lemma>University</lemma>
105 # <CharacterOffsetBegin>9</CharacterOffsetBegin>
106 # <CharacterOffsetEnd>19</CharacterOffsetEnd>
107 # <POS>NNP</POS>
108 # </token>
109 # </tokens>
110 # </sentence>
111 # <sentence id="2">
112 # <tokens>
113 # <token id="1">
114 # <word>UQAM</word>
115 # <lemma>UQAM</lemma>
116 # <CharacterOffsetBegin>0</CharacterOffsetBegin>
117 # <CharacterOffsetEnd>4</CharacterOffsetEnd>
118 # <POS>NNP</POS>
119 # </token>
120 # <token id="2">
121 # <word>University</word>
122 # <lemma>University</lemma>
123 # <CharacterOffsetBegin>5</CharacterOffsetBegin>
124 # <CharacterOffsetEnd>15</CharacterOffsetEnd>
125 # <POS>NNP</POS>
126 # </token>
127 # </tokens>
128 # </sentence>
129 # </sentences>
130 # </document>
131 # </root>""".to_xml.as(XMLDocument)
132 #
133 # var document = new NLPDocument.from_xml(xml)
134 # assert document.sentences.length == 2
135 # assert document.sentences.first.tokens.first.word == "Stanford"
136 # assert document.sentences.last.tokens.first.word == "UQAM"
137 # ~~~
138 init from_xml(xml: XMLDocument) do
139 for obj in xml["root"].first["document"].first["sentences"].first["sentence"] do
140 if obj isa XMLStartTag then
141 sentences.add new NLPSentence.from_xml(obj)
142 else
143 print "Warning: malformed xml, `sentences` is supposed to contain `sencence` tags"
144 end
145 end
146 end
147
148 # Init `self` from a XML file.
149 init from_xml_file(path: String) do
150 var file = new FileReader.open(path)
151 var xml = file.read_lines
152 file.close
153 xml.shift # remove xml doctype
154 xml.shift # remove xslt link
155 from_xml(xml.join("\n").to_xml.as(XMLDocument))
156 end
157 end
158
159 # Represent one sentence in a `Document`.
160 class NLPSentence
161
162 # Index of this sentence in the input text.
163 var index: Int
164
165 # NLPTokens contained in `self`.
166 var tokens = new Array[NLPToken]
167
168 # Init `self` from an XML element.
169 #
170 # ~~~
171 # var xml = """
172 # <sentence id="1">
173 # <tokens>
174 # <token id="1">
175 # <word>Stanford</word>
176 # <lemma>Stanford</lemma>
177 # <CharacterOffsetBegin>0</CharacterOffsetBegin>
178 # <CharacterOffsetEnd>8</CharacterOffsetEnd>
179 # <POS>NNP</POS>
180 # </token>
181 # <token id="2">
182 # <word>University</word>
183 # <lemma>University</lemma>
184 # <CharacterOffsetBegin>9</CharacterOffsetBegin>
185 # <CharacterOffsetEnd>19</CharacterOffsetEnd>
186 # <POS>NNP</POS>
187 # </token>
188 # </tokens>
189 # </sentence>""".to_xml["sentence"].first.as(XMLStartTag)
190 #
191 # var sentence = new NLPSentence.from_xml(xml)
192 # assert sentence.index == 1
193 # assert sentence.tokens.length == 2
194 # ~~~
195 init from_xml(xml: XMLStartTag) do
196 var index = xml.attributes.first.as(XMLStringAttr).value.to_i
197 for obj in xml["tokens"].first["token"] do
198 if obj isa XMLStartTag then
199 tokens.add new NLPToken.from_xml(obj)
200 else
201 print "Warning: malformed xml, `tokens` is supposed to contain `token` tags"
202 end
203 end
204 init(index)
205 end
206 end
207
208 # Represent one word (or puncutation mark) in a `NLPSentence`.
209 class NLPToken
210
211 # Index of this word in the sentence.
212 var index: Int
213
214 # Original word
215 var word: String
216
217 # `word` lemma
218 var lemma: String
219
220 # Position of the first character in the input
221 var begin_offset: Int
222
223 # Position of the last character in the input
224 var end_offset: Int
225
226 # Part Of Speech tag
227 var pos: String
228
229 # Init `self` from an XML element.
230 #
231 # ~~~
232 # var xml = """
233 # <token id="2">
234 # <word>University</word>
235 # <lemma>University</lemma>
236 # <CharacterOffsetBegin>9</CharacterOffsetBegin>
237 # <CharacterOffsetEnd>19</CharacterOffsetEnd>
238 # <POS>NNP</POS>
239 # </token>""".to_xml["token"].first.as(XMLStartTag)
240 #
241 # var token = new NLPToken.from_xml(xml)
242 # assert token.index == 2
243 # assert token.word == "University"
244 # assert token.lemma == "University"
245 # assert token.begin_offset == 9
246 # assert token.end_offset == 19
247 # assert token.pos == "NNP"
248 # ~~~
249 init from_xml(xml: XMLStartTag) do
250 var index = xml.attributes.first.as(XMLStringAttr).value.to_i
251 var word = xml["word"].first.as(XMLStartTag).data
252 var lemma = xml["lemma"].first.as(XMLStartTag).data
253 var begin_offset = xml["CharacterOffsetBegin"].first.as(XMLStartTag).data.to_i
254 var end_offset = xml["CharacterOffsetEnd"].first.as(XMLStartTag).data.to_i
255 var pos = xml["POS"].first.as(XMLStartTag).data
256 init(index, word, lemma, begin_offset, end_offset, pos)
257 end
258 end