29058b3b1937ec744c2a904af5f390c529087b60
1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
15 # Natural Language Processor based on the StanfordNLP core.
17 # See http://nlp.stanford.edu/software/corenlp.shtml.
23 # Natural Language Processor
25 # NLPProcessor provides natural language processing for input text and files.
26 # Analyzed documents can be manipulated through the resulting NLPDocument.
27 interface NLPProcessor
29 # Creates a new NLPDocument from a string
30 fun process
(string
: String): NLPDocument is abstract
32 # Creates a new NLPDocument from a file content
33 fun process_file
(path
: String): NLPDocument do
34 var content
= path
.to_path
.read_all
35 return process
(content
)
38 # Creates a new NLPDocument from a list of files (batch mode)
40 # Returns a map of file path associated with their NLPDocument.
41 fun process_files
(paths
: Array[String]): Map[String, NLPDocument] do
42 var res
= new HashMap[String, NLPDocument]
44 res
[file
] = process_file
(file
)
50 # Wrapper around StanfordNLP jar.
52 # FIXME this should use the Java FFI.
53 class NLPJavaProcessor
56 # Classpath to give to Java when loading the StanfordNLP jars.
59 # Temp dir used to store batch results
62 # Process a string and return a new NLPDocument from this.
63 redef fun process
(string
) do
64 var tmp_file
= ".nlp.in"
65 var file
= new FileWriter.open
(tmp_file
)
68 var doc
= process_file
(tmp_file
)
73 # Process the `input` file and return a new NLPDocument from this.
74 redef fun process_file
(input
) do
76 var tmp_file
= "{input.basename}.xml"
77 sys
.system
"java -cp \"{java_cp}\
" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -file {input}"
78 var doc
= new NLPDocument.from_xml_file
(tmp_file
)
85 # Returns a map of file path associated with their NLPDocument.
86 redef fun process_files
(inputs
) do
87 # Prepare the input file list
88 var input_file
= "inputs.list"
89 var fw
= new FileWriter.open
(input_file
)
90 for input
in inputs
do fw
.write
"{input}\n"
93 # Run Stanford NLP jar
94 sys
.system
"java -cp \"{java_cp}\
" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -filelist {input_file} -outputDirectory {tmp_dir}"
96 var map
= new HashMap[String, NLPDocument]
97 for input
in inputs
do
98 var out_file
= tmp_dir
/ "{input.basename}.xml"
99 map
[input
] = new NLPDocument.from_xml_file
(out_file
)
101 input_file
.file_delete
107 # A `Document` represent a text input given to the NLP processor.
109 # Once processed, it contains a list of sentences that contain tokens.
112 # NLPSentences contained in `self`
113 var sentences
= new Array[NLPSentence]
115 # Init `self` from an xml element.
125 # <word>Stanford</word>
126 # <lemma>Stanford</lemma>
127 # <CharacterOffsetBegin>0</CharacterOffsetBegin>
128 # <CharacterOffsetEnd>8</CharacterOffsetEnd>
132 # <word>University</word>
133 # <lemma>University</lemma>
134 # <CharacterOffsetBegin>9</CharacterOffsetBegin>
135 # <CharacterOffsetEnd>19</CharacterOffsetEnd>
144 # <lemma>UQAM</lemma>
145 # <CharacterOffsetBegin>0</CharacterOffsetBegin>
146 # <CharacterOffsetEnd>4</CharacterOffsetEnd>
150 # <word>University</word>
151 # <lemma>University</lemma>
152 # <CharacterOffsetBegin>5</CharacterOffsetBegin>
153 # <CharacterOffsetEnd>15</CharacterOffsetEnd>
160 # </root>""".to_xml.as(XMLDocument)
162 # var document = new NLPDocument.from_xml(xml)
163 # assert document.sentences.length == 2
164 # assert document.sentences.first.tokens.first.word == "Stanford"
165 # assert document.sentences.last.tokens.first.word == "UQAM"
167 init from_xml
(xml
: XMLDocument) do
168 for obj
in xml
["root"].first
["document"].first
["sentences"].first
["sentence"] do
169 if obj
isa XMLStartTag then
170 sentences
.add
new NLPSentence.from_xml
(obj
)
172 print
"Warning: malformed xml, `sentences` is supposed to contain `sencence` tags"
177 # Init `self` from a XML file.
178 init from_xml_file
(path
: String) do
179 var file
= new FileReader.open
(path
)
180 var xml
= file
.read_lines
182 xml
.shift
# remove xml doctype
183 xml
.shift
# remove xslt link
184 from_xml
(xml
.join
("\n").to_xml
.as(XMLDocument))
188 # Represent one sentence in a `Document`.
191 # Index of this sentence in the input text.
194 # NLPTokens contained in `self`.
195 var tokens
= new Array[NLPToken]
197 # Init `self` from an XML element.
204 # <word>Stanford</word>
205 # <lemma>Stanford</lemma>
206 # <CharacterOffsetBegin>0</CharacterOffsetBegin>
207 # <CharacterOffsetEnd>8</CharacterOffsetEnd>
211 # <word>University</word>
212 # <lemma>University</lemma>
213 # <CharacterOffsetBegin>9</CharacterOffsetBegin>
214 # <CharacterOffsetEnd>19</CharacterOffsetEnd>
218 # </sentence>""".to_xml["sentence"].first.as(XMLStartTag)
220 # var sentence = new NLPSentence.from_xml(xml)
221 # assert sentence.index == 1
222 # assert sentence.tokens.length == 2
224 init from_xml
(xml
: XMLStartTag) do
225 var index
= xml
.attributes
.first
.as(XMLStringAttr).value
.to_i
226 for obj
in xml
["tokens"].first
["token"] do
227 if obj
isa XMLStartTag then
228 tokens
.add
new NLPToken.from_xml
(obj
)
230 print
"Warning: malformed xml, `tokens` is supposed to contain `token` tags"
237 # Represent one word (or puncutation mark) in a `NLPSentence`.
240 # Index of this word in the sentence.
249 # Position of the first character in the input
250 var begin_offset
: Int
252 # Position of the last character in the input
258 # Init `self` from an XML element.
263 # <word>University</word>
264 # <lemma>University</lemma>
265 # <CharacterOffsetBegin>9</CharacterOffsetBegin>
266 # <CharacterOffsetEnd>19</CharacterOffsetEnd>
268 # </token>""".to_xml["token"].first.as(XMLStartTag)
270 # var token = new NLPToken.from_xml(xml)
271 # assert token.index == 2
272 # assert token.word == "University"
273 # assert token.lemma == "University"
274 # assert token.begin_offset == 9
275 # assert token.end_offset == 19
276 # assert token.pos == "NNP"
278 init from_xml
(xml
: XMLStartTag) do
279 var index
= xml
.attributes
.first
.as(XMLStringAttr).value
.to_i
280 var word
= xml
["word"].first
.as(XMLStartTag).data
281 var lemma
= xml
["lemma"].first
.as(XMLStartTag).data
282 var begin_offset
= xml
["CharacterOffsetBegin"].first
.as(XMLStartTag).data
.to_i
283 var end_offset
= xml
["CharacterOffsetEnd"].first
.as(XMLStartTag).data
.to_i
284 var pos
= xml
["POS"].first
.as(XMLStartTag).data
285 init(index
, word
, lemma
, begin_offset
, end_offset
, pos
)