1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
15 # Natural Language Processor based on the StanfordNLP core.
17 # See http://nlp.stanford.edu/software/corenlp.shtml.
23 # Wrapper around StanfordNLP jar.
25 # NLPProcessor provides natural language processing of input text files and
26 # an API to handle analysis results.
28 # FIXME this should use the Java FFI.
31 # Classpath to give to Java when loading the StanfordNLP jars.
34 # Process a string and return a new NLPDocument from this.
35 fun process
(string
: String): NLPDocument do
36 var tmp_file
= ".nlp.in"
37 var file
= new FileWriter.open
(tmp_file
)
40 var doc
= process_file
(tmp_file
)
45 # Process the `input` file and return a new NLPDocument from this.
46 fun process_file
(input
: String): NLPDocument do
48 var tmp_file
= "{input.basename}.xml"
49 sys
.system
"java -cp \"{java_cp}\
" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -file {input}"
50 var doc
= new NLPDocument.from_xml_file
(tmp_file
)
57 # Returns a map of file path associated with their NLPDocument.
58 fun process_files
(inputs
: Collection[String], output_dir
: String): Map[String, NLPDocument] do
59 # Prepare the input file list
60 var input_file
= "inputs.list"
61 var fw
= new FileWriter.open
(input_file
)
62 for input
in inputs
do fw
.write
"{input}\n"
65 # Run Stanford NLP jar
66 sys
.system
"java -cp \"{java_cp}\
" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat xml -filelist {input_file} -outputDirectory {output_dir}"
68 var map
= new HashMap[String, NLPDocument]
69 for input
in inputs
do
70 var out_file
= output_dir
/ "{input.basename}.xml"
71 map
[input
] = new NLPDocument.from_xml_file
(out_file
)
73 input_file
.file_delete
78 # A `Document` represent a text input given to the NLP processor.
80 # Once processed, it contains a list of sentences that contain tokens.
83 # NLPSentences contained in `self`
84 var sentences
= new Array[NLPSentence]
86 # Init `self` from an xml element.
96 # <word>Stanford</word>
97 # <lemma>Stanford</lemma>
98 # <CharacterOffsetBegin>0</CharacterOffsetBegin>
99 # <CharacterOffsetEnd>8</CharacterOffsetEnd>
103 # <word>University</word>
104 # <lemma>University</lemma>
105 # <CharacterOffsetBegin>9</CharacterOffsetBegin>
106 # <CharacterOffsetEnd>19</CharacterOffsetEnd>
115 # <lemma>UQAM</lemma>
116 # <CharacterOffsetBegin>0</CharacterOffsetBegin>
117 # <CharacterOffsetEnd>4</CharacterOffsetEnd>
121 # <word>University</word>
122 # <lemma>University</lemma>
123 # <CharacterOffsetBegin>5</CharacterOffsetBegin>
124 # <CharacterOffsetEnd>15</CharacterOffsetEnd>
131 # </root>""".to_xml.as(XMLDocument)
133 # var document = new NLPDocument.from_xml(xml)
134 # assert document.sentences.length == 2
135 # assert document.sentences.first.tokens.first.word == "Stanford"
136 # assert document.sentences.last.tokens.first.word == "UQAM"
138 init from_xml
(xml
: XMLDocument) do
139 for obj
in xml
["root"].first
["document"].first
["sentences"].first
["sentence"] do
140 if obj
isa XMLStartTag then
141 sentences
.add
new NLPSentence.from_xml
(obj
)
143 print
"Warning: malformed xml, `sentences` is supposed to contain `sencence` tags"
148 # Init `self` from a XML file.
149 init from_xml_file
(path
: String) do
150 var file
= new FileReader.open
(path
)
151 var xml
= file
.read_lines
153 xml
.shift
# remove xml doctype
154 xml
.shift
# remove xslt link
155 from_xml
(xml
.join
("\n").to_xml
.as(XMLDocument))
159 # Represent one sentence in a `Document`.
162 # Index of this sentence in the input text.
165 # NLPTokens contained in `self`.
166 var tokens
= new Array[NLPToken]
168 # Init `self` from an XML element.
175 # <word>Stanford</word>
176 # <lemma>Stanford</lemma>
177 # <CharacterOffsetBegin>0</CharacterOffsetBegin>
178 # <CharacterOffsetEnd>8</CharacterOffsetEnd>
182 # <word>University</word>
183 # <lemma>University</lemma>
184 # <CharacterOffsetBegin>9</CharacterOffsetBegin>
185 # <CharacterOffsetEnd>19</CharacterOffsetEnd>
189 # </sentence>""".to_xml["sentence"].first.as(XMLStartTag)
191 # var sentence = new NLPSentence.from_xml(xml)
192 # assert sentence.index == 1
193 # assert sentence.tokens.length == 2
195 init from_xml
(xml
: XMLStartTag) do
196 var index
= xml
.attributes
.first
.as(XMLStringAttr).value
.to_i
197 for obj
in xml
["tokens"].first
["token"] do
198 if obj
isa XMLStartTag then
199 tokens
.add
new NLPToken.from_xml
(obj
)
201 print
"Warning: malformed xml, `tokens` is supposed to contain `token` tags"
208 # Represent one word (or puncutation mark) in a `NLPSentence`.
211 # Index of this word in the sentence.
220 # Position of the first character in the input
221 var begin_offset
: Int
223 # Position of the last character in the input
229 # Init `self` from an XML element.
234 # <word>University</word>
235 # <lemma>University</lemma>
236 # <CharacterOffsetBegin>9</CharacterOffsetBegin>
237 # <CharacterOffsetEnd>19</CharacterOffsetEnd>
239 # </token>""".to_xml["token"].first.as(XMLStartTag)
241 # var token = new NLPToken.from_xml(xml)
242 # assert token.index == 2
243 # assert token.word == "University"
244 # assert token.lemma == "University"
245 # assert token.begin_offset == 9
246 # assert token.end_offset == 19
247 # assert token.pos == "NNP"
249 init from_xml
(xml
: XMLStartTag) do
250 var index
= xml
.attributes
.first
.as(XMLStringAttr).value
.to_i
251 var word
= xml
["word"].first
.as(XMLStartTag).data
252 var lemma
= xml
["lemma"].first
.as(XMLStartTag).data
253 var begin_offset
= xml
["CharacterOffsetBegin"].first
.as(XMLStartTag).data
.to_i
254 var end_offset
= xml
["CharacterOffsetEnd"].first
.as(XMLStartTag).data
.to_i
255 var pos
= xml
["POS"].first
.as(XMLStartTag).data
256 init(index
, word
, lemma
, begin_offset
, end_offset
, pos
)