1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
15 # Natural Language Processor based on the StanfordNLP core.
17 # See http://nlp.stanford.edu/software/corenlp.shtml.
23 # A StringIndex using a NLPProcessor to parse and vectorize strings
27 # NLP Processor used to tokenize, lemmatize and POS tag documents
28 var nlp_processor
: NLPProcessor
30 redef fun parse_string
(string
) do
31 var vector
= new Vector
32 if string
.trim
.is_empty
then return vector
33 var doc
= nlp_processor
.process
(string
)
34 for sentence
in doc
.sentences
do
35 for token
in sentence
.tokens
do
36 if not accept_token
(token
) then continue
37 var lemma
= token
.lemma
38 if not vector
.has_key
(lemma
) then
48 # Is `token` accepted by this index?
50 # See `whitelist_pos` and `blacklist_pos`.
51 fun accept_token
(token
: NLPToken): Bool do
53 if whitelist_pos
.not_empty
and not whitelist_pos
.has
(pos
) then return false
54 if blacklist_pos
.has
(pos
) then return false
55 if stoplist
.has
(token
.lemma
) then return false
59 # Part-Of-Speech whitelist
61 # If not empty, the index accept only the POS tags contained in this list.
62 var whitelist_pos
= new Array[String] is writable
64 # Part-Of-Speech blacklist
66 # Reject POS tags contained in this list.
67 var blacklist_pos
= new Array[String] is writable
69 # List of lemmas that must not be indexed
70 var stoplist
= new Array[String] is writable
73 # A FileIndex based using a NLPProcessor