1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
15 # Natural Language Processor based on the StanfordNLP core.
17 # See http://nlp.stanford.edu/software/corenlp.shtml.
23 redef class NLPDocument
25 # `NLPVector` representing `self`.
26 var vector
: NLPVector is lazy
do
27 var vector
= new NLPVector
28 for sentence
in sentences
do
29 for token
in sentence
.tokens
do
30 if not keep_pos_token
(token
) then continue
31 var lemma
= token
.lemma
32 if lemma_black_list
.has
(lemma
) then continue
33 if not vector
.has_key
(lemma
) then
43 # Should we keep `token` when composing the vector?
45 # Choice is based on the POS tag of the token.
46 # See `allowed_pos_prefixes`.
47 private fun keep_pos_token
(token
: NLPToken): Bool do
49 for prefix
in allowed_pos_prefixes
do
50 if pos
.has_prefix
(prefix
) then return true
55 # Should we keep `lemma` when composing the vector?
57 # See `lemma_black_list`.
58 private fun keep_lemma
(lemma
: String): Bool do
62 # Allowed POS tag prefixes.
64 # When building a vector from `self`, only tokens tagged with one of these
66 # Other tokens are ignored.
67 var allowed_pos_prefixes
: Array[String] = ["NN", "VB", "RB"] is writable
70 var lemma_black_list
: Array[String] = ["module", "class", "method"] is writable