lib/nlp: introduce NLPDocument to NLPVector translation
[nit.git] / lib / nlp / nlp.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Natural Language Processor based on the StanfordNLP core.
16 #
17 # See http://nlp.stanford.edu/software/corenlp.shtml.
18 module nlp
19
20 import stanford
21 import vsm
22
23 redef class NLPDocument
24
25 # `NLPVector` representing `self`.
26 var vector: NLPVector is lazy do
27 var vector = new NLPVector
28 for sentence in sentences do
29 for token in sentence.tokens do
30 if not keep_pos_token(token) then continue
31 var lemma = token.lemma
32 if lemma_black_list.has(lemma) then continue
33 if not vector.has_key(lemma) then
34 vector[lemma] = 1
35 else
36 vector[lemma] += 1
37 end
38 end
39 end
40 return vector
41 end
42
43 # Should we keep `token` when composing the vector?
44 #
45 # Choice is based on the POS tag of the token.
46 # See `allowed_pos_prefixes`.
47 private fun keep_pos_token(token: NLPToken): Bool do
48 var pos = token.pos
49 for prefix in allowed_pos_prefixes do
50 if pos.has_prefix(prefix) then return true
51 end
52 return false
53 end
54
55 # Should we keep `lemma` when composing the vector?
56 #
57 # See `lemma_black_list`.
58 private fun keep_lemma(lemma: String): Bool do
59 return true
60 end
61
62 # Allowed POS tag prefixes.
63 #
64 # When building a vector from `self`, only tokens tagged with one of these
65 # prefixes are kept.
66 # Other tokens are ignored.
67 var allowed_pos_prefixes: Array[String] = ["NN", "VB", "RB"] is writable
68
69 # Ignored lemmas.
70 var lemma_black_list: Array[String] = ["module", "class", "method"] is writable
71 end