Merge: gamnit: new services and a lot of bug fixes and performance improvements
[nit.git] / lib / nlp / nlp.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Natural Language Processor based on the StanfordNLP core.
16 #
17 # See http://nlp.stanford.edu/software/corenlp.shtml.
18 module nlp
19
20 import stanford
21 import vsm
22
23 # A StringIndex using a NLPProcessor to parse and vectorize strings
24 class NLPIndex
25 super StringIndex
26
27 # NLP Processor used to tokenize, lemmatize and POS tag documents
28 var nlp_processor: NLPProcessor
29
30 redef fun parse_string(string) do
31 var vector = new Vector
32 if string.trim.is_empty then return vector
33 var doc = nlp_processor.process(string)
34 for sentence in doc.sentences do
35 for token in sentence.tokens do
36 if not accept_token(token) then continue
37 var lemma = token.lemma
38 if not vector.has_key(lemma) then
39 vector[lemma] = 1.0
40 else
41 vector[lemma] += 1.0
42 end
43 end
44 end
45 return vector
46 end
47
48 # Is `token` accepted by this index?
49 #
50 # See `whitelist_pos` and `blacklist_pos`.
51 fun accept_token(token: NLPToken): Bool do
52 var pos = token.pos
53 if whitelist_pos.not_empty and not whitelist_pos.has(pos) then return false
54 if blacklist_pos.has(pos) then return false
55 if stoplist.has(token.lemma) then return false
56 return true
57 end
58
59 # Part-Of-Speech whitelist
60 #
61 # If not empty, the index accept only the POS tags contained in this list.
62 var whitelist_pos = new Array[String] is writable
63
64 # Part-Of-Speech blacklist
65 #
66 # Reject POS tags contained in this list.
67 var blacklist_pos = new Array[String] is writable
68
69 # List of lemmas that must not be indexed
70 var stoplist = new Array[String] is writable
71 end
72
73 # A FileIndex based using a NLPProcessor
74 class NLPFileIndex
75 super NLPIndex
76 super FileIndex
77 end