nlp :: NLPIndex :: accept_token
Istoken
accepted by this index?
nlp :: NLPIndex :: blacklist_pos=
Part-Of-Speech blacklistnlp :: NLPIndex :: defaultinit
nlp :: NLPIndex :: nlp_processor
NLP Processor used to tokenize, lemmatize and POS tag documentsnlp :: NLPIndex :: nlp_processor=
NLP Processor used to tokenize, lemmatize and POS tag documentsnlp :: NLPIndex :: whitelist_pos=
Part-Of-Speech whitelistnlp :: NLPIndex :: accept_token
Istoken
accepted by this index?
nlp :: NLPIndex :: blacklist_pos=
Part-Of-Speech blacklistcore :: Object :: class_factory
Implementation used byget_class
to create the specific class.
nlp :: NLPIndex :: defaultinit
core :: Object :: defaultinit
vsm :: VSMIndex :: defaultinit
vsm :: StringIndex :: defaultinit
vsm :: VSMIndex :: index_document
Index a documentvsm :: StringIndex :: index_string
Index a new Document fromtitle
, uri
and string string
.
vsm :: VSMIndex :: inverse_doc_frequency
Inverse document frequencyvsm :: VSMIndex :: inverse_doc_frequency=
Inverse document frequencyvsm :: VSMIndex :: inversed_index
Inversed indexvsm :: VSMIndex :: inversed_index=
Inversed indexcore :: Object :: is_same_instance
Return true ifself
and other
are the same instance (i.e. same identity).
core :: Object :: is_same_serialized
Isself
the same as other
in a serialization context?
core :: Object :: is_same_type
Return true ifself
and other
have the same dynamic type.
vsm :: StringIndex :: match_string
Match thequery
string against all indexed documents
vsm :: VSMIndex :: match_vector
Matchquery
vector to all index document vectors
nlp :: NLPIndex :: nlp_processor
NLP Processor used to tokenize, lemmatize and POS tag documentsnlp :: NLPIndex :: nlp_processor=
NLP Processor used to tokenize, lemmatize and POS tag documentscore :: Object :: output_class_name
Display class name on stdout (debug only).vsm :: StringIndex :: parse_string
Parse thestring
as a Vector
vsm :: VSMIndex :: terms_doc_count
Count for all terms in all indexed documentsvsm :: VSMIndex :: terms_doc_count=
Count for all terms in all indexed documentsnlp :: NLPIndex :: whitelist_pos=
Part-Of-Speech whitelist
# A StringIndex using a NLPProcessor to parse and vectorize strings
class NLPIndex
super StringIndex
# NLP Processor used to tokenize, lemmatize and POS tag documents
var nlp_processor: NLPProcessor
redef fun parse_string(string) do
var vector = new Vector
if string.trim.is_empty then return vector
var doc = nlp_processor.process(string)
for sentence in doc.sentences do
for token in sentence.tokens do
if not accept_token(token) then continue
var lemma = token.lemma
if not vector.has_key(lemma) then
vector[lemma] = 1.0
else
vector[lemma] += 1.0
end
end
end
return vector
end
# Is `token` accepted by this index?
#
# See `whitelist_pos` and `blacklist_pos`.
fun accept_token(token: NLPToken): Bool do
var pos = token.pos
if whitelist_pos.not_empty and not whitelist_pos.has(pos) then return false
if blacklist_pos.has(pos) then return false
if stoplist.has(token.lemma) then return false
return true
end
# Part-Of-Speech whitelist
#
# If not empty, the index accept only the POS tags contained in this list.
var whitelist_pos = new Array[String] is writable
# Part-Of-Speech blacklist
#
# Reject POS tags contained in this list.
var blacklist_pos = new Array[String] is writable
# List of lemmas that must not be indexed
var stoplist = new Array[String] is writable
end
lib/nlp/nlp.nit:23,1--71,3