# This file is part of NIT ( http://www.nitlanguage.org ). # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Natural Language Processor based on the StanfordNLP core. # # See http://nlp.stanford.edu/software/corenlp.shtml. module nlp import stanford import vsm redef class NLPDocument # `NLPVector` representing `self`. var vector: NLPVector is lazy do var vector = new NLPVector for sentence in sentences do for token in sentence.tokens do if not keep_pos_token(token) then continue var lemma = token.lemma if lemma_black_list.has(lemma) then continue if not vector.has_key(lemma) then vector[lemma] = 1 else vector[lemma] += 1 end end end return vector end # Should we keep `token` when composing the vector? # # Choice is based on the POS tag of the token. # See `allowed_pos_prefixes`. private fun keep_pos_token(token: NLPToken): Bool do var pos = token.pos for prefix in allowed_pos_prefixes do if pos.has_prefix(prefix) then return true end return false end # Should we keep `lemma` when composing the vector? # # See `lemma_black_list`. private fun keep_lemma(lemma: String): Bool do return true end # Allowed POS tag prefixes. # # When building a vector from `self`, only tokens tagged with one of these # prefixes are kept. # Other tokens are ignored. var allowed_pos_prefixes: Array[String] = ["NN", "VB", "RB"] is writable # Ignored lemmas. var lemma_black_list: Array[String] = ["module", "class", "method"] is writable end