From 251e50de1de7a8313b473bf5dc352cfa6fc4a92d Mon Sep 17 00:00:00 2001 From: Alexandre Terrasa Date: Tue, 26 Sep 2017 23:06:20 -0400 Subject: [PATCH] lib/nlp: combine nlp and vsm to create a search engine index Signed-off-by: Alexandre Terrasa --- lib/nlp/examples/nlp_index.nit | 81 ++++++++++++++++++++++++++++++++++++++++ lib/nlp/nlp.nit | 58 +++++++++++++++------------- 2 files changed, 113 insertions(+), 26 deletions(-) create mode 100644 lib/nlp/examples/nlp_index.nit diff --git a/lib/nlp/examples/nlp_index.nit b/lib/nlp/examples/nlp_index.nit new file mode 100644 index 0000000..72c0b5c --- /dev/null +++ b/lib/nlp/examples/nlp_index.nit @@ -0,0 +1,81 @@ +# This file is part of NIT ( http://www.nitlanguage.org ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Example showing how to use a NLPFileIndex. +module nlp_index + +import nlp +import config + +redef class Config + + # --whitelist-exts + var opt_white_exts = new OptionArray("Allowed file extensions (default is [])", + "-w", "--whitelist-exts") + + # --blacklist-exts + var opt_black_exts = new OptionArray("Allowed file extensions (default is [])", + "-b", "--blacklist-exts") + + # --server + var opt_server = new OptionString("StanfordNLP server URI (default is https://localhost:9000)", + "-s", "--server") + + # --lang + var opt_lang = new OptionString("Language to use (default is fr)", "-l", "--lang") + + redef init do + opts.add_option(opt_server, opt_lang, opt_white_exts, opt_black_exts) + end +end + +var config = new Config +config.tool_description = "usage: example_index " +config.parse_options(args) + +if args.length < 1 then + config.usage + exit 1 +end + +var host = config.opt_server.value +if host == null then host = "http://localhost:9000" +var lang = config.opt_lang.value +if lang == null then lang = "en" + +var cli = new NLPClient(host) +cli.language = lang + +var bl = config.opt_black_exts.value +if bl.is_empty then bl = ["CD", "SYM", "-RRB-", "-LRB-", "''", "``", ".", "#", ":", ",", "$", ""] + +var index = new NLPFileIndex(cli) +index.whitelist_exts = config.opt_white_exts.value +index.blacklist_exts = bl + +print "Building index..." +index.index_files(args, true) + +print "Indexed {index.documents.length} documents" + +loop + print "\nEnter query:" + printn "> " + var input = sys.stdin.read_line + var matches = index.match_string(input) + + for match in matches do + print match + end +end diff --git a/lib/nlp/nlp.nit b/lib/nlp/nlp.nit index 0e4dba3..5bad0dc 100644 --- a/lib/nlp/nlp.nit +++ b/lib/nlp/nlp.nit @@ -20,16 +20,21 @@ module nlp import stanford import vsm -redef class NLPDocument +# A StringIndex using a NLPProcessor to parse and vectorize strings +class NLPIndex + super StringIndex - # `NLPVector` representing `self`. - var vector: Vector is lazy do + # NLP Processor used to tokenize, lemmatize and POS tag documents + var nlp_processor: NLPProcessor + + redef fun parse_string(string) do var vector = new Vector - for sentence in sentences do + if string.trim.is_empty then return vector + var doc = nlp_processor.process(string) + for sentence in doc.sentences do for token in sentence.tokens do - if not keep_pos_token(token) then continue + if not accept_token(token) then continue var lemma = token.lemma - if lemma_black_list.has(lemma) then continue if not vector.has_key(lemma) then vector[lemma] = 1.0 else @@ -40,32 +45,33 @@ redef class NLPDocument return vector end - # Should we keep `token` when composing the vector? + # Is `token` accepted by this index? # - # Choice is based on the POS tag of the token. - # See `allowed_pos_prefixes`. - private fun keep_pos_token(token: NLPToken): Bool do + # See `whitelist_pos` and `blacklist_pos`. + fun accept_token(token: NLPToken): Bool do var pos = token.pos - for prefix in allowed_pos_prefixes do - if pos.has_prefix(prefix) then return true - end - return false + if whitelist_pos.not_empty and not whitelist_pos.has(pos) then return false + if blacklist_pos.has(pos) then return false + if stoplist.has(token.lemma) then return false + return true end - # Should we keep `lemma` when composing the vector? + # Part-Of-Speech whitelist # - # See `lemma_black_list`. - private fun keep_lemma(lemma: String): Bool do - return true - end + # If not empty, the index accept only the POS tags contained in this list. + var whitelist_pos = new Array[String] is writable - # Allowed POS tag prefixes. + # Part-Of-Speech blacklist # - # When building a vector from `self`, only tokens tagged with one of these - # prefixes are kept. - # Other tokens are ignored. - var allowed_pos_prefixes: Array[String] = ["NN", "VB", "RB"] is writable + # Reject POS tags contained in this list. + var blacklist_pos = new Array[String] is writable + + # List of lemmas that must not be indexed + var stoplist = new Array[String] is writable +end - # Ignored lemmas. - var lemma_black_list: Array[String] = ["module", "class", "method"] is writable +# A FileIndex based using a NLPProcessor +class NLPFileIndex + super NLPIndex + super FileIndex end -- 1.7.9.5