lib/nlp: combine nlp and vsm to create a search engine index

author Alexandre Terrasa <alexandre@moz-code.org>

Wed, 27 Sep 2017 03:06:20 +0000 (23:06 -0400)

committer Alexandre Terrasa <alexandre@moz-code.org>

Thu, 12 Oct 2017 00:49:00 +0000 (20:49 -0400)
author Alexandre Terrasa <alexandre@moz-code.org>
Wed, 27 Sep 2017 03:06:20 +0000 (23:06 -0400)
committer Alexandre Terrasa <alexandre@moz-code.org>
Thu, 12 Oct 2017 00:49:00 +0000 (20:49 -0400)
diff --git a/lib/nlp/examples/nlp_index.nit b/lib/nlp/examples/nlp_index.nit

new file mode 100644 (file)

index 0000000..72c0b5c
--- /dev/null
+++ b/lib/nlp/examples/nlp_index.nit
@@ -0,0 +1,81 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Example showing how to use a NLPFileIndex.
+module nlp_index
+
+import nlp
+import config
+
+redef class Config
+
+       # --whitelist-exts
+       var opt_white_exts = new OptionArray("Allowed file extensions (default is [])",
+               "-w", "--whitelist-exts")
+
+       # --blacklist-exts
+       var opt_black_exts = new OptionArray("Allowed file extensions (default is [])",
+               "-b", "--blacklist-exts")
+
+       # --server
+       var opt_server = new OptionString("StanfordNLP server URI (default is https://localhost:9000)",
+               "-s", "--server")
+
+       # --lang
+       var opt_lang = new OptionString("Language to use (default is fr)", "-l", "--lang")
+
+       redef init do
+               opts.add_option(opt_server, opt_lang, opt_white_exts, opt_black_exts)
+       end
+end
+
+var config = new Config
+config.tool_description = "usage: example_index <files>"
+config.parse_options(args)
+
+if args.length < 1 then
+       config.usage
+       exit 1
+end
+
+var host = config.opt_server.value
+if host == null then host = "http://localhost:9000"
+var lang = config.opt_lang.value
+if lang == null then lang = "en"
+
+var cli = new NLPClient(host)
+cli.language = lang
+
+var bl = config.opt_black_exts.value
+if bl.is_empty then bl = ["CD", "SYM", "-RRB-", "-LRB-", "''", "``", ".", "#", ":", ",", "$", ""]
+
+var index = new NLPFileIndex(cli)
+index.whitelist_exts = config.opt_white_exts.value
+index.blacklist_exts = bl
+
+print "Building index..."
+index.index_files(args, true)
+
+print "Indexed {index.documents.length} documents"
+
+loop
+       print "\nEnter query:"
+       printn "> "
+       var input = sys.stdin.read_line
+       var matches = index.match_string(input)
+
+       for match in matches do
+               print match
+       end
+end
diff --git a/lib/nlp/nlp.nit b/lib/nlp/nlp.nit

index 0e4dba3..5bad0dc 100644 (file)
--- a/lib/nlp/nlp.nit
+++ b/lib/nlp/nlp.nit
@@ -20,16 +20,21 @@ module nlp
  import stanford
  import vsm
  
-redef class NLPDocument
+# A StringIndex using a NLPProcessor to parse and vectorize strings
+class NLPIndex
+       super StringIndex
  
-       # `NLPVector` representing `self`.
-       var vector: Vector is lazy do
+       # NLP Processor used to tokenize, lemmatize and POS tag documents
+       var nlp_processor: NLPProcessor
+
+       redef fun parse_string(string) do
                 var vector = new Vector
-               for sentence in sentences do
+               if string.trim.is_empty then return vector
+               var doc = nlp_processor.process(string)
+               for sentence in doc.sentences do
                         for token in sentence.tokens do
-                               if not keep_pos_token(token) then continue
+                               if not accept_token(token) then continue
                                 var lemma = token.lemma
-                               if lemma_black_list.has(lemma) then continue
                                 if not vector.has_key(lemma) then
                                         vector[lemma] = 1.0
                                 else
@@ -40,32 +45,33 @@ redef class NLPDocument
                 return vector
         end
  
-       # Should we keep `token` when composing the vector?
+       # Is `token` accepted by this index?
         #
-       # Choice is based on the POS tag of the token.
-       # See `allowed_pos_prefixes`.
-       private fun keep_pos_token(token: NLPToken): Bool do
+       # See `whitelist_pos` and `blacklist_pos`.
+       fun accept_token(token: NLPToken): Bool do
                 var pos = token.pos
-               for prefix in allowed_pos_prefixes do
-                       if pos.has_prefix(prefix) then return true
-               end
-               return false
+               if whitelist_pos.not_empty and not whitelist_pos.has(pos) then return false
+               if blacklist_pos.has(pos) then return false
+               if stoplist.has(token.lemma) then return false
+               return true
         end
  
-       # Should we keep `lemma` when composing the vector?
+       # Part-Of-Speech whitelist
         #
-       # See `lemma_black_list`.
-       private fun keep_lemma(lemma: String): Bool do
-               return true
-       end
+       # If not empty, the index accept only the POS tags contained in this list.
+       var whitelist_pos = new Array[String] is writable
  
-       # Allowed POS tag prefixes.
+       # Part-Of-Speech blacklist
         #
-       # When building a vector from `self`,  only tokens tagged with one of these
-       # prefixes are kept.
-       # Other tokens are ignored.
-       var allowed_pos_prefixes: Array[String] = ["NN", "VB", "RB"] is writable
+       # Reject POS tags contained in this list.
+       var blacklist_pos = new Array[String] is writable
+
+       # List of lemmas that must not be indexed
+       var stoplist = new Array[String] is writable
+end
  
-       # Ignored lemmas.
-       var lemma_black_list: Array[String] = ["module", "class", "method"] is writable
+# A FileIndex based using a NLPProcessor
+class NLPFileIndex
+       super NLPIndex
+       super FileIndex
  end
author	Alexandre Terrasa <alexandre@moz-code.org>
	Wed, 27 Sep 2017 03:06:20 +0000 (23:06 -0400)
committer	Alexandre Terrasa <alexandre@moz-code.org>
	Thu, 12 Oct 2017 00:49:00 +0000 (20:49 -0400)
lib/nlp/examples/nlp_index.nit	[new file with mode: 0644]	patch \| blob
lib/nlp/nlp.nit		patch \| blob \| history