From 251e50de1de7a8313b473bf5dc352cfa6fc4a92d Mon Sep 17 00:00:00 2001
From: Alexandre Terrasa <alexandre@moz-code.org>
Date: Tue, 26 Sep 2017 23:06:20 -0400
Subject: [PATCH] lib/nlp: combine nlp and vsm to create a search engine index

Signed-off-by: Alexandre Terrasa <alexandre@moz-code.org>
---
 lib/nlp/examples/nlp_index.nit |   81 ++++++++++++++++++++++++++++++++++++++++
 lib/nlp/nlp.nit                |   58 +++++++++++++++-------------
 2 files changed, 113 insertions(+), 26 deletions(-)
 create mode 100644 lib/nlp/examples/nlp_index.nit
diff --git a/lib/nlp/examples/nlp_index.nit b/lib/nlp/examples/nlp_index.nit
new file mode 100644
index 0000000..72c0b5c
--- /dev/null
+++ b/lib/nlp/examples/nlp_index.nit
@@ -0,0 +1,81 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Example showing how to use a NLPFileIndex.
+module nlp_index
+
+import nlp
+import config
+
+redef class Config
+
+	# --whitelist-exts
+	var opt_white_exts = new OptionArray("Allowed file extensions (default is [])",
+		"-w", "--whitelist-exts")
+
+	# --blacklist-exts
+	var opt_black_exts = new OptionArray("Allowed file extensions (default is [])",
+		"-b", "--blacklist-exts")
+
+	# --server
+	var opt_server = new OptionString("StanfordNLP server URI (default is https://localhost:9000)",
+		"-s", "--server")
+
+	# --lang
+	var opt_lang = new OptionString("Language to use (default is fr)", "-l", "--lang")
+
+	redef init do
+		opts.add_option(opt_server, opt_lang, opt_white_exts, opt_black_exts)
+	end
+end
+
+var config = new Config
+config.tool_description = "usage: example_index <files>"
+config.parse_options(args)
+
+if args.length < 1 then
+	config.usage
+	exit 1
+end
+
+var host = config.opt_server.value
+if host == null then host = "http://localhost:9000"
+var lang = config.opt_lang.value
+if lang == null then lang = "en"
+
+var cli = new NLPClient(host)
+cli.language = lang
+
+var bl = config.opt_black_exts.value
+if bl.is_empty then bl = ["CD", "SYM", "-RRB-", "-LRB-", "''", "``", ".", "#", ":", ",", "$", ""]
+
+var index = new NLPFileIndex(cli)
+index.whitelist_exts = config.opt_white_exts.value
+index.blacklist_exts = bl
+
+print "Building index..."
+index.index_files(args, true)
+
+print "Indexed {index.documents.length} documents"
+
+loop
+	print "\nEnter query:"
+	printn "> "
+	var input = sys.stdin.read_line
+	var matches = index.match_string(input)
+
+	for match in matches do
+		print match
+	end
+end
diff --git a/lib/nlp/nlp.nit b/lib/nlp/nlp.nit
index 0e4dba3..5bad0dc 100644
--- a/lib/nlp/nlp.nit
+++ b/lib/nlp/nlp.nit
@@ -20,16 +20,21 @@ module nlp
 import stanford
 import vsm
 
-redef class NLPDocument
+# A StringIndex using a NLPProcessor to parse and vectorize strings
+class NLPIndex
+	super StringIndex
 
-	# `NLPVector` representing `self`.
-	var vector: Vector is lazy do
+	# NLP Processor used to tokenize, lemmatize and POS tag documents
+	var nlp_processor: NLPProcessor
+
+	redef fun parse_string(string) do
 		var vector = new Vector
-		for sentence in sentences do
+		if string.trim.is_empty then return vector
+		var doc = nlp_processor.process(string)
+		for sentence in doc.sentences do
 			for token in sentence.tokens do
-				if not keep_pos_token(token) then continue
+				if not accept_token(token) then continue
 				var lemma = token.lemma
-				if lemma_black_list.has(lemma) then continue
 				if not vector.has_key(lemma) then
 					vector[lemma] = 1.0
 				else
@@ -40,32 +45,33 @@ redef class NLPDocument
 		return vector
 	end
 
-	# Should we keep `token` when composing the vector?
+	# Is `token` accepted by this index?
 	#
-	# Choice is based on the POS tag of the token.
-	# See `allowed_pos_prefixes`.
-	private fun keep_pos_token(token: NLPToken): Bool do
+	# See `whitelist_pos` and `blacklist_pos`.
+	fun accept_token(token: NLPToken): Bool do
 		var pos = token.pos
-		for prefix in allowed_pos_prefixes do
-			if pos.has_prefix(prefix) then return true
-		end
-		return false
+		if whitelist_pos.not_empty and not whitelist_pos.has(pos) then return false
+		if blacklist_pos.has(pos) then return false
+		if stoplist.has(token.lemma) then return false
+		return true
 	end
 
-	# Should we keep `lemma` when composing the vector?
+	# Part-Of-Speech whitelist
 	#
-	# See `lemma_black_list`.
-	private fun keep_lemma(lemma: String): Bool do
-		return true
-	end
+	# If not empty, the index accept only the POS tags contained in this list.
+	var whitelist_pos = new Array[String] is writable
 
-	# Allowed POS tag prefixes.
+	# Part-Of-Speech blacklist
 	#
-	# When building a vector from `self`,  only tokens tagged with one of these
-	# prefixes are kept.
-	# Other tokens are ignored.
-	var allowed_pos_prefixes: Array[String] = ["NN", "VB", "RB"] is writable
+	# Reject POS tags contained in this list.
+	var blacklist_pos = new Array[String] is writable
+
+	# List of lemmas that must not be indexed
+	var stoplist = new Array[String] is writable
+end
 
-	# Ignored lemmas.
-	var lemma_black_list: Array[String] = ["module", "class", "method"] is writable
+# A FileIndex based using a NLPProcessor
+class NLPFileIndex
+	super NLPIndex
+	super FileIndex
 end
-- 
1.7.9.5