Merge: Vector Space Model

author Jean Privat <jean@pryen.org>

Tue, 10 Oct 2017 16:52:01 +0000 (12:52 -0400)

committer Jean Privat <jean@pryen.org>

Tue, 10 Oct 2017 16:52:01 +0000 (12:52 -0400)
author Jean Privat <jean@pryen.org>
Tue, 10 Oct 2017 16:52:01 +0000 (12:52 -0400)
committer Jean Privat <jean@pryen.org>
Tue, 10 Oct 2017 16:52:01 +0000 (12:52 -0400)
diff --git a/lib/nlp/README.md b/lib/nlp/README.md

index c7196c4..9c718a9 100644 (file)
--- a/lib/nlp/README.md
+++ b/lib/nlp/README.md
@@ -61,14 +61,6 @@ For ease of use, this wrapper introduce a Nit model to handle CoreNLP XML result
  [[doc: nlp::NLPProcessor::process_file]]
  [[doc: nlp::NLPProcessor::process_files]]
  
-## Vector Space Model
-
-[[doc: NLPVector]]
-
-[[doc: vector]]
-
-[[doc: nlp::NLPVector::cosine_similarity]]
-
  ## NitNLP binary
  
  The `nitnlp` binary is given as an example of NitNLP client.
diff --git a/lib/nlp/nlp.nit b/lib/nlp/nlp.nit

index 4dd7cc9..0e4dba3 100644 (file)
--- a/lib/nlp/nlp.nit
+++ b/lib/nlp/nlp.nit
@@ -23,17 +23,17 @@ import vsm
  redef class NLPDocument
  
         # `NLPVector` representing `self`.
-       var vector: NLPVector is lazy do
-               var vector = new NLPVector
+       var vector: Vector is lazy do
+               var vector = new Vector
                 for sentence in sentences do
                         for token in sentence.tokens do
                                 if not keep_pos_token(token) then continue
                                 var lemma = token.lemma
                                 if lemma_black_list.has(lemma) then continue
                                 if not vector.has_key(lemma) then
-                                       vector[lemma] = 1
+                                       vector[lemma] = 1.0
                                 else
-                                       vector[lemma] += 1
+                                       vector[lemma] += 1.0
                                 end
                         end
                 end
diff --git a/lib/nlp/vsm.nit b/lib/nlp/vsm.nit

deleted file mode 100644 (file)

index 7fe0a84..0000000
--- a/lib/nlp/vsm.nit
+++ /dev/null
@@ -1,92 +0,0 @@
-# This file is part of NIT ( http://www.nitlanguage.org ).
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# NLPVector Space Model.
-#
-# The Vector Space Model (VSM) is used to compare natural language texts.
-# Texts are translated to multidimensionnal vectors then compared by cosine
-# similarity.
-module vsm
-
-import counter
-
-# A multi-dimensional vector.
-class NLPVector
-       super Counter[String]
-
-       # Cosine similarity of `self` and `other`.
-       #
-       # Gives the proximity in the range `[0.0 .. 1.0]` where 0.0 means that the
-       # two vectors are orthogonal and 1.0 means that they are identical.
-       #
-       # ~~~
-       # var v1 = new NLPVector
-       # v1["x"] = 1
-       # v1["y"] = 2
-       # v1["z"] = 3
-       #
-       # var v2 = new NLPVector
-       # v2["x"] = 1
-       # v2["y"] = 2
-       # v2["z"] = 3
-       #
-       # var v3 = new NLPVector
-       # v3["a"] = 1
-       # v3["b"] = 2
-       # v3["c"] = 3
-       #
-       # print v1.cosine_similarity(v2)
-       # #assert v1.cosine_similarity(v2) == 1.0
-       # print v1.cosine_similarity(v3)
-       # assert v1.cosine_similarity(v3) == 0.0
-       # ~~~
-       fun cosine_similarity(other: SELF): Float do
-               # Collect terms
-               var terms = new HashSet[String]
-               for k in self.keys do terms.add k
-               for k in other.keys do terms.add k
-
-               # Get dot product of two verctors
-               var dot = 0
-               for term in terms do
-                       dot += self.get_or_default(term, 0) * other.get_or_default(term, 0)
-               end
-
-               return dot.to_f / (self.norm * other.norm)
-       end
-
-       # The norm of the vector.
-       #
-       # `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
-       #
-       # ~~~
-       # var v = new NLPVector
-       # v["x"] = 1
-       # v["y"] = 1
-       # v["z"] = 1
-       # v["t"] = 1
-       # assert v.norm.is_approx(2.0, 0.001)
-       #
-       # v["x"] = 1
-       # v["y"] = 2
-       # v["z"] = 3
-       # v["t"] = 0
-       # assert v.norm.is_approx(3.742, 0.001)
-       # ~~~
-       fun norm: Float do
-               var sum = 0
-               for v in self.values do sum += v ** 2
-               return sum.to_f.sqrt
-       end
-end
diff --git a/lib/vsm/README.md b/lib/vsm/README.md

new file mode 100644 (file)

index 0000000..ea50132
--- /dev/null
+++ b/lib/vsm/README.md
@@ -0,0 +1,103 @@
+# Vector Space Model
+
+Vector Space Model (VSM) is an algebraic model for representing text documents
+(and any objects, in general) as vectors of identifiers, such as, for example,
+index terms.
+
+It is used in information filtering, information retrieval, indexing and
+relevancy rankings.
+
+The `vsm` package provides the following features:
+* Vector comparison with cosine similarity.
+* Vector indexing and matching with tf * idf.
+* File indexing and matching to free text queries.
+
+## Vectors
+
+With VSM, documents are represented by a n-dimensions vector.
+Each dimension represent an attribute of the document or object.
+
+For text document, the count of each term found in the document if often used to
+build vectors.
+
+### Creating a vector
+
+~~~
+var vector = new Vector
+vector["term1"] = 2.0
+vector["term2"] = 1.0
+assert vector["term1"] == 2.0
+assert vector["term2"] == 1.0
+assert vector.norm.is_approx(2.236, 0.001)
+~~~
+
+### Comparing vectors
+
+~~~
+var v1 = new Vector
+v1["term1"] = 1.0
+v1["term2"] = 2.0
+
+var v2 = new Vector
+v2["term2"] = 1.0
+v2["term3"] = 3.0
+
+var query = new Vector
+query["term2"] = 1.0
+
+var s1 = query.cosine_similarity(v1)
+var s2 = query.cosine_similarity(v2)
+assert s1 > s2
+~~~
+
+## VSMIndex
+
+VSMIndex is a Document index based on VSM.
+
+Using VSMIndex you can index documents associated with their vector.
+Documents can then be matched to query vectors.
+
+This represents a minimalistic search engine.
+
+~~~
+var index = new VSMIndex
+
+var d1 = new Document("Doc 1", "/uri/1", v1)
+index.index_document(d1)
+
+var d2 = new Document("Doc 2", "/uri/2", v2)
+index.index_document(d2)
+
+assert index.documents.length == 2
+
+query = new Vector
+query["term1"] = 1.0
+
+var matches = index.match_vector(query)
+assert matches.first.document == d1
+~~~
+
+## StringIndex
+
+The StringIndex provides usefull services to index and match strings.
+
+~~~
+index = new StringIndex
+
+d1 = index.index_string("Doc 1", "/uri/1", "this is a sample")
+d2 = index.index_string("Doc 2", "/uri/2", "this and this is another example")
+assert index.documents.length == 2
+
+matches = index.match_string("this sample")
+assert matches.first.document == d1
+~~~
+
+## FileIndex
+
+The FileIndex is a StringIndex able to index and retrieve files.
+
+~~~nit
+index = new FileIndex
+
+index.index_files(["/path/to/doc/1", "/path/to/doc/2"])
+~~~
diff --git a/lib/vsm/examples/example_vsm.nit b/lib/vsm/examples/example_vsm.nit

new file mode 100644 (file)

index 0000000..5f1192b
--- /dev/null
+++ b/lib/vsm/examples/example_vsm.nit
@@ -0,0 +1,65 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Example using a `FileIndex`
+#
+# This example shows of to index files from the system and retrieve them
+# with text queries.
+module example_vsm
+
+import vsm
+import config
+
+redef class Config
+
+       # --whitelist-exts
+       var opt_white_exts = new OptionArray("Allowed file extensions (default is [])",
+               "-w", "--whitelist-exts")
+
+       # --blacklist-exts
+       var opt_black_exts = new OptionArray("Allowed file extensions (default is [])",
+               "-b", "--blacklist-exts")
+
+       redef init do
+               opts.add_option(opt_white_exts, opt_black_exts)
+       end
+end
+
+var config = new Config
+config.tool_description = "usage: example_vsm <files>"
+config.parse_options(args)
+
+if args.length < 1 then
+       config.usage
+       exit 1
+end
+
+var index = new FileIndex
+index.whitelist_exts = config.opt_white_exts.value
+index.blacklist_exts = config.opt_black_exts.value
+
+print "Building index..."
+index.index_files(args, true)
+print "Indexed {index.documents.length} documents"
+
+loop
+       print "\nEnter query:"
+       printn "> "
+       var input = sys.stdin.read_line
+       var matches = index.match_string(input)
+       printn ""
+       for match in matches do
+               print match
+       end
+end
diff --git a/lib/vsm/vsm.nit b/lib/vsm/vsm.nit

new file mode 100644 (file)

index 0000000..41d5152
--- /dev/null
+++ b/lib/vsm/vsm.nit
@@ -0,0 +1,374 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Vector Space Model
+#
+# Vector Space Model (VSM) is an algebraic model for representing text documents
+# (and any objects, in general) as vectors of identifiers, such as, for example,
+# index terms.
+#
+# It is used in information filtering, information retrieval, indexing and
+# relevancy rankings.
+module vsm
+
+import counter
+
+# A n-dimensions vector
+#
+# *n-dimensions* vectors are used to represent a text document or an object.
+class Vector
+       super HashMap[nullable Object, Float]
+
+       # Cosine similarity of `self` and `other`.
+       #
+       # Gives the proximity in the range `[0.0 .. 1.0]` where 0.0 means that the
+       # two vectors are orthogonal and 1.0 means that they are identical.
+       #
+       # ~~~
+       # var v1 = new Vector
+       # v1["x"] = 1.0
+       # v1["y"] = 2.0
+       # v1["z"] = 3.0
+       #
+       # var v2 = new Vector
+       # v2["x"] = 1.0
+       # v2["y"] = 2.0
+       # v2["z"] = 3.0
+       #
+       # var v3 = new Vector
+       # v3["a"] = 1.0
+       # v3["b"] = 2.0
+       # v3["c"] = 3.0
+       #
+       # print v1.cosine_similarity(v2)
+       # assert v1.cosine_similarity(v2) == 1.0
+       # print v1.cosine_similarity(v3)
+       # assert v1.cosine_similarity(v3) == 0.0
+       # ~~~
+       fun cosine_similarity(other: SELF): Float do
+               # Collect terms
+               var terms = new HashSet[nullable Object]
+               for k in self.keys do terms.add k
+               for k in other.keys do terms.add k
+
+               # Get dot product of two vectors
+               var dot = 0.0
+               for term in terms do
+                       dot += self.get_or_default(term, 0.0) * other.get_or_default(term, 0.0)
+               end
+               var cos = dot.to_f / (self.norm * other.norm)
+               if cos.is_nan then return 0.0
+               return cos
+       end
+
+       # The norm of the vector.
+       #
+       # `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
+       #
+       # ~~~
+       # var v = new Vector
+       # v["x"] = 1.0
+       # v["y"] = 1.0
+       # v["z"] = 1.0
+       # v["t"] = 1.0
+       # assert v.norm.is_approx(2.0, 0.001)
+       #
+       # v["x"] = 1.0
+       # v["y"] = 2.0
+       # v["z"] = 3.0
+       # v["t"] = 0.0
+       # assert v.norm.is_approx(3.742, 0.001)
+       # ~~~
+       fun norm: Float do
+               var sum = 0.0
+               for v in self.values do sum += v.pow(2.0)
+               return sum.to_f.sqrt
+       end
+
+       redef fun to_s do
+               return "[{join(", ", ":")}]"
+       end
+end
+
+# A Document index based on VSM
+#
+# Using VSMIndex you can index documents associated with their vector.
+# Documents can then be matched to query vectors.
+class VSMIndex
+
+       # Documents index
+       #
+       # TODO use a more efficient representation.
+       var documents = new HashSet[Document]
+
+       # Count for all terms in all indexed documents
+       #
+       # Used to compute the `inverse_doc_frequency`.
+       var terms_doc_count = new Vector
+
+       # Inverse document frequency
+       #
+       # The inverse document frequency is a measure of how much information a term
+       # provides, that is, whether the term is common or rare across all documents.
+       var inverse_doc_frequency = new Vector
+
+       # Used to sort matches
+       #
+       # See `IndexMatch`.
+       var sorter = new IndexMatchSorter
+
+       # Match `query` vector to all index document vectors
+       #
+       # Returns an `IndexMatch` for each indexed document.
+       # Results are ordered by descending similarity.
+       fun match_vector(query: Vector): Array[IndexMatch] do
+               var matches = new Array[IndexMatch]
+               for doc in documents do
+                       var sim = query.cosine_similarity(doc.tfidf)
+                       if sim == 0.0 then continue
+                       matches.add new IndexMatch(doc, sim)
+               end
+               sorter.sort(matches)
+               return matches
+       end
+
+       # Index a document
+       #
+       # With each new document, the `inverse_doc_frequency` must be updated.
+       # By default, the method `update_index` is called after each call to
+       # `index_document`.
+       #
+       # When processing batch documents, use `auto_update = false` to disable
+       # the auto update of the index.
+       fun index_document(doc: Document, auto_update: nullable Bool) do
+               for term, count in doc.terms_count do
+                       if not terms_doc_count.has_key(term) then
+                               terms_doc_count[term] = 1.0
+                       else
+                               terms_doc_count[term] += 1.0
+                       end
+               end
+               documents.add doc
+               if auto_update == null or auto_update then update_index
+       end
+
+       # Update the index
+       #
+       # Recompute the `inverse_doc_frequency` values.
+       # Must be called manually after indexing new document with the option
+       # `auto_update = false`.
+       fun update_index do
+               for doc in documents do
+                       for term, ccount in doc.terms_count do
+                               inverse_doc_frequency[term] = (documents.length.to_f / terms_doc_count[term]).log
+                       end
+               end
+               for doc in documents do
+                       for term, freq in doc.terms_frequency do
+                               doc.tfidf[term] = freq * inverse_doc_frequency[term]
+                       end
+               end
+       end
+end
+
+# A VSM index to store strings
+class StringIndex
+       super VSMIndex
+
+       # Index a new Document from `title`, `uri` and string `string`.
+       #
+       # Return the Document created.
+       #
+       # See `index_document`.
+       fun index_string(title, uri, string: String, auto_update: nullable Bool): Document do
+               var vector = parse_string(string)
+               var doc = new Document(title, uri, vector)
+               index_document(doc, auto_update)
+               return doc
+       end
+
+       # Match the `query` string against all indexed documents
+       #
+       # See `match_vector`.
+       fun match_string(query: String): Array[IndexMatch] do
+               var vector = parse_string(query)
+               return match_vector(vector)
+       end
+
+       # Parse the `string` as a Vector
+       #
+       # Returns a vector containing the terms of `string`.
+       fun parse_string(string: String): Vector do
+               var reader = new StringReader(string)
+               var vector = new Vector
+               loop
+                       var token = reader.read_word
+                       if token == "" then break
+
+                       if not vector.has_key(token) then
+                               vector[token] = 1.0
+                       else
+                               vector[token] += 1.0
+                       end
+               end
+               return vector
+       end
+end
+
+# A VSMIndex to index files
+class FileIndex
+       super StringIndex
+
+       # Index a file from its `path`.
+       #
+       # Return the created document or null if `path` is not accepted by `accept_file`.
+       #
+       # See `index_document`.
+       fun index_file(path: String, auto_update: nullable Bool): nullable Document do
+               if not accept_file(path) then return null
+               var vector = parse_file(path)
+               var doc = new Document(path, path, vector)
+               index_document(doc, auto_update)
+               return doc
+       end
+
+       # Index multiple files
+       #
+       # The recursive method `index_dir` will be called for each directory found
+       # in `paths`.
+       #
+       # See `index_file`
+       fun index_files(paths: Collection[String], auto_update: nullable Bool) do
+               for path in paths do
+                       if path.to_path.is_dir then
+                               index_dir(path, false)
+                       else
+                               index_file(path, false)
+                       end
+               end
+               if auto_update != null and auto_update then update_index
+       end
+
+       # Index all files in `dir` recursively
+       #
+       # See `index_file`.
+       fun index_dir(dir: String, auto_update: nullable Bool) do
+               if not dir.to_path.is_dir then return
+               for file in dir.files do
+                       var path = dir / file
+                       if path.to_path.is_dir then
+                               index_dir(path, false)
+                       else
+                               index_file(path, false)
+                       end
+               end
+               if auto_update != null and auto_update then update_index
+       end
+
+       # Is `path` accepted depending on `whitelist_exts` and `blacklist_exts`?
+       fun accept_file(path: String): Bool do
+               var ext = path.file_extension
+               if ext != null then
+                       ext = ext.to_lower
+                       if blacklist_exts.has(ext) then return false
+                       if whitelist_exts.not_empty and not whitelist_exts.has(ext) then return false
+               end
+               return whitelist_exts.is_empty
+       end
+
+       # Parse the `file` content as a Vector
+       #
+       # See `parse_string`.
+       fun parse_file(file: String): Vector do
+               return parse_string(file.to_path.read_all)
+       end
+
+       # File extensions white list
+       #
+       # If not empty, only files with these extensions will be indexed.
+       #
+       # If an extension is in both `whitelist_exts` and `blacklist_exts`, the
+       # blacklist will prevail and the file will be ignored.
+       var whitelist_exts = new Array[String] is writable
+
+       # File extensions black list
+       #
+       # Files with these extensions will not be indexed.
+       var blacklist_exts = new Array[String] is writable
+end
+
+# A Document to add in a VSMIndex
+class Document
+
+       # Document title
+       var title: String
+
+       # Document URI
+       var uri: String
+
+       # Count of all terms found in the document
+       #
+       # Used to compute the document `terms_frequency`.
+       var terms_count: Vector
+
+       # Frequency of each term found in the document
+       #
+       # Used to match the document against the `VSMIndex::inverse_doc_frequency`.
+       var terms_frequency: Vector is lazy do
+               var all_terms = 0.0
+               for t, c in terms_count do all_terms += c
+
+               var vector = new Vector
+               for t, c in terms_count do
+                       vector[t] = c / all_terms
+               end
+               return vector
+       end
+
+       # Term frequency–Inverse document frequency for each term
+       #
+       # A high weight in tf–idf is reached by a high term frequency
+       # (in the given document) and a low document frequency of the term in the
+       # whole collection of documents
+       var tfidf = new Vector
+
+       redef fun to_s do return "{title}"
+end
+
+# A match to a `request` in an `Index`
+class IndexMatch
+       super Comparable
+
+       # Document matching the `request_vector`
+       var document: Document
+
+       # Similarity between the `request` and the `doc`.
+       #
+       # Result is in the range 0.0 .. 1.1 where 0.0 means no similarity and 1.0
+       # means perfect similarity.
+       var similarity: Float
+
+       redef fun to_s do return "{document} ({similarity})"
+end
+
+# Sort matches by similarity
+class IndexMatchSorter
+       super DefaultComparator
+
+       redef type COMPARED: IndexMatch
+
+       redef fun compare(a, b) do
+               return b.similarity <=> a.similarity
+       end
+end
diff --git a/tests/sav/example_vsm.res b/tests/sav/example_vsm.res

new file mode 100644 (file)

index 0000000..0e44311
--- /dev/null
+++ b/tests/sav/example_vsm.res
@@ -0,0 +1,4 @@
+usage: example_vsm <files>
+  -h, --help             Show this help message
+  -w, --whitelist-exts   Allowed file extensions (default is [])
+  -b, --blacklist-exts   Allowed file extensions (default is [])
author	Jean Privat <jean@pryen.org>
	Tue, 10 Oct 2017 16:52:01 +0000 (12:52 -0400)
committer	Jean Privat <jean@pryen.org>
	Tue, 10 Oct 2017 16:52:01 +0000 (12:52 -0400)
lib/nlp/README.md		patch \| blob \| history
lib/nlp/nlp.nit		patch \| blob \| history
lib/nlp/vsm.nit	[deleted file]	patch \| blob \| history
lib/vsm/README.md	[new file with mode: 0644]	patch \| blob
lib/vsm/examples/example_vsm.nit	[new file with mode: 0644]	patch \| blob
lib/vsm/vsm.nit	[new file with mode: 0644]	patch \| blob
tests/sav/example_vsm.res	[new file with mode: 0644]	patch \| blob