lib/nlp: move vsm.nit to its own package
authorAlexandre Terrasa <alexandre@moz-code.org>
Wed, 20 Sep 2017 22:23:09 +0000 (18:23 -0400)
committerAlexandre Terrasa <alexandre@moz-code.org>
Fri, 29 Sep 2017 18:48:31 +0000 (14:48 -0400)
We don't need nlp to use vsm

Signed-off-by: Alexandre Terrasa <alexandre@moz-code.org>

lib/nlp/nlp.nit
lib/vsm/vsm.nit [moved from lib/nlp/vsm.nit with 65% similarity]

index 4dd7cc9..bc96938 100644 (file)
@@ -23,8 +23,8 @@ import vsm
 redef class NLPDocument
 
        # `NLPVector` representing `self`.
-       var vector: NLPVector is lazy do
-               var vector = new NLPVector
+       var vector: Vector[String] is lazy do
+               var vector = new Vector[String]
                for sentence in sentences do
                        for token in sentence.tokens do
                                if not keep_pos_token(token) then continue
similarity index 65%
rename from lib/nlp/vsm.nit
rename to lib/vsm/vsm.nit
index 7fe0a84..d7b5027 100644 (file)
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# NLPVector Space Model.
+# Vector Space Model
 #
-# The Vector Space Model (VSM) is used to compare natural language texts.
-# Texts are translated to multidimensionnal vectors then compared by cosine
-# similarity.
+# Vector Space Model (VSM) is an algebraic model for representing text documents
+# (and any objects, in general) as vectors of identifiers, such as, for example,
+# index terms.
+#
+# It is used in information filtering, information retrieval, indexing and
+# relevancy rankings.
 module vsm
 
 import counter
 
-# A multi-dimensional vector.
-class NLPVector
+# A n-dimensions vector
+#
+# *n-dimensions* vectors are used to represent a text document or an object.
+class Vector
        super Counter[String]
 
        # Cosine similarity of `self` and `other`.
@@ -31,23 +36,23 @@ class NLPVector
        # two vectors are orthogonal and 1.0 means that they are identical.
        #
        # ~~~
-       # var v1 = new NLPVector
-       # v1["x"] = 1
-       # v1["y"] = 2
-       # v1["z"] = 3
+       # var v1 = new Vector
+       # v1["x"] = 1.0
+       # v1["y"] = 2.0
+       # v1["z"] = 3.0
        #
-       # var v2 = new NLPVector
-       # v2["x"] = 1
-       # v2["y"] = 2
-       # v2["z"] = 3
+       # var v2 = new Vector
+       # v2["x"] = 1.0
+       # v2["y"] = 2.0
+       # v2["z"] = 3.0
        #
-       # var v3 = new NLPVector
-       # v3["a"] = 1
-       # v3["b"] = 2
-       # v3["c"] = 3
+       # var v3 = new Vector
+       # v3["a"] = 1.0
+       # v3["b"] = 2.0
+       # v3["c"] = 3.0
        #
        # print v1.cosine_similarity(v2)
-       # #assert v1.cosine_similarity(v2) == 1.0
+       # assert v1.cosine_similarity(v2) == 1.0
        # print v1.cosine_similarity(v3)
        # assert v1.cosine_similarity(v3) == 0.0
        # ~~~
@@ -57,7 +62,7 @@ class NLPVector
                for k in self.keys do terms.add k
                for k in other.keys do terms.add k
 
-               # Get dot product of two verctors
+               # Get dot product of two vectors
                var dot = 0
                for term in terms do
                        dot += self.get_or_default(term, 0) * other.get_or_default(term, 0)
@@ -71,17 +76,17 @@ class NLPVector
        # `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
        #
        # ~~~
-       # var v = new NLPVector
-       # v["x"] = 1
-       # v["y"] = 1
-       # v["z"] = 1
-       # v["t"] = 1
+       # var v = new Vector
+       # v["x"] = 1.0
+       # v["y"] = 1.0
+       # v["z"] = 1.0
+       # v["t"] = 1.0
        # assert v.norm.is_approx(2.0, 0.001)
        #
-       # v["x"] = 1
-       # v["y"] = 2
-       # v["z"] = 3
-       # v["t"] = 0
+       # v["x"] = 1.0
+       # v["y"] = 2.0
+       # v["z"] = 3.0
+       # v["t"] = 0.0
        # assert v.norm.is_approx(3.742, 0.001)
        # ~~~
        fun norm: Float do