From: Alexandre Terrasa Date: Wed, 20 Sep 2017 22:23:40 +0000 (-0400) Subject: lib/vsm: accept anything as a dimension X-Git-Url: http://nitlanguage.org lib/vsm: accept anything as a dimension Signed-off-by: Alexandre Terrasa --- diff --git a/lib/nlp/nlp.nit b/lib/nlp/nlp.nit index bc96938..7f9510a 100644 --- a/lib/nlp/nlp.nit +++ b/lib/nlp/nlp.nit @@ -23,8 +23,8 @@ import vsm redef class NLPDocument # `NLPVector` representing `self`. - var vector: Vector[String] is lazy do - var vector = new Vector[String] + var vector: Vector is lazy do + var vector = new Vector for sentence in sentences do for token in sentence.tokens do if not keep_pos_token(token) then continue diff --git a/lib/vsm/vsm.nit b/lib/vsm/vsm.nit index d7b5027..e34eb85 100644 --- a/lib/vsm/vsm.nit +++ b/lib/vsm/vsm.nit @@ -28,7 +28,7 @@ import counter # # *n-dimensions* vectors are used to represent a text document or an object. class Vector - super Counter[String] + super HashMap[nullable Object, Float] # Cosine similarity of `self` and `other`. # @@ -58,17 +58,18 @@ class Vector # ~~~ fun cosine_similarity(other: SELF): Float do # Collect terms - var terms = new HashSet[String] + var terms = new HashSet[nullable Object] for k in self.keys do terms.add k for k in other.keys do terms.add k # Get dot product of two vectors - var dot = 0 + var dot = 0.0 for term in terms do - dot += self.get_or_default(term, 0) * other.get_or_default(term, 0) + dot += self.get_or_default(term, 0.0) * other.get_or_default(term, 0.0) end - - return dot.to_f / (self.norm * other.norm) + var cos = dot.to_f / (self.norm * other.norm) + if cos.is_nan then return 0.0 + return cos end # The norm of the vector. @@ -90,8 +91,12 @@ class Vector # assert v.norm.is_approx(3.742, 0.001) # ~~~ fun norm: Float do - var sum = 0 - for v in self.values do sum += v ** 2 + var sum = 0.0 + for v in self.values do sum += v.pow(2.0) return sum.to_f.sqrt end + + redef fun to_s do + return "[{join(", ", ":")}]" + end end