vsm :: Vector :: defaultinit
# A n-dimensions vector
#
# *n-dimensions* vectors are used to represent a text document or an object.
class Vector
super HashMap[nullable Object, Float]
# Cosine similarity of `self` and `other`.
#
# Gives the proximity in the range `[0.0 .. 1.0]` where 0.0 means that the
# two vectors are orthogonal and 1.0 means that they are identical.
#
# ~~~
# var v1 = new Vector
# v1["x"] = 1.0
# v1["y"] = 2.0
# v1["z"] = 3.0
#
# var v2 = new Vector
# v2["x"] = 1.0
# v2["y"] = 2.0
# v2["z"] = 3.0
#
# var v3 = new Vector
# v3["a"] = 1.0
# v3["b"] = 2.0
# v3["c"] = 3.0
#
# print v1.cosine_similarity(v2)
# assert v1.cosine_similarity(v2) == 1.0
# print v1.cosine_similarity(v3)
# assert v1.cosine_similarity(v3) == 0.0
# ~~~
fun cosine_similarity(other: SELF): Float do
# Collect terms
var terms = new HashSet[nullable Object]
for k in self.keys do terms.add k
for k in other.keys do terms.add k
# Get dot product of two vectors
var dot = 0.0
for term in terms do
dot += self.get_or_default(term, 0.0) * other.get_or_default(term, 0.0)
end
var cos = dot.to_f / (self.norm * other.norm)
if cos.is_nan then return 0.0
return cos
end
redef fun [](k) do
if not has_key(k) then return 0.0
return super
end
# Increment value for `obj` term
#
# If the term isn't already in the vector, the new value is 1.0.
fun inc(obj: nullable Object) do
if has_key(obj) then
self[obj] += 1.0
else
self[obj] = 1.0
end
end
# The norm of the vector.
#
# `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
#
# ~~~
# var v = new Vector
# v["x"] = 1.0
# v["y"] = 1.0
# v["z"] = 1.0
# v["t"] = 1.0
# assert v.norm.is_approx(2.0, 0.001)
#
# v["x"] = 1.0
# v["y"] = 2.0
# v["z"] = 3.0
# v["t"] = 0.0
# assert v.norm.is_approx(3.742, 0.001)
# ~~~
fun norm: Float do
var sum = 0.0
for v in self.values do sum += v.pow(2.0)
return sum.to_f.sqrt
end
redef fun to_s do
return "[{join(", ", ":")}]"
end
end
lib/vsm/vsm.nit:27,1--118,3