1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
17 # Vector Space Model (VSM) is an algebraic model for representing text documents
18 # (and any objects, in general) as vectors of identifiers, such as, for example,
21 # It is used in information filtering, information retrieval, indexing and
27 # A n-dimensions vector
29 # *n-dimensions* vectors are used to represent a text document or an object.
31 super HashMap[nullable Object, Float]
33 # Cosine similarity of `self` and `other`.
35 # Gives the proximity in the range `[0.0 .. 1.0]` where 0.0 means that the
36 # two vectors are orthogonal and 1.0 means that they are identical.
54 # print v1.cosine_similarity(v2)
55 # assert v1.cosine_similarity(v2) == 1.0
56 # print v1.cosine_similarity(v3)
57 # assert v1.cosine_similarity(v3) == 0.0
59 fun cosine_similarity
(other
: SELF): Float do
61 var terms
= new HashSet[nullable Object]
62 for k
in self.keys
do terms
.add k
63 for k
in other
.keys
do terms
.add k
65 # Get dot product of two vectors
68 dot
+= self.get_or_default
(term
, 0.0) * other
.get_or_default
(term
, 0.0)
70 var cos
= dot
.to_f
/ (self.norm
* other
.norm
)
71 if cos
.is_nan
then return 0.0
75 # The norm of the vector.
77 # `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
85 # assert v.norm.is_approx(2.0, 0.001)
91 # assert v.norm.is_approx(3.742, 0.001)
95 for v
in self.values
do sum
+= v
.pow
(2.0)
100 return "[{join(", ", ":")}]"