1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
15 # NLPVector Space Model.
17 # The Vector Space Model (VSM) is used to compare natural language texts.
18 # Texts are translated to multidimensionnal vectors then compared by cosine
24 # A multi-dimensional vector.
28 # Cosine similarity of `self` and `other`.
30 # Gives the proximity in the range `[0.0 .. 1.0]` where 0.0 means that the
31 # two vectors are orthogonal and 1.0 means that they are identical.
34 # var v1 = new NLPVector
39 # var v2 = new NLPVector
44 # var v3 = new NLPVector
49 # print v1.cosine_similarity(v2)
50 # #assert v1.cosine_similarity(v2) == 1.0
51 # print v1.cosine_similarity(v3)
52 # assert v1.cosine_similarity(v3) == 0.0
54 fun cosine_similarity
(other
: SELF): Float do
56 var terms
= new HashSet[String]
57 for k
in self.keys
do terms
.add k
58 for k
in other
.keys
do terms
.add k
60 # Get dot product of two verctors
63 dot
+= self.get_or_default
(term
, 0) * other
.get_or_default
(term
, 0)
66 return dot
.to_f
/ (self.norm
* other
.norm
)
69 # The norm of the vector.
71 # `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
74 # var v = new NLPVector
79 # assert v.norm.is_approx(2.0, 0.001)
85 # assert v.norm.is_approx(3.742, 0.001)
89 for v
in self.values
do sum
+= v
** 2