lib/nlp: introduce Vector Space Model with NLPVectors
[nit.git] / lib / nlp / vsm.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # NLPVector Space Model.
16 #
17 # The Vector Space Model (VSM) is used to compare natural language texts.
18 # Texts are translated to multidimensionnal vectors then compared by cosine
19 # similarity.
20 module vsm
21
22 import counter
23
24 # A multi-dimensional vector.
25 class NLPVector
26 super Counter[String]
27
28 # Cosine similarity of `self` and `other`.
29 #
30 # Gives the proximity in the range `[0.0 .. 1.0]` where 0.0 means that the
31 # two vectors are orthogonal and 1.0 means that they are identical.
32 #
33 # ~~~
34 # var v1 = new NLPVector
35 # v1["x"] = 1
36 # v1["y"] = 2
37 # v1["z"] = 3
38 #
39 # var v2 = new NLPVector
40 # v2["x"] = 1
41 # v2["y"] = 2
42 # v2["z"] = 3
43 #
44 # var v3 = new NLPVector
45 # v3["a"] = 1
46 # v3["b"] = 2
47 # v3["c"] = 3
48 #
49 # print v1.cosine_similarity(v2)
50 # #assert v1.cosine_similarity(v2) == 1.0
51 # print v1.cosine_similarity(v3)
52 # assert v1.cosine_similarity(v3) == 0.0
53 # ~~~
54 fun cosine_similarity(other: SELF): Float do
55 # Collect terms
56 var terms = new HashSet[String]
57 for k in self.keys do terms.add k
58 for k in other.keys do terms.add k
59
60 # Get dot product of two verctors
61 var dot = 0
62 for term in terms do
63 dot += self.get_or_default(term, 0) * other.get_or_default(term, 0)
64 end
65
66 return dot.to_f / (self.norm * other.norm)
67 end
68
69 # The norm of the vector.
70 #
71 # `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
72 #
73 # ~~~
74 # var v = new NLPVector
75 # v["x"] = 1
76 # v["y"] = 1
77 # v["z"] = 1
78 # v["t"] = 1
79 # assert v.norm.is_approx(2.0, 0.001)
80 #
81 # v["x"] = 1
82 # v["y"] = 2
83 # v["z"] = 3
84 # v["t"] = 0
85 # assert v.norm.is_approx(3.742, 0.001)
86 # ~~~
87 fun norm: Float do
88 var sum = 0
89 for v in self.values do sum += v ** 2
90 return sum.to_f.sqrt
91 end
92 end