lib/vsm: accept anything as a dimension
[nit.git] / lib / vsm / vsm.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Vector Space Model
16 #
17 # Vector Space Model (VSM) is an algebraic model for representing text documents
18 # (and any objects, in general) as vectors of identifiers, such as, for example,
19 # index terms.
20 #
21 # It is used in information filtering, information retrieval, indexing and
22 # relevancy rankings.
23 module vsm
24
25 import counter
26
27 # A n-dimensions vector
28 #
29 # *n-dimensions* vectors are used to represent a text document or an object.
30 class Vector
31 super HashMap[nullable Object, Float]
32
33 # Cosine similarity of `self` and `other`.
34 #
35 # Gives the proximity in the range `[0.0 .. 1.0]` where 0.0 means that the
36 # two vectors are orthogonal and 1.0 means that they are identical.
37 #
38 # ~~~
39 # var v1 = new Vector
40 # v1["x"] = 1.0
41 # v1["y"] = 2.0
42 # v1["z"] = 3.0
43 #
44 # var v2 = new Vector
45 # v2["x"] = 1.0
46 # v2["y"] = 2.0
47 # v2["z"] = 3.0
48 #
49 # var v3 = new Vector
50 # v3["a"] = 1.0
51 # v3["b"] = 2.0
52 # v3["c"] = 3.0
53 #
54 # print v1.cosine_similarity(v2)
55 # assert v1.cosine_similarity(v2) == 1.0
56 # print v1.cosine_similarity(v3)
57 # assert v1.cosine_similarity(v3) == 0.0
58 # ~~~
59 fun cosine_similarity(other: SELF): Float do
60 # Collect terms
61 var terms = new HashSet[nullable Object]
62 for k in self.keys do terms.add k
63 for k in other.keys do terms.add k
64
65 # Get dot product of two vectors
66 var dot = 0.0
67 for term in terms do
68 dot += self.get_or_default(term, 0.0) * other.get_or_default(term, 0.0)
69 end
70 var cos = dot.to_f / (self.norm * other.norm)
71 if cos.is_nan then return 0.0
72 return cos
73 end
74
75 # The norm of the vector.
76 #
77 # `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
78 #
79 # ~~~
80 # var v = new Vector
81 # v["x"] = 1.0
82 # v["y"] = 1.0
83 # v["z"] = 1.0
84 # v["t"] = 1.0
85 # assert v.norm.is_approx(2.0, 0.001)
86 #
87 # v["x"] = 1.0
88 # v["y"] = 2.0
89 # v["z"] = 3.0
90 # v["t"] = 0.0
91 # assert v.norm.is_approx(3.742, 0.001)
92 # ~~~
93 fun norm: Float do
94 var sum = 0.0
95 for v in self.values do sum += v.pow(2.0)
96 return sum.to_f.sqrt
97 end
98
99 redef fun to_s do
100 return "[{join(", ", ":")}]"
101 end
102 end