lib/vsm: default tfidf values are extracted from terms frequencies
[nit.git] / lib / vsm / vsm.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Vector Space Model
16 #
17 # Vector Space Model (VSM) is an algebraic model for representing text documents
18 # (and any objects, in general) as vectors of identifiers, such as, for example,
19 # index terms.
20 #
21 # It is used in information filtering, information retrieval, indexing and
22 # relevancy rankings.
23 module vsm
24
25 import counter
26
27 # A n-dimensions vector
28 #
29 # *n-dimensions* vectors are used to represent a text document or an object.
30 class Vector
31 super HashMap[nullable Object, Float]
32
33 # Cosine similarity of `self` and `other`.
34 #
35 # Gives the proximity in the range `[0.0 .. 1.0]` where 0.0 means that the
36 # two vectors are orthogonal and 1.0 means that they are identical.
37 #
38 # ~~~
39 # var v1 = new Vector
40 # v1["x"] = 1.0
41 # v1["y"] = 2.0
42 # v1["z"] = 3.0
43 #
44 # var v2 = new Vector
45 # v2["x"] = 1.0
46 # v2["y"] = 2.0
47 # v2["z"] = 3.0
48 #
49 # var v3 = new Vector
50 # v3["a"] = 1.0
51 # v3["b"] = 2.0
52 # v3["c"] = 3.0
53 #
54 # print v1.cosine_similarity(v2)
55 # assert v1.cosine_similarity(v2) == 1.0
56 # print v1.cosine_similarity(v3)
57 # assert v1.cosine_similarity(v3) == 0.0
58 # ~~~
59 fun cosine_similarity(other: SELF): Float do
60 # Collect terms
61 var terms = new HashSet[nullable Object]
62 for k in self.keys do terms.add k
63 for k in other.keys do terms.add k
64
65 # Get dot product of two vectors
66 var dot = 0.0
67 for term in terms do
68 dot += self.get_or_default(term, 0.0) * other.get_or_default(term, 0.0)
69 end
70 var cos = dot.to_f / (self.norm * other.norm)
71 if cos.is_nan then return 0.0
72 return cos
73 end
74
75 redef fun [](k) do
76 if not has_key(k) then return 0.0
77 return super
78 end
79
80 # Increment value for `obj` term
81 #
82 # If the term isn't already in the vector, the new value is 1.0.
83 fun inc(obj: nullable Object) do
84 if has_key(obj) then
85 self[obj] += 1.0
86 else
87 self[obj] = 1.0
88 end
89 end
90
91 # The norm of the vector.
92 #
93 # `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
94 #
95 # ~~~
96 # var v = new Vector
97 # v["x"] = 1.0
98 # v["y"] = 1.0
99 # v["z"] = 1.0
100 # v["t"] = 1.0
101 # assert v.norm.is_approx(2.0, 0.001)
102 #
103 # v["x"] = 1.0
104 # v["y"] = 2.0
105 # v["z"] = 3.0
106 # v["t"] = 0.0
107 # assert v.norm.is_approx(3.742, 0.001)
108 # ~~~
109 fun norm: Float do
110 var sum = 0.0
111 for v in self.values do sum += v.pow(2.0)
112 return sum.to_f.sqrt
113 end
114
115 redef fun to_s do
116 return "[{join(", ", ":")}]"
117 end
118 end
119
120 # A Document index based on VSM
121 #
122 # Using VSMIndex you can index documents associated with their vector.
123 # Documents can then be matched to query vectors.
124 class VSMIndex
125
126 # Kind of documents stored in this index
127 #
128 # Clients can redefine this type to specialize the index.
129 type DOC: Document
130
131 # Documents index
132 var documents = new HashSet[DOC]
133
134 # Inversed index
135 #
136 # Link documents to existing terms.
137 var inversed_index = new HashMap[nullable Object, Array[DOC]]
138
139 # Count for all terms in all indexed documents
140 #
141 # Used to compute the `inverse_doc_frequency`.
142 var terms_doc_count = new Vector
143
144 # Inverse document frequency
145 #
146 # The inverse document frequency is a measure of how much information a term
147 # provides, that is, whether the term is common or rare across all documents.
148 var inverse_doc_frequency = new Vector
149
150 # Used to sort matches
151 #
152 # See `IndexMatch`.
153 var sorter = new IndexMatchSorter
154
155 # Match `query` vector to all index document vectors
156 #
157 # Returns an `IndexMatch` for each indexed document.
158 # Results are ordered by descending similarity.
159 fun match_vector(query: Vector): Array[IndexMatch[DOC]] do
160 var documents = new HashSet[DOC]
161 for term, count in query do
162 if inversed_index.has_key(term) then
163 documents.add_all inversed_index[term]
164 end
165 end
166 var matches = new Array[IndexMatch[DOC]]
167 for doc in documents do
168 var sim = query.cosine_similarity(doc.tfidf)
169 if sim == 0.0 then continue
170 matches.add new IndexMatch[DOC](doc, sim)
171 end
172 sorter.sort(matches)
173 return matches
174 end
175
176 # Index a document
177 #
178 # With each new document, the `inverse_doc_frequency` must be updated.
179 # By default, the method `update_index` is called after each call to
180 # `index_document`.
181 #
182 # When processing batch documents, use `auto_update = false` to disable
183 # the auto update of the index.
184 fun index_document(doc: DOC, auto_update: nullable Bool) do
185 for term, count in doc.terms_count do
186 terms_doc_count.inc(term)
187 if not inversed_index.has_key(term) then
188 inversed_index[term] = new Array[DOC]
189 end
190 inversed_index[term].add doc
191 end
192 documents.add doc
193 if auto_update == null or auto_update then update_index
194 end
195
196 # Update the index
197 #
198 # Recompute the `inverse_doc_frequency` values.
199 # Must be called manually after indexing new document with the option
200 # `auto_update = false`.
201 fun update_index do
202 for doc in documents do
203 for term, ccount in doc.terms_count do
204 inverse_doc_frequency[term] = (documents.length.to_f / terms_doc_count[term]).log
205 end
206 end
207 for doc in documents do
208 for term, freq in doc.terms_frequency do
209 doc.tfidf[term] = freq * inverse_doc_frequency[term]
210 end
211 end
212 end
213 end
214
215 # A VSM index to store strings
216 class StringIndex
217 super VSMIndex
218
219 # Index a new Document from `title`, `uri` and string `string`.
220 #
221 # Return the Document created.
222 #
223 # See `index_document`.
224 fun index_string(title, uri, string: String, auto_update: nullable Bool): DOC do
225 var vector = parse_string(string)
226 var doc = new Document(title, uri, vector)
227 index_document(doc, auto_update)
228 return doc
229 end
230
231 # Match the `query` string against all indexed documents
232 #
233 # See `match_vector`.
234 fun match_string(query: String): Array[IndexMatch[DOC]] do
235 var vector = parse_string(query)
236 var doc = new Document("", "", vector)
237 return match_vector(doc.terms_frequency)
238 end
239
240 # Parse the `string` as a Vector
241 #
242 # Returns a vector containing the terms of `string`.
243 fun parse_string(string: String): Vector do
244 var reader = new StringReader(string)
245 var vector = new Vector
246 loop
247 var token = reader.read_word
248 if token == "" then break
249 vector.inc(token)
250 end
251 return vector
252 end
253 end
254
255 # A VSMIndex to index files
256 class FileIndex
257 super StringIndex
258
259 # Index a file from its `path`.
260 #
261 # Return the created document or null if `path` is not accepted by `accept_file`.
262 #
263 # See `index_document`.
264 fun index_file(path: String, auto_update: nullable Bool): nullable DOC do
265 if not accept_file(path) then return null
266 var vector = parse_file(path)
267 var doc = new Document(path, path, vector)
268 index_document(doc, auto_update)
269 return doc
270 end
271
272 # Index multiple files
273 #
274 # The recursive method `index_dir` will be called for each directory found
275 # in `paths`.
276 #
277 # See `index_file`
278 fun index_files(paths: Collection[String], auto_update: nullable Bool) do
279 for path in paths do
280 if path.to_path.is_dir then
281 index_dir(path, false)
282 else
283 index_file(path, false)
284 end
285 end
286 if auto_update != null and auto_update then update_index
287 end
288
289 # Index all files in `dir` recursively
290 #
291 # See `index_file`.
292 fun index_dir(dir: String, auto_update: nullable Bool) do
293 if not dir.to_path.is_dir then return
294 for file in dir.files do
295 var path = dir / file
296 if path.to_path.is_dir then
297 index_dir(path, false)
298 else
299 index_file(path, false)
300 end
301 end
302 if auto_update != null and auto_update then update_index
303 end
304
305 # Is `path` accepted depending on `whitelist_exts` and `blacklist_exts`?
306 fun accept_file(path: String): Bool do
307 var ext = path.file_extension
308 if ext != null then
309 ext = ext.to_lower
310 if blacklist_exts.has(ext) then return false
311 if whitelist_exts.not_empty and not whitelist_exts.has(ext) then return false
312 end
313 return whitelist_exts.is_empty
314 end
315
316 # Parse the `file` content as a Vector
317 #
318 # See `parse_string`.
319 fun parse_file(file: String): Vector do
320 return parse_string(file.to_path.read_all)
321 end
322
323 # File extensions white list
324 #
325 # If not empty, only files with these extensions will be indexed.
326 #
327 # If an extension is in both `whitelist_exts` and `blacklist_exts`, the
328 # blacklist will prevail and the file will be ignored.
329 var whitelist_exts = new Array[String] is writable
330
331 # File extensions black list
332 #
333 # Files with these extensions will not be indexed.
334 var blacklist_exts = new Array[String] is writable
335 end
336
337 # A Document to add in a VSMIndex
338 class Document
339
340 # Document title
341 var title: String
342
343 # Document URI
344 var uri: String
345
346 # Count of all terms found in the document
347 #
348 # Used to compute the document `terms_frequency`.
349 var terms_count: Vector
350
351 # Frequency of each term found in the document
352 #
353 # Used to match the document against the `VSMIndex::inverse_doc_frequency`.
354 var terms_frequency: Vector is lazy do
355 var all_terms = 0.0
356 for t, c in terms_count do all_terms += c
357
358 var vector = new Vector
359 for t, c in terms_count do
360 vector[t] = c / all_terms
361 end
362 return vector
363 end
364
365 # Term frequency–Inverse document frequency for each term
366 #
367 # A high weight in tf–idf is reached by a high term frequency
368 # (in the given document) and a low document frequency of the term in the
369 # whole collection of documents
370 var tfidf: Vector = terms_count is lazy
371
372 redef fun to_s do return "{title}"
373 end
374
375 # A match to a `request` in an `Index`
376 class IndexMatch[DOC: Document]
377 super Comparable
378
379 # Document matching the `request_vector`
380 var document: DOC
381
382 # Similarity between the `request` and the `doc`.
383 #
384 # Result is in the range 0.0 .. 1.1 where 0.0 means no similarity and 1.0
385 # means perfect similarity.
386 var similarity: Float
387
388 redef fun to_s do return "{document} ({similarity})"
389 end
390
391 # Sort matches by similarity
392 class IndexMatchSorter
393 super DefaultComparator
394
395 redef type COMPARED: IndexMatch[Document]
396
397 redef fun compare(a, b) do
398 return b.similarity <=> a.similarity
399 end
400 end