lib/vsm: speedup matches using a reverse index
[nit.git] / lib / vsm / vsm.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Vector Space Model
16 #
17 # Vector Space Model (VSM) is an algebraic model for representing text documents
18 # (and any objects, in general) as vectors of identifiers, such as, for example,
19 # index terms.
20 #
21 # It is used in information filtering, information retrieval, indexing and
22 # relevancy rankings.
23 module vsm
24
25 import counter
26
27 # A n-dimensions vector
28 #
29 # *n-dimensions* vectors are used to represent a text document or an object.
30 class Vector
31 super HashMap[nullable Object, Float]
32
33 # Cosine similarity of `self` and `other`.
34 #
35 # Gives the proximity in the range `[0.0 .. 1.0]` where 0.0 means that the
36 # two vectors are orthogonal and 1.0 means that they are identical.
37 #
38 # ~~~
39 # var v1 = new Vector
40 # v1["x"] = 1.0
41 # v1["y"] = 2.0
42 # v1["z"] = 3.0
43 #
44 # var v2 = new Vector
45 # v2["x"] = 1.0
46 # v2["y"] = 2.0
47 # v2["z"] = 3.0
48 #
49 # var v3 = new Vector
50 # v3["a"] = 1.0
51 # v3["b"] = 2.0
52 # v3["c"] = 3.0
53 #
54 # print v1.cosine_similarity(v2)
55 # assert v1.cosine_similarity(v2) == 1.0
56 # print v1.cosine_similarity(v3)
57 # assert v1.cosine_similarity(v3) == 0.0
58 # ~~~
59 fun cosine_similarity(other: SELF): Float do
60 # Collect terms
61 var terms = new HashSet[nullable Object]
62 for k in self.keys do terms.add k
63 for k in other.keys do terms.add k
64
65 # Get dot product of two vectors
66 var dot = 0.0
67 for term in terms do
68 dot += self.get_or_default(term, 0.0) * other.get_or_default(term, 0.0)
69 end
70 var cos = dot.to_f / (self.norm * other.norm)
71 if cos.is_nan then return 0.0
72 return cos
73 end
74
75 redef fun [](k) do
76 if not has_key(k) then return 0.0
77 return super
78 end
79
80 # The norm of the vector.
81 #
82 # `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
83 #
84 # ~~~
85 # var v = new Vector
86 # v["x"] = 1.0
87 # v["y"] = 1.0
88 # v["z"] = 1.0
89 # v["t"] = 1.0
90 # assert v.norm.is_approx(2.0, 0.001)
91 #
92 # v["x"] = 1.0
93 # v["y"] = 2.0
94 # v["z"] = 3.0
95 # v["t"] = 0.0
96 # assert v.norm.is_approx(3.742, 0.001)
97 # ~~~
98 fun norm: Float do
99 var sum = 0.0
100 for v in self.values do sum += v.pow(2.0)
101 return sum.to_f.sqrt
102 end
103
104 redef fun to_s do
105 return "[{join(", ", ":")}]"
106 end
107 end
108
109 # A Document index based on VSM
110 #
111 # Using VSMIndex you can index documents associated with their vector.
112 # Documents can then be matched to query vectors.
113 class VSMIndex
114
115 # Kind of documents stored in this index
116 #
117 # Clients can redefine this type to specialize the index.
118 type DOC: Document
119
120 # Documents index
121 var documents = new HashSet[DOC]
122
123 # Inversed index
124 #
125 # Link documents to existing terms.
126 var inversed_index = new HashMap[nullable Object, Array[DOC]]
127
128 # Count for all terms in all indexed documents
129 #
130 # Used to compute the `inverse_doc_frequency`.
131 var terms_doc_count = new Vector
132
133 # Inverse document frequency
134 #
135 # The inverse document frequency is a measure of how much information a term
136 # provides, that is, whether the term is common or rare across all documents.
137 var inverse_doc_frequency = new Vector
138
139 # Used to sort matches
140 #
141 # See `IndexMatch`.
142 var sorter = new IndexMatchSorter
143
144 # Match `query` vector to all index document vectors
145 #
146 # Returns an `IndexMatch` for each indexed document.
147 # Results are ordered by descending similarity.
148 fun match_vector(query: Vector): Array[IndexMatch[DOC]] do
149 var documents = new HashSet[DOC]
150 for term, count in query do
151 if inversed_index.has_key(term) then
152 documents.add_all inversed_index[term]
153 end
154 end
155 var matches = new Array[IndexMatch[DOC]]
156 for doc in documents do
157 var sim = query.cosine_similarity(doc.tfidf)
158 if sim == 0.0 then continue
159 matches.add new IndexMatch[DOC](doc, sim)
160 end
161 sorter.sort(matches)
162 return matches
163 end
164
165 # Index a document
166 #
167 # With each new document, the `inverse_doc_frequency` must be updated.
168 # By default, the method `update_index` is called after each call to
169 # `index_document`.
170 #
171 # When processing batch documents, use `auto_update = false` to disable
172 # the auto update of the index.
173 fun index_document(doc: DOC, auto_update: nullable Bool) do
174 for term, count in doc.terms_count do
175 if not terms_doc_count.has_key(term) then
176 terms_doc_count[term] = 1.0
177 else
178 terms_doc_count[term] += 1.0
179 end
180 if not inversed_index.has_key(term) then
181 inversed_index[term] = new Array[DOC]
182 end
183 inversed_index[term].add doc
184 end
185 documents.add doc
186 if auto_update == null or auto_update then update_index
187 end
188
189 # Update the index
190 #
191 # Recompute the `inverse_doc_frequency` values.
192 # Must be called manually after indexing new document with the option
193 # `auto_update = false`.
194 fun update_index do
195 for doc in documents do
196 for term, ccount in doc.terms_count do
197 inverse_doc_frequency[term] = (documents.length.to_f / terms_doc_count[term]).log
198 end
199 end
200 for doc in documents do
201 for term, freq in doc.terms_frequency do
202 doc.tfidf[term] = freq * inverse_doc_frequency[term]
203 end
204 end
205 end
206 end
207
208 # A VSM index to store strings
209 class StringIndex
210 super VSMIndex
211
212 # Index a new Document from `title`, `uri` and string `string`.
213 #
214 # Return the Document created.
215 #
216 # See `index_document`.
217 fun index_string(title, uri, string: String, auto_update: nullable Bool): DOC do
218 var vector = parse_string(string)
219 var doc = new Document(title, uri, vector)
220 index_document(doc, auto_update)
221 return doc
222 end
223
224 # Match the `query` string against all indexed documents
225 #
226 # See `match_vector`.
227 fun match_string(query: String): Array[IndexMatch[DOC]] do
228 var vector = parse_string(query)
229 var doc = new Document("", "", vector)
230 return match_vector(doc.terms_frequency)
231 end
232
233 # Parse the `string` as a Vector
234 #
235 # Returns a vector containing the terms of `string`.
236 fun parse_string(string: String): Vector do
237 var reader = new StringReader(string)
238 var vector = new Vector
239 loop
240 var token = reader.read_word
241 if token == "" then break
242
243 if not vector.has_key(token) then
244 vector[token] = 1.0
245 else
246 vector[token] += 1.0
247 end
248 end
249 return vector
250 end
251 end
252
253 # A VSMIndex to index files
254 class FileIndex
255 super StringIndex
256
257 # Index a file from its `path`.
258 #
259 # Return the created document or null if `path` is not accepted by `accept_file`.
260 #
261 # See `index_document`.
262 fun index_file(path: String, auto_update: nullable Bool): nullable DOC do
263 if not accept_file(path) then return null
264 var vector = parse_file(path)
265 var doc = new Document(path, path, vector)
266 index_document(doc, auto_update)
267 return doc
268 end
269
270 # Index multiple files
271 #
272 # The recursive method `index_dir` will be called for each directory found
273 # in `paths`.
274 #
275 # See `index_file`
276 fun index_files(paths: Collection[String], auto_update: nullable Bool) do
277 for path in paths do
278 if path.to_path.is_dir then
279 index_dir(path, false)
280 else
281 index_file(path, false)
282 end
283 end
284 if auto_update != null and auto_update then update_index
285 end
286
287 # Index all files in `dir` recursively
288 #
289 # See `index_file`.
290 fun index_dir(dir: String, auto_update: nullable Bool) do
291 if not dir.to_path.is_dir then return
292 for file in dir.files do
293 var path = dir / file
294 if path.to_path.is_dir then
295 index_dir(path, false)
296 else
297 index_file(path, false)
298 end
299 end
300 if auto_update != null and auto_update then update_index
301 end
302
303 # Is `path` accepted depending on `whitelist_exts` and `blacklist_exts`?
304 fun accept_file(path: String): Bool do
305 var ext = path.file_extension
306 if ext != null then
307 ext = ext.to_lower
308 if blacklist_exts.has(ext) then return false
309 if whitelist_exts.not_empty and not whitelist_exts.has(ext) then return false
310 end
311 return whitelist_exts.is_empty
312 end
313
314 # Parse the `file` content as a Vector
315 #
316 # See `parse_string`.
317 fun parse_file(file: String): Vector do
318 return parse_string(file.to_path.read_all)
319 end
320
321 # File extensions white list
322 #
323 # If not empty, only files with these extensions will be indexed.
324 #
325 # If an extension is in both `whitelist_exts` and `blacklist_exts`, the
326 # blacklist will prevail and the file will be ignored.
327 var whitelist_exts = new Array[String] is writable
328
329 # File extensions black list
330 #
331 # Files with these extensions will not be indexed.
332 var blacklist_exts = new Array[String] is writable
333 end
334
335 # A Document to add in a VSMIndex
336 class Document
337
338 # Document title
339 var title: String
340
341 # Document URI
342 var uri: String
343
344 # Count of all terms found in the document
345 #
346 # Used to compute the document `terms_frequency`.
347 var terms_count: Vector
348
349 # Frequency of each term found in the document
350 #
351 # Used to match the document against the `VSMIndex::inverse_doc_frequency`.
352 var terms_frequency: Vector is lazy do
353 var all_terms = 0.0
354 for t, c in terms_count do all_terms += c
355
356 var vector = new Vector
357 for t, c in terms_count do
358 vector[t] = c / all_terms
359 end
360 return vector
361 end
362
363 # Term frequency–Inverse document frequency for each term
364 #
365 # A high weight in tf–idf is reached by a high term frequency
366 # (in the given document) and a low document frequency of the term in the
367 # whole collection of documents
368 var tfidf = new Vector
369
370 redef fun to_s do return "{title}"
371 end
372
373 # A match to a `request` in an `Index`
374 class IndexMatch[DOC: Document]
375 super Comparable
376
377 # Document matching the `request_vector`
378 var document: DOC
379
380 # Similarity between the `request` and the `doc`.
381 #
382 # Result is in the range 0.0 .. 1.1 where 0.0 means no similarity and 1.0
383 # means perfect similarity.
384 var similarity: Float
385
386 redef fun to_s do return "{document} ({similarity})"
387 end
388
389 # Sort matches by similarity
390 class IndexMatchSorter
391 super DefaultComparator
392
393 redef type COMPARED: IndexMatch[Document]
394
395 redef fun compare(a, b) do
396 return b.similarity <=> a.similarity
397 end
398 end