lib/vsm/vsm.nit

   1 # This file is part of NIT ( http://www.nitlanguage.org ).
   2 #
   3 # Licensed under the Apache License, Version 2.0 (the "License");
   4 # you may not use this file except in compliance with the License.
   5 # You may obtain a copy of the License at
   6 #
   7 #     http://www.apache.org/licenses/LICENSE-2.0
   8 #
   9 # Unless required by applicable law or agreed to in writing, software
  10 # distributed under the License is distributed on an "AS IS" BASIS,
  11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 # See the License for the specific language governing permissions and
  13 # limitations under the License.
  14
  15 # Vector Space Model
  16 #
  17 # Vector Space Model (VSM) is an algebraic model for representing text documents
  18 # (and any objects, in general) as vectors of identifiers, such as, for example,
  19 # index terms.
  20 #
  21 # It is used in information filtering, information retrieval, indexing and
  22 # relevancy rankings.
  23 module vsm
  24
  25 import counter
  26
  27 # A n-dimensions vector
  28 #
  29 # *n-dimensions* vectors are used to represent a text document or an object.
  30 class Vector
  31         super HashMap[nullable Object, Float]
  32
  33         # Cosine similarity of `self` and `other`.
  34         #
  35         # Gives the proximity in the range `[0.0 .. 1.0]` where 0.0 means that the
  36         # two vectors are orthogonal and 1.0 means that they are identical.
  37         #
  38         # ~~~
  39         # var v1 = new Vector
  40         # v1["x"] = 1.0
  41         # v1["y"] = 2.0
  42         # v1["z"] = 3.0
  43         #
  44         # var v2 = new Vector
  45         # v2["x"] = 1.0
  46         # v2["y"] = 2.0
  47         # v2["z"] = 3.0
  48         #
  49         # var v3 = new Vector
  50         # v3["a"] = 1.0
  51         # v3["b"] = 2.0
  52         # v3["c"] = 3.0
  53         #
  54         # print v1.cosine_similarity(v2)
  55         # assert v1.cosine_similarity(v2) == 1.0
  56         # print v1.cosine_similarity(v3)
  57         # assert v1.cosine_similarity(v3) == 0.0
  58         # ~~~
  59         fun cosine_similarity(other: SELF): Float do
  60                 # Collect terms
  61                 var terms = new HashSet[nullable Object]
  62                 for k in self.keys do terms.add k
  63                 for k in other.keys do terms.add k
  64
  65                 # Get dot product of two vectors
  66                 var dot = 0.0
  67                 for term in terms do
  68                         dot += self.get_or_default(term, 0.0) * other.get_or_default(term, 0.0)
  69                 end
  70                 var cos = dot.to_f / (self.norm * other.norm)
  71                 if cos.is_nan then return 0.0
  72                 return cos
  73         end
  74
  75         redef fun [](k) do
  76                 if not has_key(k) then return 0.0
  77                 return super
  78         end
  79
  80         # Increment value for `obj` term
  81         #
  82         # If the term isn't already in the vector, the new value is 1.0.
  83         fun inc(obj: nullable Object) do
  84                 if has_key(obj) then
  85                         self[obj] += 1.0
  86                 else
  87                         self[obj] = 1.0
  88                 end
  89         end
  90
  91         # The norm of the vector.
  92         #
  93         # `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
  94         #
  95         # ~~~
  96         # var v = new Vector
  97         # v["x"] = 1.0
  98         # v["y"] = 1.0
  99         # v["z"] = 1.0
 100         # v["t"] = 1.0
 101         # assert v.norm.is_approx(2.0, 0.001)
 102         #
 103         # v["x"] = 1.0
 104         # v["y"] = 2.0
 105         # v["z"] = 3.0
 106         # v["t"] = 0.0
 107         # assert v.norm.is_approx(3.742, 0.001)
 108         # ~~~
 109         fun norm: Float do
 110                 var sum = 0.0
 111                 for v in self.values do sum += v.pow(2.0)
 112                 return sum.to_f.sqrt
 113         end
 114
 115         redef fun to_s do
 116                 return "[{join(", ", ":")}]"
 117         end
 118 end
 119
 120 # A Document index based on VSM
 121 #
 122 # Using VSMIndex you can index documents associated with their vector.
 123 # Documents can then be matched to query vectors.
 124 class VSMIndex
 125
 126         # Kind of documents stored in this index
 127         #
 128         # Clients can redefine this type to specialize the index.
 129         type DOC: Document
 130
 131         # Documents index
 132         var documents = new HashSet[DOC]
 133
 134         # Inversed index
 135         #
 136         # Link documents to existing terms.
 137         var inversed_index = new HashMap[nullable Object, Array[DOC]]
 138
 139         # Count for all terms in all indexed documents
 140         #
 141         # Used to compute the `inverse_doc_frequency`.
 142         var terms_doc_count = new Vector
 143
 144         # Inverse document frequency
 145         #
 146         # The inverse document frequency is a measure of how much information a term
 147         # provides, that is, whether the term is common or rare across all documents.
 148         var inverse_doc_frequency = new Vector
 149
 150         # Used to sort matches
 151         #
 152         # See `IndexMatch`.
 153         var sorter = new IndexMatchSorter
 154
 155         # Match `query` vector to all index document vectors
 156         #
 157         # Returns an `IndexMatch` for each indexed document.
 158         # Results are ordered by descending similarity.
 159         fun match_vector(query: Vector): Array[IndexMatch[DOC]] do
 160                 var documents = new HashSet[DOC]
 161                 for term, count in query do
 162                         if inversed_index.has_key(term) then
 163                                 documents.add_all inversed_index[term]
 164                         end
 165                 end
 166                 var matches = new Array[IndexMatch[DOC]]
 167                 for doc in documents do
 168                         var sim = query.cosine_similarity(doc.tfidf)
 169                         if sim == 0.0 then continue
 170                         matches.add new IndexMatch[DOC](doc, sim)
 171                 end
 172                 sorter.sort(matches)
 173                 return matches
 174         end
 175
 176         # Index a document
 177         #
 178         # With each new document, the `inverse_doc_frequency` must be updated.
 179         # By default, the method `update_index` is called after each call to
 180         # `index_document`.
 181         #
 182         # When processing batch documents, use `auto_update = false` to disable
 183         # the auto update of the index.
 184         fun index_document(doc: DOC, auto_update: nullable Bool) do
 185                 for term, count in doc.terms_count do
 186                         terms_doc_count.inc(term)
 187                         if not inversed_index.has_key(term) then
 188                                 inversed_index[term] = new Array[DOC]
 189                         end
 190                         inversed_index[term].add doc
 191                 end
 192                 documents.add doc
 193                 if auto_update == null or auto_update then update_index
 194         end
 195
 196         # Update the index
 197         #
 198         # Recompute the `inverse_doc_frequency` values.
 199         # Must be called manually after indexing new document with the option
 200         # `auto_update = false`.
 201         fun update_index do
 202                 for doc in documents do
 203                         for term, ccount in doc.terms_count do
 204                                 inverse_doc_frequency[term] = (documents.length.to_f / terms_doc_count[term]).log
 205                         end
 206                 end
 207                 for doc in documents do
 208                         for term, freq in doc.terms_frequency do
 209                                 doc.tfidf[term] = freq * inverse_doc_frequency[term]
 210                         end
 211                 end
 212         end
 213 end
 214
 215 # A VSM index to store strings
 216 class StringIndex
 217         super VSMIndex
 218
 219         # Index a new Document from `title`, `uri` and string `string`.
 220         #
 221         # Return the Document created.
 222         #
 223         # See `index_document`.
 224         fun index_string(title, uri, string: String, auto_update: nullable Bool): DOC do
 225                 var vector = parse_string(string)
 226                 var doc = new Document(title, uri, vector)
 227                 index_document(doc, auto_update)
 228                 return doc
 229         end
 230
 231         # Match the `query` string against all indexed documents
 232         #
 233         # See `match_vector`.
 234         fun match_string(query: String): Array[IndexMatch[DOC]] do
 235                 var vector = parse_string(query)
 236                 var doc = new Document("", "", vector)
 237                 return match_vector(doc.terms_frequency)
 238         end
 239
 240         # Parse the `string` as a Vector
 241         #
 242         # Returns a vector containing the terms of `string`.
 243         fun parse_string(string: String): Vector do
 244                 var reader = new StringReader(string)
 245                 var vector = new Vector
 246                 loop
 247                         var token = reader.read_word
 248                         if token == "" then break
 249                         vector.inc(token)
 250                 end
 251                 return vector
 252         end
 253 end
 254
 255 # A VSMIndex to index files
 256 class FileIndex
 257         super StringIndex
 258
 259         # Index a file from its `path`.
 260         #
 261         # Return the created document or null if `path` is not accepted by `accept_file`.
 262         #
 263         # See `index_document`.
 264         fun index_file(path: String, auto_update: nullable Bool): nullable DOC do
 265                 if not accept_file(path) then return null
 266                 var vector = parse_file(path)
 267                 var doc = new Document(path, path, vector)
 268                 index_document(doc, auto_update)
 269                 return doc
 270         end
 271
 272         # Index multiple files
 273         #
 274         # The recursive method `index_dir` will be called for each directory found
 275         # in `paths`.
 276         #
 277         # See `index_file`
 278         fun index_files(paths: Collection[String], auto_update: nullable Bool) do
 279                 for path in paths do
 280                         if path.to_path.is_dir then
 281                                 index_dir(path, false)
 282                         else
 283                                 index_file(path, false)
 284                         end
 285                 end
 286                 if auto_update != null and auto_update then update_index
 287         end
 288
 289         # Index all files in `dir` recursively
 290         #
 291         # See `index_file`.
 292         fun index_dir(dir: String, auto_update: nullable Bool) do
 293                 if not dir.to_path.is_dir then return
 294                 for file in dir.files do
 295                         var path = dir / file
 296                         if path.to_path.is_dir then
 297                                 index_dir(path, false)
 298                         else
 299                                 index_file(path, false)
 300                         end
 301                 end
 302                 if auto_update != null and auto_update then update_index
 303         end
 304
 305         # Is `path` accepted depending on `whitelist_exts` and `blacklist_exts`?
 306         fun accept_file(path: String): Bool do
 307                 var ext = path.file_extension
 308                 if ext != null then
 309                         ext = ext.to_lower
 310                         if blacklist_exts.has(ext) then return false
 311                         if whitelist_exts.not_empty and not whitelist_exts.has(ext) then return false
 312                 end
 313                 return whitelist_exts.is_empty
 314         end
 315
 316         # Parse the `file` content as a Vector
 317         #
 318         # See `parse_string`.
 319         fun parse_file(file: String): Vector do
 320                 return parse_string(file.to_path.read_all)
 321         end
 322
 323         # File extensions white list
 324         #
 325         # If not empty, only files with these extensions will be indexed.
 326         #
 327         # If an extension is in both `whitelist_exts` and `blacklist_exts`, the
 328         # blacklist will prevail and the file will be ignored.
 329         var whitelist_exts = new Array[String] is writable
 330
 331         # File extensions black list
 332         #
 333         # Files with these extensions will not be indexed.
 334         var blacklist_exts = new Array[String] is writable
 335 end
 336
 337 # A Document to add in a VSMIndex
 338 class Document
 339
 340         # Document title
 341         var title: String
 342
 343         # Document URI
 344         var uri: String
 345
 346         # Count of all terms found in the document
 347         #
 348         # Used to compute the document `terms_frequency`.
 349         var terms_count: Vector
 350
 351         # Frequency of each term found in the document
 352         #
 353         # Used to match the document against the `VSMIndex::inverse_doc_frequency`.
 354         var terms_frequency: Vector is lazy do
 355                 var all_terms = 0.0
 356                 for t, c in terms_count do all_terms += c
 357
 358                 var vector = new Vector
 359                 for t, c in terms_count do
 360                         vector[t] = c / all_terms
 361                 end
 362                 return vector
 363         end
 364
 365         # Term frequency–Inverse document frequency for each term
 366         #
 367         # A high weight in tf–idf is reached by a high term frequency
 368         # (in the given document) and a low document frequency of the term in the
 369         # whole collection of documents
 370         var tfidf: Vector = terms_count is lazy
 371
 372         redef fun to_s do return "{title}"
 373 end
 374
 375 # A match to a `request` in an `Index`
 376 class IndexMatch[DOC: Document]
 377         super Comparable
 378
 379         # Document matching the `request_vector`
 380         var document: DOC
 381
 382         # Similarity between the `request` and the `doc`.
 383         #
 384         # Result is in the range 0.0 .. 1.1 where 0.0 means no similarity and 1.0
 385         # means perfect similarity.
 386         var similarity: Float
 387
 388         redef fun to_s do return "{document} ({similarity})"
 389 end
 390
 391 # Sort matches by similarity
 392 class IndexMatchSorter
 393         super DefaultComparator
 394
 395         redef type COMPARED: IndexMatch[Document]
 396
 397         redef fun compare(a, b) do
 398                 return b.similarity <=> a.similarity
 399         end
 400 end