X-Git-Url: http://nitlanguage.org diff --git a/lib/vsm/vsm.nit b/lib/vsm/vsm.nit index 4a64f99..ad3d928 100644 --- a/lib/vsm/vsm.nit +++ b/lib/vsm/vsm.nit @@ -77,6 +77,17 @@ class Vector return super end + # Increment value for `obj` term + # + # If the term isn't already in the vector, the new value is 1.0. + fun inc(obj: nullable Object) do + if has_key(obj) then + self[obj] += 1.0 + else + self[obj] = 1.0 + end + end + # The norm of the vector. # # `||x|| = (x1 ** 2 ... + xn ** 2).sqrt` @@ -112,10 +123,18 @@ end # Documents can then be matched to query vectors. class VSMIndex + # Kind of documents stored in this index + # + # Clients can redefine this type to specialize the index. + type DOC: Document + # Documents index + var documents = new HashSet[DOC] + + # Inversed index # - # TODO use a more efficient representation. - var documents = new HashSet[Document] + # Link documents to existing terms. + var inversed_index = new HashMap[nullable Object, Array[DOC]] # Count for all terms in all indexed documents # @@ -137,12 +156,18 @@ class VSMIndex # # Returns an `IndexMatch` for each indexed document. # Results are ordered by descending similarity. - fun match_vector(query: Vector): Array[IndexMatch] do - var matches = new Array[IndexMatch] + fun match_vector(query: Vector): Array[IndexMatch[DOC]] do + var documents = new HashSet[DOC] + for term, count in query do + if inversed_index.has_key(term) then + documents.add_all inversed_index[term] + end + end + var matches = new Array[IndexMatch[DOC]] for doc in documents do var sim = query.cosine_similarity(doc.tfidf) if sim == 0.0 then continue - matches.add new IndexMatch(doc, sim) + matches.add new IndexMatch[DOC](doc, sim) end sorter.sort(matches) return matches @@ -156,13 +181,13 @@ class VSMIndex # # When processing batch documents, use `auto_update = false` to disable # the auto update of the index. - fun index_document(doc: Document, auto_update: nullable Bool) do + fun index_document(doc: DOC, auto_update: nullable Bool) do for term, count in doc.terms_count do - if not terms_doc_count.has_key(term) then - terms_doc_count[term] = 1.0 - else - terms_doc_count[term] += 1.0 + terms_doc_count.inc(term) + if not inversed_index.has_key(term) then + inversed_index[term] = new Array[DOC] end + inversed_index[term].add doc end documents.add doc if auto_update == null or auto_update then update_index @@ -196,7 +221,7 @@ class StringIndex # Return the Document created. # # See `index_document`. - fun index_string(title, uri, string: String, auto_update: nullable Bool): Document do + fun index_string(title, uri, string: String, auto_update: nullable Bool): DOC do var vector = parse_string(string) var doc = new Document(title, uri, vector) index_document(doc, auto_update) @@ -206,7 +231,7 @@ class StringIndex # Match the `query` string against all indexed documents # # See `match_vector`. - fun match_string(query: String): Array[IndexMatch] do + fun match_string(query: String): Array[IndexMatch[DOC]] do var vector = parse_string(query) var doc = new Document("", "", vector) return match_vector(doc.terms_frequency) @@ -221,12 +246,7 @@ class StringIndex loop var token = reader.read_word if token == "" then break - - if not vector.has_key(token) then - vector[token] = 1.0 - else - vector[token] += 1.0 - end + vector.inc(token) end return vector end @@ -241,7 +261,7 @@ class FileIndex # Return the created document or null if `path` is not accepted by `accept_file`. # # See `index_document`. - fun index_file(path: String, auto_update: nullable Bool): nullable Document do + fun index_file(path: String, auto_update: nullable Bool): nullable DOC do if not accept_file(path) then return null var vector = parse_file(path) var doc = new Document(path, path, vector) @@ -347,17 +367,17 @@ class Document # A high weight in tf–idf is reached by a high term frequency # (in the given document) and a low document frequency of the term in the # whole collection of documents - var tfidf = new Vector + var tfidf: Vector = terms_count is lazy redef fun to_s do return "{title}" end # A match to a `request` in an `Index` -class IndexMatch +class IndexMatch[DOC: Document] super Comparable # Document matching the `request_vector` - var document: Document + var document: DOC # Similarity between the `request` and the `doc`. # @@ -372,7 +392,7 @@ end class IndexMatchSorter super DefaultComparator - redef type COMPARED: IndexMatch + redef type COMPARED: IndexMatch[Document] redef fun compare(a, b) do return b.similarity <=> a.similarity