nitlanguage
/
nit.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
lib/vsm: default tfidf values are extracted from terms frequencies
[nit.git]
/
lib
/
vsm
/
vsm.nit
diff --git
a/lib/vsm/vsm.nit
b/lib/vsm/vsm.nit
index
d589e89
..
ad3d928
100644
(file)
--- a/
lib/vsm/vsm.nit
+++ b/
lib/vsm/vsm.nit
@@
-77,6
+77,17
@@
class Vector
return super
end
return super
end
+ # Increment value for `obj` term
+ #
+ # If the term isn't already in the vector, the new value is 1.0.
+ fun inc(obj: nullable Object) do
+ if has_key(obj) then
+ self[obj] += 1.0
+ else
+ self[obj] = 1.0
+ end
+ end
+
# The norm of the vector.
#
# `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
# The norm of the vector.
#
# `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
@@
-120,6
+131,11
@@
class VSMIndex
# Documents index
var documents = new HashSet[DOC]
# Documents index
var documents = new HashSet[DOC]
+ # Inversed index
+ #
+ # Link documents to existing terms.
+ var inversed_index = new HashMap[nullable Object, Array[DOC]]
+
# Count for all terms in all indexed documents
#
# Used to compute the `inverse_doc_frequency`.
# Count for all terms in all indexed documents
#
# Used to compute the `inverse_doc_frequency`.
@@
-141,6
+157,12
@@
class VSMIndex
# Returns an `IndexMatch` for each indexed document.
# Results are ordered by descending similarity.
fun match_vector(query: Vector): Array[IndexMatch[DOC]] do
# Returns an `IndexMatch` for each indexed document.
# Results are ordered by descending similarity.
fun match_vector(query: Vector): Array[IndexMatch[DOC]] do
+ var documents = new HashSet[DOC]
+ for term, count in query do
+ if inversed_index.has_key(term) then
+ documents.add_all inversed_index[term]
+ end
+ end
var matches = new Array[IndexMatch[DOC]]
for doc in documents do
var sim = query.cosine_similarity(doc.tfidf)
var matches = new Array[IndexMatch[DOC]]
for doc in documents do
var sim = query.cosine_similarity(doc.tfidf)
@@
-161,11
+183,11
@@
class VSMIndex
# the auto update of the index.
fun index_document(doc: DOC, auto_update: nullable Bool) do
for term, count in doc.terms_count do
# the auto update of the index.
fun index_document(doc: DOC, auto_update: nullable Bool) do
for term, count in doc.terms_count do
- if not terms_doc_count.has_key(term) then
- terms_doc_count[term] = 1.0
- else
- terms_doc_count[term] += 1.0
+ terms_doc_count.inc(term)
+ if not inversed_index.has_key(term) then
+ inversed_index[term] = new Array[DOC]
end
end
+ inversed_index[term].add doc
end
documents.add doc
if auto_update == null or auto_update then update_index
end
documents.add doc
if auto_update == null or auto_update then update_index
@@
-224,12
+246,7
@@
class StringIndex
loop
var token = reader.read_word
if token == "" then break
loop
var token = reader.read_word
if token == "" then break
-
- if not vector.has_key(token) then
- vector[token] = 1.0
- else
- vector[token] += 1.0
- end
+ vector.inc(token)
end
return vector
end
end
return vector
end
@@
-350,7
+367,7
@@
class Document
# A high weight in tf–idf is reached by a high term frequency
# (in the given document) and a low document frequency of the term in the
# whole collection of documents
# A high weight in tf–idf is reached by a high term frequency
# (in the given document) and a low document frequency of the term in the
# whole collection of documents
- var tfidf = new Vector
+ var tfidf: Vector = terms_count is lazy
redef fun to_s do return "{title}"
end
redef fun to_s do return "{title}"
end