6cd1e9280d024c939c09df5aa01b5d5a63996e23
1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
17 # Vector Space Model (VSM) is an algebraic model for representing text documents
18 # (and any objects, in general) as vectors of identifiers, such as, for example,
21 # It is used in information filtering, information retrieval, indexing and
27 # A n-dimensions vector
29 # *n-dimensions* vectors are used to represent a text document or an object.
31 super HashMap[nullable Object, Float]
33 # Cosine similarity of `self` and `other`.
35 # Gives the proximity in the range `[0.0 .. 1.0]` where 0.0 means that the
36 # two vectors are orthogonal and 1.0 means that they are identical.
54 # print v1.cosine_similarity(v2)
55 # assert v1.cosine_similarity(v2) == 1.0
56 # print v1.cosine_similarity(v3)
57 # assert v1.cosine_similarity(v3) == 0.0
59 fun cosine_similarity
(other
: SELF): Float do
61 var terms
= new HashSet[nullable Object]
62 for k
in self.keys
do terms
.add k
63 for k
in other
.keys
do terms
.add k
65 # Get dot product of two vectors
68 dot
+= self.get_or_default
(term
, 0.0) * other
.get_or_default
(term
, 0.0)
70 var cos
= dot
.to_f
/ (self.norm
* other
.norm
)
71 if cos
.is_nan
then return 0.0
76 if not has_key
(k
) then return 0.0
80 # The norm of the vector.
82 # `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
90 # assert v.norm.is_approx(2.0, 0.001)
96 # assert v.norm.is_approx(3.742, 0.001)
100 for v
in self.values
do sum
+= v
.pow
(2.0)
105 return "[{join(", ", ":")}]"
109 # A Document index based on VSM
111 # Using VSMIndex you can index documents associated with their vector.
112 # Documents can then be matched to query vectors.
115 # Kind of documents stored in this index
117 # Clients can redefine this type to specialize the index.
121 var documents
= new HashSet[DOC]
125 # Link documents to existing terms.
126 var inversed_index
= new HashMap[nullable Object, Array[DOC]]
128 # Count for all terms in all indexed documents
130 # Used to compute the `inverse_doc_frequency`.
131 var terms_doc_count
= new Vector
133 # Inverse document frequency
135 # The inverse document frequency is a measure of how much information a term
136 # provides, that is, whether the term is common or rare across all documents.
137 var inverse_doc_frequency
= new Vector
139 # Used to sort matches
142 var sorter
= new IndexMatchSorter
144 # Match `query` vector to all index document vectors
146 # Returns an `IndexMatch` for each indexed document.
147 # Results are ordered by descending similarity.
148 fun match_vector
(query
: Vector): Array[IndexMatch[DOC]] do
149 var documents
= new HashSet[DOC]
150 for term
, count
in query
do
151 if inversed_index
.has_key
(term
) then
152 documents
.add_all inversed_index
[term
]
155 var matches
= new Array[IndexMatch[DOC]]
156 for doc
in documents
do
157 var sim
= query
.cosine_similarity
(doc
.tfidf
)
158 if sim
== 0.0 then continue
159 matches
.add
new IndexMatch[DOC](doc
, sim
)
167 # With each new document, the `inverse_doc_frequency` must be updated.
168 # By default, the method `update_index` is called after each call to
171 # When processing batch documents, use `auto_update = false` to disable
172 # the auto update of the index.
173 fun index_document
(doc
: DOC, auto_update
: nullable Bool) do
174 for term
, count
in doc
.terms_count
do
175 if not terms_doc_count
.has_key
(term
) then
176 terms_doc_count
[term
] = 1.0
178 terms_doc_count
[term
] += 1.0
180 if not inversed_index
.has_key
(term
) then
181 inversed_index
[term
] = new Array[DOC]
183 inversed_index
[term
].add doc
186 if auto_update
== null or auto_update
then update_index
191 # Recompute the `inverse_doc_frequency` values.
192 # Must be called manually after indexing new document with the option
193 # `auto_update = false`.
195 for doc
in documents
do
196 for term
, ccount
in doc
.terms_count
do
197 inverse_doc_frequency
[term
] = (documents
.length
.to_f
/ terms_doc_count
[term
]).log
200 for doc
in documents
do
201 for term
, freq
in doc
.terms_frequency
do
202 doc
.tfidf
[term
] = freq
* inverse_doc_frequency
[term
]
208 # A VSM index to store strings
212 # Index a new Document from `title`, `uri` and string `string`.
214 # Return the Document created.
216 # See `index_document`.
217 fun index_string
(title
, uri
, string
: String, auto_update
: nullable Bool): DOC do
218 var vector
= parse_string
(string
)
219 var doc
= new Document(title
, uri
, vector
)
220 index_document
(doc
, auto_update
)
224 # Match the `query` string against all indexed documents
226 # See `match_vector`.
227 fun match_string
(query
: String): Array[IndexMatch[DOC]] do
228 var vector
= parse_string
(query
)
229 var doc
= new Document("", "", vector
)
230 return match_vector
(doc
.terms_frequency
)
233 # Parse the `string` as a Vector
235 # Returns a vector containing the terms of `string`.
236 fun parse_string
(string
: String): Vector do
237 var reader
= new StringReader(string
)
238 var vector
= new Vector
240 var token
= reader
.read_word
241 if token
== "" then break
243 if not vector
.has_key
(token
) then
253 # A VSMIndex to index files
257 # Index a file from its `path`.
259 # Return the created document or null if `path` is not accepted by `accept_file`.
261 # See `index_document`.
262 fun index_file
(path
: String, auto_update
: nullable Bool): nullable DOC do
263 if not accept_file
(path
) then return null
264 var vector
= parse_file
(path
)
265 var doc
= new Document(path
, path
, vector
)
266 index_document
(doc
, auto_update
)
270 # Index multiple files
272 # The recursive method `index_dir` will be called for each directory found
276 fun index_files
(paths
: Collection[String], auto_update
: nullable Bool) do
278 if path
.to_path
.is_dir
then
279 index_dir
(path
, false)
281 index_file
(path
, false)
284 if auto_update
!= null and auto_update
then update_index
287 # Index all files in `dir` recursively
290 fun index_dir
(dir
: String, auto_update
: nullable Bool) do
291 if not dir
.to_path
.is_dir
then return
292 for file
in dir
.files
do
293 var path
= dir
/ file
294 if path
.to_path
.is_dir
then
295 index_dir
(path
, false)
297 index_file
(path
, false)
300 if auto_update
!= null and auto_update
then update_index
303 # Is `path` accepted depending on `whitelist_exts` and `blacklist_exts`?
304 fun accept_file
(path
: String): Bool do
305 var ext
= path
.file_extension
308 if blacklist_exts
.has
(ext
) then return false
309 if whitelist_exts
.not_empty
and not whitelist_exts
.has
(ext
) then return false
311 return whitelist_exts
.is_empty
314 # Parse the `file` content as a Vector
316 # See `parse_string`.
317 fun parse_file
(file
: String): Vector do
318 return parse_string
(file
.to_path
.read_all
)
321 # File extensions white list
323 # If not empty, only files with these extensions will be indexed.
325 # If an extension is in both `whitelist_exts` and `blacklist_exts`, the
326 # blacklist will prevail and the file will be ignored.
327 var whitelist_exts
= new Array[String] is writable
329 # File extensions black list
331 # Files with these extensions will not be indexed.
332 var blacklist_exts
= new Array[String] is writable
335 # A Document to add in a VSMIndex
344 # Count of all terms found in the document
346 # Used to compute the document `terms_frequency`.
347 var terms_count
: Vector
349 # Frequency of each term found in the document
351 # Used to match the document against the `VSMIndex::inverse_doc_frequency`.
352 var terms_frequency
: Vector is lazy
do
354 for t
, c
in terms_count
do all_terms
+= c
356 var vector
= new Vector
357 for t
, c
in terms_count
do
358 vector
[t
] = c
/ all_terms
363 # Term frequency–Inverse document frequency for each term
365 # A high weight in tf–idf is reached by a high term frequency
366 # (in the given document) and a low document frequency of the term in the
367 # whole collection of documents
368 var tfidf
= new Vector
370 redef fun to_s
do return "{title}"
373 # A match to a `request` in an `Index`
374 class IndexMatch[DOC: Document]
377 # Document matching the `request_vector`
380 # Similarity between the `request` and the `doc`.
382 # Result is in the range 0.0 .. 1.1 where 0.0 means no similarity and 1.0
383 # means perfect similarity.
384 var similarity
: Float
386 redef fun to_s
do return "{document} ({similarity})"
389 # Sort matches by similarity
390 class IndexMatchSorter
391 super DefaultComparator
393 redef type COMPARED: IndexMatch[Document]
395 redef fun compare
(a
, b
) do
396 return b
.similarity
<=> a
.similarity