d589e89e5a6fe205d4c9f8884bb2a161a43aefc5
1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
17 # Vector Space Model (VSM) is an algebraic model for representing text documents
18 # (and any objects, in general) as vectors of identifiers, such as, for example,
21 # It is used in information filtering, information retrieval, indexing and
27 # A n-dimensions vector
29 # *n-dimensions* vectors are used to represent a text document or an object.
31 super HashMap[nullable Object, Float]
33 # Cosine similarity of `self` and `other`.
35 # Gives the proximity in the range `[0.0 .. 1.0]` where 0.0 means that the
36 # two vectors are orthogonal and 1.0 means that they are identical.
54 # print v1.cosine_similarity(v2)
55 # assert v1.cosine_similarity(v2) == 1.0
56 # print v1.cosine_similarity(v3)
57 # assert v1.cosine_similarity(v3) == 0.0
59 fun cosine_similarity
(other
: SELF): Float do
61 var terms
= new HashSet[nullable Object]
62 for k
in self.keys
do terms
.add k
63 for k
in other
.keys
do terms
.add k
65 # Get dot product of two vectors
68 dot
+= self.get_or_default
(term
, 0.0) * other
.get_or_default
(term
, 0.0)
70 var cos
= dot
.to_f
/ (self.norm
* other
.norm
)
71 if cos
.is_nan
then return 0.0
76 if not has_key
(k
) then return 0.0
80 # The norm of the vector.
82 # `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
90 # assert v.norm.is_approx(2.0, 0.001)
96 # assert v.norm.is_approx(3.742, 0.001)
100 for v
in self.values
do sum
+= v
.pow
(2.0)
105 return "[{join(", ", ":")}]"
109 # A Document index based on VSM
111 # Using VSMIndex you can index documents associated with their vector.
112 # Documents can then be matched to query vectors.
115 # Kind of documents stored in this index
117 # Clients can redefine this type to specialize the index.
121 var documents
= new HashSet[DOC]
123 # Count for all terms in all indexed documents
125 # Used to compute the `inverse_doc_frequency`.
126 var terms_doc_count
= new Vector
128 # Inverse document frequency
130 # The inverse document frequency is a measure of how much information a term
131 # provides, that is, whether the term is common or rare across all documents.
132 var inverse_doc_frequency
= new Vector
134 # Used to sort matches
137 var sorter
= new IndexMatchSorter
139 # Match `query` vector to all index document vectors
141 # Returns an `IndexMatch` for each indexed document.
142 # Results are ordered by descending similarity.
143 fun match_vector
(query
: Vector): Array[IndexMatch[DOC]] do
144 var matches
= new Array[IndexMatch[DOC]]
145 for doc
in documents
do
146 var sim
= query
.cosine_similarity
(doc
.tfidf
)
147 if sim
== 0.0 then continue
148 matches
.add
new IndexMatch[DOC](doc
, sim
)
156 # With each new document, the `inverse_doc_frequency` must be updated.
157 # By default, the method `update_index` is called after each call to
160 # When processing batch documents, use `auto_update = false` to disable
161 # the auto update of the index.
162 fun index_document
(doc
: DOC, auto_update
: nullable Bool) do
163 for term
, count
in doc
.terms_count
do
164 if not terms_doc_count
.has_key
(term
) then
165 terms_doc_count
[term
] = 1.0
167 terms_doc_count
[term
] += 1.0
171 if auto_update
== null or auto_update
then update_index
176 # Recompute the `inverse_doc_frequency` values.
177 # Must be called manually after indexing new document with the option
178 # `auto_update = false`.
180 for doc
in documents
do
181 for term
, ccount
in doc
.terms_count
do
182 inverse_doc_frequency
[term
] = (documents
.length
.to_f
/ terms_doc_count
[term
]).log
185 for doc
in documents
do
186 for term
, freq
in doc
.terms_frequency
do
187 doc
.tfidf
[term
] = freq
* inverse_doc_frequency
[term
]
193 # A VSM index to store strings
197 # Index a new Document from `title`, `uri` and string `string`.
199 # Return the Document created.
201 # See `index_document`.
202 fun index_string
(title
, uri
, string
: String, auto_update
: nullable Bool): DOC do
203 var vector
= parse_string
(string
)
204 var doc
= new Document(title
, uri
, vector
)
205 index_document
(doc
, auto_update
)
209 # Match the `query` string against all indexed documents
211 # See `match_vector`.
212 fun match_string
(query
: String): Array[IndexMatch[DOC]] do
213 var vector
= parse_string
(query
)
214 var doc
= new Document("", "", vector
)
215 return match_vector
(doc
.terms_frequency
)
218 # Parse the `string` as a Vector
220 # Returns a vector containing the terms of `string`.
221 fun parse_string
(string
: String): Vector do
222 var reader
= new StringReader(string
)
223 var vector
= new Vector
225 var token
= reader
.read_word
226 if token
== "" then break
228 if not vector
.has_key
(token
) then
238 # A VSMIndex to index files
242 # Index a file from its `path`.
244 # Return the created document or null if `path` is not accepted by `accept_file`.
246 # See `index_document`.
247 fun index_file
(path
: String, auto_update
: nullable Bool): nullable DOC do
248 if not accept_file
(path
) then return null
249 var vector
= parse_file
(path
)
250 var doc
= new Document(path
, path
, vector
)
251 index_document
(doc
, auto_update
)
255 # Index multiple files
257 # The recursive method `index_dir` will be called for each directory found
261 fun index_files
(paths
: Collection[String], auto_update
: nullable Bool) do
263 if path
.to_path
.is_dir
then
264 index_dir
(path
, false)
266 index_file
(path
, false)
269 if auto_update
!= null and auto_update
then update_index
272 # Index all files in `dir` recursively
275 fun index_dir
(dir
: String, auto_update
: nullable Bool) do
276 if not dir
.to_path
.is_dir
then return
277 for file
in dir
.files
do
278 var path
= dir
/ file
279 if path
.to_path
.is_dir
then
280 index_dir
(path
, false)
282 index_file
(path
, false)
285 if auto_update
!= null and auto_update
then update_index
288 # Is `path` accepted depending on `whitelist_exts` and `blacklist_exts`?
289 fun accept_file
(path
: String): Bool do
290 var ext
= path
.file_extension
293 if blacklist_exts
.has
(ext
) then return false
294 if whitelist_exts
.not_empty
and not whitelist_exts
.has
(ext
) then return false
296 return whitelist_exts
.is_empty
299 # Parse the `file` content as a Vector
301 # See `parse_string`.
302 fun parse_file
(file
: String): Vector do
303 return parse_string
(file
.to_path
.read_all
)
306 # File extensions white list
308 # If not empty, only files with these extensions will be indexed.
310 # If an extension is in both `whitelist_exts` and `blacklist_exts`, the
311 # blacklist will prevail and the file will be ignored.
312 var whitelist_exts
= new Array[String] is writable
314 # File extensions black list
316 # Files with these extensions will not be indexed.
317 var blacklist_exts
= new Array[String] is writable
320 # A Document to add in a VSMIndex
329 # Count of all terms found in the document
331 # Used to compute the document `terms_frequency`.
332 var terms_count
: Vector
334 # Frequency of each term found in the document
336 # Used to match the document against the `VSMIndex::inverse_doc_frequency`.
337 var terms_frequency
: Vector is lazy
do
339 for t
, c
in terms_count
do all_terms
+= c
341 var vector
= new Vector
342 for t
, c
in terms_count
do
343 vector
[t
] = c
/ all_terms
348 # Term frequency–Inverse document frequency for each term
350 # A high weight in tf–idf is reached by a high term frequency
351 # (in the given document) and a low document frequency of the term in the
352 # whole collection of documents
353 var tfidf
= new Vector
355 redef fun to_s
do return "{title}"
358 # A match to a `request` in an `Index`
359 class IndexMatch[DOC: Document]
362 # Document matching the `request_vector`
365 # Similarity between the `request` and the `doc`.
367 # Result is in the range 0.0 .. 1.1 where 0.0 means no similarity and 1.0
368 # means perfect similarity.
369 var similarity
: Float
371 redef fun to_s
do return "{document} ({similarity})"
374 # Sort matches by similarity
375 class IndexMatchSorter
376 super DefaultComparator
378 redef type COMPARED: IndexMatch[Document]
380 redef fun compare
(a
, b
) do
381 return b
.similarity
<=> a
.similarity