1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
17 # Vector Space Model (VSM) is an algebraic model for representing text documents
18 # (and any objects, in general) as vectors of identifiers, such as, for example,
21 # It is used in information filtering, information retrieval, indexing and
27 # A n-dimensions vector
29 # *n-dimensions* vectors are used to represent a text document or an object.
31 super HashMap[nullable Object, Float]
33 # Cosine similarity of `self` and `other`.
35 # Gives the proximity in the range `[0.0 .. 1.0]` where 0.0 means that the
36 # two vectors are orthogonal and 1.0 means that they are identical.
54 # print v1.cosine_similarity(v2)
55 # assert v1.cosine_similarity(v2) == 1.0
56 # print v1.cosine_similarity(v3)
57 # assert v1.cosine_similarity(v3) == 0.0
59 fun cosine_similarity
(other
: SELF): Float do
61 var terms
= new HashSet[nullable Object]
62 for k
in self.keys
do terms
.add k
63 for k
in other
.keys
do terms
.add k
65 # Get dot product of two vectors
68 dot
+= self.get_or_default
(term
, 0.0) * other
.get_or_default
(term
, 0.0)
70 var cos
= dot
.to_f
/ (self.norm
* other
.norm
)
71 if cos
.is_nan
then return 0.0
76 if not has_key
(k
) then return 0.0
80 # Increment value for `obj` term
82 # If the term isn't already in the vector, the new value is 1.0.
83 fun inc
(obj
: nullable Object) do
91 # The norm of the vector.
93 # `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
101 # assert v.norm.is_approx(2.0, 0.001)
107 # assert v.norm.is_approx(3.742, 0.001)
111 for v
in self.values
do sum
+= v
.pow
(2.0)
116 return "[{join(", ", ":")}]"
120 # A Document index based on VSM
122 # Using VSMIndex you can index documents associated with their vector.
123 # Documents can then be matched to query vectors.
126 # Kind of documents stored in this index
128 # Clients can redefine this type to specialize the index.
132 var documents
= new HashSet[DOC]
136 # Link documents to existing terms.
137 var inversed_index
= new HashMap[nullable Object, Array[DOC]]
139 # Count for all terms in all indexed documents
141 # Used to compute the `inverse_doc_frequency`.
142 var terms_doc_count
= new Vector
144 # Inverse document frequency
146 # The inverse document frequency is a measure of how much information a term
147 # provides, that is, whether the term is common or rare across all documents.
148 var inverse_doc_frequency
= new Vector
150 # Used to sort matches
153 var sorter
= new IndexMatchSorter
155 # Match `query` vector to all index document vectors
157 # Returns an `IndexMatch` for each indexed document.
158 # Results are ordered by descending similarity.
159 fun match_vector
(query
: Vector): Array[IndexMatch[DOC]] do
160 var documents
= new HashSet[DOC]
161 for term
, count
in query
do
162 if inversed_index
.has_key
(term
) then
163 documents
.add_all inversed_index
[term
]
166 var matches
= new Array[IndexMatch[DOC]]
167 for doc
in documents
do
168 var sim
= query
.cosine_similarity
(doc
.tfidf
)
169 if sim
== 0.0 then continue
170 matches
.add
new IndexMatch[DOC](doc
, sim
)
178 # With each new document, the `inverse_doc_frequency` must be updated.
179 # By default, the method `update_index` is called after each call to
182 # When processing batch documents, use `auto_update = false` to disable
183 # the auto update of the index.
184 fun index_document
(doc
: DOC, auto_update
: nullable Bool) do
185 for term
, count
in doc
.terms_count
do
186 terms_doc_count
.inc
(term
)
187 if not inversed_index
.has_key
(term
) then
188 inversed_index
[term
] = new Array[DOC]
190 inversed_index
[term
].add doc
193 if auto_update
== null or auto_update
then update_index
198 # Recompute the `inverse_doc_frequency` values.
199 # Must be called manually after indexing new document with the option
200 # `auto_update = false`.
202 for doc
in documents
do
203 for term
, ccount
in doc
.terms_count
do
204 inverse_doc_frequency
[term
] = (documents
.length
.to_f
/ terms_doc_count
[term
]).log
207 for doc
in documents
do
208 for term
, freq
in doc
.terms_frequency
do
209 doc
.tfidf
[term
] = freq
* inverse_doc_frequency
[term
]
215 # A VSM index to store strings
219 # Index a new Document from `title`, `uri` and string `string`.
221 # Return the Document created.
223 # See `index_document`.
224 fun index_string
(title
, uri
, string
: String, auto_update
: nullable Bool): DOC do
225 var vector
= parse_string
(string
)
226 var doc
= new Document(title
, uri
, vector
)
227 index_document
(doc
, auto_update
)
231 # Match the `query` string against all indexed documents
233 # See `match_vector`.
234 fun match_string
(query
: String): Array[IndexMatch[DOC]] do
235 var vector
= parse_string
(query
)
236 var doc
= new Document("", "", vector
)
237 return match_vector
(doc
.terms_frequency
)
240 # Parse the `string` as a Vector
242 # Returns a vector containing the terms of `string`.
243 fun parse_string
(string
: String): Vector do
244 var reader
= new StringReader(string
)
245 var vector
= new Vector
247 var token
= reader
.read_word
248 if token
== "" then break
255 # A VSMIndex to index files
259 # Index a file from its `path`.
261 # Return the created document or null if `path` is not accepted by `accept_file`.
263 # See `index_document`.
264 fun index_file
(path
: String, auto_update
: nullable Bool): nullable DOC do
265 if not accept_file
(path
) then return null
266 var vector
= parse_file
(path
)
267 var doc
= new Document(path
, path
, vector
)
268 index_document
(doc
, auto_update
)
272 # Index multiple files
274 # The recursive method `index_dir` will be called for each directory found
278 fun index_files
(paths
: Collection[String], auto_update
: nullable Bool) do
280 if path
.to_path
.is_dir
then
281 index_dir
(path
, false)
283 index_file
(path
, false)
286 if auto_update
!= null and auto_update
then update_index
289 # Index all files in `dir` recursively
292 fun index_dir
(dir
: String, auto_update
: nullable Bool) do
293 if not dir
.to_path
.is_dir
then return
294 for file
in dir
.files
do
295 var path
= dir
/ file
296 if path
.to_path
.is_dir
then
297 index_dir
(path
, false)
299 index_file
(path
, false)
302 if auto_update
!= null and auto_update
then update_index
305 # Is `path` accepted depending on `whitelist_exts` and `blacklist_exts`?
306 fun accept_file
(path
: String): Bool do
307 var ext
= path
.file_extension
310 if blacklist_exts
.has
(ext
) then return false
311 if whitelist_exts
.not_empty
and not whitelist_exts
.has
(ext
) then return false
313 return whitelist_exts
.is_empty
316 # Parse the `file` content as a Vector
318 # See `parse_string`.
319 fun parse_file
(file
: String): Vector do
320 return parse_string
(file
.to_path
.read_all
)
323 # File extensions white list
325 # If not empty, only files with these extensions will be indexed.
327 # If an extension is in both `whitelist_exts` and `blacklist_exts`, the
328 # blacklist will prevail and the file will be ignored.
329 var whitelist_exts
= new Array[String] is writable
331 # File extensions black list
333 # Files with these extensions will not be indexed.
334 var blacklist_exts
= new Array[String] is writable
337 # A Document to add in a VSMIndex
346 # Count of all terms found in the document
348 # Used to compute the document `terms_frequency`.
349 var terms_count
: Vector
351 # Frequency of each term found in the document
353 # Used to match the document against the `VSMIndex::inverse_doc_frequency`.
354 var terms_frequency
: Vector is lazy
do
356 for t
, c
in terms_count
do all_terms
+= c
358 var vector
= new Vector
359 for t
, c
in terms_count
do
360 vector
[t
] = c
/ all_terms
365 # Term frequency–Inverse document frequency for each term
367 # A high weight in tf–idf is reached by a high term frequency
368 # (in the given document) and a low document frequency of the term in the
369 # whole collection of documents
370 var tfidf
: Vector = terms_count
is lazy
372 redef fun to_s
do return "{title}"
375 # A match to a `request` in an `Index`
376 class IndexMatch[DOC: Document]
379 # Document matching the `request_vector`
382 # Similarity between the `request` and the `doc`.
384 # Result is in the range 0.0 .. 1.1 where 0.0 means no similarity and 1.0
385 # means perfect similarity.
386 var similarity
: Float
388 redef fun to_s
do return "{document} ({similarity})"
391 # Sort matches by similarity
392 class IndexMatchSorter
393 super DefaultComparator
395 redef type COMPARED: IndexMatch[Document]
397 redef fun compare
(a
, b
) do
398 return b
.similarity
<=> a
.similarity