1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
17 # Vector Space Model (VSM) is an algebraic model for representing text documents
18 # (and any objects, in general) as vectors of identifiers, such as, for example,
21 # It is used in information filtering, information retrieval, indexing and
27 # A n-dimensions vector
29 # *n-dimensions* vectors are used to represent a text document or an object.
31 super HashMap[nullable Object, Float]
33 # Cosine similarity of `self` and `other`.
35 # Gives the proximity in the range `[0.0 .. 1.0]` where 0.0 means that the
36 # two vectors are orthogonal and 1.0 means that they are identical.
54 # print v1.cosine_similarity(v2)
55 # assert v1.cosine_similarity(v2) == 1.0
56 # print v1.cosine_similarity(v3)
57 # assert v1.cosine_similarity(v3) == 0.0
59 fun cosine_similarity
(other
: SELF): Float do
61 var terms
= new HashSet[nullable Object]
62 for k
in self.keys
do terms
.add k
63 for k
in other
.keys
do terms
.add k
65 # Get dot product of two vectors
68 dot
+= self.get_or_default
(term
, 0.0) * other
.get_or_default
(term
, 0.0)
70 var cos
= dot
.to_f
/ (self.norm
* other
.norm
)
71 if cos
.is_nan
then return 0.0
76 if not has_key
(k
) then return 0.0
80 # The norm of the vector.
82 # `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
90 # assert v.norm.is_approx(2.0, 0.001)
96 # assert v.norm.is_approx(3.742, 0.001)
100 for v
in self.values
do sum
+= v
.pow
(2.0)
105 return "[{join(", ", ":")}]"
109 # A Document index based on VSM
111 # Using VSMIndex you can index documents associated with their vector.
112 # Documents can then be matched to query vectors.
117 # TODO use a more efficient representation.
118 var documents
= new HashSet[Document]
120 # Count for all terms in all indexed documents
122 # Used to compute the `inverse_doc_frequency`.
123 var terms_doc_count
= new Vector
125 # Inverse document frequency
127 # The inverse document frequency is a measure of how much information a term
128 # provides, that is, whether the term is common or rare across all documents.
129 var inverse_doc_frequency
= new Vector
131 # Used to sort matches
134 var sorter
= new IndexMatchSorter
136 # Match `query` vector to all index document vectors
138 # Returns an `IndexMatch` for each indexed document.
139 # Results are ordered by descending similarity.
140 fun match_vector
(query
: Vector): Array[IndexMatch] do
141 var matches
= new Array[IndexMatch]
142 for doc
in documents
do
143 var sim
= query
.cosine_similarity
(doc
.tfidf
)
144 if sim
== 0.0 then continue
145 matches
.add
new IndexMatch(doc
, sim
)
153 # With each new document, the `inverse_doc_frequency` must be updated.
154 # By default, the method `update_index` is called after each call to
157 # When processing batch documents, use `auto_update = false` to disable
158 # the auto update of the index.
159 fun index_document
(doc
: Document, auto_update
: nullable Bool) do
160 for term
, count
in doc
.terms_count
do
161 if not terms_doc_count
.has_key
(term
) then
162 terms_doc_count
[term
] = 1.0
164 terms_doc_count
[term
] += 1.0
168 if auto_update
== null or auto_update
then update_index
173 # Recompute the `inverse_doc_frequency` values.
174 # Must be called manually after indexing new document with the option
175 # `auto_update = false`.
177 for doc
in documents
do
178 for term
, ccount
in doc
.terms_count
do
179 inverse_doc_frequency
[term
] = (documents
.length
.to_f
/ terms_doc_count
[term
]).log
182 for doc
in documents
do
183 for term
, freq
in doc
.terms_frequency
do
184 doc
.tfidf
[term
] = freq
* inverse_doc_frequency
[term
]
190 # A VSM index to store strings
194 # Index a new Document from `title`, `uri` and string `string`.
196 # Return the Document created.
198 # See `index_document`.
199 fun index_string
(title
, uri
, string
: String, auto_update
: nullable Bool): Document do
200 var vector
= parse_string
(string
)
201 var doc
= new Document(title
, uri
, vector
)
202 index_document
(doc
, auto_update
)
206 # Match the `query` string against all indexed documents
208 # See `match_vector`.
209 fun match_string
(query
: String): Array[IndexMatch] do
210 var vector
= parse_string
(query
)
211 var doc
= new Document("", "", vector
)
212 return match_vector
(doc
.terms_frequency
)
215 # Parse the `string` as a Vector
217 # Returns a vector containing the terms of `string`.
218 fun parse_string
(string
: String): Vector do
219 var reader
= new StringReader(string
)
220 var vector
= new Vector
222 var token
= reader
.read_word
223 if token
== "" then break
225 if not vector
.has_key
(token
) then
235 # A VSMIndex to index files
239 # Index a file from its `path`.
241 # Return the created document or null if `path` is not accepted by `accept_file`.
243 # See `index_document`.
244 fun index_file
(path
: String, auto_update
: nullable Bool): nullable Document do
245 if not accept_file
(path
) then return null
246 var vector
= parse_file
(path
)
247 var doc
= new Document(path
, path
, vector
)
248 index_document
(doc
, auto_update
)
252 # Index multiple files
254 # The recursive method `index_dir` will be called for each directory found
258 fun index_files
(paths
: Collection[String], auto_update
: nullable Bool) do
260 if path
.to_path
.is_dir
then
261 index_dir
(path
, false)
263 index_file
(path
, false)
266 if auto_update
!= null and auto_update
then update_index
269 # Index all files in `dir` recursively
272 fun index_dir
(dir
: String, auto_update
: nullable Bool) do
273 if not dir
.to_path
.is_dir
then return
274 for file
in dir
.files
do
275 var path
= dir
/ file
276 if path
.to_path
.is_dir
then
277 index_dir
(path
, false)
279 index_file
(path
, false)
282 if auto_update
!= null and auto_update
then update_index
285 # Is `path` accepted depending on `whitelist_exts` and `blacklist_exts`?
286 fun accept_file
(path
: String): Bool do
287 var ext
= path
.file_extension
290 if blacklist_exts
.has
(ext
) then return false
291 if whitelist_exts
.not_empty
and not whitelist_exts
.has
(ext
) then return false
293 return whitelist_exts
.is_empty
296 # Parse the `file` content as a Vector
298 # See `parse_string`.
299 fun parse_file
(file
: String): Vector do
300 return parse_string
(file
.to_path
.read_all
)
303 # File extensions white list
305 # If not empty, only files with these extensions will be indexed.
307 # If an extension is in both `whitelist_exts` and `blacklist_exts`, the
308 # blacklist will prevail and the file will be ignored.
309 var whitelist_exts
= new Array[String] is writable
311 # File extensions black list
313 # Files with these extensions will not be indexed.
314 var blacklist_exts
= new Array[String] is writable
317 # A Document to add in a VSMIndex
326 # Count of all terms found in the document
328 # Used to compute the document `terms_frequency`.
329 var terms_count
: Vector
331 # Frequency of each term found in the document
333 # Used to match the document against the `VSMIndex::inverse_doc_frequency`.
334 var terms_frequency
: Vector is lazy
do
336 for t
, c
in terms_count
do all_terms
+= c
338 var vector
= new Vector
339 for t
, c
in terms_count
do
340 vector
[t
] = c
/ all_terms
345 # Term frequency–Inverse document frequency for each term
347 # A high weight in tf–idf is reached by a high term frequency
348 # (in the given document) and a low document frequency of the term in the
349 # whole collection of documents
350 var tfidf
= new Vector
352 redef fun to_s
do return "{title}"
355 # A match to a `request` in an `Index`
359 # Document matching the `request_vector`
360 var document
: Document
362 # Similarity between the `request` and the `doc`.
364 # Result is in the range 0.0 .. 1.1 where 0.0 means no similarity and 1.0
365 # means perfect similarity.
366 var similarity
: Float
368 redef fun to_s
do return "{document} ({similarity})"
371 # Sort matches by similarity
372 class IndexMatchSorter
373 super DefaultComparator
375 redef type COMPARED: IndexMatch
377 redef fun compare
(a
, b
) do
378 return b
.similarity
<=> a
.similarity