1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
17 # Vector Space Model (VSM) is an algebraic model for representing text documents
18 # (and any objects, in general) as vectors of identifiers, such as, for example,
21 # It is used in information filtering, information retrieval, indexing and
27 # A n-dimensions vector
29 # *n-dimensions* vectors are used to represent a text document or an object.
31 super HashMap[nullable Object, Float]
33 # Cosine similarity of `self` and `other`.
35 # Gives the proximity in the range `[0.0 .. 1.0]` where 0.0 means that the
36 # two vectors are orthogonal and 1.0 means that they are identical.
54 # print v1.cosine_similarity(v2)
55 # assert v1.cosine_similarity(v2) == 1.0
56 # print v1.cosine_similarity(v3)
57 # assert v1.cosine_similarity(v3) == 0.0
59 fun cosine_similarity
(other
: SELF): Float do
61 var terms
= new HashSet[nullable Object]
62 for k
in self.keys
do terms
.add k
63 for k
in other
.keys
do terms
.add k
65 # Get dot product of two vectors
68 dot
+= self.get_or_default
(term
, 0.0) * other
.get_or_default
(term
, 0.0)
70 var cos
= dot
.to_f
/ (self.norm
* other
.norm
)
71 if cos
.is_nan
then return 0.0
75 # The norm of the vector.
77 # `||x|| = (x1 ** 2 ... + xn ** 2).sqrt`
85 # assert v.norm.is_approx(2.0, 0.001)
91 # assert v.norm.is_approx(3.742, 0.001)
95 for v
in self.values
do sum
+= v
.pow
(2.0)
100 return "[{join(", ", ":")}]"
104 # A Document index based on VSM
106 # Using VSMIndex you can index documents associated with their vector.
107 # Documents can then be matched to query vectors.
112 # TODO use a more efficient representation.
113 var documents
= new HashSet[Document]
115 # Count for all terms in all indexed documents
117 # Used to compute the `inverse_doc_frequency`.
118 var terms_doc_count
= new Vector
120 # Inverse document frequency
122 # The inverse document frequency is a measure of how much information a term
123 # provides, that is, whether the term is common or rare across all documents.
124 var inverse_doc_frequency
= new Vector
126 # Used to sort matches
129 var sorter
= new IndexMatchSorter
131 # Match `query` vector to all index document vectors
133 # Returns an `IndexMatch` for each indexed document.
134 # Results are ordered by descending similarity.
135 fun match_vector
(query
: Vector): Array[IndexMatch] do
136 var matches
= new Array[IndexMatch]
137 for doc
in documents
do
138 var sim
= query
.cosine_similarity
(doc
.tfidf
)
139 if sim
== 0.0 then continue
140 matches
.add
new IndexMatch(doc
, sim
)
148 # With each new document, the `inverse_doc_frequency` must be updated.
149 # By default, the method `update_index` is called after each call to
152 # When processing batch documents, use `auto_update = false` to disable
153 # the auto update of the index.
154 fun index_document
(doc
: Document, auto_update
: nullable Bool) do
155 for term
, count
in doc
.terms_count
do
156 if not terms_doc_count
.has_key
(term
) then
157 terms_doc_count
[term
] = 1.0
159 terms_doc_count
[term
] += 1.0
163 if auto_update
== null or auto_update
then update_index
168 # Recompute the `inverse_doc_frequency` values.
169 # Must be called manually after indexing new document with the option
170 # `auto_update = false`.
172 for doc
in documents
do
173 for term
, ccount
in doc
.terms_count
do
174 inverse_doc_frequency
[term
] = (documents
.length
.to_f
/ terms_doc_count
[term
]).log
177 for doc
in documents
do
178 for term
, freq
in doc
.terms_frequency
do
179 doc
.tfidf
[term
] = freq
* inverse_doc_frequency
[term
]
185 # A VSM index to store strings
189 # Index a new Document from `title`, `uri` and string `string`.
191 # Return the Document created.
193 # See `index_document`.
194 fun index_string
(title
, uri
, string
: String, auto_update
: nullable Bool): Document do
195 var vector
= parse_string
(string
)
196 var doc
= new Document(title
, uri
, vector
)
197 index_document
(doc
, auto_update
)
201 # Match the `query` string against all indexed documents
203 # See `match_vector`.
204 fun match_string
(query
: String): Array[IndexMatch] do
205 var vector
= parse_string
(query
)
206 return match_vector
(vector
)
209 # Parse the `string` as a Vector
211 # Returns a vector containing the terms of `string`.
212 fun parse_string
(string
: String): Vector do
213 var reader
= new StringReader(string
)
214 var vector
= new Vector
216 var token
= reader
.read_word
217 if token
== "" then break
219 if not vector
.has_key
(token
) then
229 # A VSMIndex to index files
233 # Index a file from its `path`.
235 # Return the created document or null if `path` is not accepted by `accept_file`.
237 # See `index_document`.
238 fun index_file
(path
: String, auto_update
: nullable Bool): nullable Document do
239 if not accept_file
(path
) then return null
240 var vector
= parse_file
(path
)
241 var doc
= new Document(path
, path
, vector
)
242 index_document
(doc
, auto_update
)
246 # Index multiple files
248 # The recursive method `index_dir` will be called for each directory found
252 fun index_files
(paths
: Collection[String], auto_update
: nullable Bool) do
254 if path
.to_path
.is_dir
then
255 index_dir
(path
, false)
257 index_file
(path
, false)
260 if auto_update
!= null and auto_update
then update_index
263 # Index all files in `dir` recursively
266 fun index_dir
(dir
: String, auto_update
: nullable Bool) do
267 if not dir
.to_path
.is_dir
then return
268 for file
in dir
.files
do
269 var path
= dir
/ file
270 if path
.to_path
.is_dir
then
271 index_dir
(path
, false)
273 index_file
(path
, false)
276 if auto_update
!= null and auto_update
then update_index
279 # Is `path` accepted depending on `whitelist_exts` and `blacklist_exts`?
280 fun accept_file
(path
: String): Bool do
281 var ext
= path
.file_extension
284 if blacklist_exts
.has
(ext
) then return false
285 if whitelist_exts
.not_empty
and not whitelist_exts
.has
(ext
) then return false
287 return whitelist_exts
.is_empty
290 # Parse the `file` content as a Vector
292 # See `parse_string`.
293 fun parse_file
(file
: String): Vector do
294 return parse_string
(file
.to_path
.read_all
)
297 # File extensions white list
299 # If not empty, only files with these extensions will be indexed.
301 # If an extension is in both `whitelist_exts` and `blacklist_exts`, the
302 # blacklist will prevail and the file will be ignored.
303 var whitelist_exts
= new Array[String] is writable
305 # File extensions black list
307 # Files with these extensions will not be indexed.
308 var blacklist_exts
= new Array[String] is writable
311 # A Document to add in a VSMIndex
320 # Count of all terms found in the document
322 # Used to compute the document `terms_frequency`.
323 var terms_count
: Vector
325 # Frequency of each term found in the document
327 # Used to match the document against the `VSMIndex::inverse_doc_frequency`.
328 var terms_frequency
: Vector is lazy
do
330 for t
, c
in terms_count
do all_terms
+= c
332 var vector
= new Vector
333 for t
, c
in terms_count
do
334 vector
[t
] = c
/ all_terms
339 # Term frequency–Inverse document frequency for each term
341 # A high weight in tf–idf is reached by a high term frequency
342 # (in the given document) and a low document frequency of the term in the
343 # whole collection of documents
344 var tfidf
= new Vector
346 redef fun to_s
do return "{title}"
349 # A match to a `request` in an `Index`
353 # Document matching the `request_vector`
354 var document
: Document
356 # Similarity between the `request` and the `doc`.
358 # Result is in the range 0.0 .. 1.1 where 0.0 means no similarity and 1.0
359 # means perfect similarity.
360 var similarity
: Float
362 redef fun to_s
do return "{document} ({similarity})"
365 # Sort matches by similarity
366 class IndexMatchSorter
367 super DefaultComparator
369 redef type COMPARED: IndexMatch
371 redef fun compare
(a
, b
) do
372 return b
.similarity
<=> a
.similarity