Merge: Use a Buffer in StringWriter
[nit.git] / lib / counter.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Simple numerical statistical analysis and presentation
16 module counter
17
18 import poset
19
20 # A counter counts occurrences of things
21 # Use this instead of a `HashMap[E, Int]`
22 #
23 # ~~~
24 # var c = new Counter[String].from(["a", "a", "b", "b", "b", "c"])
25 # assert c["a"] == 2
26 # assert c["b"] == 3
27 # assert c["c"] == 1
28 # assert c["d"] == 0
29 # ~~~
30 #
31 # The counter class can also be used to gather statistical informations.
32 #
33 # ~~~~
34 # assert c.length == 3 # because 3 distinct values
35 # assert c.max == "b" # because "b" has the most count (3)
36 # assert c.avg == 2.0 # because it is the mean of the counts
37 # ~~~~
38 class Counter[E]
39 super Map[E, Int]
40
41 # Total number of counted occurrences
42 #
43 # ~~~
44 # var c = new Counter[String]
45 # assert c.sum == 0
46 # c.inc_all(["a", "a", "b", "b", "b", "c"])
47 # assert c.sum == 6
48 # ~~~
49 var sum: Int = 0
50
51 private var map = new HashMap[E, Int]
52
53 redef fun iterator do return map.iterator
54
55 # The number of counted occurrences of `e`
56 redef fun [](e)
57 do
58 var map = self.map
59 if map.has_key(e) then return map[e]
60 return 0
61 end
62
63 redef fun []=(e, value)
64 do
65 sum -= self[e]
66 self.map[e] = value
67 sum += value
68 end
69
70 redef fun keys do return map.keys
71
72 redef fun values do return map.values
73
74 redef fun length do return map.length
75
76 redef fun is_empty do return map.is_empty
77
78 redef fun clear do
79 sum = 0
80 map.clear
81 end
82
83 # Count one more occurrence of `e`
84 fun inc(e: E)
85 do
86 self.map[e] = self[e] + 1
87 sum += 1
88 end
89
90 # Count one more for each element of `es`
91 fun inc_all(es: Collection[E])
92 do
93 for e in es do inc(e)
94 end
95
96 # Decrement the value of `e` by 1
97 fun dec(e: E) do
98 if not has_key(e) then
99 self.map[e] = 0
100 else
101 self.map[e] = self[e] - 1
102 sum += - 1
103 end
104 end
105
106 # Decrement the value for each element of `es`
107 fun dec_all(es: Collection[E])
108 do
109 for e in es do dec(e)
110 end
111
112 # A new Counter initialized with `inc_all`.
113 init from(es: Collection[E])
114 do
115 inc_all(es)
116 end
117
118 # Return an array of elements sorted by occurrences
119 #
120 # ~~~
121 # var c = new Counter[String].from(["a", "a", "b", "b", "b", "c"])
122 # assert c.sort == ["c", "a", "b"]
123 # ~~~
124 fun sort: Array[E]
125 do
126 var res = map.keys.to_a
127 var sorter = new CounterComparator[E](self)
128 sorter.sort(res)
129 return res
130 end
131
132 # The method used to display an element
133 # @toimplement by default just call `to_s` on the element
134 protected fun element_to_s(e: E): String
135 do
136 return e.to_s
137 end
138
139 # Display statistical information
140 fun print_summary
141 do
142 var list = self.sort
143 print " population: {list.length}"
144 if list.is_empty then return
145 print " minimum value: {self[list.first]}"
146 print " maximum value: {self[list.last]}"
147 print " total value: {self.sum}"
148 print " average value: {div(self.sum,list.length)}"
149 print " distribution:"
150 var count = 0
151 var sum = 0
152 var limit = self[list.first]
153 for t in list do
154 if self[t] > limit then
155 print " <={limit}: sub-population={count} ({div(count*100,list.length)}%); cumulated value={sum} ({div(sum*100,self.sum)}%)"
156 count = 0
157 sum = 0
158 while self[t] > limit do
159 limit = limit * 2
160 if limit == 0 then limit = 1
161 end
162 end
163 count += 1
164 sum += self[t]
165 end
166 print " <={limit}: sub-population={count} ({div(count*100,list.length)}%); cumulated value={sum} ({div(sum*100,self.sum)}%)"
167 end
168
169 # Display up to `count` most used elements and `count` least used elements
170 # Use `element_to_s` to display the element
171 fun print_elements(count: Int)
172 do
173 print " list:"
174 var list = self.sort
175 var min = count
176 if list.length <= count*2 then min = list.length
177 for i in [0..min[ do
178 var t = list[list.length-i-1]
179 print " {element_to_s(t)}: {self[t]} ({div(self[t]*100,self.sum)}%)"
180 end
181 if list.length <= count*2 then return
182 print " ..."
183 for i in [0..min[ do
184 var t = list[min-i-1]
185 print " {element_to_s(t)}: {self[t]} ({div(self[t]*100,self.sum)}%)"
186 end
187 end
188
189 # Return the element with the highest value (aka. the mode)
190 #
191 # ~~~
192 # var c = new Counter[String].from(["a", "a", "b", "b", "b", "c"])
193 # assert c.max == "b"
194 # ~~~
195 #
196 # If more than one max exists, the first one is returned.
197 fun max: nullable E do
198 var max: nullable Int = null
199 var elem: nullable E = null
200 for e in map.keys do
201 var v = map[e]
202 if max == null or v > max then
203 max = v
204 elem = e
205 end
206 end
207 return elem
208 end
209
210 # Return the couple with the lowest value
211 #
212 # ~~~
213 # var c = new Counter[String].from(["a", "a", "b", "b", "b", "c"])
214 # assert c.min == "c"
215 # ~~~
216 #
217 # If more than one min exists, the first one is returned.
218 fun min: nullable E do
219 var min: nullable Int = null
220 var elem: nullable E = null
221 for e in map.keys do
222 var v = map[e]
223 if min == null or v < min then
224 min = v
225 elem = e
226 end
227 end
228 return elem
229 end
230
231 # Values average (aka. arithmetic mean)
232 #
233 # ~~~
234 # var c = new Counter[String].from(["a", "a", "b", "b", "b", "c"])
235 # assert c.avg == 2.0
236 # ~~~
237 fun avg: Float do
238 if values.is_empty then return 0.0
239 return (sum / values.length).to_f
240 end
241
242 # The standard derivation of the counter values
243 #
244 # ~~~
245 # var c = new Counter[String].from(["a", "a", "b", "b", "b", "c"])
246 # assert c.std_dev > 0.81
247 # assert c.std_dev < 0.82
248 # ~~~
249 fun std_dev: Float do
250 var avg = self.avg
251 var sum = 0.0
252 for value in map.values do
253 sum += (value.to_f - avg).pow(2.0)
254 end
255 return (sum / map.length.to_f).sqrt
256 end
257
258 # The information entropy (Shannon entropy) of the elements in the counter (in bits).
259 fun entropy: Float
260 do
261 var res = 0.0
262 var sum = self.sum.to_f
263 for k, v in self do
264 var f = v.to_f / sum
265 res = res - f * f.log_base(2.0)
266 end
267 return res
268 end
269
270 # Prints the content of the counter along with statistics
271 #
272 # Content is printed in order (if available) from lowest to highest on the keys.
273 # Else, it is printed as-is
274 fun print_content do
275 var a = keys.to_a
276 if a isa Array[Comparable] then default_comparator.sort(a)
277 var subtotal = 0
278 for i in a do
279 subtotal += self[i]
280 printn("* ", i or else "null", " = ", self[i], " => occurences ", self[i].to_f / sum.to_f * 100.0, "%, cumulative ", subtotal.to_f / sum.to_f * 100.0, "% \n")
281 end
282 end
283
284 # Packs elements into separate arrays based on their occurences
285 #
286 # ~~~nit
287 # var x = "aaaabbbeeecccddhhgjt"
288 # var c = new Counter[Char]
289 # for i in x do c.inc i
290 # var ret = c.pack
291 # assert ret.join(", ") == """[t,g,j], [d,h], [c,b,e], [a]"""
292 # ~~~
293 fun pack: Array[Array[E]] do
294 var ret = new Array[Array[E]]
295 var base = self.sort
296 if base.is_empty then return ret
297 var curr = new Array[E]
298 curr.push base.first
299 var curr_score = self[base.first]
300 base.shift
301 for i in base do
302 if self[i] == curr_score then
303 curr.push i
304 continue
305 end
306 curr_score = self[i]
307 ret.push curr
308 curr = new Array[E]
309 curr.push i
310 end
311 if not curr.is_empty then ret.push curr
312 return ret
313 end
314 end
315
316 redef class Collection[E]
317 # Create and fill up a counter with the elements of `self.
318 #
319 # ~~~
320 # var cpt = "abaa".chars.to_counter
321 # assert cpt['a'] == 3
322 # assert cpt['b'] == 1
323 # assert cpt.length == 2
324 # assert cpt.sum == 4
325 # ~~~
326 fun to_counter: Counter[E]
327 do
328 var res = new Counter[E]
329 res.inc_all(self)
330 return res
331 end
332 end
333
334 private class CounterComparator[E]
335 super Comparator
336 redef type COMPARED: E
337 var counter: Counter[E]
338 redef fun compare(a,b) do return self.counter.map[a] <=> self.counter.map[b]
339 end
340
341 redef class POSet[E]
342 private fun show_counter(c: Counter[Int])
343 do
344 var list = c.sort
345 default_comparator.sort(list)
346 for e in list do
347 print " {e} -> {c[e]} times ({div(c[e]*100, c.sum)}%)"
348 end
349 end
350
351 # Display exhaustive metrics about the poset
352 fun print_metrics
353 do
354 var nb_greaters = new Counter[E]
355 var nb_direct_greaters = new Counter[E]
356 var nb_smallers = new Counter[E]
357 var nb_direct_smallers = new Counter[E]
358 var nb_direct_edges = 0
359 var nb_edges = 0
360 for n in self do
361 var ne = self[n]
362 nb_edges += ne.greaters.length
363 nb_direct_edges += ne.direct_greaters.length
364 nb_greaters[n] = ne.greaters.length
365 nb_direct_greaters[n] = ne.direct_greaters.length
366 nb_smallers[n] = ne.smallers.length
367 nb_direct_smallers[n] = ne.direct_smallers.length
368 end
369 print "Number of nodes: {self.length}"
370 print "Number of edges: {nb_edges} ({div(nb_edges,self.length)} per node)"
371 print "Number of direct edges: {nb_direct_edges} ({div(nb_direct_edges,self.length)} per node)"
372 print "Distribution of greaters"
373 nb_greaters.print_summary
374 print "Distribution of direct greaters"
375 nb_direct_greaters.print_summary
376 print "Distribution of smallers"
377 nb_smallers.print_summary
378 print "Distribution of direct smallers"
379 nb_direct_smallers.print_summary
380 end
381 end
382
383 # Helper function to display `n/d` and handle division by 0
384 fun div(n: Int, d: Int): String
385 do
386 if d == 0 then return "na"
387 return ((100*n/d).to_f/100.0).to_precision(2)
388 end