tesseract 4.1.1
Loading...
Searching...
No Matches
statistc.h
Go to the documentation of this file.
1/**********************************************************************
2 * File: statistc.h (Formerly stats.h)
3 * Description: Class description for STATS class.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 1991, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19#ifndef TESSERACT_CCSTRUCT_STATISTC_H_
20#define TESSERACT_CCSTRUCT_STATISTC_H_
21
22#include <cstdio>
23#include "kdpair.h"
24#include "scrollview.h"
25
26template <typename T> class GenericVector;
27
28
29// Simple histogram-based statistics for integer values in a known
30// range, such that the range is small compared to the number of samples.
31class STATS {
32 public:
33 // The histogram buckets are in the range
34 // [min_bucket_value, max_bucket_value_plus_1 - 1] i.e.
35 // [min_bucket_value, max_bucket_value].
36 // Any data under min_bucket value is silently mapped to min_bucket_value,
37 // and likewise, any data over max_bucket_value is silently mapped to
38 // max_bucket_value.
39 // In the internal array, min_bucket_value maps to 0 and
40 // max_bucket_value_plus_1 - min_bucket_value to the array size.
41 // TODO(rays) This is ugly. Convert the second argument to
42 // max_bucket_value and all the code that uses it.
43 STATS(int32_t min_bucket_value, int32_t max_bucket_value_plus_1);
44 STATS() = default; // empty for arrays
45
46 ~STATS();
47
48 // (Re)Sets the range and clears the counts.
49 // See the constructor for info on max and min values.
50 bool set_range(int32_t min_bucket_value, int32_t max_bucket_value_plus_1);
51
52 void clear(); // empty buckets
53
54 void add(int32_t value, int32_t count);
55
56 // "Accessors" return various statistics on the data.
57 int32_t mode() const; // get mode of samples
58 double mean() const; // get mean of samples
59 double sd() const; // standard deviation
60 // Returns the fractile value such that frac fraction (in [0,1]) of samples
61 // has a value less than the return value.
62 double ile(double frac) const;
63 // Returns the minimum used entry in the histogram (ie the minimum of the
64 // data, NOT the minimum of the supplied range, nor is it an index.)
65 // Would normally be called min(), but that is a reserved word in VC++.
66 int32_t min_bucket() const; // Find min
67 // Returns the maximum used entry in the histogram (ie the maximum of the
68 // data, NOT the maximum of the supplied range, nor is it an index.)
69 int32_t max_bucket() const; // Find max
70 // Finds a more useful estimate of median than ile(0.5).
71 // Overcomes a problem with ile() - if the samples are, for example,
72 // 6,6,13,14 ile(0.5) return 7.0 - when a more useful value would be midway
73 // between 6 and 13 = 9.5
74 double median() const; // get median of samples
75 // Returns the count of the given value.
76 int32_t pile_count(int32_t value) const {
77 if (value <= rangemin_)
78 return buckets_[0];
79 if (value >= rangemax_ - 1)
80 return buckets_[rangemax_ - rangemin_ - 1];
81 return buckets_[value - rangemin_];
82 }
83 // Returns the total count of all buckets.
84 int32_t get_total() const {
85 return total_count_; // total of all piles
86 }
87 // Returns true if x is a local min.
88 bool local_min(int32_t x) const;
89
90 // Apply a triangular smoothing filter to the stats.
91 // This makes the modes a bit more useful.
92 // The factor gives the height of the triangle, i.e. the weight of the
93 // centre.
94 void smooth(int32_t factor);
95
96 // Cluster the samples into max_cluster clusters.
97 // Each call runs one iteration. The array of clusters must be
98 // max_clusters+1 in size as cluster 0 is used to indicate which samples
99 // have been used.
100 // The return value is the current number of clusters.
101 int32_t cluster(float lower, // thresholds
102 float upper,
103 float multiple, // distance threshold
104 int32_t max_clusters, // max no to make
105 STATS *clusters); // array of clusters
106
107// Finds (at most) the top max_modes modes, well actually the whole peak around
108// each mode, returning them in the given modes vector as a <mean of peak,
109// total count of peak> pair in order of decreasing total count.
110// Since the mean is the key and the count the data in the pair, a single call
111// to sort on the output will re-sort by increasing mean of peak if that is
112// more useful than decreasing total count.
113// Returns the actual number of modes found.
114 int top_n_modes(
115 int max_modes,
117
118 // Prints a summary and table of the histogram.
119 void print() const;
120 // Prints summary stats only of the histogram.
121 void print_summary() const;
122
123 #ifndef GRAPHICS_DISABLED
124 // Draws the histogram as a series of rectangles.
125 void plot(ScrollView* window, // window to draw in
126 float xorigin, // origin of histo
127 float yorigin, // gram
128 float xscale, // size of one unit
129 float yscale, // size of one uint
130 ScrollView::Color colour) const; // colour to draw in
131
132 // Draws a line graph of the histogram.
133 void plotline(ScrollView* window, // window to draw in
134 float xorigin, // origin of histo
135 float yorigin, // gram
136 float xscale, // size of one unit
137 float yscale, // size of one uint
138 ScrollView::Color colour) const; // colour to draw in
139 #endif // GRAPHICS_DISABLED
140
141 private:
142 int32_t rangemin_ = 0; // min of range
143 // rangemax_ is not well named as it is really one past the max.
144 int32_t rangemax_ = 0; // max of range
145 int32_t total_count_ = 0; // no of samples
146 int32_t* buckets_ = nullptr; // array of cells
147};
148
149// Returns the nth ordered item from the array, as if they were
150// ordered, but without ordering them, in linear time.
151// The array does get shuffled!
152int32_t choose_nth_item(int32_t index, // index to choose
153 float *array, // array of items
154 int32_t count); // no of items
155// Generic version uses a defined comparator (with qsort semantics).
156int32_t choose_nth_item(int32_t index, // index to choose
157 void *array, // array of items
158 int32_t count, // no of items
159 size_t size, // element size
160 int (*compar)(const void*, const void*)); // comparator
161// Swaps 2 entries in an array in-place.
162void swap_entries(void *array, // array of entries
163 size_t size, // size of entry
164 int32_t index1, // entries to swap
165 int32_t index2);
166
167#endif // TESSERACT_CCSTRUCT_STATISTC_H_
void swap_entries(void *array, size_t size, int32_t index1, int32_t index2)
Definition: statistc.cpp:758
int32_t choose_nth_item(int32_t index, float *array, int32_t count)
Definition: statistc.cpp:630
int count(LIST var_list)
Definition: oldlist.cpp:95
Definition: statistc.h:31
void plot(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
Definition: statistc.cpp:577
int32_t max_bucket() const
Definition: statistc.cpp:213
int32_t cluster(float lower, float upper, float multiple, int32_t max_clusters, STATS *clusters)
Definition: statistc.cpp:312
void smooth(int32_t factor)
Definition: statistc.cpp:281
void clear()
Definition: statistc.cpp:75
int32_t pile_count(int32_t value) const
Definition: statistc.h:76
double mean() const
Definition: statistc.cpp:127
void add(int32_t value, int32_t count)
Definition: statistc.cpp:93
double sd() const
Definition: statistc.cpp:143
bool set_range(int32_t min_bucket_value, int32_t max_bucket_value_plus_1)
Definition: statistc.cpp:56
double median() const
Definition: statistc.cpp:231
int top_n_modes(int max_modes, GenericVector< tesseract::KDPairInc< float, int > > *modes) const
Definition: statistc.cpp:461
int32_t get_total() const
Definition: statistc.h:84
void print() const
Definition: statistc.cpp:526
STATS()=default
int32_t min_bucket() const
Definition: statistc.cpp:198
void plotline(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
Definition: statistc.cpp:604
double ile(double frac) const
Definition: statistc.cpp:166
~STATS()
Definition: statistc.cpp:86
bool local_min(int32_t x) const
Definition: statistc.cpp:254
void print_summary() const
Definition: statistc.cpp:552
int32_t mode() const
Definition: statistc.cpp:107