tesseract 4.1.1
Loading...
Searching...
No Matches
linlsq.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: linlsq.cpp (Formerly llsq.c)
3 * Description: Linear Least squares fitting code.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 1991, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19#include <cstdio>
20#include <cmath> // for std::sqrt
21#include "errcode.h"
22#include "linlsq.h"
23
24constexpr ERRCODE EMPTY_LLSQ("Can't delete from an empty LLSQ");
25
26/**********************************************************************
27 * LLSQ::clear
28 *
29 * Function to initialize a LLSQ.
30 **********************************************************************/
31
32void LLSQ::clear() { // initialize
33 total_weight = 0.0; // no elements
34 sigx = 0.0; // update accumulators
35 sigy = 0.0;
36 sigxx = 0.0;
37 sigxy = 0.0;
38 sigyy = 0.0;
39}
40
41
42/**********************************************************************
43 * LLSQ::add
44 *
45 * Add an element to the accumulator.
46 **********************************************************************/
47
48void LLSQ::add(double x, double y) { // add an element
49 total_weight++; // count elements
50 sigx += x; // update accumulators
51 sigy += y;
52 sigxx += x * x;
53 sigxy += x * y;
54 sigyy += y * y;
55}
56// Adds an element with a specified weight.
57void LLSQ::add(double x, double y, double weight) {
58 total_weight += weight;
59 sigx += x * weight; // update accumulators
60 sigy += y * weight;
61 sigxx += x * x * weight;
62 sigxy += x * y * weight;
63 sigyy += y * y * weight;
64}
65// Adds a whole LLSQ.
66void LLSQ::add(const LLSQ& other) {
67 total_weight += other.total_weight;
68 sigx += other.sigx; // update accumulators
69 sigy += other.sigy;
70 sigxx += other.sigxx;
71 sigxy += other.sigxy;
72 sigyy += other.sigyy;
73}
74
75
76/**********************************************************************
77 * LLSQ::remove
78 *
79 * Delete an element from the acculuator.
80 **********************************************************************/
81
82void LLSQ::remove(double x, double y) { // delete an element
83 if (total_weight <= 0.0) // illegal
84 EMPTY_LLSQ.error("LLSQ::remove", ABORT, nullptr);
85 total_weight--; // count elements
86 sigx -= x; // update accumulators
87 sigy -= y;
88 sigxx -= x * x;
89 sigxy -= x * y;
90 sigyy -= y * y;
91}
92
93
94/**********************************************************************
95 * LLSQ::m
96 *
97 * Return the gradient of the line fit.
98 **********************************************************************/
99
100double LLSQ::m() const { // get gradient
101 double covar = covariance();
102 double x_var = x_variance();
103 if (x_var != 0.0)
104 return covar / x_var;
105 else
106 return 0.0; // too little
107}
108
109
110/**********************************************************************
111 * LLSQ::c
112 *
113 * Return the constant of the line fit.
114 **********************************************************************/
115
116double LLSQ::c(double m) const { // get constant
117 if (total_weight > 0.0)
118 return (sigy - m * sigx) / total_weight;
119 else
120 return 0; // too little
121}
122
123
124/**********************************************************************
125 * LLSQ::rms
126 *
127 * Return the rms error of the fit.
128 **********************************************************************/
129
130double LLSQ::rms(double m, double c) const { // get error
131 double error; // total error
132
133 if (total_weight > 0) {
134 error = sigyy + m * (m * sigxx + 2 * (c * sigx - sigxy)) + c *
135 (total_weight * c - 2 * sigy);
136 if (error >= 0)
137 error = std::sqrt(error / total_weight); // sqrt of mean
138 else
139 error = 0;
140 } else {
141 error = 0; // too little
142 }
143 return error;
144}
145
146
147/**********************************************************************
148 * LLSQ::pearson
149 *
150 * Return the pearson product moment correlation coefficient.
151 **********************************************************************/
152
153double LLSQ::pearson() const { // get correlation
154 double r = 0.0; // Correlation is 0 if insufficient data.
155
156 double covar = covariance();
157 if (covar != 0.0) {
158 double var_product = x_variance() * y_variance();
159 if (var_product > 0.0)
160 r = covar / std::sqrt(var_product);
161 }
162 return r;
163}
164
165// Returns the x,y means as an FCOORD.
167 if (total_weight > 0.0) {
168 return FCOORD(sigx / total_weight, sigy / total_weight);
169 } else {
170 return FCOORD(0.0f, 0.0f);
171 }
172}
173
174// Returns the sqrt of the mean squared error measured perpendicular from the
175// line through mean_point() in the direction dir.
176//
177// Derivation:
178// Lemma: Let v and x_i (i=1..N) be a k-dimensional vectors (1xk matrices).
179// Let % be dot product and ' be transpose. Note that:
180// Sum[i=1..N] (v % x_i)^2
181// = v * [x_1' x_2' ... x_N'] * [x_1' x_2' .. x_N']' * v'
182// If x_i have average 0 we have:
183// = v * (N * COVARIANCE_MATRIX(X)) * v'
184// Expanded for the case that k = 2, where we treat the dimensions
185// as x_i and y_i, this is:
186// = v * (N * [VAR(X), COV(X,Y); COV(X,Y) VAR(Y)]) * v'
187// Now, we are trying to calculate the mean squared error, where v is
188// perpendicular to our line of interest:
189// Mean squared error
190// = E [ (v % (x_i - x_avg))) ^2 ]
191// = Sum (v % (x_i - x_avg))^2 / N
192// = v * N * [VAR(X) COV(X,Y); COV(X,Y) VAR(Y)] / N * v'
193// = v * [VAR(X) COV(X,Y); COV(X,Y) VAR(Y)] * v'
194// = code below
195double LLSQ::rms_orth(const FCOORD &dir) const {
196 FCOORD v = !dir;
197 v.normalise();
198 return std::sqrt(x_variance() * v.x() * v.x() +
199 2 * covariance() * v.x() * v.y() +
200 y_variance() * v.y() * v.y());
201}
202
203// Returns the direction of the fitted line as a unit vector, using the
204// least mean squared perpendicular distance. The line runs through the
205// mean_point, i.e. a point p on the line is given by:
206// p = mean_point() + lambda * vector_fit() for some real number lambda.
207// Note that the result (0<=x<=1, -1<=y<=-1) is directionally ambiguous
208// and may be negated without changing its meaning.
209// Fitting a line m + ๐œ†v to a set of N points Pi = (xi, yi), where
210// m is the mean point (๐, ๐‚) and
211// v is the direction vector (cos๐œƒ, sin๐œƒ)
212// The perpendicular distance of each Pi from the line is:
213// (Pi - m) x v, where x is the scalar cross product.
214// Total squared error is thus:
215// E = โˆ‘((xi - ๐)sin๐œƒ - (yi - ๐‚)cos๐œƒ)ยฒ
216// = โˆ‘(xi - ๐)ยฒsinยฒ๐œƒ - 2โˆ‘(xi - ๐)(yi - ๐‚)sin๐œƒ cos๐œƒ + โˆ‘(yi - ๐‚)ยฒcosยฒ๐œƒ
217// = NVar(xi)sinยฒ๐œƒ - 2NCovar(xi, yi)sin๐œƒ cos๐œƒ + NVar(yi)cosยฒ๐œƒ (Eq 1)
218// where Var(xi) is the variance of xi,
219// and Covar(xi, yi) is the covariance of xi, yi.
220// Taking the derivative wrt ๐œƒ and setting to 0 to obtain the min/max:
221// 0 = 2NVar(xi)sin๐œƒ cos๐œƒ -2NCovar(xi, yi)(cosยฒ๐œƒ - sinยฒ๐œƒ) -2NVar(yi)sin๐œƒ cos๐œƒ
222// => Covar(xi, yi)(cosยฒ๐œƒ - sinยฒ๐œƒ) = (Var(xi) - Var(yi))sin๐œƒ cos๐œƒ
223// Using double angles:
224// 2Covar(xi, yi)cos2๐œƒ = (Var(xi) - Var(yi))sin2๐œƒ (Eq 2)
225// So ๐œƒ = 0.5 atan2(2Covar(xi, yi), Var(xi) - Var(yi)) (Eq 3)
226
227// Because it involves 2๐œƒ , Eq 2 has 2 solutions 90 degrees apart, but which
228// is the min and which is the max? From Eq1:
229// E/N = Var(xi)sinยฒ๐œƒ - 2Covar(xi, yi)sin๐œƒ cos๐œƒ + Var(yi)cosยฒ๐œƒ
230// and 90 degrees away, using sin/cos equivalences:
231// E'/N = Var(xi)cosยฒ๐œƒ + 2Covar(xi, yi)sin๐œƒ cos๐œƒ + Var(yi)sinยฒ๐œƒ
232// The second error is smaller (making it the minimum) iff
233// E'/N < E/N ie:
234// (Var(xi) - Var(yi))(cosยฒ๐œƒ - sinยฒ๐œƒ) < -4Covar(xi, yi)sin๐œƒ cos๐œƒ
235// Using double angles:
236// (Var(xi) - Var(yi))cos2๐œƒ < -2Covar(xi, yi)sin2๐œƒ (InEq 1)
237// But atan2(2Covar(xi, yi), Var(xi) - Var(yi)) picks 2๐œƒ such that:
238// sgn(cos2๐œƒ) = sgn(Var(xi) - Var(yi)) and sgn(sin2๐œƒ) = sgn(Covar(xi, yi))
239// so InEq1 can *never* be true, making the atan2 result *always* the min!
240// In the degenerate case, where Covar(xi, yi) = 0 AND Var(xi) = Var(yi),
241// the 2 solutions have equal error and the inequality is still false.
242// Therefore the solution really is as trivial as Eq 3.
243
244// This is equivalent to returning the Principal Component in PCA, or the
245// eigenvector corresponding to the largest eigenvalue in the covariance
246// matrix. However, atan2 is much simpler! The one reference I found that
247// uses this formula is http://web.mit.edu/18.06/www/Essays/tlsfit.pdf but
248// that is still a much more complex derivation. It seems Pearson had already
249// found this simple solution in 1901.
250// http://books.google.com/books?id=WXwvAQAAIAAJ&pg=PA559
252 double x_var = x_variance();
253 double y_var = y_variance();
254 double covar = covariance();
255 double theta = 0.5 * atan2(2.0 * covar, x_var - y_var);
256 FCOORD result(cos(theta), sin(theta));
257 return result;
258}
constexpr ERRCODE EMPTY_LLSQ("Can't delete from an empty LLSQ")
@ ABORT
Definition: errcode.h:29
Definition: linlsq.h:28
double y_variance() const
Definition: linlsq.h:87
double m() const
Definition: linlsq.cpp:100
double pearson() const
Definition: linlsq.cpp:153
void remove(double x, double y)
Definition: linlsq.cpp:82
double c(double m) const
Definition: linlsq.cpp:116
double x_variance() const
Definition: linlsq.h:81
double rms(double m, double c) const
Definition: linlsq.cpp:130
double covariance() const
Definition: linlsq.h:75
void clear()
Definition: linlsq.cpp:32
void add(double x, double y)
Definition: linlsq.cpp:48
double rms_orth(const FCOORD &dir) const
Definition: linlsq.cpp:195
FCOORD mean_point() const
Definition: linlsq.cpp:166
FCOORD vector_fit() const
Definition: linlsq.cpp:251
Definition: points.h:189
float y() const
Definition: points.h:210
bool normalise()
Convert to unit vec.
Definition: points.cpp:29
float x() const
Definition: points.h:207
void error(const char *caller, TessErrorLogCode action, const char *format,...) const
Definition: errcode.cpp:35