tesseract 4.1.1
Loading...
Searching...
No Matches
fixxht.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: fixxht.cpp (Formerly fixxht.c)
3 * Description: Improve x_ht and look out for case inconsistencies
4 * Author: Phil Cheatle
5 * Created: Thu Aug 5 14:11:08 BST 1993
6 *
7 * (C) Copyright 1992, Hewlett-Packard Ltd.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 **********************************************************************/
19
20#include <algorithm>
21#include <cstring>
22#include <cctype>
23#include "params.h"
24#include "float2int.h"
25#include "tesseractclass.h"
26
27namespace tesseract {
28
29// Fixxht overview.
30// Premise: Initial estimate of x-height is adequate most of the time, but
31// occasionally it is incorrect. Most notable causes of failure are:
32// 1. Small caps, where the top of the caps is the same as the body text
33// xheight. For small caps words the xheight needs to be reduced to correctly
34// recognize the caps in the small caps word.
35// 2. All xheight lines, such as summer. Here the initial estimate will have
36// guessed that the blob tops are caps and will have placed the xheight too low.
37// 3. Noise/logos beside words, or changes in font size on a line. Such
38// things can blow the statistics and cause an incorrect estimate.
39// 4. Incorrect baseline. Can happen when 2 columns are incorrectly merged.
40// In this case the x-height is often still correct.
41//
42// Algorithm.
43// Compare the vertical position (top only) of alphnumerics in a word with
44// the range of positions in training data (in the unicharset).
45// See CountMisfitTops. If any characters disagree sufficiently with the
46// initial xheight estimate, then recalculate the xheight, re-run OCR on
47// the word, and if the number of vertical misfits goes down, along with
48// either the word rating or certainty, then keep the new xheight.
49// The new xheight is calculated as follows:ComputeCompatibleXHeight
50// For each alphanumeric character that has a vertically misplaced top
51// (a misfit), yet its bottom is within the acceptable range (ie it is not
52// likely a sub-or super-script) calculate the range of acceptable xheight
53// positions from its range of tops, and give each value in the range a
54// number of votes equal to the distance of its top from its acceptance range.
55// The x-height position with the median of the votes becomes the new
56// x-height. This assumes that most characters will be correctly recognized
57// even if the x-height is incorrect. This is not a terrible assumption, but
58// it is not great. An improvement would be to use a classifier that does
59// not care about vertical position or scaling at all.
60// Separately collect stats on shifted baselines and apply the same logic to
61// computing a best-fit shift to fix the error. If the baseline needs to be
62// shifted, but the x-height is OK, returns the original x-height along with
63// the baseline shift to indicate that recognition needs to re-run.
64
65// If the max-min top of a unicharset char is bigger than kMaxCharTopRange
66// then the char top cannot be used to judge misfits or suggest a new top.
67const int kMaxCharTopRange = 48;
68
69// Returns the number of misfit blob tops in this word.
71 int bad_blobs = 0;
72 int num_blobs = word_res->rebuild_word->NumBlobs();
73 for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
74 TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
75 UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
76 if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
77 int top = blob->bounding_box().top();
78 if (top >= INT_FEAT_RANGE)
79 top = INT_FEAT_RANGE - 1;
80 int min_bottom, max_bottom, min_top, max_top;
81 unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
82 &min_top, &max_top);
83 if (max_top - min_top > kMaxCharTopRange)
84 continue;
85 bool bad = top < min_top - x_ht_acceptance_tolerance ||
86 top > max_top + x_ht_acceptance_tolerance;
87 if (bad)
88 ++bad_blobs;
89 if (debug_x_ht_level >= 1) {
90 tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
91 unicharset.id_to_unichar(class_id),
92 bad ? "Misfit" : "OK", top, min_top, max_top,
93 static_cast<int>(x_ht_acceptance_tolerance));
94 }
95 }
96 }
97 return bad_blobs;
98}
99
100// Returns a new x-height maximally compatible with the result in word_res.
101// See comment above for overall algorithm.
103 float* baseline_shift) {
104 STATS top_stats(0, UINT8_MAX);
105 STATS shift_stats(-UINT8_MAX, UINT8_MAX);
106 int bottom_shift = 0;
107 int num_blobs = word_res->rebuild_word->NumBlobs();
108 do {
109 top_stats.clear();
110 shift_stats.clear();
111 for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
112 TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
113 UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
114 if (unicharset.get_isalpha(class_id) ||
115 unicharset.get_isdigit(class_id)) {
116 int top = blob->bounding_box().top() + bottom_shift;
117 // Clip the top to the limit of normalized feature space.
118 if (top >= INT_FEAT_RANGE)
119 top = INT_FEAT_RANGE - 1;
120 int bottom = blob->bounding_box().bottom() + bottom_shift;
121 int min_bottom, max_bottom, min_top, max_top;
122 unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
123 &min_top, &max_top);
124 // Chars with a wild top range would mess up the result so ignore them.
125 if (max_top - min_top > kMaxCharTopRange)
126 continue;
127 int misfit_dist = std::max((min_top - x_ht_acceptance_tolerance) - top,
128 top - (max_top + x_ht_acceptance_tolerance));
129 int height = top - kBlnBaselineOffset;
130 if (debug_x_ht_level >= 2) {
131 tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
132 unicharset.id_to_unichar(class_id),
133 height, min_bottom, max_bottom, min_top, max_top,
134 bottom, top);
135 }
136 // Use only chars that fit in the expected bottom range, and where
137 // the range of tops is sensibly near the xheight.
138 if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
139 bottom - x_ht_acceptance_tolerance <= max_bottom &&
140 min_top > kBlnBaselineOffset &&
141 max_top - kBlnBaselineOffset >= kBlnXHeight &&
142 misfit_dist > 0) {
143 // Compute the x-height position using proportionality between the
144 // actual height and expected height.
145 int min_xht = DivRounded(height * kBlnXHeight,
146 max_top - kBlnBaselineOffset);
147 int max_xht = DivRounded(height * kBlnXHeight,
148 min_top - kBlnBaselineOffset);
149 if (debug_x_ht_level >= 2) {
150 tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
151 }
152 // The range of expected heights gets a vote equal to the distance
153 // of the actual top from the expected top.
154 for (int y = min_xht; y <= max_xht; ++y)
155 top_stats.add(y, misfit_dist);
156 } else if ((min_bottom > bottom + x_ht_acceptance_tolerance ||
157 bottom - x_ht_acceptance_tolerance > max_bottom) &&
158 bottom_shift == 0) {
159 // Get the range of required bottom shift.
160 int min_shift = min_bottom - bottom;
161 int max_shift = max_bottom - bottom;
162 if (debug_x_ht_level >= 2) {
163 tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift);
164 }
165 // The range of expected shifts gets a vote equal to the min distance
166 // of the actual bottom from the expected bottom, spread over the
167 // range of its acceptance.
168 int misfit_weight = abs(min_shift);
169 if (max_shift > min_shift)
170 misfit_weight /= max_shift - min_shift;
171 for (int y = min_shift; y <= max_shift; ++y)
172 shift_stats.add(y, misfit_weight);
173 } else {
174 if (bottom_shift == 0) {
175 // Things with bottoms that are already ok need to say so, on the
176 // 1st iteration only.
177 shift_stats.add(0, kBlnBaselineOffset);
178 }
179 if (debug_x_ht_level >= 2) {
180 tprintf(" already OK\n");
181 }
182 }
183 }
184 }
185 if (shift_stats.get_total() > top_stats.get_total()) {
186 bottom_shift = IntCastRounded(shift_stats.median());
187 if (debug_x_ht_level >= 2) {
188 tprintf("Applying bottom shift=%d\n", bottom_shift);
189 }
190 }
191 } while (bottom_shift != 0 &&
192 top_stats.get_total() < shift_stats.get_total());
193 // Baseline shift is opposite sign to the bottom shift.
194 *baseline_shift = -bottom_shift / word_res->denorm.y_scale();
195 if (debug_x_ht_level >= 2) {
196 tprintf("baseline shift=%g\n", *baseline_shift);
197 }
198 if (top_stats.get_total() == 0)
199 return bottom_shift != 0 ? word_res->x_height : 0.0f;
200 // The new xheight is just the median vote, which is then scaled out
201 // of BLN space back to pixel space to get the x-height in pixel space.
202 float new_xht = top_stats.median();
203 if (debug_x_ht_level >= 2) {
204 tprintf("Median xht=%f\n", new_xht);
205 tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n",
206 new_xht, new_xht / word_res->denorm.y_scale());
207 }
208 // The xheight must change by at least x_ht_min_change to be used.
209 if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change)
210 return new_xht / word_res->denorm.y_scale();
211 else
212 return bottom_shift != 0 ? word_res->x_height : 0.0f;
213}
214
215} // namespace tesseract
const int kBlnBaselineOffset
Definition: normalis.h:25
const int kBlnXHeight
Definition: normalis.h:24
int DivRounded(int a, int b)
Definition: helpers.h:167
int IntCastRounded(double x)
Definition: helpers.h:175
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int UNICHAR_ID
Definition: unichar.h:34
#define INT_FEAT_RANGE
Definition: float2int.h:27
const int kMaxCharTopRange
Definition: fixxht.cpp:67
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
Definition: fixxht.cpp:102
int CountMisfitTops(WERD_RES *word_res)
Definition: fixxht.cpp:70
Definition: blobs.h:284
TBOX bounding_box() const
Definition: blobs.cpp:468
int NumBlobs() const
Definition: blobs.h:448
GenericVector< TBLOB * > blobs
Definition: blobs.h:459
float y_scale() const
Definition: normalis.h:270
DENORM denorm
Definition: pageres.h:201
TWERD * rebuild_word
Definition: pageres.h:266
WERD_CHOICE * best_choice
Definition: pageres.h:241
float x_height
Definition: pageres.h:316
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
int16_t top() const
Definition: rect.h:58
int16_t bottom() const
Definition: rect.h:65
Definition: statistc.h:31
void clear()
Definition: statistc.cpp:75
void add(int32_t value, int32_t count)
Definition: statistc.cpp:93
double median() const
Definition: statistc.cpp:231
int32_t get_total() const
Definition: statistc.h:84
UNICHARSET unicharset
Definition: ccutil.h:73
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:568