tesseract 4.1.1
Loading...
Searching...
No Matches
edgblob.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: edgblob.cpp (Formerly edgeloop.c)
3 * Description: Functions to clean up an outline before approximation.
4 * Author: Ray Smith
5 *
6 *(C) Copyright 1991, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0(the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19#include "scanedg.h"
20#include "edgloop.h"
21#include "edgblob.h"
22
23// Include automatically generated configuration file if running autoconf.
24#ifdef HAVE_CONFIG_H
25#include "config_auto.h"
26#endif
27
28// Control parameters used in outline_complexity(), which rejects an outline
29// if any one of the 3 conditions is satisfied:
30// - number of children exceeds edges_max_children_per_outline
31// - number of nested layers exceeds edges_max_children_layers
32// - joint complexity exceeds edges_children_count_limit(as in child_count())
33static BOOL_VAR(edges_use_new_outline_complexity, false,
34 "Use the new outline complexity module");
35static INT_VAR(edges_max_children_per_outline, 10,
36 "Max number of children inside a character outline");
37static INT_VAR(edges_max_children_layers, 5,
38 "Max layers of nested children inside a character outline");
39static BOOL_VAR(edges_debug, false,
40 "turn on debugging for this module");
41
42static INT_VAR(edges_children_per_grandchild, 10,
43 "Importance ratio for chucking outlines");
44static INT_VAR(edges_children_count_limit, 45,
45 "Max holes allowed in blob");
46static BOOL_VAR(edges_children_fix, false,
47 "Remove boxy parents of char-like children");
48static INT_VAR(edges_min_nonhole, 12,
49 "Min pixels for potential char in box");
50static INT_VAR(edges_patharea_ratio, 40,
51 "Max lensq/area for acceptable child outline");
52static double_VAR(edges_childarea, 0.5,
53 "Min area fraction of child outline");
54static double_VAR(edges_boxarea, 0.875,
55 "Min area fraction of grandchild for box");
56
64ICOORD bleft, // corners
65ICOORD tright): bl(bleft), tr(tright) {
66 bxdim =(tright.x() - bleft.x()) / BUCKETSIZE + 1;
67 bydim =(tright.y() - bleft.y()) / BUCKETSIZE + 1;
68 // make array
69 buckets.reset(new C_OUTLINE_LIST[bxdim * bydim]);
70 index = 0;
71}
72
73
81C_OUTLINE_LIST *
82OL_BUCKETS::operator()( // array access
83int16_t x, // image coords
84int16_t y) {
85 return &buckets[(y-bl.y()) / BUCKETSIZE * bxdim + (x-bl.x()) / BUCKETSIZE];
86}
87
88
110 C_OUTLINE *outline, // parent outline
111 int32_t max_count, // max output
112 int16_t depth // recurion depth
113 ) {
114 int16_t xmin, xmax; // coord limits
115 int16_t ymin, ymax;
116 int16_t xindex, yindex; // current bucket
117 C_OUTLINE *child; // current child
118 int32_t child_count; // no of children
119 int32_t grandchild_count; // no of grandchildren
120 C_OUTLINE_IT child_it; // search iterator
121
122 TBOX olbox = outline->bounding_box();
123 xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
124 xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
125 ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
126 ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
127 child_count = 0;
128 grandchild_count = 0;
129 if (++depth > edges_max_children_layers) // nested loops are too deep
130 return max_count + depth;
131
132 for (yindex = ymin; yindex <= ymax; yindex++) {
133 for (xindex = xmin; xindex <= xmax; xindex++) {
134 child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
135 if (child_it.empty())
136 continue;
137 for (child_it.mark_cycle_pt(); !child_it.cycled_list();
138 child_it.forward()) {
139 child = child_it.data();
140 if (child == outline || !(*child < *outline))
141 continue;
142 child_count++;
143
144 if (child_count > edges_max_children_per_outline) { // too fragmented
145 if (edges_debug)
146 tprintf("Discard outline on child_count=%d > "
147 "max_children_per_outline=%d\n",
148 child_count,
149 static_cast<int32_t>(edges_max_children_per_outline));
150 return max_count + child_count;
151 }
152
153 // Compute the "complexity" of each child recursively
154 int32_t remaining_count = max_count - child_count - grandchild_count;
155 if (remaining_count > 0)
156 grandchild_count += edges_children_per_grandchild *
157 outline_complexity(child, remaining_count, depth);
158 if (child_count + grandchild_count > max_count) { // too complex
159 if (edges_debug)
160 tprintf("Disgard outline on child_count=%d + grandchild_count=%d "
161 "> max_count=%d\n",
162 child_count, grandchild_count, max_count);
163 return child_count + grandchild_count;
164 }
165 }
166 }
167 }
168 return child_count + grandchild_count;
169}
170
171
177// TODO(rays) Merge with outline_complexity.
178int32_t OL_BUCKETS::count_children( // recursive count
179 C_OUTLINE *outline, // parent outline
180 int32_t max_count // max output
181 ) {
182 bool parent_box; // could it be boxy
183 int16_t xmin, xmax; // coord limits
184 int16_t ymin, ymax;
185 int16_t xindex, yindex; // current bucket
186 C_OUTLINE *child; // current child
187 int32_t child_count; // no of children
188 int32_t grandchild_count; // no of grandchildren
189 int32_t parent_area; // potential box
190 float max_parent_area; // potential box
191 int32_t child_area; // current child
192 int32_t child_length; // current child
193 TBOX olbox;
194 C_OUTLINE_IT child_it; // search iterator
195
196 olbox = outline->bounding_box();
197 xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
198 xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
199 ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
200 ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
201 child_count = 0;
202 grandchild_count = 0;
203 parent_area = 0;
204 max_parent_area = 0;
205 parent_box = true;
206 for (yindex = ymin; yindex <= ymax; yindex++) {
207 for (xindex = xmin; xindex <= xmax; xindex++) {
208 child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
209 if (child_it.empty())
210 continue;
211 for (child_it.mark_cycle_pt(); !child_it.cycled_list();
212 child_it.forward()) {
213 child = child_it.data();
214 if (child != outline && *child < *outline) {
215 child_count++;
216 if (child_count <= max_count) {
217 int max_grand =(max_count - child_count) /
218 edges_children_per_grandchild;
219 if (max_grand > 0)
220 grandchild_count += count_children(child, max_grand) *
221 edges_children_per_grandchild;
222 else
223 grandchild_count += count_children(child, 1);
224 }
225 if (child_count + grandchild_count > max_count) {
226 if (edges_debug)
227 tprintf("Discarding parent with child count=%d, gc=%d\n",
228 child_count,grandchild_count);
229 return child_count + grandchild_count;
230 }
231 if (parent_area == 0) {
232 parent_area = outline->outer_area();
233 if (parent_area < 0)
234 parent_area = -parent_area;
235 max_parent_area = outline->bounding_box().area() * edges_boxarea;
236 if (parent_area < max_parent_area)
237 parent_box = false;
238 }
239 if (parent_box &&
240 (!edges_children_fix ||
241 child->bounding_box().height() > edges_min_nonhole)) {
242 child_area = child->outer_area();
243 if (child_area < 0)
244 child_area = -child_area;
245 if (edges_children_fix) {
246 if (parent_area - child_area < max_parent_area) {
247 parent_box = false;
248 continue;
249 }
250 if (grandchild_count > 0) {
251 if (edges_debug)
252 tprintf("Discarding parent of area %d, child area=%d, max%g "
253 "with gc=%d\n",
254 parent_area, child_area, max_parent_area,
255 grandchild_count);
256 return max_count + 1;
257 }
258 child_length = child->pathlength();
259 if (child_length * child_length >
260 child_area * edges_patharea_ratio) {
261 if (edges_debug)
262 tprintf("Discarding parent of area %d, child area=%d, max%g "
263 "with child length=%d\n",
264 parent_area, child_area, max_parent_area,
265 child_length);
266 return max_count + 1;
267 }
268 }
269 if (child_area < child->bounding_box().area() * edges_childarea) {
270 if (edges_debug)
271 tprintf("Discarding parent of area %d, child area=%d, max%g "
272 "with child rect=%d\n",
273 parent_area, child_area, max_parent_area,
274 child->bounding_box().area());
275 return max_count + 1;
276 }
277 }
278 }
279 }
280 }
281 }
282 return child_count + grandchild_count;
283}
284
285
286
287
294void OL_BUCKETS::extract_children( // recursive count
295 C_OUTLINE *outline, // parent outline
296 C_OUTLINE_IT *it // destination iterator
297 ) {
298 int16_t xmin, xmax; // coord limits
299 int16_t ymin, ymax;
300 int16_t xindex, yindex; // current bucket
301 TBOX olbox;
302 C_OUTLINE_IT child_it; // search iterator
303
304 olbox = outline->bounding_box();
305 xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
306 xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
307 ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
308 ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
309 for (yindex = ymin; yindex <= ymax; yindex++) {
310 for (xindex = xmin; xindex <= xmax; xindex++) {
311 child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
312 for (child_it.mark_cycle_pt(); !child_it.cycled_list();
313 child_it.forward()) {
314 if (*child_it.data() < *outline) {
315 it->add_after_then_move(child_it.extract());
316 }
317 }
318 }
319 }
320}
321
322
329void extract_edges(Pix* pix, // thresholded image
330 BLOCK *block) { // block to scan
331 C_OUTLINE_LIST outlines; // outlines in block
332 C_OUTLINE_IT out_it = &outlines;
333
334 block_edges(pix, &(block->pdblk), &out_it);
335 ICOORD bleft; // block box
336 ICOORD tright;
337 block->pdblk.bounding_box(bleft, tright);
338 // make blobs
339 outlines_to_blobs(block, bleft, tright, &outlines);
340}
341
342
349void outlines_to_blobs( // find blobs
350 BLOCK *block, // block to scan
351 ICOORD bleft,
352 ICOORD tright,
353 C_OUTLINE_LIST *outlines) {
354 // make buckets
355 OL_BUCKETS buckets(bleft, tright);
356
357 fill_buckets(outlines, &buckets);
358 empty_buckets(block, &buckets);
359}
360
361
368void fill_buckets( // find blobs
369 C_OUTLINE_LIST *outlines, // outlines in block
370 OL_BUCKETS *buckets // output buckets
371 ) {
372 TBOX ol_box; // outline box
373 C_OUTLINE_IT out_it = outlines; // iterator
374 C_OUTLINE_IT bucket_it; // iterator in bucket
375 C_OUTLINE *outline; // current outline
376
377 for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
378 outline = out_it.extract(); // take off list
379 // get box
380 ol_box = outline->bounding_box();
381 bucket_it.set_to_list((*buckets) (ol_box.left(), ol_box.bottom()));
382 bucket_it.add_to_end(outline);
383 }
384}
385
386
393void empty_buckets( // find blobs
394 BLOCK *block, // block to scan
395 OL_BUCKETS *buckets // output buckets
396 ) {
397 bool good_blob; // healthy blob
398 C_OUTLINE_LIST outlines; // outlines in block
399 // iterator
400 C_OUTLINE_IT out_it = &outlines;
401 C_OUTLINE_IT bucket_it = buckets->start_scan();
402 C_OUTLINE_IT parent_it; // parent outline
403 C_BLOB_IT good_blobs = block->blob_list();
404 C_BLOB_IT junk_blobs = block->reject_blobs();
405
406 while (!bucket_it.empty()) {
407 out_it.set_to_list(&outlines);
408 do {
409 parent_it = bucket_it; // find outermost
410 do {
411 bucket_it.forward();
412 } while (!bucket_it.at_first() &&
413 !(*parent_it.data() < *bucket_it.data()));
414 } while (!bucket_it.at_first());
415
416 // move to new list
417 out_it.add_after_then_move(parent_it.extract());
418 good_blob = capture_children(buckets, &junk_blobs, &out_it);
419 C_BLOB::ConstructBlobsFromOutlines(good_blob, &outlines, &good_blobs,
420 &junk_blobs);
421
422 bucket_it.set_to_list(buckets->scan_next());
423 }
424}
425
426
435bool capture_children( // find children
436 OL_BUCKETS* buckets, // bucket sort clanss
437 C_BLOB_IT* reject_it, // dead grandchildren
438 C_OUTLINE_IT* blob_it // output outlines
439) {
440 C_OUTLINE *outline; // master outline
441 int32_t child_count; // no of children
442
443 outline = blob_it->data();
444 if (edges_use_new_outline_complexity)
445 child_count = buckets->outline_complexity(outline,
446 edges_children_count_limit,
447 0);
448 else
449 child_count = buckets->count_children(outline,
450 edges_children_count_limit);
451 if (child_count > edges_children_count_limit)
452 return false;
453
454 if (child_count > 0)
455 buckets->extract_children(outline, blob_it);
456 return true;
457}
#define BOOL_VAR(name, val, comment)
Definition: params.h:306
#define INT_VAR(name, val, comment)
Definition: params.h:303
#define double_VAR(name, val, comment)
Definition: params.h:312
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
void extract_edges(Pix *pix, BLOCK *block)
Definition: edgblob.cpp:329
void empty_buckets(BLOCK *block, OL_BUCKETS *buckets)
Definition: edgblob.cpp:393
bool capture_children(OL_BUCKETS *buckets, C_BLOB_IT *reject_it, C_OUTLINE_IT *blob_it)
Definition: edgblob.cpp:435
void fill_buckets(C_OUTLINE_LIST *outlines, OL_BUCKETS *buckets)
Definition: edgblob.cpp:368
void outlines_to_blobs(BLOCK *block, ICOORD bleft, ICOORD tright, C_OUTLINE_LIST *outlines)
Definition: edgblob.cpp:349
#define BUCKETSIZE
Definition: edgblob.h:30
void block_edges(Pix *t_pix, PDBLK *block, C_OUTLINE_IT *outline_it)
Definition: scanedg.cpp:37
const TBOX & bounding_box() const
Definition: coutln.h:113
int32_t outer_area() const
Definition: coutln.cpp:308
int32_t pathlength() const
Definition: coutln.h:135
Definition: ocrblock.h:31
C_BLOB_LIST * blob_list()
get blobs
Definition: ocrblock.h:128
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:190
C_BLOB_LIST * reject_blobs()
Definition: ocrblock.h:131
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:59
integer coordinate
Definition: points.h:32
int16_t y() const
access_function
Definition: points.h:56
int16_t x() const
access function
Definition: points.h:52
Definition: rect.h:34
int16_t top() const
Definition: rect.h:58
int32_t area() const
Definition: rect.h:122
int16_t height() const
Definition: rect.h:108
int16_t left() const
Definition: rect.h:72
int16_t bottom() const
Definition: rect.h:65
int16_t right() const
Definition: rect.h:79
static void ConstructBlobsFromOutlines(bool good_blob, C_OUTLINE_LIST *outline_list, C_BLOB_IT *good_blobs_it, C_BLOB_IT *bad_blobs_it)
Definition: stepblob.cpp:189
int32_t outline_complexity(C_OUTLINE *outline, int32_t max_count, int16_t depth)
Definition: edgblob.cpp:109
OL_BUCKETS(ICOORD bleft, ICOORD tright)
Definition: edgblob.cpp:63
void extract_children(C_OUTLINE *outline, C_OUTLINE_IT *it)
Definition: edgblob.cpp:294
C_OUTLINE_LIST * operator()(int16_t x, int16_t y)
Definition: edgblob.cpp:82
C_OUTLINE_LIST * start_scan()
Definition: edgblob.h:45
int32_t count_children(C_OUTLINE *outline, int32_t max_count)
Definition: edgblob.cpp:178
C_OUTLINE_LIST * scan_next()
Definition: edgblob.h:51