tesseract 4.1.1
Loading...
Searching...
No Matches
tordmain.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: tordmain.cpp (Formerly textordp.c)
3 * Description: C++ top level textord code.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 1992, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19#define _USE_MATH_DEFINES // for M_PI
20#ifdef HAVE_CONFIG_H
21#include "config_auto.h"
22#endif
23
24#include "tordmain.h"
25#include <cfloat> // for FLT_MAX
26#include <cmath> // for ceil, floor, M_PI
27#include <cstdint> // for INT16_MAX, uint32_t, int32_t, int16_t
28#include "allheaders.h" // for pixDestroy, pixGetHeight, boxCreate
29#include "arrayaccess.h" // for GET_DATA_BYTE
30#include "blobbox.h" // for BLOBNBOX_IT, BLOBNBOX, TO_BLOCK, TO_B...
31#include "ccstruct.h" // for CCStruct, CCStruct::kXHeightFraction
32#include "clst.h" // for CLISTIZE
33#include "coutln.h" // for C_OUTLINE_IT, C_OUTLINE_LIST, C_OUTLINE
34#include "drawtord.h" // for plot_box_list, to_win, create_to_win
35#include "edgblob.h" // for extract_edges
36#include "errcode.h" // for set_global_loc_code, ASSERT_HOST, LOC...
37#include "genericvector.h" // for PointerVector, GenericVector
38#include "makerow.h" // for textord_test_x, textord_test_y, texto...
39#include "morph.h" // for L_BOUNDARY_BG
40#include "ocrblock.h" // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only)
41#include "ocrrow.h" // for ROW, ROW_IT, ROW_LIST, tweak_row_base...
42#include "params.h" // for DoubleParam, BoolParam, IntParam
43#include "pdblock.h" // for PDBLK
44#include "points.h" // for FCOORD, ICOORD
45#include "polyblk.h" // for POLY_BLOCK
46#include "quadratc.h" // for QUAD_COEFFS
47#include "quspline.h" // for QSPLINE, tweak_row_baseline
48#include "rect.h" // for TBOX
49#include "scrollview.h" // for ScrollView, ScrollView::WHITE
50#include "statistc.h" // for STATS
51#include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
52#include "textord.h" // for Textord, WordWithBox, WordGrid, WordS...
53#include "tprintf.h" // for tprintf
54#include "werd.h" // for WERD_IT, WERD, WERD_LIST, W_DONT_CHOP
55
56struct Box;
57
58#define MAX_NEAREST_DIST 600 //for block skew stats
59
60namespace tesseract {
61
62CLISTIZE(WordWithBox)
63
64/**********************************************************************
65 * SetBlobStrokeWidth
66 *
67 * Set the horizontal and vertical stroke widths in the blob.
68 **********************************************************************/
69void SetBlobStrokeWidth(Pix* pix, BLOBNBOX* blob) {
70 // Cut the blob rectangle into a Pix.
71 int pix_height = pixGetHeight(pix);
72 const TBOX& box = blob->bounding_box();
73 int width = box.width();
74 int height = box.height();
75 Box* blob_pix_box = boxCreate(box.left(), pix_height - box.top(),
76 width, height);
77 Pix* pix_blob = pixClipRectangle(pix, blob_pix_box, nullptr);
78 boxDestroy(&blob_pix_box);
79 Pix* dist_pix = pixDistanceFunction(pix_blob, 4, 8, L_BOUNDARY_BG);
80 pixDestroy(&pix_blob);
81 // Compute the stroke widths.
82 uint32_t* data = pixGetData(dist_pix);
83 int wpl = pixGetWpl(dist_pix);
84 // Horizontal width of stroke.
85 STATS h_stats(0, width + 1);
86 for (int y = 0; y < height; ++y) {
87 uint32_t* pixels = data + y*wpl;
88 int prev_pixel = 0;
89 int pixel = GET_DATA_BYTE(pixels, 0);
90 for (int x = 1; x < width; ++x) {
91 int next_pixel = GET_DATA_BYTE(pixels, x);
92 // We are looking for a pixel that is equal to its vertical neighbours,
93 // yet greater than its left neighbour.
94 if (prev_pixel < pixel &&
95 (y == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
96 (y == height - 1 || pixel == GET_DATA_BYTE(pixels + wpl, x - 1))) {
97 if (pixel > next_pixel) {
98 // Single local max, so an odd width.
99 h_stats.add(pixel * 2 - 1, 1);
100 } else if (pixel == next_pixel && x + 1 < width &&
101 pixel > GET_DATA_BYTE(pixels, x + 1)) {
102 // Double local max, so an even width.
103 h_stats.add(pixel * 2, 1);
104 }
105 }
106 prev_pixel = pixel;
107 pixel = next_pixel;
108 }
109 }
110 // Vertical width of stroke.
111 STATS v_stats(0, height + 1);
112 for (int x = 0; x < width; ++x) {
113 int prev_pixel = 0;
114 int pixel = GET_DATA_BYTE(data, x);
115 for (int y = 1; y < height; ++y) {
116 uint32_t* pixels = data + y*wpl;
117 int next_pixel = GET_DATA_BYTE(pixels, x);
118 // We are looking for a pixel that is equal to its horizontal neighbours,
119 // yet greater than its upper neighbour.
120 if (prev_pixel < pixel &&
121 (x == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
122 (x == width - 1 || pixel == GET_DATA_BYTE(pixels - wpl, x + 1))) {
123 if (pixel > next_pixel) {
124 // Single local max, so an odd width.
125 v_stats.add(pixel * 2 - 1, 1);
126 } else if (pixel == next_pixel && y + 1 < height &&
127 pixel > GET_DATA_BYTE(pixels + wpl, x)) {
128 // Double local max, so an even width.
129 v_stats.add(pixel * 2, 1);
130 }
131 }
132 prev_pixel = pixel;
133 pixel = next_pixel;
134 }
135 }
136 pixDestroy(&dist_pix);
137 // Store the horizontal and vertical width in the blob, keeping both
138 // widths if there is enough information, otherwise only the one with
139 // the most samples.
140 // If there are insufficient samples, store zero, rather than using
141 // 2*area/perimeter, as the numbers that gives do not match the numbers
142 // from the distance method.
143 if (h_stats.get_total() >= (width + height) / 4) {
144 blob->set_horz_stroke_width(h_stats.ile(0.5f));
145 if (v_stats.get_total() >= (width + height) / 4)
146 blob->set_vert_stroke_width(v_stats.ile(0.5f));
147 else
148 blob->set_vert_stroke_width(0.0f);
149 } else {
150 if (v_stats.get_total() >= (width + height) / 4 ||
151 v_stats.get_total() > h_stats.get_total()) {
152 blob->set_horz_stroke_width(0.0f);
153 blob->set_vert_stroke_width(v_stats.ile(0.5f));
154 } else {
155 blob->set_horz_stroke_width(h_stats.get_total() > 2 ? h_stats.ile(0.5f)
156 : 0.0f);
157 blob->set_vert_stroke_width(0.0f);
158 }
159 }
160}
161
162/**********************************************************************
163 * assign_blobs_to_blocks2
164 *
165 * Make a list of TO_BLOCKs for portrait and landscape orientation.
166 **********************************************************************/
167
169 BLOCK_LIST *blocks, // blocks to process
170 TO_BLOCK_LIST *port_blocks) { // output list
171 BLOCK *block; // current block
172 BLOBNBOX *newblob; // created blob
173 C_BLOB *blob; // current blob
174 BLOCK_IT block_it = blocks;
175 C_BLOB_IT blob_it; // iterator
176 BLOBNBOX_IT port_box_it; // iterator
177 // destination iterator
178 TO_BLOCK_IT port_block_it = port_blocks;
179 TO_BLOCK *port_block; // created block
180
181 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
182 block = block_it.data();
183 port_block = new TO_BLOCK(block);
184
185 // Convert the good outlines to block->blob_list
186 port_box_it.set_to_list(&port_block->blobs);
187 blob_it.set_to_list(block->blob_list());
188 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
189 blob = blob_it.extract();
190 newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX.
191 SetBlobStrokeWidth(pix, newblob);
192 port_box_it.add_after_then_move(newblob);
193 }
194
195 // Put the rejected outlines in block->noise_blobs, which allows them to
196 // be reconsidered and sorted back into rows and recover outlines mistakenly
197 // rejected.
198 port_box_it.set_to_list(&port_block->noise_blobs);
199 blob_it.set_to_list(block->reject_blobs());
200 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
201 blob = blob_it.extract();
202 newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX.
203 SetBlobStrokeWidth(pix, newblob);
204 port_box_it.add_after_then_move(newblob);
205 }
206
207 port_block_it.add_after_then_move(port_block);
208 }
209}
210
211/**********************************************************************
212 * find_components
213 *
214 * Find the C_OUTLINEs of the connected components in each block, put them
215 * in C_BLOBs, and filter them by size, putting the different size
216 * grades on different lists in the matching TO_BLOCK in to_blocks.
217 **********************************************************************/
218
219void Textord::find_components(Pix* pix, BLOCK_LIST *blocks,
220 TO_BLOCK_LIST *to_blocks) {
221 int width = pixGetWidth(pix);
222 int height = pixGetHeight(pix);
223 if (width > INT16_MAX || height > INT16_MAX) {
224 tprintf("Input image too large! (%d, %d)\n", width, height);
225 return; // Can't handle it.
226 }
227
229
230 BLOCK_IT block_it(blocks); // iterator
231 for (block_it.mark_cycle_pt(); !block_it.cycled_list();
232 block_it.forward()) {
233 BLOCK* block = block_it.data();
234 if (block->pdblk.poly_block() == nullptr || block->pdblk.poly_block()->IsText()) {
235 extract_edges(pix, block);
236 }
237 }
238
239 assign_blobs_to_blocks2(pix, blocks, to_blocks);
240 ICOORD page_tr(width, height);
241 filter_blobs(page_tr, to_blocks, !textord_test_landscape);
242}
243
244/**********************************************************************
245 * filter_blobs
246 *
247 * Sort the blobs into sizes in all the blocks for later work.
248 **********************************************************************/
249
250void Textord::filter_blobs(ICOORD page_tr, // top right
251 TO_BLOCK_LIST* blocks, // output list
252 bool testing_on) { // for plotting
253 TO_BLOCK_IT block_it = blocks; // destination iterator
254 TO_BLOCK *block; // created block
255
256 #ifndef GRAPHICS_DISABLED
257 if (to_win != nullptr)
258 to_win->Clear();
259 #endif // GRAPHICS_DISABLED
260
261 for (block_it.mark_cycle_pt(); !block_it.cycled_list();
262 block_it.forward()) {
263 block = block_it.data();
264 block->line_size = filter_noise_blobs(&block->blobs,
265 &block->noise_blobs,
266 &block->small_blobs,
267 &block->large_blobs);
268 if (block->line_size == 0) block->line_size = 1;
269 block->line_spacing = block->line_size *
276
277 #ifndef GRAPHICS_DISABLED
278 if (textord_show_blobs && testing_on) {
279 if (to_win == nullptr)
280 create_to_win(page_tr);
282 }
283 if (textord_show_boxes && testing_on) {
284 if (to_win == nullptr)
285 create_to_win(page_tr);
290 }
291 #endif // GRAPHICS_DISABLED
292 }
293}
294
295/**********************************************************************
296 * filter_noise_blobs
297 *
298 * Move small blobs to a separate list.
299 **********************************************************************/
300
301float Textord::filter_noise_blobs(
302 BLOBNBOX_LIST *src_list, // original list
303 BLOBNBOX_LIST *noise_list, // noise list
304 BLOBNBOX_LIST *small_list, // small blobs
305 BLOBNBOX_LIST *large_list) { // large blobs
306 int16_t height; //height of blob
307 int16_t width; //of blob
308 BLOBNBOX *blob; //current blob
309 float initial_x; //first guess
310 BLOBNBOX_IT src_it = src_list; //iterators
311 BLOBNBOX_IT noise_it = noise_list;
312 BLOBNBOX_IT small_it = small_list;
313 BLOBNBOX_IT large_it = large_list;
314 STATS size_stats (0, MAX_NEAREST_DIST);
315 //blob heights
316 float min_y; //size limits
317 float max_y;
318 float max_x;
319 float max_height; //of good blobs
320
321 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
322 blob = src_it.data();
324 noise_it.add_after_then_move(src_it.extract());
325 else if (blob->enclosed_area() >= blob->bounding_box().height()
327 small_it.add_after_then_move(src_it.extract());
328 }
329 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
330 size_stats.add(src_it.data()->bounding_box().height(), 1);
331 }
332 initial_x = size_stats.ile(textord_initialx_ile);
333 max_y = ceil(initial_x *
338 min_y = floor (initial_x / 2);
339 max_x = ceil (initial_x * textord_width_limit);
340 small_it.move_to_first ();
341 for (small_it.mark_cycle_pt (); !small_it.cycled_list ();
342 small_it.forward ()) {
343 height = small_it.data()->bounding_box().height();
344 if (height > max_y)
345 large_it.add_after_then_move(small_it.extract ());
346 else if (height >= min_y)
347 src_it.add_after_then_move(small_it.extract ());
348 }
349 size_stats.clear ();
350 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
351 height = src_it.data ()->bounding_box ().height ();
352 width = src_it.data ()->bounding_box ().width ();
353 if (height < min_y)
354 small_it.add_after_then_move (src_it.extract ());
355 else if (height > max_y || width > max_x)
356 large_it.add_after_then_move (src_it.extract ());
357 else
358 size_stats.add (height, 1);
359 }
360 max_height = size_stats.ile (textord_initialasc_ile);
361 // tprintf("max_y=%g, min_y=%g, initial_x=%g, max_height=%g,",
362 // max_y,min_y,initial_x,max_height);
364 if (max_height > initial_x)
365 initial_x = max_height;
366 // tprintf(" ret=%g\n",initial_x);
367 return initial_x;
368}
369
370// Fixes the block so it obeys all the rules:
371// Must have at least one ROW.
372// Must have at least one WERD.
373// WERDs contain a fake blob.
374void Textord::cleanup_nontext_block(BLOCK* block) {
375 // Non-text blocks must contain at least one row.
376 ROW_IT row_it(block->row_list());
377 if (row_it.empty()) {
378 const TBOX& box = block->pdblk.bounding_box();
379 float height = box.height();
380 int32_t xstarts[2] = {box.left(), box.right()};
381 double coeffs[3] = {0.0, 0.0, static_cast<double>(box.bottom())};
382 ROW* row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f,
383 height / 4.0f, 0, 1);
384 row_it.add_after_then_move(row);
385 }
386 // Each row must contain at least one word.
387 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
388 ROW* row = row_it.data();
389 WERD_IT w_it(row->word_list());
390 if (w_it.empty()) {
391 // Make a fake blob to put in the word.
392 TBOX box = block->row_list()->singleton() ? block->pdblk.bounding_box()
393 : row->bounding_box();
394 C_BLOB* blob = C_BLOB::FakeBlob(box);
395 C_BLOB_LIST blobs;
396 C_BLOB_IT blob_it(&blobs);
397 blob_it.add_after_then_move(blob);
398 WERD* word = new WERD(&blobs, 0, nullptr);
399 w_it.add_after_then_move(word);
400 }
401 // Each word must contain a fake blob.
402 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
403 WERD* word = w_it.data();
404 // Just assert that this is true, as it would be useful to find
405 // out why it isn't.
406 ASSERT_HOST(!word->cblob_list()->empty());
407 }
408 row->recalc_bounding_box();
409 }
410}
411
412/**********************************************************************
413 * cleanup_blocks
414 *
415 * Delete empty blocks, rows from the page.
416 **********************************************************************/
417
418void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST* blocks) {
419 BLOCK_IT block_it = blocks; //iterator
420 ROW_IT row_it; //row iterator
421
422 int num_rows = 0;
423 int num_rows_all = 0;
424 int num_blocks = 0;
425 int num_blocks_all = 0;
426 for (block_it.mark_cycle_pt(); !block_it.cycled_list();
427 block_it.forward()) {
428 BLOCK* block = block_it.data();
429 if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) {
430 cleanup_nontext_block(block);
431 continue;
432 }
433 num_rows = 0;
434 num_rows_all = 0;
435 if (clean_noise) {
436 row_it.set_to_list(block->row_list());
437 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
438 ROW* row = row_it.data();
439 ++num_rows_all;
440 clean_small_noise_from_words(row);
441 if ((textord_noise_rejrows && !row->word_list()->empty() &&
442 clean_noise_from_row(row)) ||
443 row->word_list()->empty()) {
444 delete row_it.extract(); // lose empty row.
445 } else {
447 clean_noise_from_words(row_it.data());
451 ++num_rows;
452 }
453 }
454 }
455 if (block->row_list()->empty()) {
456 delete block_it.extract(); // Lose empty text blocks.
457 } else {
458 ++num_blocks;
459 }
460 ++num_blocks_all;
462 tprintf("cleanup_blocks: # rows = %d / %d\n", num_rows, num_rows_all);
463 }
465 tprintf("cleanup_blocks: # blocks = %d / %d\n", num_blocks, num_blocks_all);
466}
467
468
469/**********************************************************************
470 * clean_noise_from_row
471 *
472 * Move blobs of words from rows of garbage into the reject blobs list.
473 **********************************************************************/
474
475bool Textord::clean_noise_from_row( //remove empties
476 ROW* row //row to clean
477) {
478 bool testing_on;
479 TBOX blob_box; //bounding box
480 C_BLOB *blob; //current blob
481 C_OUTLINE *outline; //current outline
482 WERD *word; //current word
483 int32_t blob_size; //biggest size
484 int32_t trans_count = 0; //no of transitions
485 int32_t trans_threshold; //noise tolerance
486 int32_t dot_count; //small objects
487 int32_t norm_count; //normal objects
488 int32_t super_norm_count; //real char-like
489 //words of row
490 WERD_IT word_it = row->word_list ();
491 C_BLOB_IT blob_it; //blob iterator
492 C_OUTLINE_IT out_it; //outline iterator
493
494 testing_on = textord_test_y > row->base_line (textord_test_x)
496 && textord_test_y < row->base_line (textord_test_x) + row->x_height ();
497 dot_count = 0;
498 norm_count = 0;
499 super_norm_count = 0;
500 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
501 word = word_it.data (); //current word
502 //blobs in word
503 blob_it.set_to_list (word->cblob_list ());
504 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
505 blob_it.forward ()) {
506 blob = blob_it.data ();
507 if (!word->flag (W_DONT_CHOP)) {
508 //get outlines
509 out_it.set_to_list (blob->out_list ());
510 for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
511 out_it.forward ()) {
512 outline = out_it.data ();
513 blob_box = outline->bounding_box ();
514 blob_size =
515 blob_box.width () >
516 blob_box.height ()? blob_box.width () : blob_box.
517 height();
518 if (blob_size < textord_noise_sizelimit * row->x_height ())
519 dot_count++; //count smal outlines
520 if (!outline->child ()->empty ()
521 && blob_box.height () <
522 (1 + textord_noise_syfract) * row->x_height ()
523 && blob_box.height () >
524 (1 - textord_noise_syfract) * row->x_height ()
525 && blob_box.width () <
526 (1 + textord_noise_sxfract) * row->x_height ()
527 && blob_box.width () >
528 (1 - textord_noise_sxfract) * row->x_height ())
529 super_norm_count++; //count smal outlines
530 }
531 }
532 else
533 super_norm_count++;
534 blob_box = blob->bounding_box ();
535 blob_size =
536 blob_box.width () >
537 blob_box.height ()? blob_box.width () : blob_box.height ();
538 if (blob_size >= textord_noise_sizelimit * row->x_height ()
539 && blob_size < row->x_height () * 2) {
540 trans_threshold = blob_size / textord_noise_sizefraction;
541 trans_count = blob->count_transitions (trans_threshold);
542 if (trans_count < textord_noise_translimit)
543 norm_count++;
544 }
545 else if (blob_box.height () > row->x_height () * 2
546 && (!word_it.at_first () || !blob_it.at_first ()))
547 dot_count += 2;
548 if (testing_on) {
549 tprintf
550 ("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n",
551 blob_box.left (), blob_box.bottom (), blob_box.right (),
552 blob_box.top (), blob->out_list ()->length (), trans_count,
553 blob_box.bottom () - row->base_line (blob_box.left ()));
554 }
555 }
556 }
558 tprintf ("Row ending at (%d,%g):",
559 blob_box.right (), row->base_line (blob_box.right ()));
560 tprintf (" R=%g, dc=%d, nc=%d, %s\n",
561 norm_count > 0 ? static_cast<float>(dot_count) / norm_count : 9999,
562 dot_count, norm_count,
563 dot_count > norm_count * textord_noise_normratio
564 && dot_count > 2 ? "REJECTED" : "ACCEPTED");
565 }
566 return super_norm_count < textord_noise_sncount
567 && dot_count > norm_count * textord_noise_rowratio && dot_count > 2;
568}
569
570/**********************************************************************
571 * clean_noise_from_words
572 *
573 * Move blobs of words from rows of garbage into the reject blobs list.
574 **********************************************************************/
575
576void Textord::clean_noise_from_words( //remove empties
577 ROW *row //row to clean
578 ) {
579 TBOX blob_box; //bounding box
580 C_BLOB *blob; //current blob
581 C_OUTLINE *outline; //current outline
582 WERD *word; //current word
583 int32_t blob_size; //biggest size
584 int32_t trans_count; //no of transitions
585 int32_t trans_threshold; //noise tolerance
586 int32_t dot_count; //small objects
587 int32_t norm_count; //normal objects
588 int32_t dud_words; //number discarded
589 int32_t ok_words; //number remaining
590 int32_t word_index; //current word
591 //words of row
592 WERD_IT word_it = row->word_list ();
593 C_BLOB_IT blob_it; //blob iterator
594 C_OUTLINE_IT out_it; //outline iterator
595
596 ok_words = word_it.length ();
597 if (ok_words == 0 || textord_no_rejects)
598 return;
599 // was it chucked
600 std::vector<int8_t> word_dud(ok_words);
601 dud_words = 0;
602 ok_words = 0;
603 word_index = 0;
604 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
605 word = word_it.data (); //current word
606 dot_count = 0;
607 norm_count = 0;
608 //blobs in word
609 blob_it.set_to_list (word->cblob_list ());
610 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
611 blob_it.forward ()) {
612 blob = blob_it.data ();
613 if (!word->flag (W_DONT_CHOP)) {
614 //get outlines
615 out_it.set_to_list (blob->out_list ());
616 for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
617 out_it.forward ()) {
618 outline = out_it.data ();
619 blob_box = outline->bounding_box ();
620 blob_size =
621 blob_box.width () >
622 blob_box.height ()? blob_box.width () : blob_box.
623 height();
624 if (blob_size < textord_noise_sizelimit * row->x_height ())
625 dot_count++; //count smal outlines
626 if (!outline->child ()->empty ()
627 && blob_box.height () <
628 (1 + textord_noise_syfract) * row->x_height ()
629 && blob_box.height () >
630 (1 - textord_noise_syfract) * row->x_height ()
631 && blob_box.width () <
632 (1 + textord_noise_sxfract) * row->x_height ()
633 && blob_box.width () >
634 (1 - textord_noise_sxfract) * row->x_height ())
635 norm_count++; //count smal outlines
636 }
637 }
638 else
639 norm_count++;
640 blob_box = blob->bounding_box ();
641 blob_size =
642 blob_box.width () >
643 blob_box.height ()? blob_box.width () : blob_box.height ();
644 if (blob_size >= textord_noise_sizelimit * row->x_height ()
645 && blob_size < row->x_height () * 2) {
646 trans_threshold = blob_size / textord_noise_sizefraction;
647 trans_count = blob->count_transitions (trans_threshold);
648 if (trans_count < textord_noise_translimit)
649 norm_count++;
650 }
651 else if (blob_box.height () > row->x_height () * 2
652 && (!word_it.at_first () || !blob_it.at_first ()))
653 dot_count += 2;
654 }
655 if (dot_count > 2 && !word->flag(W_REP_CHAR)) {
656 if (dot_count > norm_count * textord_noise_normratio * 2)
657 word_dud[word_index] = 2;
658 else if (dot_count > norm_count * textord_noise_normratio)
659 word_dud[word_index] = 1;
660 else
661 word_dud[word_index] = 0;
662 } else {
663 word_dud[word_index] = 0;
664 }
665 if (word_dud[word_index] == 2)
666 dud_words++;
667 else
668 ok_words++;
669 word_index++;
670 }
671
672 word_index = 0;
673 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
674 if (word_dud[word_index] == 2
675 || (word_dud[word_index] == 1 && dud_words > ok_words)) {
676 word = word_it.data(); // Current word.
677 // Previously we threw away the entire word.
678 // Now just aggressively throw all small blobs into the reject list, where
679 // the classifier can decide whether they are actually needed.
681 }
682 word_index++;
683 }
684}
685
686// Remove outlines that are a tiny fraction in either width or height
687// of the word height.
688void Textord::clean_small_noise_from_words(ROW *row) {
689 WERD_IT word_it(row->word_list());
690 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
691 WERD* word = word_it.data();
692 int min_size = static_cast<int>(
693 textord_noise_hfract * word->bounding_box().height() + 0.5);
694 C_BLOB_IT blob_it(word->cblob_list());
695 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
696 C_BLOB* blob = blob_it.data();
697 C_OUTLINE_IT out_it(blob->out_list());
698 for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
699 C_OUTLINE* outline = out_it.data();
700 outline->RemoveSmallRecursive(min_size, &out_it);
701 }
702 if (blob->out_list()->empty()) {
703 delete blob_it.extract();
704 }
705 }
706 if (word->cblob_list()->empty()) {
707 if (!word_it.at_last()) {
708 // The next word is no longer a fuzzy non space if it was before,
709 // since the word before is about to be deleted.
710 WERD* next_word = word_it.data_relative(1);
711 if (next_word->flag(W_FUZZY_NON)) {
712 next_word->set_flag(W_FUZZY_NON, false);
713 }
714 }
715 delete word_it.extract();
716 }
717 }
718}
719
720// Local struct to hold a group of blocks.
722 BlockGroup() : rotation(1.0f, 0.0f), angle(0.0f), min_xheight(1.0f) {}
723 explicit BlockGroup(BLOCK* block)
724 : bounding_box(block->pdblk.bounding_box()),
725 rotation(block->re_rotation()),
726 angle(block->re_rotation().angle()),
727 min_xheight(block->x_height()) {
728 blocks.push_back(block);
729 }
730 // Union of block bounding boxes.
732 // Common rotation of the blocks.
734 // Angle of rotation.
735 float angle;
736 // Min xheight of the blocks.
738 // Collection of borrowed pointers to the blocks in the group.
740};
741
742// Groups blocks by rotation, then, for each group, makes a WordGrid and calls
743// TransferDiacriticsToWords to copy the diacritic blobs to the most
744// appropriate words in the group of blocks. Source blobs are not touched.
745void Textord::TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs,
746 BLOCK_LIST* blocks) {
747 // Angle difference larger than this is too much to consider equal.
748 // They should only be in multiples of M_PI/2 anyway.
749 const double kMaxAngleDiff = 0.01; // About 0.6 degrees.
751 BLOCK_IT bk_it(blocks);
752 for (bk_it.mark_cycle_pt(); !bk_it.cycled_list(); bk_it.forward()) {
753 BLOCK* block = bk_it.data();
754 if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) {
755 continue;
756 }
757 // Linear search of the groups to find a matching rotation.
758 float block_angle = block->re_rotation().angle();
759 int best_g = 0;
760 float best_angle_diff = FLT_MAX;
761 for (int g = 0; g < groups.size(); ++g) {
762 double angle_diff = fabs(block_angle - groups[g]->angle);
763 if (angle_diff > M_PI) angle_diff = fabs(angle_diff - 2.0 * M_PI);
764 if (angle_diff < best_angle_diff) {
765 best_angle_diff = angle_diff;
766 best_g = g;
767 }
768 }
769 if (best_angle_diff > kMaxAngleDiff) {
770 groups.push_back(new BlockGroup(block));
771 } else {
772 groups[best_g]->blocks.push_back(block);
773 groups[best_g]->bounding_box += block->pdblk.bounding_box();
774 float x_height = block->x_height();
775 if (x_height < groups[best_g]->min_xheight)
776 groups[best_g]->min_xheight = x_height;
777 }
778 }
779 // Now process each group of blocks.
780 PointerVector<WordWithBox> word_ptrs;
781 for (int g = 0; g < groups.size(); ++g) {
782 const BlockGroup* group = groups[g];
783 if (group->bounding_box.null_box()) continue;
784 WordGrid word_grid(group->min_xheight, group->bounding_box.botleft(),
785 group->bounding_box.topright());
786 for (int b = 0; b < group->blocks.size(); ++b) {
787 ROW_IT row_it(group->blocks[b]->row_list());
788 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
789 ROW* row = row_it.data();
790 // Put the words of the row into the grid.
791 WERD_IT w_it(row->word_list());
792 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
793 WERD* word = w_it.data();
794 auto* box_word = new WordWithBox(word);
795 word_grid.InsertBBox(true, true, box_word);
796 // Save the pointer where it will be auto-deleted.
797 word_ptrs.push_back(box_word);
798 }
799 }
800 }
801 FCOORD rotation = group->rotation;
802 // Make it a forward rotation that will transform blob coords to block.
803 rotation.set_y(-rotation.y());
804 TransferDiacriticsToWords(diacritic_blobs, rotation, &word_grid);
805 }
806}
807
808// Places a copy of blobs that are near a word (after applying rotation to the
809// blob) in the most appropriate word, unless there is doubt, in which case a
810// blob can end up in two words. Source blobs are not touched.
811void Textord::TransferDiacriticsToWords(BLOBNBOX_LIST* diacritic_blobs,
812 const FCOORD& rotation,
813 WordGrid* word_grid) {
814 WordSearch ws(word_grid);
815 BLOBNBOX_IT b_it(diacritic_blobs);
816 // Apply rotation to each blob before finding the nearest words. The rotation
817 // allows us to only consider above/below placement and not left/right on
818 // vertical text, because all text is horizontal here.
819 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
820 BLOBNBOX* blobnbox = b_it.data();
821 TBOX blob_box = blobnbox->bounding_box();
822 blob_box.rotate(rotation);
823 ws.StartRectSearch(blob_box);
824 // Above/below refer to word position relative to diacritic. Since some
825 // scripts eg Kannada/Telugu habitually put diacritics below words, and
826 // others eg Thai/Vietnamese/Latin put most diacritics above words, try
827 // for both if there isn't much in it.
828 WordWithBox* best_above_word = nullptr;
829 WordWithBox* best_below_word = nullptr;
830 int best_above_distance = 0;
831 int best_below_distance = 0;
832 for (WordWithBox* word = ws.NextRectSearch(); word != nullptr;
833 word = ws.NextRectSearch()) {
834 if (word->word()->flag(W_REP_CHAR)) continue;
835 TBOX word_box = word->true_bounding_box();
836 int x_distance = blob_box.x_gap(word_box);
837 int y_distance = blob_box.y_gap(word_box);
838 if (x_distance > 0) {
839 // Arbitrarily divide x-distance by 2 if there is a major y overlap,
840 // and the word is to the left of the diacritic. If the
841 // diacritic is a dropped broken character between two words, this will
842 // help send all the pieces to a single word, instead of splitting them
843 // over the 2 words.
844 if (word_box.major_y_overlap(blob_box) &&
845 blob_box.left() > word_box.right()) {
846 x_distance /= 2;
847 }
848 y_distance += x_distance;
849 }
850 if (word_box.y_middle() > blob_box.y_middle() &&
851 (best_above_word == nullptr || y_distance < best_above_distance)) {
852 best_above_word = word;
853 best_above_distance = y_distance;
854 }
855 if (word_box.y_middle() <= blob_box.y_middle() &&
856 (best_below_word == nullptr || y_distance < best_below_distance)) {
857 best_below_word = word;
858 best_below_distance = y_distance;
859 }
860 }
861 bool above_good =
862 best_above_word != nullptr &&
863 (best_below_word == nullptr ||
864 best_above_distance < best_below_distance + blob_box.height());
865 bool below_good =
866 best_below_word != nullptr && best_below_word != best_above_word &&
867 (best_above_word == nullptr ||
868 best_below_distance < best_above_distance + blob_box.height());
869 if (below_good) {
870 C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob());
871 copied_blob->rotate(rotation);
872 // Put the blob into the word's reject blobs list.
873 C_BLOB_IT blob_it(best_below_word->RejBlobs());
874 blob_it.add_to_end(copied_blob);
875 }
876 if (above_good) {
877 C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob());
878 copied_blob->rotate(rotation);
879 // Put the blob into the word's reject blobs list.
880 C_BLOB_IT blob_it(best_above_word->RejBlobs());
881 blob_it.add_to_end(copied_blob);
882 }
883 }
884}
885
886} // tesseract
887
888/**********************************************************************
889 * tweak_row_baseline
890 *
891 * Shift baseline to fit the blobs more accurately where they are
892 * close enough.
893 **********************************************************************/
894
896 double blshift_maxshift,
897 double blshift_xfraction) {
898 TBOX blob_box; //bounding box
899 C_BLOB *blob; //current blob
900 WERD *word; //current word
901 int32_t blob_count; //no of blobs
902 int32_t src_index; //source segment
903 int32_t dest_index; //destination segment
904 float ydiff; //baseline error
905 float x_centre; //centre of blob
906 //words of row
907 WERD_IT word_it = row->word_list ();
908 C_BLOB_IT blob_it; //blob iterator
909
910 blob_count = 0;
911 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
912 word = word_it.data (); //current word
913 //get total blobs
914 blob_count += word->cblob_list ()->length ();
915 }
916 if (blob_count == 0)
917 return;
918 // spline segments
919 std::vector<int32_t> xstarts(blob_count + row->baseline.segments + 1);
920 // spline coeffs
921 std::vector<double> coeffs((blob_count + row->baseline.segments) * 3);
922
923 src_index = 0;
924 dest_index = 0;
925 xstarts[0] = row->baseline.xcoords[0];
926 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
927 word = word_it.data (); //current word
928 //blobs in word
929 blob_it.set_to_list (word->cblob_list ());
930 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
931 blob_it.forward ()) {
932 blob = blob_it.data ();
933 blob_box = blob->bounding_box ();
934 x_centre = (blob_box.left () + blob_box.right ()) / 2.0;
935 ydiff = blob_box.bottom () - row->base_line (x_centre);
936 if (ydiff < 0)
937 ydiff = -ydiff / row->x_height ();
938 else
939 ydiff = ydiff / row->x_height ();
940 if (ydiff < blshift_maxshift
941 && blob_box.height () / row->x_height () > blshift_xfraction) {
942 if (xstarts[dest_index] >= x_centre)
943 xstarts[dest_index] = blob_box.left ();
944 coeffs[dest_index * 3] = 0;
945 coeffs[dest_index * 3 + 1] = 0;
946 coeffs[dest_index * 3 + 2] = blob_box.bottom ();
947 //shift it
948 dest_index++;
949 xstarts[dest_index] = blob_box.right () + 1;
950 }
951 else {
952 if (xstarts[dest_index] <= x_centre) {
953 while (row->baseline.xcoords[src_index + 1] <= x_centre
954 && src_index < row->baseline.segments - 1) {
955 if (row->baseline.xcoords[src_index + 1] >
956 xstarts[dest_index]) {
957 coeffs[dest_index * 3] =
958 row->baseline.quadratics[src_index].a;
959 coeffs[dest_index * 3 + 1] =
960 row->baseline.quadratics[src_index].b;
961 coeffs[dest_index * 3 + 2] =
962 row->baseline.quadratics[src_index].c;
963 dest_index++;
964 xstarts[dest_index] =
965 row->baseline.xcoords[src_index + 1];
966 }
967 src_index++;
968 }
969 coeffs[dest_index * 3] =
970 row->baseline.quadratics[src_index].a;
971 coeffs[dest_index * 3 + 1] =
972 row->baseline.quadratics[src_index].b;
973 coeffs[dest_index * 3 + 2] =
974 row->baseline.quadratics[src_index].c;
975 dest_index++;
976 xstarts[dest_index] = row->baseline.xcoords[src_index + 1];
977 }
978 }
979 }
980 }
981 while (src_index < row->baseline.segments
982 && row->baseline.xcoords[src_index + 1] <= xstarts[dest_index])
983 src_index++;
984 while (src_index < row->baseline.segments) {
985 coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a;
986 coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b;
987 coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c;
988 dest_index++;
989 src_index++;
990 xstarts[dest_index] = row->baseline.xcoords[src_index];
991 }
992 //turn to spline
993 row->baseline = QSPLINE(dest_index, &xstarts[0], &coeffs[0]);
994}
@ W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:40
@ W_REP_CHAR
repeated character
Definition: werd.h:38
@ W_DONT_CHOP
fixed pitch chopped
Definition: werd.h:37
#define CLISTIZE(CLASSNAME)
Definition: clst.h:891
#define LOC_EDGE_PROG
Definition: errcode.h:43
#define ASSERT_HOST(x)
Definition: errcode.h:88
void set_global_loc_code(int loc_code)
Definition: globaloc.cpp:25
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
@ baseline
Definition: mfoutline.h:63
ScrollView * to_win
Definition: drawtord.cpp:35
void plot_box_list(ScrollView *win, BLOBNBOX_LIST *list, ScrollView::Color body_colour)
Definition: drawtord.cpp:67
ScrollView * create_to_win(ICOORD page_tr)
Definition: drawtord.cpp:44
void extract_edges(Pix *pix, BLOCK *block)
Definition: edgblob.cpp:329
int textord_test_x
Definition: makerow.cpp:60
bool textord_test_landscape
Definition: makerow.cpp:48
double textord_excess_blobsize
Definition: makerow.cpp:83
double textord_width_limit
Definition: makerow.cpp:75
int textord_test_y
Definition: makerow.cpp:61
double textord_min_linesize
Definition: makerow.cpp:81
#define MAX_NEAREST_DIST
Definition: tordmain.cpp:58
void tweak_row_baseline(ROW *row, double blshift_maxshift, double blshift_xfraction)
Definition: tordmain.cpp:895
BBGrid< WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT > WordGrid
Definition: textord.h:65
void SetBlobStrokeWidth(Pix *pix, BLOBNBOX *blob)
Definition: tordmain.cpp:69
void assign_blobs_to_blocks2(Pix *pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *port_blocks)
Definition: tordmain.cpp:168
GridSearch< WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT > WordSearch
Definition: textord.h:66
int push_back(T object)
int size() const
Definition: genericvector.h:72
const TBOX & bounding_box() const
Definition: blobbox.h:230
C_BLOB * cblob() const
Definition: blobbox.h:268
int32_t enclosed_area() const
Definition: blobbox.h:253
BLOBNBOX_LIST blobs
Definition: blobbox.h:772
BLOBNBOX_LIST noise_blobs
Definition: blobbox.h:774
BLOBNBOX_LIST large_blobs
Definition: blobbox.h:776
float line_size
Definition: blobbox.h:785
float max_blob_size
Definition: blobbox.h:786
float line_spacing
Definition: blobbox.h:779
BLOBNBOX_LIST small_blobs
Definition: blobbox.h:775
void plot_graded_blobs(ScrollView *to_win)
Definition: blobbox.cpp:1071
static const double kXHeightCapRatio
Definition: ccstruct.h:37
static const double kXHeightFraction
Definition: ccstruct.h:34
static const double kDescenderFraction
Definition: ccstruct.h:33
static const double kAscenderFraction
Definition: ccstruct.h:35
C_OUTLINE_LIST * child()
Definition: coutln.h:108
const TBOX & bounding_box() const
Definition: coutln.h:113
void RemoveSmallRecursive(int min_size, C_OUTLINE_IT *it)
Definition: coutln.cpp:627
Definition: ocrblock.h:31
FCOORD re_rotation() const
Definition: ocrblock.h:134
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:116
C_BLOB_LIST * blob_list()
get blobs
Definition: ocrblock.h:128
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:190
int32_t x_height() const
return xheight
Definition: ocrblock.h:106
C_BLOB_LIST * reject_blobs()
Definition: ocrblock.h:131
Definition: ocrrow.h:37
WERD_LIST * word_list()
Definition: ocrrow.h:55
float base_line(float xpos) const
Definition: ocrrow.h:59
TBOX bounding_box() const
Definition: ocrrow.h:88
float x_height() const
Definition: ocrrow.h:64
void recalc_bounding_box()
Definition: ocrrow.cpp:100
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:59
POLY_BLOCK * poly_block() const
Definition: pdblock.h:55
integer coordinate
Definition: points.h:32
Definition: points.h:189
float angle() const
find angle
Definition: points.h:247
float y() const
Definition: points.h:210
void set_y(float yin)
rewrite function
Definition: points.h:218
bool IsText() const
Definition: polyblk.h:49
float b
Definition: quadratc.h:58
float c
Definition: quadratc.h:59
double a
Definition: quadratc.h:57
Definition: rect.h:34
void rotate(const FCOORD &vec)
Definition: rect.h:197
int16_t top() const
Definition: rect.h:58
int16_t width() const
Definition: rect.h:115
int16_t height() const
Definition: rect.h:108
int16_t left() const
Definition: rect.h:72
int y_gap(const TBOX &box) const
Definition: rect.h:233
int y_middle() const
Definition: rect.h:88
int16_t bottom() const
Definition: rect.h:65
bool major_y_overlap(const TBOX &box) const
Definition: rect.h:439
int x_gap(const TBOX &box) const
Definition: rect.h:225
int16_t right() const
Definition: rect.h:79
Definition: statistc.h:31
void add(int32_t value, int32_t count)
Definition: statistc.cpp:93
int32_t get_total() const
Definition: statistc.h:84
double ile(double frac) const
Definition: statistc.cpp:166
static C_BLOB * deep_copy(const C_BLOB *src)
Definition: stepblob.h:119
static C_BLOB * FakeBlob(const TBOX &box)
Definition: stepblob.cpp:241
int32_t count_transitions(int32_t threshold)
Definition: stepblob.cpp:333
TBOX bounding_box() const
Definition: stepblob.cpp:253
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:70
void rotate(const FCOORD &rotation)
Definition: stepblob.cpp:391
Definition: werd.h:56
C_BLOB_LIST * cblob_list()
Definition: werd.h:95
void CleanNoise(float size_threshold)
Definition: werd.cpp:482
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:118
TBOX bounding_box() const
Definition: werd.cpp:148
TBOX true_bounding_box() const
Definition: werd.cpp:169
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
bool textord_no_rejects
Definition: textord.h:373
bool textord_noise_rejrows
Definition: textord.h:387
int textord_noise_translimit
Definition: textord.h:384
void filter_blobs(ICOORD page_tr, TO_BLOCK_LIST *blocks, bool testing_on)
Definition: tordmain.cpp:250
double textord_noise_sxfract
Definition: textord.h:390
bool textord_show_boxes
Definition: textord.h:375
double textord_noise_syfract
Definition: textord.h:388
double textord_noise_sizelimit
Definition: textord.h:383
double textord_initialasc_ile
Definition: textord.h:381
bool textord_noise_debug
Definition: textord.h:395
double textord_blshift_maxshift
Definition: textord.h:396
int textord_noise_sizefraction
Definition: textord.h:382
void find_components(Pix *pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: tordmain.cpp:219
double textord_noise_normratio
Definition: textord.h:385
double textord_noise_area_ratio
Definition: textord.h:379
double textord_blshift_xfraction
Definition: textord.h:397
bool textord_show_blobs
Definition: textord.h:374
double textord_noise_rowratio
Definition: textord.h:394
double textord_noise_hfract
Definition: textord.h:392
double textord_initialx_ile
Definition: textord.h:380
int textord_noise_sncount
Definition: textord.h:393
bool textord_noise_rejwords
Definition: textord.h:386
int textord_max_noise_size
Definition: textord.h:376
GenericVector< BLOCK * > blocks
Definition: tordmain.cpp:739
BlockGroup(BLOCK *block)
Definition: tordmain.cpp:723
void Clear()
Definition: scrollview.cpp:589