tesseract 4.1.1
Loading...
Searching...
No Matches
makerow.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: makerow.cpp (Formerly makerows.c)
3 * Description: Code to arrange blobs into rows of text.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 1992, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19#include <vector> // for std::vector
20#include "blobbox.h"
21#include "ccstruct.h"
22#include "detlinefit.h"
23#include "statistc.h"
24#include "drawtord.h"
25#include "blkocc.h"
26#include "sortflts.h"
27#include "oldbasel.h"
28#include "textord.h"
29#include "tordmain.h"
30#include "underlin.h"
31#include "makerow.h"
32#include "tprintf.h"
33#include "tovars.h"
34
35// Include automatically generated configuration file if running autoconf.
36#ifdef HAVE_CONFIG_H
37#include "config_auto.h"
38#endif
39
40#include <algorithm>
41
42BOOL_VAR(textord_heavy_nr, false, "Vigorously remove noise");
43BOOL_VAR(textord_show_initial_rows, false, "Display row accumulation");
44BOOL_VAR(textord_show_parallel_rows, false, "Display page correlated rows");
45BOOL_VAR(textord_show_expanded_rows, false, "Display rows after expanding");
46BOOL_VAR(textord_show_final_rows, false, "Display rows after final fitting");
47BOOL_VAR(textord_show_final_blobs, false, "Display blob bounds after pre-ass");
48BOOL_VAR(textord_test_landscape, false, "Tests refer to land/port");
49BOOL_VAR(textord_parallel_baselines, true, "Force parallel baselines");
50BOOL_VAR(textord_straight_baselines, false, "Force straight baselines");
51BOOL_VAR(textord_old_baselines, true, "Use old baseline algorithm");
52BOOL_VAR(textord_old_xheight, false, "Use old xheight algorithm");
53BOOL_VAR(textord_fix_xheight_bug, true, "Use spline baseline");
54BOOL_VAR(textord_fix_makerow_bug, true, "Prevent multiple baselines");
55BOOL_VAR(textord_debug_xheights, false, "Test xheight algorithms");
56static BOOL_VAR(textord_biased_skewcalc, true, "Bias skew estimates with line length");
57static BOOL_VAR(textord_interpolating_skew, true, "Interpolate across gaps");
58static INT_VAR(textord_skewsmooth_offset, 4, "For smooth factor");
59static INT_VAR(textord_skewsmooth_offset2, 1, "For smooth factor");
60INT_VAR(textord_test_x, -INT32_MAX, "coord of test pt");
61INT_VAR(textord_test_y, -INT32_MAX, "coord of test pt");
62INT_VAR(textord_min_blobs_in_row, 4, "Min blobs before gradient counted");
63INT_VAR(textord_spline_minblobs, 8, "Min blobs in each spline segment");
64INT_VAR(textord_spline_medianwin, 6, "Size of window for spline segmentation");
65static INT_VAR(textord_max_blob_overlaps, 4,
66 "Max number of blobs a big blob can overlap");
67INT_VAR(textord_min_xheight, 10, "Min credible pixel xheight");
69 "Fraction of line spacing for quad");
71 "Fraction of line spacing for outlier");
72double_VAR(textord_skew_ile, 0.5, "Ile of gradients for page skew");
73double_VAR(textord_skew_lag, 0.02, "Lag for skew on row accumulation");
74double_VAR(textord_linespace_iqrlimit, 0.2, "Max iqr/median for linespace");
75double_VAR(textord_width_limit, 8, "Max width of blobs to make rows");
76double_VAR(textord_chop_width, 1.5, "Max width before chopping");
77static double_VAR(textord_expansion_factor, 1.0,
78 "Factor to expand rows by in expand_rows");
79static double_VAR(textord_overlap_x, 0.375, "Fraction of linespace for good overlap");
80double_VAR(textord_minxh, 0.25, "fraction of linesize for min xheight");
81double_VAR(textord_min_linesize, 1.25, "* blob height for initial linesize");
83 "New row made if blob makes row this big");
84double_VAR(textord_occupancy_threshold, 0.4, "Fraction of neighbourhood");
85double_VAR(textord_underline_width, 2.0, "Multiple of line_size for underline");
87 "Min blob height/top to include blob top into xheight stats");
89 "Min pile height to make xheight");
91 "Min pile height to make ascheight");
92static double_VAR(textord_descheight_mode_fraction, 0.08,
93 "Min pile height to make descheight");
94double_VAR(textord_ascx_ratio_min, 1.25, "Min cap/xheight");
95double_VAR(textord_ascx_ratio_max, 1.8, "Max cap/xheight");
96double_VAR(textord_descx_ratio_min, 0.25, "Min desc/xheight");
97double_VAR(textord_descx_ratio_max, 0.6, "Max desc/xheight");
98double_VAR(textord_xheight_error_margin, 0.1, "Accepted variation");
99INT_VAR(textord_lms_line_trials, 12, "Number of linew fits to do");
100BOOL_VAR(textord_new_initial_xheight, true, "Use test xheight mechanism");
101BOOL_VAR(textord_debug_blob, false, "Print test blob information");
102
103#define MAX_HEIGHT_MODES 12
104
105const int kMinLeaderCount = 5;
106
107// Factored-out helper to build a single row from a list of blobs.
108// Returns the mean blob size.
109static float MakeRowFromBlobs(float line_size,
110 BLOBNBOX_IT* blob_it, TO_ROW_IT* row_it) {
111 blob_it->sort(blob_x_order);
112 blob_it->move_to_first();
113 TO_ROW* row = nullptr;
114 float total_size = 0.0f;
115 int blob_count = 0;
116 // Add all the blobs to a single TO_ROW.
117 for (; !blob_it->empty(); blob_it->forward()) {
118 BLOBNBOX* blob = blob_it->extract();
119 int top = blob->bounding_box().top();
120 int bottom = blob->bounding_box().bottom();
121 if (row == nullptr) {
122 row = new TO_ROW(blob, top, bottom, line_size);
123 row_it->add_before_then_move(row);
124 } else {
125 row->add_blob(blob, top, bottom, line_size);
126 }
127 total_size += top - bottom;
128 ++blob_count;
129 }
130 return blob_count > 0 ? total_size / blob_count : total_size;
131}
132
133// Helper to make a row using the children of a single blob.
134// Returns the mean size of the blobs created.
135static float MakeRowFromSubBlobs(TO_BLOCK* block, C_BLOB* blob,
136 TO_ROW_IT* row_it) {
137 // The blobs made from the children will go in the small_blobs list.
138 BLOBNBOX_IT bb_it(&block->small_blobs);
139 C_OUTLINE_IT ol_it(blob->out_list());
140 // Get the children.
141 ol_it.set_to_list(ol_it.data()->child());
142 if (ol_it.empty())
143 return 0.0f;
144 for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) {
145 // Deep copy the child outline and use that to make a blob.
146 blob = new C_BLOB(C_OUTLINE::deep_copy(ol_it.data()));
147 // Correct direction as needed.
149 auto* bbox = new BLOBNBOX(blob);
150 bb_it.add_after_then_move(bbox);
151 }
152 // Now we can make a row from the blobs.
153 return MakeRowFromBlobs(block->line_size, &bb_it, row_it);
154}
155
163float make_single_row(ICOORD page_tr, bool allow_sub_blobs,
164 TO_BLOCK* block, TO_BLOCK_LIST* blocks) {
165 BLOBNBOX_IT blob_it = &block->blobs;
166 TO_ROW_IT row_it = block->get_rows();
167
168 // Include all the small blobs and large blobs.
169 blob_it.add_list_after(&block->small_blobs);
170 blob_it.add_list_after(&block->noise_blobs);
171 blob_it.add_list_after(&block->large_blobs);
172 if (block->blobs.singleton() && allow_sub_blobs) {
173 blob_it.move_to_first();
174 float size = MakeRowFromSubBlobs(block, blob_it.data()->cblob(), &row_it);
175 if (size > block->line_size)
176 block->line_size = size;
177 } else if (block->blobs.empty()) {
178 // Make a fake blob.
180 // The blobnbox owns the blob.
181 auto* bblob = new BLOBNBOX(blob);
182 blob_it.add_after_then_move(bblob);
183 }
184 MakeRowFromBlobs(block->line_size, &blob_it, &row_it);
185 // Fit an LMS line to the rows.
186 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward())
187 fit_lms_line(row_it.data());
188 float gradient;
189 float fit_error;
190 // Compute the skew based on the fitted line.
191 compute_page_skew(blocks, gradient, fit_error);
192 return gradient;
193}
194
200float make_rows(ICOORD page_tr, TO_BLOCK_LIST *port_blocks) {
201 float port_m; // global skew
202 float port_err; // global noise
203 TO_BLOCK_IT block_it; // iterator
204
205 block_it.set_to_list(port_blocks);
206 for (block_it.mark_cycle_pt(); !block_it.cycled_list();
207 block_it.forward())
208 make_initial_textrows(page_tr, block_it.data(), FCOORD(1.0f, 0.0f),
210 // compute globally
211 compute_page_skew(port_blocks, port_m, port_err);
212 block_it.set_to_list(port_blocks);
213 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
214 cleanup_rows_making(page_tr, block_it.data(), port_m, FCOORD(1.0f, 0.0f),
215 block_it.data()->block->pdblk.bounding_box().left(),
217 }
218 return port_m; // global skew
219}
220
226void make_initial_textrows( //find lines
227 ICOORD page_tr,
228 TO_BLOCK* block, //block to do
229 FCOORD rotation, //for drawing
230 bool testing_on //correct orientation
231) {
232 TO_ROW_IT row_it = block->get_rows ();
233
234#ifndef GRAPHICS_DISABLED
235 ScrollView::Color colour; //of row
236
237 if (textord_show_initial_rows && testing_on) {
238 if (to_win == nullptr)
239 create_to_win(page_tr);
240 }
241#endif
242 //guess skew
243 assign_blobs_to_rows (block, nullptr, 0, true, true, textord_show_initial_rows && testing_on);
244 row_it.move_to_first ();
245 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
246 fit_lms_line (row_it.data ());
247#ifndef GRAPHICS_DISABLED
248 if (textord_show_initial_rows && testing_on) {
249 colour = ScrollView::RED;
250 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
251 plot_to_row (row_it.data (), colour, rotation);
252 colour = static_cast<ScrollView::Color>(colour + 1);
253 if (colour > ScrollView::MAGENTA)
254 colour = ScrollView::RED;
255 }
256 }
257#endif
258}
259
260
267 float m, c; // fitted line
269 BLOBNBOX_IT blob_it = row->blob_list();
270
271 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
272 const TBOX& box = blob_it.data()->bounding_box();
273 lms.Add(ICOORD((box.left() + box.right()) / 2, box.bottom()));
274 }
275 double error = lms.Fit(&m, &c);
276 row->set_line(m, c, error);
277}
278
279
286void compute_page_skew( //get average gradient
287 TO_BLOCK_LIST *blocks, //list of blocks
288 float &page_m, //average gradient
289 float &page_err //average error
290 ) {
291 int32_t row_count; //total rows
292 int32_t blob_count; //total_blobs
293 int32_t row_err; //integer error
294 int32_t row_index; //of total
295 TO_ROW *row; //current row
296 TO_BLOCK_IT block_it = blocks; //iterator
297
298 row_count = 0;
299 blob_count = 0;
300 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
301 block_it.forward ()) {
302 POLY_BLOCK* pb = block_it.data()->block->pdblk.poly_block();
303 if (pb != nullptr && !pb->IsText())
304 continue; // Pretend non-text blocks don't exist.
305 row_count += block_it.data ()->get_rows ()->length ();
306 //count up rows
307 TO_ROW_IT row_it(block_it.data()->get_rows());
308 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
309 blob_count += row_it.data ()->blob_list ()->length ();
310 }
311 if (row_count == 0) {
312 page_m = 0.0f;
313 page_err = 0.0f;
314 return;
315 }
316 // of rows
317 std::vector<float> gradients(blob_count);
318 // of rows
319 std::vector<float> errors(blob_count);
320
321 row_index = 0;
322 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
323 block_it.forward ()) {
324 POLY_BLOCK* pb = block_it.data()->block->pdblk.poly_block();
325 if (pb != nullptr && !pb->IsText())
326 continue; // Pretend non-text blocks don't exist.
327 TO_ROW_IT row_it(block_it.data ()->get_rows());
328 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
329 row = row_it.data ();
330 blob_count = row->blob_list ()->length ();
331 row_err = static_cast<int32_t>(ceil (row->line_error ()));
332 if (row_err <= 0)
333 row_err = 1;
334 if (textord_biased_skewcalc) {
335 blob_count /= row_err;
336 for (blob_count /= row_err; blob_count > 0; blob_count--) {
337 gradients[row_index] = row->line_m ();
338 errors[row_index] = row->line_error ();
339 row_index++;
340 }
341 }
342 else if (blob_count >= textord_min_blobs_in_row) {
343 //get gradient
344 gradients[row_index] = row->line_m ();
345 errors[row_index] = row->line_error ();
346 row_index++;
347 }
348 }
349 }
350 if (row_index == 0) {
351 //desperate
352 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
353 block_it.forward ()) {
354 POLY_BLOCK* pb = block_it.data()->block->pdblk.poly_block();
355 if (pb != nullptr && !pb->IsText())
356 continue; // Pretend non-text blocks don't exist.
357 TO_ROW_IT row_it(block_it.data()->get_rows());
358 for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
359 row_it.forward ()) {
360 row = row_it.data ();
361 gradients[row_index] = row->line_m ();
362 errors[row_index] = row->line_error ();
363 row_index++;
364 }
365 }
366 }
367 row_count = row_index;
368 row_index = choose_nth_item (static_cast<int32_t>(row_count * textord_skew_ile),
369 &gradients[0], row_count);
370 page_m = gradients[row_index];
371 row_index = choose_nth_item (static_cast<int32_t>(row_count * textord_skew_ile),
372 &errors[0], row_count);
373 page_err = errors[row_index];
374}
375
376const double kNoiseSize = 0.5; // Fraction of xheight.
377const int kMinSize = 8; // Min pixels to be xheight.
378
383static bool dot_of_i(BLOBNBOX* dot, BLOBNBOX* i, TO_ROW* row) {
384 const TBOX& ibox = i->bounding_box();
385 const TBOX& dotbox = dot->bounding_box();
386
387 // Must overlap horizontally by enough and be high enough.
388 int overlap = std::min(dotbox.right(), ibox.right()) -
389 std::max(dotbox.left(), ibox.left());
390 if (ibox.height() <= 2 * dotbox.height() ||
391 (overlap * 2 < ibox.width() && overlap < dotbox.width()))
392 return false;
393
394 // If the i is tall and thin then it is good.
395 if (ibox.height() > ibox.width() * 2)
396 return true; // The i or ! must be tall and thin.
397
398 // It might still be tall and thin, but it might be joined to something.
399 // So search the outline for a piece of large height close to the edges
400 // of the dot.
401 const double kHeightFraction = 0.6;
402 double target_height = std::min(dotbox.bottom(), ibox.top());
403 target_height -= row->line_m()*dotbox.left() + row->line_c();
404 target_height *= kHeightFraction;
405 int left_min = dotbox.left() - dotbox.width();
406 int middle = (dotbox.left() + dotbox.right())/2;
407 int right_max = dotbox.right() + dotbox.width();
408 int left_miny = 0;
409 int left_maxy = 0;
410 int right_miny = 0;
411 int right_maxy = 0;
412 bool found_left = false;
413 bool found_right = false;
414 bool in_left = false;
415 bool in_right = false;
416 C_BLOB* blob = i->cblob();
417 C_OUTLINE_IT o_it = blob->out_list();
418 for (o_it.mark_cycle_pt(); !o_it.cycled_list(); o_it.forward()) {
419 C_OUTLINE* outline = o_it.data();
420 int length = outline->pathlength();
421 ICOORD pos = outline->start_pos();
422 for (int step = 0; step < length; pos += outline->step(step++)) {
423 int x = pos.x();
424 int y = pos.y();
425 if (x >= left_min && x < middle && !found_left) {
426 // We are in the left part so find min and max y.
427 if (in_left) {
428 if (y > left_maxy) left_maxy = y;
429 if (y < left_miny) left_miny = y;
430 } else {
431 left_maxy = left_miny = y;
432 in_left = true;
433 }
434 } else if (in_left) {
435 // We just left the left so look for size.
436 if (left_maxy - left_miny > target_height) {
437 if (found_right)
438 return true;
439 found_left = true;
440 }
441 in_left = false;
442 }
443 if (x <= right_max && x > middle && !found_right) {
444 // We are in the right part so find min and max y.
445 if (in_right) {
446 if (y > right_maxy) right_maxy = y;
447 if (y < right_miny) right_miny = y;
448 } else {
449 right_maxy = right_miny = y;
450 in_right = true;
451 }
452 } else if (in_right) {
453 // We just left the right so look for size.
454 if (right_maxy - right_miny > target_height) {
455 if (found_left)
456 return true;
457 found_right = true;
458 }
459 in_right = false;
460 }
461 }
462 }
463 return false;
464}
465
467 TO_ROW_IT row_it = block->get_rows ();
468 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
469 TO_ROW* row = row_it.data();
470 BLOBNBOX_IT b_it = row->blob_list();
471 // Estimate the xheight on the row.
472 int max_height = 0;
473 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
474 BLOBNBOX* blob = b_it.data();
475 if (blob->bounding_box().height() > max_height)
476 max_height = blob->bounding_box().height();
477 }
478 STATS hstats(0, max_height + 1);
479 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
480 BLOBNBOX* blob = b_it.data();
481 int height = blob->bounding_box().height();
482 if (height >= kMinSize)
483 hstats.add(blob->bounding_box().height(), 1);
484 }
485 float xheight = hstats.median();
486 // Delete small objects.
487 BLOBNBOX* prev = nullptr;
488 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
489 BLOBNBOX* blob = b_it.data();
490 const TBOX& box = blob->bounding_box();
491 if (box.height() < kNoiseSize * xheight) {
492 // Small so delete unless it looks like an i dot.
493 if (prev != nullptr) {
494 if (dot_of_i(blob, prev, row))
495 continue; // Looks OK.
496 }
497 if (!b_it.at_last()) {
498 BLOBNBOX* next = b_it.data_relative(1);
499 if (dot_of_i(blob, next, row))
500 continue; // Looks OK.
501 }
502 // It might be noise so get rid of it.
503 delete blob->cblob();
504 delete b_it.extract();
505 } else {
506 prev = blob;
507 }
508 }
509 }
510}
511
517void cleanup_rows_making( //find lines
518 ICOORD page_tr, //top right
519 TO_BLOCK* block, //block to do
520 float gradient, //gradient to fit
521 FCOORD rotation, //for drawing
522 int32_t block_edge, //edge of block
523 bool testing_on //correct orientation
524) {
525 //iterators
526 BLOBNBOX_IT blob_it = &block->blobs;
527 TO_ROW_IT row_it = block->get_rows ();
528
529#ifndef GRAPHICS_DISABLED
530 if (textord_show_parallel_rows && testing_on) {
531 if (to_win == nullptr)
532 create_to_win(page_tr);
533 }
534#endif
535 //get row coords
536 fit_parallel_rows(block,
537 gradient,
538 rotation,
539 block_edge,
540 textord_show_parallel_rows && testing_on);
542 gradient,
543 rotation,
544 block_edge,
545 textord_show_parallel_rows && testing_on);
546 expand_rows(page_tr, block, gradient, rotation, block_edge, testing_on);
547 blob_it.set_to_list (&block->blobs);
548 row_it.set_to_list (block->get_rows ());
549 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
550 blob_it.add_list_after (row_it.data ()->blob_list ());
551 //give blobs back
552 assign_blobs_to_rows (block, &gradient, 1, false, false, false);
553 //now new rows must be genuine
554 blob_it.set_to_list (&block->blobs);
555 blob_it.add_list_after (&block->large_blobs);
556 assign_blobs_to_rows (block, &gradient, 2, true, true, false);
557 //safe to use big ones now
558 blob_it.set_to_list (&block->blobs);
559 //throw all blobs in
560 blob_it.add_list_after (&block->noise_blobs);
561 blob_it.add_list_after (&block->small_blobs);
562 assign_blobs_to_rows (block, &gradient, 3, false, false, false);
563}
564
570void delete_non_dropout_rows( //find lines
571 TO_BLOCK* block, //block to do
572 float gradient, //global skew
573 FCOORD rotation, //deskew vector
574 int32_t block_edge, //left edge
575 bool testing_on //correct orientation
576) {
577 TBOX block_box; //deskewed block
578 int32_t max_y; //in block
579 int32_t min_y;
580 int32_t line_index; //of scan line
581 int32_t line_count; //no of scan lines
582 int32_t distance; //to drop-out
583 int32_t xleft; //of block
584 int32_t ybottom; //of block
585 TO_ROW *row; //current row
586 TO_ROW_IT row_it = block->get_rows ();
587 BLOBNBOX_IT blob_it = &block->blobs;
588
589 if (row_it.length () == 0)
590 return; //empty block
591 block_box = deskew_block_coords (block, gradient);
592 xleft = block->block->pdblk.bounding_box ().left ();
593 ybottom = block->block->pdblk.bounding_box ().bottom ();
594 min_y = block_box.bottom () - 1;
595 max_y = block_box.top () + 1;
596 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
597 line_index = static_cast<int32_t>(floor (row_it.data ()->intercept ()));
598 if (line_index <= min_y)
599 min_y = line_index - 1;
600 if (line_index >= max_y)
601 max_y = line_index + 1;
602 }
603 line_count = max_y - min_y + 1;
604 if (line_count <= 0)
605 return; //empty block
606 // change in occupation
607 std::vector<int32_t> deltas(line_count);
608 // of pixel coords
609 std::vector<int32_t> occupation(line_count);
610
611 compute_line_occupation(block, gradient, min_y, max_y, &occupation[0], &deltas[0]);
612 compute_occupation_threshold (static_cast<int32_t>(ceil (block->line_spacing *
615 static_cast<int32_t>(ceil (block->line_spacing *
618 max_y - min_y + 1, &occupation[0], &deltas[0]);
619#ifndef GRAPHICS_DISABLED
620 if (testing_on) {
621 draw_occupation(xleft, ybottom, min_y, max_y, &occupation[0], &deltas[0]);
622 }
623#endif
624 compute_dropout_distances(&occupation[0], &deltas[0], line_count);
625 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
626 row = row_it.data ();
627 line_index = static_cast<int32_t>(floor (row->intercept ()));
628 distance = deltas[line_index - min_y];
629 if (find_best_dropout_row (row, distance, block->line_spacing / 2,
630 line_index, &row_it, testing_on)) {
631#ifndef GRAPHICS_DISABLED
632 if (testing_on)
633 plot_parallel_row(row, gradient, block_edge,
634 ScrollView::WHITE, rotation);
635#endif
636 blob_it.add_list_after (row_it.data ()->blob_list ());
637 delete row_it.extract (); //too far away
638 }
639 }
640 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
641 blob_it.add_list_after (row_it.data ()->blob_list ());
642 }
643}
644
645
652bool find_best_dropout_row( //find neighbours
653 TO_ROW* row, //row to test
654 int32_t distance, //dropout dist
655 float dist_limit, //threshold distance
656 int32_t line_index, //index of row
657 TO_ROW_IT* row_it, //current position
658 bool testing_on //correct orientation
659) {
660 int32_t next_index; // of neighbouring row
661 int32_t row_offset; //from current row
662 int32_t abs_dist; //absolute distance
663 int8_t row_inc; //increment to row_index
664 TO_ROW *next_row; //nextious row
665
666 if (testing_on)
667 tprintf ("Row at %g(%g), dropout dist=%d,",
668 row->intercept (), row->parallel_c (), distance);
669 if (distance < 0) {
670 row_inc = 1;
671 abs_dist = -distance;
672 }
673 else {
674 row_inc = -1;
675 abs_dist = distance;
676 }
677 if (abs_dist > dist_limit) {
678 if (testing_on) {
679 tprintf (" too far - deleting\n");
680 }
681 return true;
682 }
683 if ((distance < 0 && !row_it->at_last ())
684 || (distance >= 0 && !row_it->at_first ())) {
685 row_offset = row_inc;
686 do {
687 next_row = row_it->data_relative (row_offset);
688 next_index = static_cast<int32_t>(floor (next_row->intercept ()));
689 if ((distance < 0
690 && next_index < line_index
691 && next_index > line_index + distance + distance)
692 || (distance >= 0
693 && next_index > line_index
694 && next_index < line_index + distance + distance)) {
695 if (testing_on) {
696 tprintf (" nearer neighbour (%d) at %g\n",
697 line_index + distance - next_index,
698 next_row->intercept ());
699 }
700 return true; //other is nearer
701 }
702 else if (next_index == line_index
703 || next_index == line_index + distance + distance) {
704 if (row->believability () <= next_row->believability ()) {
705 if (testing_on) {
706 tprintf (" equal but more believable at %g (%g/%g)\n",
707 next_row->intercept (),
708 row->believability (),
709 next_row->believability ());
710 }
711 return true; //other is more believable
712 }
713 }
714 row_offset += row_inc;
715 }
716 while ((next_index == line_index
717 || next_index == line_index + distance + distance)
718 && row_offset < row_it->length ());
719 if (testing_on)
720 tprintf (" keeping\n");
721 }
722 return false;
723}
724
725
733 TO_BLOCK *block, //block to do
734 float gradient //global skew
735 ) {
736 TBOX result; //block bounds
737 TBOX blob_box; //of block
738 FCOORD rotation; //deskew vector
739 float length; //of gradient vector
740 TO_ROW_IT row_it = block->get_rows ();
741 TO_ROW *row; //current row
742 BLOBNBOX *blob; //current blob
743 BLOBNBOX_IT blob_it; //iterator
744
745 length = sqrt (gradient * gradient + 1);
746 rotation = FCOORD (1 / length, -gradient / length);
747 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
748 row = row_it.data ();
749 blob_it.set_to_list (row->blob_list ());
750 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
751 blob_it.forward ()) {
752 blob = blob_it.data ();
753 blob_box = blob->bounding_box ();
754 blob_box.rotate (rotation);//de-skew it
755 result += blob_box;
756 }
757 }
758 return result;
759}
760
761
768void compute_line_occupation( //project blobs
769 TO_BLOCK *block, //block to do
770 float gradient, //global skew
771 int32_t min_y, //min coord in block
772 int32_t max_y, //in block
773 int32_t *occupation, //output projection
774 int32_t *deltas //derivative
775 ) {
776 int32_t line_count; //maxy-miny+1
777 int32_t line_index; //of scan line
778 int index; //array index for daft compilers
779 TO_ROW *row; //current row
780 TO_ROW_IT row_it = block->get_rows ();
781 BLOBNBOX *blob; //current blob
782 BLOBNBOX_IT blob_it; //iterator
783 float length; //of skew vector
784 TBOX blob_box; //bounding box
785 FCOORD rotation; //inverse of skew
786
787 line_count = max_y - min_y + 1;
788 length = sqrt (gradient * gradient + 1);
789 rotation = FCOORD (1 / length, -gradient / length);
790 for (line_index = 0; line_index < line_count; line_index++)
791 deltas[line_index] = 0;
792 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
793 row = row_it.data ();
794 blob_it.set_to_list (row->blob_list ());
795 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
796 blob_it.forward ()) {
797 blob = blob_it.data ();
798 blob_box = blob->bounding_box ();
799 blob_box.rotate (rotation);//de-skew it
800 int32_t width = blob_box.right() - blob_box.left();
801 index = blob_box.bottom() - min_y;
802 ASSERT_HOST(index >= 0 && index < line_count);
803 // count transitions
804 deltas[index] += width;
805 index = blob_box.top() - min_y;
806 ASSERT_HOST(index >= 0 && index < line_count);
807 deltas[index] -= width;
808 }
809 }
810 occupation[0] = deltas[0];
811 for (line_index = 1; line_index < line_count; line_index++)
812 occupation[line_index] = occupation[line_index - 1] + deltas[line_index];
813}
814
815
821void compute_occupation_threshold( //project blobs
822 int32_t low_window, //below result point
823 int32_t high_window, //above result point
824 int32_t line_count, //array sizes
825 int32_t *occupation, //input projection
826 int32_t *thresholds //output thresholds
827 ) {
828 int32_t line_index; //of thresholds line
829 int32_t low_index; //in occupation
830 int32_t high_index; //in occupation
831 int32_t sum; //current average
832 int32_t divisor; //to get thresholds
833 int32_t min_index; //of min occ
834 int32_t min_occ; //min in locality
835 int32_t test_index; //for finding min
836
837 divisor =
838 static_cast<int32_t>(ceil ((low_window + high_window) / textord_occupancy_threshold));
839 if (low_window + high_window < line_count) {
840 for (sum = 0, high_index = 0; high_index < low_window; high_index++)
841 sum += occupation[high_index];
842 for (low_index = 0; low_index < high_window; low_index++, high_index++)
843 sum += occupation[high_index];
844 min_occ = occupation[0];
845 min_index = 0;
846 for (test_index = 1; test_index < high_index; test_index++) {
847 if (occupation[test_index] <= min_occ) {
848 min_occ = occupation[test_index];
849 min_index = test_index; //find min in region
850 }
851 }
852 for (line_index = 0; line_index < low_window; line_index++)
853 thresholds[line_index] = (sum - min_occ) / divisor + min_occ;
854 //same out to end
855 for (low_index = 0; high_index < line_count; low_index++, high_index++) {
856 sum -= occupation[low_index];
857 sum += occupation[high_index];
858 if (occupation[high_index] <= min_occ) {
859 //find min in region
860 min_occ = occupation[high_index];
861 min_index = high_index;
862 }
863 //lost min from region
864 if (min_index <= low_index) {
865 min_occ = occupation[low_index + 1];
866 min_index = low_index + 1;
867 for (test_index = low_index + 2; test_index <= high_index;
868 test_index++) {
869 if (occupation[test_index] <= min_occ) {
870 min_occ = occupation[test_index];
871 //find min in region
872 min_index = test_index;
873 }
874 }
875 }
876 thresholds[line_index++] = (sum - min_occ) / divisor + min_occ;
877 }
878 }
879 else {
880 min_occ = occupation[0];
881 min_index = 0;
882 for (sum = 0, low_index = 0; low_index < line_count; low_index++) {
883 if (occupation[low_index] < min_occ) {
884 min_occ = occupation[low_index];
885 min_index = low_index;
886 }
887 sum += occupation[low_index];
888 }
889 line_index = 0;
890 }
891 for (; line_index < line_count; line_index++)
892 thresholds[line_index] = (sum - min_occ) / divisor + min_occ;
893 //same out to end
894}
895
896
902void compute_dropout_distances( //project blobs
903 int32_t *occupation, //input projection
904 int32_t *thresholds, //output thresholds
905 int32_t line_count //array sizes
906 ) {
907 int32_t line_index; //of thresholds line
908 int32_t distance; //from prev dropout
909 int32_t next_dist; //to next dropout
910 int32_t back_index; //for back filling
911 int32_t prev_threshold; //before overwrite
912
913 distance = -line_count;
914 line_index = 0;
915 do {
916 do {
917 distance--;
918 prev_threshold = thresholds[line_index];
919 //distance from prev
920 thresholds[line_index] = distance;
921 line_index++;
922 }
923 while (line_index < line_count
924 && (occupation[line_index] < thresholds[line_index]
925 || occupation[line_index - 1] >= prev_threshold));
926 if (line_index < line_count) {
927 back_index = line_index - 1;
928 next_dist = 1;
929 while (next_dist < -distance && back_index >= 0) {
930 thresholds[back_index] = next_dist;
931 back_index--;
932 next_dist++;
933 distance++;
934 }
935 distance = 1;
936 }
937 }
938 while (line_index < line_count);
939}
940
941
949void expand_rows( //find lines
950 ICOORD page_tr, //top right
951 TO_BLOCK* block, //block to do
952 float gradient, //gradient to fit
953 FCOORD rotation, //for drawing
954 int32_t block_edge, //edge of block
955 bool testing_on //correct orientation
956) {
957 bool swallowed_row; //eaten a neighbour
958 float y_max, y_min; //new row limits
959 float y_bottom, y_top; //allowed limits
960 TO_ROW *test_row; //next row
961 TO_ROW *row; //current row
962 //iterators
963 BLOBNBOX_IT blob_it = &block->blobs;
964 TO_ROW_IT row_it = block->get_rows ();
965
966#ifndef GRAPHICS_DISABLED
967 if (textord_show_expanded_rows && testing_on) {
968 if (to_win == nullptr)
969 create_to_win(page_tr);
970 }
971#endif
972
973 adjust_row_limits(block); //shift min,max.
975 if (block->get_rows ()->length () == 0)
976 return;
977 compute_row_stats(block, textord_show_expanded_rows && testing_on);
978 }
979 assign_blobs_to_rows (block, &gradient, 4, true, false, false);
980 //get real membership
981 if (block->get_rows ()->length () == 0)
982 return;
983 fit_parallel_rows(block,
984 gradient,
985 rotation,
986 block_edge,
987 textord_show_expanded_rows && testing_on);
989 compute_row_stats(block, textord_show_expanded_rows && testing_on);
990 row_it.move_to_last ();
991 do {
992 row = row_it.data ();
993 y_max = row->max_y (); //get current limits
994 y_min = row->min_y ();
995 y_bottom = row->intercept () - block->line_size * textord_expansion_factor *
997 y_top = row->intercept () + block->line_size * textord_expansion_factor *
1000 if (y_min > y_bottom) { //expansion allowed
1001 if (textord_show_expanded_rows && testing_on)
1002 tprintf("Expanding bottom of row at %f from %f to %f\n",
1003 row->intercept(), y_min, y_bottom);
1004 //expandable
1005 swallowed_row = true;
1006 while (swallowed_row && !row_it.at_last ()) {
1007 swallowed_row = false;
1008 //get next one
1009 test_row = row_it.data_relative (1);
1010 //overlaps space
1011 if (test_row->max_y () > y_bottom) {
1012 if (test_row->min_y () > y_bottom) {
1013 if (textord_show_expanded_rows && testing_on)
1014 tprintf("Eating row below at %f\n", test_row->intercept());
1015 row_it.forward ();
1016#ifndef GRAPHICS_DISABLED
1017 if (textord_show_expanded_rows && testing_on)
1018 plot_parallel_row(test_row,
1019 gradient,
1020 block_edge,
1022 rotation);
1023#endif
1024 blob_it.set_to_list (row->blob_list ());
1025 blob_it.add_list_after (test_row->blob_list ());
1026 //swallow complete row
1027 delete row_it.extract ();
1028 row_it.backward ();
1029 swallowed_row = true;
1030 }
1031 else if (test_row->max_y () < y_min) {
1032 //shorter limit
1033 y_bottom = test_row->max_y ();
1034 if (textord_show_expanded_rows && testing_on)
1035 tprintf("Truncating limit to %f due to touching row at %f\n",
1036 y_bottom, test_row->intercept());
1037 }
1038 else {
1039 y_bottom = y_min; //can't expand it
1040 if (textord_show_expanded_rows && testing_on)
1041 tprintf("Not expanding limit beyond %f due to touching row at %f\n",
1042 y_bottom, test_row->intercept());
1043 }
1044 }
1045 }
1046 y_min = y_bottom; //expand it
1047 }
1048 if (y_max < y_top) { //expansion allowed
1049 if (textord_show_expanded_rows && testing_on)
1050 tprintf("Expanding top of row at %f from %f to %f\n",
1051 row->intercept(), y_max, y_top);
1052 swallowed_row = true;
1053 while (swallowed_row && !row_it.at_first ()) {
1054 swallowed_row = false;
1055 //get one above
1056 test_row = row_it.data_relative (-1);
1057 if (test_row->min_y () < y_top) {
1058 if (test_row->max_y () < y_top) {
1059 if (textord_show_expanded_rows && testing_on)
1060 tprintf("Eating row above at %f\n", test_row->intercept());
1061 row_it.backward ();
1062 blob_it.set_to_list (row->blob_list ());
1063#ifndef GRAPHICS_DISABLED
1064 if (textord_show_expanded_rows && testing_on)
1065 plot_parallel_row(test_row,
1066 gradient,
1067 block_edge,
1069 rotation);
1070#endif
1071 blob_it.add_list_after (test_row->blob_list ());
1072 //swallow complete row
1073 delete row_it.extract ();
1074 row_it.forward ();
1075 swallowed_row = true;
1076 }
1077 else if (test_row->min_y () < y_max) {
1078 //shorter limit
1079 y_top = test_row->min_y ();
1080 if (textord_show_expanded_rows && testing_on)
1081 tprintf("Truncating limit to %f due to touching row at %f\n",
1082 y_top, test_row->intercept());
1083 }
1084 else {
1085 y_top = y_max; //can't expand it
1086 if (textord_show_expanded_rows && testing_on)
1087 tprintf("Not expanding limit beyond %f due to touching row at %f\n",
1088 y_top, test_row->intercept());
1089 }
1090 }
1091 }
1092 y_max = y_top;
1093 }
1094 //new limits
1095 row->set_limits (y_min, y_max);
1096 row_it.backward ();
1097 }
1098 while (!row_it.at_last ());
1099}
1100
1101
1107void adjust_row_limits( //tidy limits
1108 TO_BLOCK *block //block to do
1109 ) {
1110 TO_ROW *row; //current row
1111 float size; //size of row
1112 float ymax; //top of row
1113 float ymin; //bottom of row
1114 TO_ROW_IT row_it = block->get_rows ();
1115
1117 tprintf("Adjusting row limits for block(%d,%d)\n",
1118 block->block->pdblk.bounding_box().left(),
1119 block->block->pdblk.bounding_box().top());
1120 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1121 row = row_it.data ();
1122 size = row->max_y () - row->min_y ();
1124 tprintf("Row at %f has min %f, max %f, size %f\n",
1125 row->intercept(), row->min_y(), row->max_y(), size);
1132 row->set_limits (row->intercept () + ymin, row->intercept () + ymax);
1133 row->merged = false;
1134 }
1135}
1136
1137
1143void compute_row_stats( //find lines
1144 TO_BLOCK* block, //block to do
1145 bool testing_on //correct orientation
1146) {
1147 int32_t row_index; //of median
1148 TO_ROW *row; //current row
1149 TO_ROW *prev_row; //previous row
1150 float iqr; //inter quartile range
1151 TO_ROW_IT row_it = block->get_rows ();
1152 //number of rows
1153 int16_t rowcount = row_it.length ();
1154 // for choose nth
1155 std::vector<TO_ROW*> rows(rowcount);
1156 rowcount = 0;
1157 prev_row = nullptr;
1158 row_it.move_to_last (); //start at bottom
1159 do {
1160 row = row_it.data ();
1161 if (prev_row != nullptr) {
1162 rows[rowcount++] = prev_row;
1163 prev_row->spacing = row->intercept () - prev_row->intercept ();
1164 if (testing_on)
1165 tprintf ("Row at %g yields spacing of %g\n",
1166 row->intercept (), prev_row->spacing);
1167 }
1168 prev_row = row;
1169 row_it.backward ();
1170 }
1171 while (!row_it.at_last ());
1172 block->key_row = prev_row;
1173 block->baseline_offset =
1174 fmod (prev_row->parallel_c (), block->line_spacing);
1175 if (testing_on)
1176 tprintf ("Blob based spacing=(%g,%g), offset=%g",
1177 block->line_size, block->line_spacing, block->baseline_offset);
1178 if (rowcount > 0) {
1179 row_index = choose_nth_item(rowcount * 3 / 4, &rows[0], rowcount,
1180 sizeof (TO_ROW *), row_spacing_order);
1181 iqr = rows[row_index]->spacing;
1182 row_index = choose_nth_item(rowcount / 4, &rows[0], rowcount,
1183 sizeof (TO_ROW *), row_spacing_order);
1184 iqr -= rows[row_index]->spacing;
1185 row_index = choose_nth_item(rowcount / 2, &rows[0], rowcount,
1186 sizeof (TO_ROW *), row_spacing_order);
1187 block->key_row = rows[row_index];
1188 if (testing_on)
1189 tprintf (" row based=%g(%g)", rows[row_index]->spacing, iqr);
1190 if (rowcount > 2
1191 && iqr < rows[row_index]->spacing * textord_linespace_iqrlimit) {
1193 if (rows[row_index]->spacing < block->line_spacing
1194 && rows[row_index]->spacing > block->line_size)
1195 //within range
1196 block->line_size = rows[row_index]->spacing;
1197 //spacing=size
1198 else if (rows[row_index]->spacing > block->line_spacing)
1199 block->line_size = block->line_spacing;
1200 //too big so use max
1201 }
1202 else {
1203 if (rows[row_index]->spacing < block->line_spacing)
1204 block->line_size = rows[row_index]->spacing;
1205 else
1206 block->line_size = block->line_spacing;
1207 //too big so use max
1208 }
1209 if (block->line_size < textord_min_xheight)
1210 block->line_size = (float) textord_min_xheight;
1211 block->line_spacing = rows[row_index]->spacing;
1212 block->max_blob_size =
1214 }
1215 block->baseline_offset = fmod (rows[row_index]->intercept (),
1216 block->line_spacing);
1217 }
1218 if (testing_on)
1219 tprintf ("\nEstimate line size=%g, spacing=%g, offset=%g\n",
1220 block->line_size, block->line_spacing, block->baseline_offset);
1221}
1222
1223
1253namespace tesseract {
1254void Textord::compute_block_xheight(TO_BLOCK *block, float gradient) {
1255 TO_ROW *row; // current row
1256 float asc_frac_xheight = CCStruct::kAscenderFraction /
1258 float desc_frac_xheight = CCStruct::kDescenderFraction /
1260 int32_t min_height, max_height; // limits on xheight
1261 TO_ROW_IT row_it = block->get_rows();
1262 if (row_it.empty()) return; // no rows
1263
1264 // Compute the best guess of xheight of each row individually.
1265 // Use xheight and ascrise values of the rows where ascenders were found.
1266 get_min_max_xheight(block->line_size, &min_height, &max_height);
1267 STATS row_asc_xheights(min_height, max_height + 1);
1268 STATS row_asc_ascrise(static_cast<int>(min_height * asc_frac_xheight),
1269 static_cast<int>(max_height * asc_frac_xheight) + 1);
1270 int min_desc_height = static_cast<int>(min_height * desc_frac_xheight);
1271 int max_desc_height = static_cast<int>(max_height * desc_frac_xheight);
1272 STATS row_asc_descdrop(min_desc_height, max_desc_height + 1);
1273 STATS row_desc_xheights(min_height, max_height + 1);
1274 STATS row_desc_descdrop(min_desc_height, max_desc_height + 1);
1275 STATS row_cap_xheights(min_height, max_height + 1);
1276 STATS row_cap_floating_xheights(min_height, max_height + 1);
1277 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1278 row = row_it.data();
1279 // Compute the xheight of this row if it has not been computed before.
1280 if (row->xheight <= 0.0) {
1282 gradient, block->line_size);
1283 }
1284 ROW_CATEGORY row_category = get_row_category(row);
1285 if (row_category == ROW_ASCENDERS_FOUND) {
1286 row_asc_xheights.add(static_cast<int32_t>(row->xheight),
1287 row->xheight_evidence);
1288 row_asc_ascrise.add(static_cast<int32_t>(row->ascrise),
1289 row->xheight_evidence);
1290 row_asc_descdrop.add(static_cast<int32_t>(-row->descdrop),
1291 row->xheight_evidence);
1292 } else if (row_category == ROW_DESCENDERS_FOUND) {
1293 row_desc_xheights.add(static_cast<int32_t>(row->xheight),
1294 row->xheight_evidence);
1295 row_desc_descdrop.add(static_cast<int32_t>(-row->descdrop),
1296 row->xheight_evidence);
1297 } else if (row_category == ROW_UNKNOWN) {
1298 fill_heights(row, gradient, min_height, max_height,
1299 &row_cap_xheights, &row_cap_floating_xheights);
1300 }
1301 }
1302
1303 float xheight = 0.0;
1304 float ascrise = 0.0;
1305 float descdrop = 0.0;
1306 // Compute our best guess of xheight of this block.
1307 if (row_asc_xheights.get_total() > 0) {
1308 // Determine xheight from rows where ascenders were found.
1309 xheight = row_asc_xheights.median();
1310 ascrise = row_asc_ascrise.median();
1311 descdrop = -row_asc_descdrop.median();
1312 } else if (row_desc_xheights.get_total() > 0) {
1313 // Determine xheight from rows where descenders were found.
1314 xheight = row_desc_xheights.median();
1315 descdrop = -row_desc_descdrop.median();
1316 } else if (row_cap_xheights.get_total() > 0) {
1317 // All the rows in the block were (a/de)scenderless.
1318 // Try to search for two modes in row_cap_heights that could
1319 // be the xheight and the capheight (e.g. some of the rows
1320 // were lowercase, but did not have enough (a/de)scenders.
1321 // If such two modes can not be found, this block is most
1322 // likely all caps (or all small caps, in which case the code
1323 // still works as intended).
1324 compute_xheight_from_modes(&row_cap_xheights, &row_cap_floating_xheights,
1326 block->block->classify_rotation().y() == 0.0,
1327 min_height, max_height, &(xheight), &(ascrise));
1328 if (ascrise == 0) { // assume only caps in the whole block
1329 xheight = row_cap_xheights.median() * CCStruct::kXHeightCapRatio;
1330 }
1331 } else { // default block sizes
1332 xheight = block->line_size * CCStruct::kXHeightFraction;
1333 }
1334 // Correct xheight, ascrise and descdrop if necessary.
1335 bool corrected_xheight = false;
1336 if (xheight < textord_min_xheight) {
1337 xheight = static_cast<float>(textord_min_xheight);
1338 corrected_xheight = true;
1339 }
1340 if (corrected_xheight || ascrise <= 0.0) {
1341 ascrise = xheight * asc_frac_xheight;
1342 }
1343 if (corrected_xheight || descdrop >= 0.0) {
1344 descdrop = -(xheight * desc_frac_xheight);
1345 }
1346 block->xheight = xheight;
1347
1349 tprintf("Block average xheight=%.4f, ascrise=%.4f, descdrop=%.4f\n",
1350 xheight, ascrise, descdrop);
1351 }
1352 // Correct xheight, ascrise, descdrop of rows based on block averages.
1353 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1354 correct_row_xheight(row_it.data(), xheight, ascrise, descdrop);
1355 }
1356}
1357
1367 const FCOORD& rotation,
1368 float gradient, // global skew
1369 int block_line_size) {
1370 // Find blobs representing repeated characters in rows and mark them.
1371 // This information is used for computing row xheight and at a later
1372 // stage when words are formed by make_words.
1373 if (!row->rep_chars_marked()) {
1375 }
1376
1377 int min_height, max_height;
1378 get_min_max_xheight(block_line_size, &min_height, &max_height);
1379 STATS heights(min_height, max_height + 1);
1380 STATS floating_heights(min_height, max_height + 1);
1381 fill_heights(row, gradient, min_height, max_height,
1382 &heights, &floating_heights);
1383 row->ascrise = 0.0f;
1384 row->xheight = 0.0f;
1385 row->xheight_evidence =
1386 compute_xheight_from_modes(&heights, &floating_heights,
1388 rotation.y() == 0.0,
1389 min_height, max_height,
1390 &(row->xheight), &(row->ascrise));
1391 row->descdrop = 0.0f;
1392 if (row->xheight > 0.0) {
1393 row->descdrop = static_cast<float>(
1394 compute_row_descdrop(row, gradient, row->xheight_evidence, &heights));
1395 }
1396}
1397
1398} // namespace tesseract.
1399
1406void fill_heights(TO_ROW *row, float gradient, int min_height,
1407 int max_height, STATS *heights, STATS *floating_heights) {
1408 float xcentre; // centre of blob
1409 float top; // top y coord of blob
1410 float height; // height of blob
1411 BLOBNBOX *blob; // current blob
1412 int repeated_set;
1413 BLOBNBOX_IT blob_it = row->blob_list();
1414 if (blob_it.empty()) return; // no blobs in this row
1415 bool has_rep_chars =
1416 row->rep_chars_marked() && row->num_repeated_sets() > 0;
1417 do {
1418 blob = blob_it.data();
1419 if (!blob->joined_to_prev()) {
1420 xcentre = (blob->bounding_box().left() +
1421 blob->bounding_box().right()) / 2.0f;
1422 top = blob->bounding_box().top();
1423 height = blob->bounding_box().height();
1425 top -= row->baseline.y(xcentre);
1426 else
1427 top -= gradient * xcentre + row->parallel_c();
1428 if (top >= min_height && top <= max_height) {
1429 heights->add(static_cast<int32_t>(floor(top + 0.5)), 1);
1430 if (height / top < textord_min_blob_height_fraction) {
1431 floating_heights->add(static_cast<int32_t>(floor(top + 0.5)), 1);
1432 }
1433 }
1434 }
1435 // Skip repeated chars, since they are likely to skew the height stats.
1436 if (has_rep_chars && blob->repeated_set() != 0) {
1437 repeated_set = blob->repeated_set();
1438 blob_it.forward();
1439 while (!blob_it.at_first() &&
1440 blob_it.data()->repeated_set() == repeated_set) {
1441 blob_it.forward();
1443 tprintf("Skipping repeated char when computing xheight\n");
1444 }
1445 } else {
1446 blob_it.forward();
1447 }
1448 } while (!blob_it.at_first());
1449}
1450
1468 STATS *heights, STATS *floating_heights, bool cap_only, int min_height,
1469 int max_height, float *xheight, float *ascrise) {
1470 int blob_index = heights->mode(); // find mode
1471 int blob_count = heights->pile_count(blob_index); // get count of mode
1473 tprintf("min_height=%d, max_height=%d, mode=%d, count=%d, total=%d\n",
1474 min_height, max_height, blob_index, blob_count,
1475 heights->get_total());
1476 heights->print();
1477 floating_heights->print();
1478 }
1479 if (blob_count == 0) return 0;
1480 int modes[MAX_HEIGHT_MODES]; // biggest piles
1481 bool in_best_pile = false;
1482 int prev_size = -INT32_MAX;
1483 int best_count = 0;
1484 int mode_count = compute_height_modes(heights, min_height, max_height,
1485 modes, MAX_HEIGHT_MODES);
1486 if (cap_only && mode_count > 1)
1487 mode_count = 1;
1488 int x;
1490 tprintf("found %d modes: ", mode_count);
1491 for (x = 0; x < mode_count; x++) tprintf("%d ", modes[x]);
1492 tprintf("\n");
1493 }
1494
1495 for (x = 0; x < mode_count - 1; x++) {
1496 if (modes[x] != prev_size + 1)
1497 in_best_pile = false; // had empty height
1498 int modes_x_count = heights->pile_count(modes[x]) -
1499 floating_heights->pile_count(modes[x]);
1500 if ((modes_x_count >= blob_count * textord_xheight_mode_fraction) &&
1501 (in_best_pile || modes_x_count > best_count)) {
1502 for (int asc = x + 1; asc < mode_count; asc++) {
1503 float ratio =
1504 static_cast<float>(modes[asc]) / static_cast<float>(modes[x]);
1505 if (textord_ascx_ratio_min < ratio &&
1506 ratio < textord_ascx_ratio_max &&
1507 (heights->pile_count(modes[asc]) >=
1508 blob_count * textord_ascheight_mode_fraction)) {
1509 if (modes_x_count > best_count) {
1510 in_best_pile = true;
1511 best_count = modes_x_count;
1512 }
1514 tprintf("X=%d, asc=%d, count=%d, ratio=%g\n",
1515 modes[x], modes[asc]-modes[x], modes_x_count, ratio);
1516 }
1517 prev_size = modes[x];
1518 *xheight = static_cast<float>(modes[x]);
1519 *ascrise = static_cast<float>(modes[asc] - modes[x]);
1520 }
1521 }
1522 }
1523 }
1524 if (*xheight == 0) { // single mode
1525 // Remove counts of the "floating" blobs (the one whose height is too
1526 // small in relation to it's top end of the bounding box) from heights
1527 // before computing the single-mode xheight.
1528 // Restore the counts in heights after the mode is found, since
1529 // floating blobs might be useful for determining potential ascenders
1530 // in compute_row_descdrop().
1531 if (floating_heights->get_total() > 0) {
1532 for (x = min_height; x < max_height; ++x) {
1533 heights->add(x, -(floating_heights->pile_count(x)));
1534 }
1535 blob_index = heights->mode(); // find the modified mode
1536 for (x = min_height; x < max_height; ++x) {
1537 heights->add(x, floating_heights->pile_count(x));
1538 }
1539 }
1540 *xheight = static_cast<float>(blob_index);
1541 *ascrise = 0.0f;
1542 best_count = heights->pile_count(blob_index);
1544 tprintf("Single mode xheight set to %g\n", *xheight);
1545 } else if (textord_debug_xheights) {
1546 tprintf("Multi-mode xheight set to %g, asc=%g\n", *xheight, *ascrise);
1547 }
1548 return best_count;
1549}
1550
1563int32_t compute_row_descdrop(TO_ROW *row, float gradient,
1564 int xheight_blob_count, STATS *asc_heights) {
1565 // Count how many potential ascenders are in this row.
1566 int i_min = asc_heights->min_bucket();
1567 if ((i_min / row->xheight) < textord_ascx_ratio_min) {
1568 i_min = static_cast<int>(
1569 floor(row->xheight * textord_ascx_ratio_min + 0.5));
1570 }
1571 int i_max = asc_heights->max_bucket();
1572 if ((i_max / row->xheight) > textord_ascx_ratio_max) {
1573 i_max = static_cast<int>(floor(row->xheight * textord_ascx_ratio_max));
1574 }
1575 int num_potential_asc = 0;
1576 for (int i = i_min; i <= i_max; ++i) {
1577 num_potential_asc += asc_heights->pile_count(i);
1578 }
1579 auto min_height =
1580 static_cast<int32_t>(floor(row->xheight * textord_descx_ratio_min + 0.5));
1581 auto max_height =
1582 static_cast<int32_t>(floor(row->xheight * textord_descx_ratio_max));
1583 float xcentre; // centre of blob
1584 float height; // height of blob
1585 BLOBNBOX_IT blob_it = row->blob_list();
1586 BLOBNBOX *blob; // current blob
1587 STATS heights (min_height, max_height + 1);
1588 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1589 blob = blob_it.data();
1590 if (!blob->joined_to_prev()) {
1591 xcentre = (blob->bounding_box().left() +
1592 blob->bounding_box().right()) / 2.0f;
1593 height = (gradient * xcentre + row->parallel_c() -
1594 blob->bounding_box().bottom());
1595 if (height >= min_height && height <= max_height)
1596 heights.add(static_cast<int>(floor(height + 0.5)), 1);
1597 }
1598 }
1599 int blob_index = heights.mode(); // find mode
1600 int blob_count = heights.pile_count(blob_index); // get count of mode
1601 float total_fraction =
1602 (textord_descheight_mode_fraction + textord_ascheight_mode_fraction);
1603 if (static_cast<float>(blob_count + num_potential_asc) <
1604 xheight_blob_count * total_fraction) {
1605 blob_count = 0;
1606 }
1607 int descdrop = blob_count > 0 ? -blob_index : 0;
1609 tprintf("Descdrop: %d (potential ascenders %d, descenders %d)\n",
1610 descdrop, num_potential_asc, blob_count);
1611 heights.print();
1612 }
1613 return descdrop;
1614}
1615
1616
1623int32_t compute_height_modes(STATS *heights, // stats to search
1624 int32_t min_height, // bottom of range
1625 int32_t max_height, // top of range
1626 int32_t *modes, // output array
1627 int32_t maxmodes) { // size of modes
1628 int32_t pile_count; // no in source pile
1629 int32_t src_count; // no of source entries
1630 int32_t src_index; // current entry
1631 int32_t least_count; // height of smalllest
1632 int32_t least_index; // index of least
1633 int32_t dest_count; // index in modes
1634
1635 src_count = max_height + 1 - min_height;
1636 dest_count = 0;
1637 least_count = INT32_MAX;
1638 least_index = -1;
1639 for (src_index = 0; src_index < src_count; src_index++) {
1640 pile_count = heights->pile_count(min_height + src_index);
1641 if (pile_count > 0) {
1642 if (dest_count < maxmodes) {
1643 if (pile_count < least_count) {
1644 // find smallest in array
1645 least_count = pile_count;
1646 least_index = dest_count;
1647 }
1648 modes[dest_count++] = min_height + src_index;
1649 } else if (pile_count >= least_count) {
1650 while (least_index < maxmodes - 1) {
1651 modes[least_index] = modes[least_index + 1];
1652 // shuffle up
1653 least_index++;
1654 }
1655 // new one on end
1656 modes[maxmodes - 1] = min_height + src_index;
1657 if (pile_count == least_count) {
1658 // new smallest
1659 least_index = maxmodes - 1;
1660 } else {
1661 least_count = heights->pile_count(modes[0]);
1662 least_index = 0;
1663 for (dest_count = 1; dest_count < maxmodes; dest_count++) {
1664 pile_count = heights->pile_count(modes[dest_count]);
1665 if (pile_count < least_count) {
1666 // find smallest
1667 least_count = pile_count;
1668 least_index = dest_count;
1669 }
1670 }
1671 }
1672 }
1673 }
1674 }
1675 return dest_count;
1676}
1677
1678
1685void correct_row_xheight(TO_ROW *row, float xheight,
1686 float ascrise, float descdrop) {
1687 ROW_CATEGORY row_category = get_row_category(row);
1689 tprintf("correcting row xheight: row->xheight %.4f"
1690 ", row->acrise %.4f row->descdrop %.4f\n",
1691 row->xheight, row->ascrise, row->descdrop);
1692 }
1693 bool normal_xheight =
1695 bool cap_xheight =
1696 within_error_margin(row->xheight, xheight + ascrise,
1698 // Use the average xheight/ascrise for the following cases:
1699 // -- the xheight of the row could not be determined at all
1700 // -- the row has descenders (e.g. "many groups", "ISBN 12345 p.3")
1701 // and its xheight is close to either cap height or average xheight
1702 // -- the row does not have ascenders or descenders, but its xheight
1703 // is close to the average block xheight (e.g. row with "www.mmm.com")
1704 if (row_category == ROW_ASCENDERS_FOUND) {
1705 if (row->descdrop >= 0.0) {
1706 row->descdrop = row->xheight * (descdrop / xheight);
1707 }
1708 } else if (row_category == ROW_INVALID ||
1709 (row_category == ROW_DESCENDERS_FOUND &&
1710 (normal_xheight || cap_xheight)) ||
1711 (row_category == ROW_UNKNOWN && normal_xheight)) {
1712 if (textord_debug_xheights) tprintf("using average xheight\n");
1713 row->xheight = xheight;
1714 row->ascrise = ascrise;
1715 row->descdrop = descdrop;
1716 } else if (row_category == ROW_DESCENDERS_FOUND) {
1717 // Assume this is a row with mostly lowercase letters and it's xheight
1718 // is computed correctly (unfortunately there is no way to distinguish
1719 // this from the case when descenders are found, but the most common
1720 // height is capheight).
1721 if (textord_debug_xheights) tprintf("lowercase, corrected ascrise\n");
1722 row->ascrise = row->xheight * (ascrise / xheight);
1723 } else if (row_category == ROW_UNKNOWN) {
1724 // Otherwise assume this row is an all-caps or small-caps row
1725 // and adjust xheight and ascrise of the row.
1726
1727 row->all_caps = true;
1728 if (cap_xheight) { // regular all caps
1729 if (textord_debug_xheights) tprintf("all caps\n");
1730 row->xheight = xheight;
1731 row->ascrise = ascrise;
1732 row->descdrop = descdrop;
1733 } else { // small caps or caps with an odd xheight
1735 if (row->xheight < xheight + ascrise && row->xheight > xheight) {
1736 tprintf("small caps\n");
1737 } else {
1738 tprintf("all caps with irregular xheight\n");
1739 }
1740 }
1741 row->ascrise = row->xheight * (ascrise / (xheight + ascrise));
1742 row->xheight -= row->ascrise;
1743 row->descdrop = row->xheight * (descdrop / xheight);
1744 }
1745 }
1747 tprintf("corrected row->xheight = %.4f, row->acrise = %.4f, row->descdrop"
1748 " = %.4f\n", row->xheight, row->ascrise, row->descdrop);
1749 }
1750}
1751
1752static int CountOverlaps(const TBOX& box, int min_height,
1753 BLOBNBOX_LIST* blobs) {
1754 int overlaps = 0;
1755 BLOBNBOX_IT blob_it(blobs);
1756 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1757 BLOBNBOX* blob = blob_it.data();
1758 const TBOX &blob_box = blob->bounding_box();
1759 if (blob_box.height() >= min_height && box.major_overlap(blob_box)) {
1760 ++overlaps;
1761 }
1762 }
1763 return overlaps;
1764}
1765
1772void separate_underlines(TO_BLOCK* block, // block to do
1773 float gradient, // skew angle
1774 FCOORD rotation, // inverse landscape
1775 bool testing_on) { // correct orientation
1776 BLOBNBOX *blob; // current blob
1777 C_BLOB *rotated_blob; // rotated blob
1778 TO_ROW *row; // current row
1779 float length; // of g_vec
1780 TBOX blob_box;
1781 FCOORD blob_rotation; // inverse of rotation
1782 FCOORD g_vec; // skew rotation
1783 BLOBNBOX_IT blob_it; // iterator
1784 // iterator
1785 BLOBNBOX_IT under_it = &block->underlines;
1786 BLOBNBOX_IT large_it = &block->large_blobs;
1787 TO_ROW_IT row_it = block->get_rows();
1788 int min_blob_height = static_cast<int>(textord_min_blob_height_fraction *
1789 block->line_size + 0.5);
1790
1791 // length of vector
1792 length = sqrt(1 + gradient * gradient);
1793 g_vec = FCOORD(1 / length, -gradient / length);
1794 blob_rotation = FCOORD(rotation.x(), -rotation.y());
1795 blob_rotation.rotate(g_vec); // undoing everything
1796 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1797 row = row_it.data();
1798 // get blobs
1799 blob_it.set_to_list(row->blob_list());
1800 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
1801 blob_it.forward()) {
1802 blob = blob_it.data();
1803 blob_box = blob->bounding_box();
1804 if (blob_box.width() > block->line_size * textord_underline_width) {
1805 ASSERT_HOST(blob->cblob() != nullptr);
1806 rotated_blob = crotate_cblob (blob->cblob(),
1807 blob_rotation);
1808 if (test_underline(
1809 testing_on && textord_show_final_rows,
1810 rotated_blob, static_cast<int16_t>(row->intercept()),
1811 static_cast<int16_t>(
1812 block->line_size *
1815 under_it.add_after_then_move(blob_it.extract());
1816 if (testing_on && textord_show_final_rows) {
1817 tprintf("Underlined blob at:");
1818 rotated_blob->bounding_box().print();
1819 tprintf("Was:");
1820 blob_box.print();
1821 }
1822 } else if (CountOverlaps(blob->bounding_box(), min_blob_height,
1823 row->blob_list()) >
1824 textord_max_blob_overlaps) {
1825 large_it.add_after_then_move(blob_it.extract());
1826 if (testing_on && textord_show_final_rows) {
1827 tprintf("Large blob overlaps %d blobs at:",
1828 CountOverlaps(blob_box, min_blob_height,
1829 row->blob_list()));
1830 blob_box.print();
1831 }
1832 }
1833 delete rotated_blob;
1834 }
1835 }
1836 }
1837}
1838
1839
1845void pre_associate_blobs( //make rough chars
1846 ICOORD page_tr, //top right
1847 TO_BLOCK* block, //block to do
1848 FCOORD rotation, //inverse landscape
1849 bool testing_on //correct orientation
1850) {
1851#ifndef GRAPHICS_DISABLED
1852 ScrollView::Color colour; //of boxes
1853#endif
1854 BLOBNBOX *blob; //current blob
1855 BLOBNBOX *nextblob; //next in list
1856 TBOX blob_box;
1857 FCOORD blob_rotation; //inverse of rotation
1858 BLOBNBOX_IT blob_it; //iterator
1859 BLOBNBOX_IT start_it; //iterator
1860 TO_ROW_IT row_it = block->get_rows ();
1861
1862#ifndef GRAPHICS_DISABLED
1863 colour = ScrollView::RED;
1864#endif
1865
1866 blob_rotation = FCOORD (rotation.x (), -rotation.y ());
1867 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1868 //get blobs
1869 blob_it.set_to_list (row_it.data ()->blob_list ());
1870 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
1871 blob_it.forward ()) {
1872 blob = blob_it.data ();
1873 blob_box = blob->bounding_box ();
1874 start_it = blob_it; //save start point
1875 // if (testing_on && textord_show_final_blobs)
1876 // {
1877 // tprintf("Blob at (%d,%d)->(%d,%d), addr=%x, count=%d\n",
1878 // blob_box.left(),blob_box.bottom(),
1879 // blob_box.right(),blob_box.top(),
1880 // (void*)blob,blob_it.length());
1881 // }
1882 bool overlap;
1883 do {
1884 overlap = false;
1885 if (!blob_it.at_last ()) {
1886 nextblob = blob_it.data_relative(1);
1887 overlap = blob_box.major_x_overlap(nextblob->bounding_box());
1888 if (overlap) {
1889 blob->merge(nextblob); // merge new blob
1890 blob_box = blob->bounding_box(); // get bigger box
1891 blob_it.forward();
1892 }
1893 }
1894 }
1895 while (overlap);
1896 blob->chop (&start_it, &blob_it,
1897 blob_rotation,
1900 //attempt chop
1901 }
1902#ifndef GRAPHICS_DISABLED
1903 if (testing_on && textord_show_final_blobs) {
1904 if (to_win == nullptr)
1905 create_to_win(page_tr);
1906 to_win->Pen(colour);
1907 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
1908 blob_it.forward ()) {
1909 blob = blob_it.data ();
1910 blob_box = blob->bounding_box ();
1911 blob_box.rotate (rotation);
1912 if (!blob->joined_to_prev ()) {
1913 to_win->Rectangle (blob_box.left (), blob_box.bottom (),
1914 blob_box.right (), blob_box.top ());
1915 }
1916 }
1917 colour = static_cast<ScrollView::Color>(colour + 1);
1918 if (colour > ScrollView::MAGENTA)
1919 colour = ScrollView::RED;
1920 }
1921#endif
1922 }
1923}
1924
1925
1931void fit_parallel_rows( //find lines
1932 TO_BLOCK* block, //block to do
1933 float gradient, //gradient to fit
1934 FCOORD rotation, //for drawing
1935 int32_t block_edge, //edge of block
1936 bool testing_on //correct orientation
1937) {
1938#ifndef GRAPHICS_DISABLED
1939 ScrollView::Color colour; //of row
1940#endif
1941 TO_ROW_IT row_it = block->get_rows ();
1942
1943 row_it.move_to_first ();
1944 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1945 if (row_it.data ()->blob_list ()->empty ())
1946 delete row_it.extract (); //nothing in it
1947 else
1948 fit_parallel_lms (gradient, row_it.data ());
1949 }
1950#ifndef GRAPHICS_DISABLED
1951 if (testing_on) {
1952 colour = ScrollView::RED;
1953 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1954 plot_parallel_row (row_it.data (), gradient,
1955 block_edge, colour, rotation);
1956 colour = static_cast<ScrollView::Color>(colour + 1);
1957 if (colour > ScrollView::MAGENTA)
1958 colour = ScrollView::RED;
1959 }
1960 }
1961#endif
1962 row_it.sort (row_y_order); //may have gone out of order
1963}
1964
1965
1973void fit_parallel_lms(float gradient, TO_ROW *row) {
1974 float c; // fitted line
1975 int blobcount; // no of blobs
1977 BLOBNBOX_IT blob_it = row->blob_list();
1978
1979 blobcount = 0;
1980 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1981 if (!blob_it.data()->joined_to_prev()) {
1982 const TBOX& box = blob_it.data()->bounding_box();
1983 lms.Add(ICOORD((box.left() + box.right()) / 2, box.bottom()));
1984 blobcount++;
1985 }
1986 }
1987 double error = lms.ConstrainedFit(gradient, &c);
1988 row->set_parallel_line(gradient, c, error);
1990 error = lms.Fit(&gradient, &c);
1991 }
1992 //set the other too
1993 row->set_line(gradient, c, error);
1994}
1995
1996
2002namespace tesseract {
2003void Textord::make_spline_rows(TO_BLOCK* block, // block to do
2004 float gradient, // gradient to fit
2005 bool testing_on) {
2006#ifndef GRAPHICS_DISABLED
2007 ScrollView::Color colour; //of row
2008#endif
2009 TO_ROW_IT row_it = block->get_rows ();
2010
2011 row_it.move_to_first ();
2012 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
2013 if (row_it.data ()->blob_list ()->empty ())
2014 delete row_it.extract (); //nothing in it
2015 else
2016 make_baseline_spline (row_it.data (), block);
2017 }
2019#ifndef GRAPHICS_DISABLED
2020 if (testing_on) {
2021 colour = ScrollView::RED;
2022 for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
2023 row_it.forward ()) {
2024 row_it.data ()->baseline.plot (to_win, colour);
2025 colour = static_cast<ScrollView::Color>(colour + 1);
2026 if (colour > ScrollView::MAGENTA)
2027 colour = ScrollView::RED;
2028 }
2029 }
2030#endif
2031 make_old_baselines(block, testing_on, gradient);
2032 }
2033#ifndef GRAPHICS_DISABLED
2034 if (testing_on) {
2035 colour = ScrollView::RED;
2036 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
2037 row_it.data ()->baseline.plot (to_win, colour);
2038 colour = static_cast<ScrollView::Color>(colour + 1);
2039 if (colour > ScrollView::MAGENTA)
2040 colour = ScrollView::RED;
2041 }
2042 }
2043#endif
2044}
2045
2046} // namespace tesseract.
2047
2048
2056void make_baseline_spline(TO_ROW *row, //row to fit
2057 TO_BLOCK *block) {
2058 double *coeffs; // quadratic coeffs
2059 int32_t segments; // no of segments
2060
2061 // spline boundaries
2062 auto *xstarts = new int32_t[row->blob_list()->length() + 1];
2063 if (segment_baseline(row, block, segments, xstarts)
2065 coeffs = linear_spline_baseline(row, block, segments, xstarts);
2066 } else {
2067 xstarts[1] = xstarts[segments];
2068 segments = 1;
2069 coeffs = new double[3];
2070 coeffs[0] = 0;
2071 coeffs[1] = row->line_m ();
2072 coeffs[2] = row->line_c ();
2073 }
2074 row->baseline = QSPLINE (segments, xstarts, coeffs);
2075 delete[] coeffs;
2076 delete[] xstarts;
2077}
2078
2079
2087bool
2088segment_baseline( //split baseline
2089 TO_ROW* row, //row to fit
2090 TO_BLOCK* block, //block it came from
2091 int32_t& segments, //no fo segments
2092 int32_t* xstarts //coords of segments
2093) {
2094 bool needs_curve; //needs curved line
2095 int blobcount; //no of blobs
2096 int blobindex; //current blob
2097 int last_state; //above, on , below
2098 int state; //of current blob
2099 float yshift; //from baseline
2100 TBOX box; //blob box
2101 TBOX new_box; //new_it box
2102 float middle; //xcentre of blob
2103 //blobs
2104 BLOBNBOX_IT blob_it = row->blob_list ();
2105 BLOBNBOX_IT new_it = blob_it; //front end
2106 SORTED_FLOATS yshifts; //shifts from baseline
2107
2108 needs_curve = false;
2109 box = box_next_pre_chopped (&blob_it);
2110 xstarts[0] = box.left ();
2111 segments = 1;
2112 blobcount = row->blob_list ()->length ();
2114 tprintf ("Segmenting baseline of %d blobs at (%d,%d)\n",
2115 blobcount, box.left (), box.bottom ());
2116 if (blobcount <= textord_spline_medianwin
2117 || blobcount < textord_spline_minblobs) {
2118 blob_it.move_to_last ();
2119 box = blob_it.data ()->bounding_box ();
2120 xstarts[1] = box.right ();
2121 return false;
2122 }
2123 last_state = 0;
2124 new_it.mark_cycle_pt ();
2125 for (blobindex = 0; blobindex < textord_spline_medianwin; blobindex++) {
2126 new_box = box_next_pre_chopped (&new_it);
2127 middle = (new_box.left () + new_box.right ()) / 2.0;
2128 yshift = new_box.bottom () - row->line_m () * middle - row->line_c ();
2129 //record shift
2130 yshifts.add (yshift, blobindex);
2131 if (new_it.cycled_list ()) {
2132 xstarts[1] = new_box.right ();
2133 return false;
2134 }
2135 }
2136 for (blobcount = 0; blobcount < textord_spline_medianwin / 2; blobcount++)
2137 box = box_next_pre_chopped (&blob_it);
2138 do {
2139 new_box = box_next_pre_chopped (&new_it);
2140 //get middle one
2141 yshift = yshifts[textord_spline_medianwin / 2];
2142 if (yshift > textord_spline_shift_fraction * block->line_size)
2143 state = 1;
2144 else if (-yshift > textord_spline_shift_fraction * block->line_size)
2145 state = -1;
2146 else
2147 state = 0;
2148 if (state != 0)
2149 needs_curve = true;
2150 // tprintf("State=%d, prev=%d, shift=%g\n",
2151 // state,last_state,yshift);
2152 if (state != last_state && blobcount > textord_spline_minblobs) {
2153 xstarts[segments++] = box.left ();
2154 blobcount = 0;
2155 }
2156 last_state = state;
2157 yshifts.remove (blobindex - textord_spline_medianwin);
2158 box = box_next_pre_chopped (&blob_it);
2159 middle = (new_box.left () + new_box.right ()) / 2.0;
2160 yshift = new_box.bottom () - row->line_m () * middle - row->line_c ();
2161 yshifts.add (yshift, blobindex);
2162 blobindex++;
2163 blobcount++;
2164 }
2165 while (!new_it.cycled_list ());
2166 if (blobcount > textord_spline_minblobs || segments == 1) {
2167 xstarts[segments] = new_box.right ();
2168 }
2169 else {
2170 xstarts[--segments] = new_box.right ();
2171 }
2173 tprintf ("Made %d segments on row at (%d,%d)\n",
2174 segments, box.right (), box.bottom ());
2175 return needs_curve;
2176}
2177
2178
2186double *
2187linear_spline_baseline ( //split baseline
2188TO_ROW * row, //row to fit
2189TO_BLOCK * block, //block it came from
2190int32_t & segments, //no fo segments
2191int32_t xstarts[] //coords of segments
2192) {
2193 int blobcount; //no of blobs
2194 int blobindex; //current blob
2195 int index1, index2; //blob numbers
2196 int blobs_per_segment; //blobs in each
2197 TBOX box; //blob box
2198 TBOX new_box; //new_it box
2199 //blobs
2200 BLOBNBOX_IT blob_it = row->blob_list ();
2201 BLOBNBOX_IT new_it = blob_it; //front end
2202 float b, c; //fitted curve
2204 int32_t segment; //current segment
2205
2206 box = box_next_pre_chopped (&blob_it);
2207 xstarts[0] = box.left ();
2208 blobcount = 1;
2209 while (!blob_it.at_first ()) {
2210 blobcount++;
2211 box = box_next_pre_chopped (&blob_it);
2212 }
2213 segments = blobcount / textord_spline_medianwin;
2214 if (segments < 1)
2215 segments = 1;
2216 blobs_per_segment = blobcount / segments;
2217 // quadratic coeffs
2218 auto *coeffs = new double[segments * 3];
2220 tprintf
2221 ("Linear splining baseline of %d blobs at (%d,%d), into %d segments of %d blobs\n",
2222 blobcount, box.left (), box.bottom (), segments, blobs_per_segment);
2223 segment = 1;
2224 for (index2 = 0; index2 < blobs_per_segment / 2; index2++)
2225 box_next_pre_chopped(&new_it);
2226 index1 = 0;
2227 blobindex = index2;
2228 do {
2229 blobindex += blobs_per_segment;
2230 lms.Clear();
2231 while (index1 < blobindex || (segment == segments && index1 < blobcount)) {
2232 box = box_next_pre_chopped (&blob_it);
2233 int middle = (box.left() + box.right()) / 2;
2234 lms.Add(ICOORD(middle, box.bottom()));
2235 index1++;
2236 if (index1 == blobindex - blobs_per_segment / 2
2237 || index1 == blobcount - 1) {
2238 xstarts[segment] = box.left ();
2239 }
2240 }
2241 lms.Fit(&b, &c);
2242 coeffs[segment * 3 - 3] = 0;
2243 coeffs[segment * 3 - 2] = b;
2244 coeffs[segment * 3 - 1] = c;
2245 segment++;
2246 if (segment > segments)
2247 break;
2248
2249 blobindex += blobs_per_segment;
2250 lms.Clear();
2251 while (index2 < blobindex || (segment == segments && index2 < blobcount)) {
2252 new_box = box_next_pre_chopped (&new_it);
2253 int middle = (new_box.left() + new_box.right()) / 2;
2254 lms.Add(ICOORD (middle, new_box.bottom()));
2255 index2++;
2256 if (index2 == blobindex - blobs_per_segment / 2
2257 || index2 == blobcount - 1) {
2258 xstarts[segment] = new_box.left ();
2259 }
2260 }
2261 lms.Fit(&b, &c);
2262 coeffs[segment * 3 - 3] = 0;
2263 coeffs[segment * 3 - 2] = b;
2264 coeffs[segment * 3 - 1] = c;
2265 segment++;
2266 }
2267 while (segment <= segments);
2268 return coeffs;
2269}
2270
2271
2278void assign_blobs_to_rows( //find lines
2279 TO_BLOCK* block, //block to do
2280 float* gradient, //block skew
2281 int pass, //identification
2282 bool reject_misses, //chuck big ones out
2283 bool make_new_rows, //add rows for unmatched
2284 bool drawing_skew //draw smoothed skew
2285) {
2286 OVERLAP_STATE overlap_result; //what to do with it
2287 float ycoord; //current y
2288 float top, bottom; //of blob
2289 float g_length = 1.0f; //from gradient
2290 int16_t row_count; //no of rows
2291 int16_t left_x; //left edge
2292 int16_t last_x; //previous edge
2293 float block_skew; //y delta
2294 float smooth_factor; //for new coords
2295 float near_dist; //dist to nearest row
2296 ICOORD testpt; //testing only
2297 BLOBNBOX *blob; //current blob
2298 TO_ROW *row; //current row
2299 TO_ROW *dest_row = nullptr; //row to put blob in
2300 //iterators
2301 BLOBNBOX_IT blob_it = &block->blobs;
2302 TO_ROW_IT row_it = block->get_rows ();
2303
2304 ycoord =
2305 (block->block->pdblk.bounding_box ().bottom () +
2306 block->block->pdblk.bounding_box ().top ()) / 2.0f;
2307 if (gradient != nullptr)
2308 g_length = sqrt (1 + *gradient * *gradient);
2309#ifndef GRAPHICS_DISABLED
2310 if (drawing_skew)
2311 to_win->SetCursor(block->block->pdblk.bounding_box ().left (), ycoord);
2312#endif
2314 blob_it.sort (blob_x_order);
2315 smooth_factor = 1.0;
2316 block_skew = 0.0f;
2317 row_count = row_it.length (); //might have rows
2318 if (!blob_it.empty ()) {
2319 left_x = blob_it.data ()->bounding_box ().left ();
2320 }
2321 else {
2322 left_x = block->block->pdblk.bounding_box ().left ();
2323 }
2324 last_x = left_x;
2325 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
2326 blob = blob_it.data ();
2327 if (gradient != nullptr) {
2328 block_skew = (1 - 1 / g_length) * blob->bounding_box ().bottom ()
2329 + *gradient / g_length * blob->bounding_box ().left ();
2330 }
2331 else if (blob->bounding_box ().left () - last_x > block->line_size / 2
2332 && last_x - left_x > block->line_size * 2
2333 && textord_interpolating_skew) {
2334 // tprintf("Interpolating skew from %g",block_skew);
2335 block_skew *= static_cast<float>(blob->bounding_box ().left () - left_x)
2336 / (last_x - left_x);
2337 // tprintf("to %g\n",block_skew);
2338 }
2339 last_x = blob->bounding_box ().left ();
2340 top = blob->bounding_box ().top () - block_skew;
2341 bottom = blob->bounding_box ().bottom () - block_skew;
2342#ifndef GRAPHICS_DISABLED
2343 if (drawing_skew)
2344 to_win->DrawTo(blob->bounding_box ().left (), ycoord + block_skew);
2345#endif
2346 if (!row_it.empty ()) {
2347 for (row_it.move_to_first ();
2348 !row_it.at_last () && row_it.data ()->min_y () > top;
2349 row_it.forward ());
2350 row = row_it.data ();
2351 if (row->min_y () <= top && row->max_y () >= bottom) {
2352 //any overlap
2353 dest_row = row;
2354 overlap_result = most_overlapping_row (&row_it, dest_row,
2355 top, bottom,
2356 block->line_size,
2357 blob->bounding_box ().
2358 contains (testpt));
2359 if (overlap_result == NEW_ROW && !reject_misses)
2360 overlap_result = ASSIGN;
2361 }
2362 else {
2363 overlap_result = NEW_ROW;
2364 if (!make_new_rows) {
2365 near_dist = row_it.data_relative (-1)->min_y () - top;
2366 //below bottom
2367 if (bottom < row->min_y ()) {
2368 if (row->min_y () - bottom <=
2369 (block->line_spacing -
2371 //done it
2372 overlap_result = ASSIGN;
2373 dest_row = row;
2374 }
2375 }
2376 else if (near_dist > 0
2377 && near_dist < bottom - row->max_y ()) {
2378 row_it.backward ();
2379 dest_row = row_it.data ();
2380 if (dest_row->min_y () - bottom <=
2381 (block->line_spacing -
2383 //done it
2384 overlap_result = ASSIGN;
2385 }
2386 }
2387 else {
2388 if (top - row->max_y () <=
2389 (block->line_spacing -
2390 block->line_size) * (textord_overlap_x +
2392 //done it
2393 overlap_result = ASSIGN;
2394 dest_row = row;
2395 }
2396 }
2397 }
2398 }
2399 if (overlap_result == ASSIGN)
2400 dest_row->add_blob (blob_it.extract (), top, bottom,
2401 block->line_size);
2402 if (overlap_result == NEW_ROW) {
2403 if (make_new_rows && top - bottom < block->max_blob_size) {
2404 dest_row =
2405 new TO_ROW (blob_it.extract (), top, bottom,
2406 block->line_size);
2407 row_count++;
2408 if (bottom > row_it.data ()->min_y ())
2409 row_it.add_before_then_move (dest_row);
2410 //insert in right place
2411 else
2412 row_it.add_after_then_move (dest_row);
2413 smooth_factor =
2414 1.0 / (row_count * textord_skew_lag +
2415 textord_skewsmooth_offset);
2416 }
2417 else
2418 overlap_result = REJECT;
2419 }
2420 }
2421 else if (make_new_rows && top - bottom < block->max_blob_size) {
2422 overlap_result = NEW_ROW;
2423 dest_row =
2424 new TO_ROW(blob_it.extract(), top, bottom, block->line_size);
2425 row_count++;
2426 row_it.add_after_then_move(dest_row);
2427 smooth_factor = 1.0 / (row_count * textord_skew_lag +
2428 textord_skewsmooth_offset2);
2429 }
2430 else
2431 overlap_result = REJECT;
2432 if (blob->bounding_box ().contains(testpt) && textord_debug_blob) {
2433 if (overlap_result != REJECT) {
2434 tprintf("Test blob assigned to row at (%g,%g) on pass %d\n",
2435 dest_row->min_y(), dest_row->max_y(), pass);
2436 }
2437 else {
2438 tprintf("Test blob assigned to no row on pass %d\n", pass);
2439 }
2440 }
2441 if (overlap_result != REJECT) {
2442 while (!row_it.at_first() &&
2443 row_it.data()->min_y() > row_it.data_relative(-1)->min_y()) {
2444 row = row_it.extract();
2445 row_it.backward();
2446 row_it.add_before_then_move(row);
2447 }
2448 while (!row_it.at_last() &&
2449 row_it.data ()->min_y() < row_it.data_relative (1)->min_y()) {
2450 row = row_it.extract();
2451 row_it.forward();
2452 // Keep rows in order.
2453 row_it.add_after_then_move(row);
2454 }
2455 BLOBNBOX_IT added_blob_it(dest_row->blob_list());
2456 added_blob_it.move_to_last();
2457 TBOX prev_box = added_blob_it.data_relative(-1)->bounding_box();
2458 if (dest_row->blob_list()->singleton() ||
2459 !prev_box.major_x_overlap(blob->bounding_box())) {
2460 block_skew = (1 - smooth_factor) * block_skew
2461 + smooth_factor * (blob->bounding_box().bottom() -
2462 dest_row->initial_min_y());
2463 }
2464 }
2465 }
2466 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
2467 if (row_it.data()->blob_list()->empty())
2468 delete row_it.extract(); // Discard empty rows.
2469 }
2470}
2471
2472
2479 TO_ROW_IT* row_it, //iterator
2480 TO_ROW*& best_row, //output row
2481 float top, //top of blob
2482 float bottom, //bottom of blob
2483 float rowsize, //max row size
2484 bool testing_blob //test stuff
2485) {
2486 OVERLAP_STATE result; //result of tests
2487 float overlap; //of blob & row
2488 float bestover; //nearest row
2489 float merge_top, merge_bottom; //size of merged row
2490 ICOORD testpt; //testing only
2491 TO_ROW *row; //current row
2492 TO_ROW *test_row; //for multiple overlaps
2493 BLOBNBOX_IT blob_it; //for merging rows
2494
2495 result = ASSIGN;
2496 row = row_it->data ();
2497 bestover = top - bottom;
2498 if (top > row->max_y ())
2499 bestover -= top - row->max_y ();
2500 if (bottom < row->min_y ())
2501 //compute overlap
2502 bestover -= row->min_y () - bottom;
2503 if (testing_blob && textord_debug_blob) {
2504 tprintf("Test blob y=(%g,%g), row=(%f,%f), size=%g, overlap=%f\n",
2505 bottom, top, row->min_y(), row->max_y(), rowsize, bestover);
2506 }
2507 test_row = row;
2508 do {
2509 if (!row_it->at_last ()) {
2510 row_it->forward ();
2511 test_row = row_it->data ();
2512 if (test_row->min_y () <= top && test_row->max_y () >= bottom) {
2513 merge_top =
2514 test_row->max_y () >
2515 row->max_y ()? test_row->max_y () : row->max_y ();
2516 merge_bottom =
2517 test_row->min_y () <
2518 row->min_y ()? test_row->min_y () : row->min_y ();
2519 if (merge_top - merge_bottom <= rowsize) {
2520 if (testing_blob && textord_debug_blob) {
2521 tprintf ("Merging rows at (%g,%g), (%g,%g)\n",
2522 row->min_y (), row->max_y (),
2523 test_row->min_y (), test_row->max_y ());
2524 }
2525 test_row->set_limits (merge_bottom, merge_top);
2526 blob_it.set_to_list (test_row->blob_list ());
2527 blob_it.add_list_after (row->blob_list ());
2528 blob_it.sort (blob_x_order);
2529 row_it->backward ();
2530 delete row_it->extract ();
2531 row_it->forward ();
2532 bestover = -1.0f; //force replacement
2533 }
2534 overlap = top - bottom;
2535 if (top > test_row->max_y ())
2536 overlap -= top - test_row->max_y ();
2537 if (bottom < test_row->min_y ())
2538 overlap -= test_row->min_y () - bottom;
2539 if (bestover >= rowsize - 1 && overlap >= rowsize - 1) {
2540 result = REJECT;
2541 }
2542 if (overlap > bestover) {
2543 bestover = overlap; //find biggest overlap
2544 row = test_row;
2545 }
2546 if (testing_blob && textord_debug_blob) {
2547 tprintf("Test blob y=(%g,%g), row=(%f,%f), size=%g, overlap=%f->%f\n",
2548 bottom, top, test_row->min_y(), test_row->max_y(),
2549 rowsize, overlap, bestover);
2550 }
2551 }
2552 }
2553 }
2554 while (!row_it->at_last ()
2555 && test_row->min_y () <= top && test_row->max_y () >= bottom);
2556 while (row_it->data () != row)
2557 row_it->backward (); //make it point to row
2558 //doesn't overlap much
2559 if (top - bottom - bestover > rowsize * textord_overlap_x &&
2560 (!textord_fix_makerow_bug || bestover < rowsize * textord_overlap_x)
2561 && result == ASSIGN)
2562 result = NEW_ROW; //doesn't overlap enough
2563 best_row = row;
2564 return result;
2565}
2566
2567
2573int blob_x_order( //sort function
2574 const void *item1, //items to compare
2575 const void *item2) {
2576 //converted ptr
2577 const BLOBNBOX *blob1 = *reinterpret_cast<const BLOBNBOX* const*>(item1);
2578 //converted ptr
2579 const BLOBNBOX *blob2 = *reinterpret_cast<const BLOBNBOX* const*>(item2);
2580
2581 if (blob1->bounding_box ().left () < blob2->bounding_box ().left ())
2582 return -1;
2583 else if (blob1->bounding_box ().left () > blob2->bounding_box ().left ())
2584 return 1;
2585 else
2586 return 0;
2587}
2588
2589
2595int row_y_order( //sort function
2596 const void *item1, //items to compare
2597 const void *item2) {
2598 //converted ptr
2599 const TO_ROW *row1 = *reinterpret_cast<const TO_ROW* const*>(item1);
2600 //converted ptr
2601 const TO_ROW *row2 = *reinterpret_cast<const TO_ROW* const*>(item2);
2602
2603 if (row1->parallel_c () > row2->parallel_c ())
2604 return -1;
2605 else if (row1->parallel_c () < row2->parallel_c ())
2606 return 1;
2607 else
2608 return 0;
2609}
2610
2611
2617int row_spacing_order( //sort function
2618 const void *item1, //items to compare
2619 const void *item2) {
2620 //converted ptr
2621 const TO_ROW *row1 = *reinterpret_cast<const TO_ROW* const*>(item1);
2622 //converted ptr
2623 const TO_ROW *row2 = *reinterpret_cast<const TO_ROW* const*>(item2);
2624
2625 if (row1->spacing < row2->spacing)
2626 return -1;
2627 else if (row1->spacing > row2->spacing)
2628 return 1;
2629 else
2630 return 0;
2631}
2632
2640 BLOBNBOX_IT box_it(row->blob_list()); // Iterator.
2641 int num_repeated_sets = 0;
2642 if (!box_it.empty()) {
2643 do {
2644 BLOBNBOX* bblob = box_it.data();
2645 int repeat_length = 1;
2646 if (bblob->flow() == BTFT_LEADER &&
2647 !bblob->joined_to_prev() && bblob->cblob() != nullptr) {
2648 BLOBNBOX_IT test_it(box_it);
2649 for (test_it.forward(); !test_it.at_first();) {
2650 bblob = test_it.data();
2651 if (bblob->flow() != BTFT_LEADER)
2652 break;
2653 test_it.forward();
2654 bblob = test_it.data();
2655 if (bblob->joined_to_prev() || bblob->cblob() == nullptr) {
2656 repeat_length = 0;
2657 break;
2658 }
2659 ++repeat_length;
2660 }
2661 }
2662 if (repeat_length >= kMinLeaderCount) {
2663 num_repeated_sets++;
2664 for (; repeat_length > 0; box_it.forward(), --repeat_length) {
2665 bblob = box_it.data();
2666 bblob->set_repeated_set(num_repeated_sets);
2667 }
2668 } else {
2669 bblob->set_repeated_set(0);
2670 box_it.forward();
2671 }
2672 } while (!box_it.at_first()); // until all done
2673 }
2674 row->set_num_repeated_sets(num_repeated_sets);
2675}
TBOX box_next_pre_chopped(BLOBNBOX_IT *it)
Definition: blobbox.cpp:665
C_BLOB * crotate_cblob(C_BLOB *blob, FCOORD rotation)
Definition: blobbox.cpp:611
@ BTFT_LEADER
Definition: blobbox.h:121
int32_t choose_nth_item(int32_t index, float *array, int32_t count)
Definition: statistc.cpp:630
#define ASSERT_HOST(x)
Definition: errcode.h:88
#define BOOL_VAR(name, val, comment)
Definition: params.h:306
#define INT_VAR(name, val, comment)
Definition: params.h:303
#define double_VAR(name, val, comment)
Definition: params.h:312
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
bool test_underline(bool testing_on, C_BLOB *blob, int16_t baseline, int16_t xheight)
Definition: blkocc.cpp:48
ScrollView * to_win
Definition: drawtord.cpp:35
void draw_occupation(int32_t xleft, int32_t ybottom, int32_t min_y, int32_t max_y, int32_t occupation[], int32_t thresholds[])
Definition: drawtord.cpp:163
void plot_parallel_row(TO_ROW *row, float gradient, int32_t left, ScrollView::Color colour, FCOORD rotation)
Definition: drawtord.cpp:122
void plot_to_row(TO_ROW *row, ScrollView::Color colour, FCOORD rotation)
Definition: drawtord.cpp:88
ScrollView * create_to_win(ICOORD page_tr)
Definition: drawtord.cpp:44
int textord_spline_minblobs
Definition: makerow.cpp:63
void compute_dropout_distances(int32_t *occupation, int32_t *thresholds, int32_t line_count)
Definition: makerow.cpp:902
bool find_best_dropout_row(TO_ROW *row, int32_t distance, float dist_limit, int32_t line_index, TO_ROW_IT *row_it, bool testing_on)
Definition: makerow.cpp:652
double textord_chop_width
Definition: makerow.cpp:76
double textord_xheight_mode_fraction
Definition: makerow.cpp:89
void fill_heights(TO_ROW *row, float gradient, int min_height, int max_height, STATS *heights, STATS *floating_heights)
Definition: makerow.cpp:1406
double textord_skew_ile
Definition: makerow.cpp:72
void cleanup_rows_making(ICOORD page_tr, TO_BLOCK *block, float gradient, FCOORD rotation, int32_t block_edge, bool testing_on)
Definition: makerow.cpp:517
double textord_min_blob_height_fraction
Definition: makerow.cpp:87
bool segment_baseline(TO_ROW *row, TO_BLOCK *block, int32_t &segments, int32_t *xstarts)
Definition: makerow.cpp:2088
void delete_non_dropout_rows(TO_BLOCK *block, float gradient, FCOORD rotation, int32_t block_edge, bool testing_on)
Definition: makerow.cpp:570
void pre_associate_blobs(ICOORD page_tr, TO_BLOCK *block, FCOORD rotation, bool testing_on)
Definition: makerow.cpp:1845
const int kMinSize
Definition: makerow.cpp:377
double textord_ascx_ratio_max
Definition: makerow.cpp:95
int32_t compute_height_modes(STATS *heights, int32_t min_height, int32_t max_height, int32_t *modes, int32_t maxmodes)
Definition: makerow.cpp:1623
void mark_repeated_chars(TO_ROW *row)
Definition: makerow.cpp:2639
int textord_test_x
Definition: makerow.cpp:60
void make_initial_textrows(ICOORD page_tr, TO_BLOCK *block, FCOORD rotation, bool testing_on)
Definition: makerow.cpp:226
void fit_parallel_rows(TO_BLOCK *block, float gradient, FCOORD rotation, int32_t block_edge, bool testing_on)
Definition: makerow.cpp:1931
bool textord_debug_blob
Definition: makerow.cpp:101
int textord_min_blobs_in_row
Definition: makerow.cpp:62
bool textord_show_final_blobs
Definition: makerow.cpp:47
int textord_lms_line_trials
Definition: makerow.cpp:99
double textord_linespace_iqrlimit
Definition: makerow.cpp:74
bool textord_show_parallel_rows
Definition: makerow.cpp:44
int compute_xheight_from_modes(STATS *heights, STATS *floating_heights, bool cap_only, int min_height, int max_height, float *xheight, float *ascrise)
Definition: makerow.cpp:1467
double textord_spline_outlier_fraction
Definition: makerow.cpp:71
void expand_rows(ICOORD page_tr, TO_BLOCK *block, float gradient, FCOORD rotation, int32_t block_edge, bool testing_on)
Definition: makerow.cpp:949
bool textord_test_landscape
Definition: makerow.cpp:48
double textord_descx_ratio_max
Definition: makerow.cpp:97
double textord_minxh
Definition: makerow.cpp:80
int blob_x_order(const void *item1, const void *item2)
Definition: makerow.cpp:2573
bool textord_show_final_rows
Definition: makerow.cpp:46
bool textord_old_xheight
Definition: makerow.cpp:52
void fit_parallel_lms(float gradient, TO_ROW *row)
Definition: makerow.cpp:1973
void correct_row_xheight(TO_ROW *row, float xheight, float ascrise, float descdrop)
Definition: makerow.cpp:1685
void compute_occupation_threshold(int32_t low_window, int32_t high_window, int32_t line_count, int32_t *occupation, int32_t *thresholds)
Definition: makerow.cpp:821
float make_single_row(ICOORD page_tr, bool allow_sub_blobs, TO_BLOCK *block, TO_BLOCK_LIST *blocks)
Definition: makerow.cpp:163
void separate_underlines(TO_BLOCK *block, float gradient, FCOORD rotation, bool testing_on)
Definition: makerow.cpp:1772
double textord_ascx_ratio_min
Definition: makerow.cpp:94
void compute_row_stats(TO_BLOCK *block, bool testing_on)
Definition: makerow.cpp:1143
OVERLAP_STATE most_overlapping_row(TO_ROW_IT *row_it, TO_ROW *&best_row, float top, float bottom, float rowsize, bool testing_blob)
Definition: makerow.cpp:2478
int textord_min_xheight
Definition: makerow.cpp:67
double textord_skew_lag
Definition: makerow.cpp:73
bool textord_show_initial_rows
Definition: makerow.cpp:43
double * linear_spline_baseline(TO_ROW *row, TO_BLOCK *block, int32_t &segments, int32_t xstarts[])
Definition: makerow.cpp:2187
double textord_excess_blobsize
Definition: makerow.cpp:83
double textord_descx_ratio_min
Definition: makerow.cpp:96
int row_spacing_order(const void *item1, const void *item2)
Definition: makerow.cpp:2617
int textord_spline_medianwin
Definition: makerow.cpp:64
double textord_xheight_error_margin
Definition: makerow.cpp:98
bool textord_fix_xheight_bug
Definition: makerow.cpp:53
int32_t compute_row_descdrop(TO_ROW *row, float gradient, int xheight_blob_count, STATS *asc_heights)
Definition: makerow.cpp:1563
bool textord_debug_xheights
Definition: makerow.cpp:55
bool textord_heavy_nr
Definition: makerow.cpp:42
double textord_width_limit
Definition: makerow.cpp:75
bool textord_fix_makerow_bug
Definition: makerow.cpp:54
void assign_blobs_to_rows(TO_BLOCK *block, float *gradient, int pass, bool reject_misses, bool make_new_rows, bool drawing_skew)
Definition: makerow.cpp:2278
int row_y_order(const void *item1, const void *item2)
Definition: makerow.cpp:2595
void fit_lms_line(TO_ROW *row)
Definition: makerow.cpp:266
double textord_spline_shift_fraction
Definition: makerow.cpp:69
void compute_line_occupation(TO_BLOCK *block, float gradient, int32_t min_y, int32_t max_y, int32_t *occupation, int32_t *deltas)
Definition: makerow.cpp:768
void compute_page_skew(TO_BLOCK_LIST *blocks, float &page_m, float &page_err)
Definition: makerow.cpp:286
bool textord_show_expanded_rows
Definition: makerow.cpp:45
bool textord_parallel_baselines
Definition: makerow.cpp:49
double textord_underline_width
Definition: makerow.cpp:85
int textord_test_y
Definition: makerow.cpp:61
double textord_min_linesize
Definition: makerow.cpp:81
bool textord_new_initial_xheight
Definition: makerow.cpp:100
const int kMinLeaderCount
Definition: makerow.cpp:105
bool textord_old_baselines
Definition: makerow.cpp:51
float make_rows(ICOORD page_tr, TO_BLOCK_LIST *port_blocks)
Definition: makerow.cpp:200
void vigorous_noise_removal(TO_BLOCK *block)
Definition: makerow.cpp:466
double textord_ascheight_mode_fraction
Definition: makerow.cpp:91
double textord_occupancy_threshold
Definition: makerow.cpp:84
#define MAX_HEIGHT_MODES
Definition: makerow.cpp:103
void adjust_row_limits(TO_BLOCK *block)
Definition: makerow.cpp:1107
const double kNoiseSize
Definition: makerow.cpp:376
bool textord_straight_baselines
Definition: makerow.cpp:50
TBOX deskew_block_coords(TO_BLOCK *block, float gradient)
Definition: makerow.cpp:732
void make_baseline_spline(TO_ROW *row, TO_BLOCK *block)
Definition: makerow.cpp:2056
bool within_error_margin(float test, float num, float margin)
Definition: makerow.h:128
void get_min_max_xheight(int block_linesize, int *min_height, int *max_height)
Definition: makerow.h:115
OVERLAP_STATE
Definition: makerow.h:29
@ ASSIGN
Definition: makerow.h:30
@ REJECT
Definition: makerow.h:31
@ NEW_ROW
Definition: makerow.h:32
ROW_CATEGORY get_row_category(const TO_ROW *row)
Definition: makerow.h:122
ROW_CATEGORY
Definition: makerow.h:35
@ ROW_DESCENDERS_FOUND
Definition: makerow.h:37
@ ROW_UNKNOWN
Definition: makerow.h:38
@ ROW_ASCENDERS_FOUND
Definition: makerow.h:36
@ ROW_INVALID
Definition: makerow.h:39
bool textord_oldbl_debug
Definition: oldbasel.cpp:39
void chop(BLOBNBOX_IT *start_it, BLOBNBOX_IT *blob_it, FCOORD rotation, float xheight)
Definition: blobbox.cpp:120
void set_repeated_set(int set_id)
Definition: blobbox.h:265
int repeated_set() const
Definition: blobbox.h:262
const TBOX & bounding_box() const
Definition: blobbox.h:230
C_BLOB * cblob() const
Definition: blobbox.h:268
bool joined_to_prev() const
Definition: blobbox.h:256
BlobTextFlowType flow() const
Definition: blobbox.h:295
void merge(BLOBNBOX *nextblob)
Definition: blobbox.cpp:92
void add_blob(BLOBNBOX *blob, float top, float bottom, float row_size)
Definition: blobbox.cpp:733
void set_line(float new_m, float new_c, float new_error)
Definition: blobbox.h:604
bool all_caps
Definition: blobbox.h:646
void set_num_repeated_sets(int num_sets)
Definition: blobbox.h:640
void set_parallel_line(float gradient, float new_c, float new_error)
Definition: blobbox.h:612
float intercept() const
Definition: blobbox.h:589
float line_c() const
Definition: blobbox.h:574
float spacing
Definition: blobbox.h:656
float max_y() const
Definition: blobbox.h:559
void set_limits(float new_min, float new_max)
Definition: blobbox.h:622
QSPLINE baseline
Definition: blobbox.h:670
float initial_min_y() const
Definition: blobbox.h:568
bool merged
Definition: blobbox.h:645
int xheight_evidence
Definition: blobbox.h:658
float xheight
Definition: blobbox.h:657
bool rep_chars_marked() const
Definition: blobbox.h:631
float parallel_c() const
Definition: blobbox.h:580
int num_repeated_sets() const
Definition: blobbox.h:637
float min_y() const
Definition: blobbox.h:562
float believability() const
Definition: blobbox.h:586
float descdrop
Definition: blobbox.h:660
float ascrise
Definition: blobbox.h:659
float line_m() const
Definition: blobbox.h:571
float line_error() const
Definition: blobbox.h:577
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:600
BLOCK * block
Definition: blobbox.h:777
float xheight
Definition: blobbox.h:788
BLOBNBOX_LIST blobs
Definition: blobbox.h:772
TO_ROW * key_row
Definition: blobbox.h:798
TO_ROW_LIST * get_rows()
Definition: blobbox.h:704
BLOBNBOX_LIST noise_blobs
Definition: blobbox.h:774
BLOBNBOX_LIST large_blobs
Definition: blobbox.h:776
float line_size
Definition: blobbox.h:785
float baseline_offset
Definition: blobbox.h:787
float max_blob_size
Definition: blobbox.h:786
BLOBNBOX_LIST underlines
Definition: blobbox.h:773
float line_spacing
Definition: blobbox.h:779
BLOBNBOX_LIST small_blobs
Definition: blobbox.h:775
static const double kXHeightCapRatio
Definition: ccstruct.h:37
static const double kXHeightFraction
Definition: ccstruct.h:34
static const double kDescenderFraction
Definition: ccstruct.h:33
static const double kAscenderFraction
Definition: ccstruct.h:35
ICOORD step(int index) const
Definition: coutln.h:144
static C_OUTLINE * deep_copy(const C_OUTLINE *src)
Definition: coutln.h:261
const ICOORD & start_pos() const
Definition: coutln.h:148
int32_t pathlength() const
Definition: coutln.h:135
void Add(const ICOORD &pt)
Definition: detlinefit.cpp:51
double ConstrainedFit(const FCOORD &direction, double min_dist, double max_dist, bool debug, ICOORD *line_pt)
Definition: detlinefit.cpp:130
double Fit(ICOORD *pt1, ICOORD *pt2)
Definition: detlinefit.h:75
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:190
FCOORD classify_rotation() const
Definition: ocrblock.h:140
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:59
integer coordinate
Definition: points.h:32
int16_t y() const
access_function
Definition: points.h:56
int16_t x() const
access function
Definition: points.h:52
Definition: points.h:189
void rotate(const FCOORD vec)
Definition: points.h:763
float y() const
Definition: points.h:210
float x() const
Definition: points.h:207
bool IsText() const
Definition: polyblk.h:49
double y(double x) const
Definition: quspline.cpp:209
Definition: rect.h:34
void rotate(const FCOORD &vec)
Definition: rect.h:197
int16_t top() const
Definition: rect.h:58
void print() const
Definition: rect.h:278
int16_t width() const
Definition: rect.h:115
int16_t height() const
Definition: rect.h:108
bool major_overlap(const TBOX &box) const
Definition: rect.h:368
int16_t left() const
Definition: rect.h:72
int16_t bottom() const
Definition: rect.h:65
bool contains(const FCOORD pt) const
Definition: rect.h:333
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:412
int16_t right() const
Definition: rect.h:79
Definition: statistc.h:31
int32_t max_bucket() const
Definition: statistc.cpp:213
int32_t pile_count(int32_t value) const
Definition: statistc.h:76
void add(int32_t value, int32_t count)
Definition: statistc.cpp:93
double median() const
Definition: statistc.cpp:231
int32_t get_total() const
Definition: statistc.h:84
void print() const
Definition: statistc.cpp:526
int32_t min_bucket() const
Definition: statistc.cpp:198
int32_t mode() const
Definition: statistc.cpp:107
static C_BLOB * FakeBlob(const TBOX &box)
Definition: stepblob.cpp:241
TBOX bounding_box() const
Definition: stepblob.cpp:253
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:70
void CheckInverseFlagAndDirection()
Definition: stepblob.cpp:224
void remove(int32_t key)
Definition: sortflts.cpp:52
void add(float value, int32_t key)
Definition: sortflts.cpp:27
void compute_row_xheight(TO_ROW *row, const FCOORD &rotation, float gradient, int block_line_size)
Definition: makerow.cpp:1366
bool textord_single_height_mode
Definition: textord.h:261
void make_spline_rows(TO_BLOCK *block, float gradient, bool testing_on)
Definition: makerow.cpp:2003
void compute_block_xheight(TO_BLOCK *block, float gradient)
Definition: makerow.cpp:1254
void DrawTo(int x, int y)
Definition: scrollview.cpp:525
void SetCursor(int x, int y)
Definition: scrollview.cpp:519
void Pen(Color color)
Definition: scrollview.cpp:719
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:600