tesseract 4.1.1
Loading...
Searching...
No Matches
makerow.cpp File Reference
#include <vector>
#include "blobbox.h"
#include "ccstruct.h"
#include "detlinefit.h"
#include "statistc.h"
#include "drawtord.h"
#include "blkocc.h"
#include "sortflts.h"
#include "oldbasel.h"
#include "textord.h"
#include "tordmain.h"
#include "underlin.h"
#include "makerow.h"
#include "tprintf.h"
#include "tovars.h"
#include <algorithm>

Go to the source code of this file.

Namespaces

namespace  tesseract
 

Macros

#define MAX_HEIGHT_MODES   12
 

Functions

make_single_row

Arrange the blobs into a single row... well actually, if there is only a single blob, it makes 2 rows, in case the top-level blob is a container of the real blobs to recognize.

float make_single_row (ICOORD page_tr, bool allow_sub_blobs, TO_BLOCK *block, TO_BLOCK_LIST *blocks)
 
make_rows

Arrange the blobs into rows.

float make_rows (ICOORD page_tr, TO_BLOCK_LIST *port_blocks)
 
make_initial_textrows

Arrange the good blobs into rows of text.

void make_initial_textrows (ICOORD page_tr, TO_BLOCK *block, FCOORD rotation, bool testing_on)
 
fit_lms_line

Fit an LMS line to a row.

void fit_lms_line (TO_ROW *row)
 
find_best_dropout_row

Delete this row if it has a neighbour with better dropout characteristics. true is returned if the row should be deleted.

bool find_best_dropout_row (TO_ROW *row, int32_t distance, float dist_limit, int32_t line_index, TO_ROW_IT *row_it, bool testing_on)
 
deskew_block_coords

Compute the bounding box of all the blobs in the block if they were deskewed without actually doing it.

TBOX deskew_block_coords (TO_BLOCK *block, float gradient)
 
compute_line_occupation

Compute the pixel projection back on the y axis given the global skew. Also compute the 1st derivative.

void compute_line_occupation (TO_BLOCK *block, float gradient, int32_t min_y, int32_t max_y, int32_t *occupation, int32_t *deltas)
 
void compute_occupation_threshold (int32_t low_window, int32_t high_window, int32_t line_count, int32_t *occupation, int32_t *thresholds)
 
compute_dropout_distances

Compute the distance from each coordinate to the nearest dropout.

void compute_dropout_distances (int32_t *occupation, int32_t *thresholds, int32_t line_count)
 
expand_rows

Expand each row to the least of its allowed size and touching its neighbours. If the expansion would entirely swallow a neighbouring row then do so.

void expand_rows (ICOORD page_tr, TO_BLOCK *block, float gradient, FCOORD rotation, int32_t block_edge, bool testing_on)
 
void adjust_row_limits (TO_BLOCK *block)
 
compute_row_stats

Compute the linespacing and offset.

void compute_row_stats (TO_BLOCK *block, bool testing_on)
 
fill_heights

Fill the given heights with heights of the blobs that are legal candidates for estimating xheight.

void fill_heights (TO_ROW *row, float gradient, int min_height, int max_height, STATS *heights, STATS *floating_heights)
 
compute_xheight_from_modes

Given a STATS object heights, looks for two most frequently occurring heights that look like xheight and xheight + ascrise. If found, sets the values of *xheight and *ascrise accordingly, otherwise sets xheight to any most frequently occurring height and sets *ascrise to 0. Returns the number of times xheight occurred in heights. For each mode that is considered for being an xheight the count of floating blobs (stored in floating_heights) is subtracted from the total count of the blobs of this height. This is done because blobs that sit far above the baseline could represent valid ascenders, but it is highly unlikely that such a character's height will be an xheight (e.g. -, ', =, ^, ‘, ", ’, etc) If cap_only, then force finding of only the top mode.

int compute_xheight_from_modes (STATS *heights, STATS *floating_heights, bool cap_only, int min_height, int max_height, float *xheight, float *ascrise)
 
compute_row_descdrop

Estimates the descdrop of this row. This function looks for "significant" descenders of lowercase letters (those that could not just be the small descenders of upper case letters like Q,J). The function also takes into account how many potential ascenders this row might contain. If the number of potential ascenders along with descenders is close to the expected fraction of the total number of blobs in the row, the function returns the descender height, returns 0 otherwise.

int32_t compute_row_descdrop (TO_ROW *row, float gradient, int xheight_blob_count, STATS *asc_heights)
 
compute_height_modes

Find the top maxmodes values in the input array and put their indices in the output in the order in which they occurred.

int32_t compute_height_modes (STATS *heights, int32_t min_height, int32_t max_height, int32_t *modes, int32_t maxmodes)
 
correct_row_xheight

Adjust the xheight etc of this row if not within reasonable limits of the average for the block.

void correct_row_xheight (TO_ROW *row, float xheight, float ascrise, float descdrop)
 
separate_underlines

Test wide objects for being potential underlines. If they are then put them in a separate list in the block.

void separate_underlines (TO_BLOCK *block, float gradient, FCOORD rotation, bool testing_on)
 
pre_associate_blobs

Associate overlapping blobs and fake chop wide blobs.

void pre_associate_blobs (ICOORD page_tr, TO_BLOCK *block, FCOORD rotation, bool testing_on)
 
fit_parallel_rows

Re-fit the rows in the block to the given gradient.

void fit_parallel_rows (TO_BLOCK *block, float gradient, FCOORD rotation, int32_t block_edge, bool testing_on)
 
fit_parallel_lms

Fit an LMS line to a row. Make the fit parallel to the given gradient and set the row accordingly.

void fit_parallel_lms (float gradient, TO_ROW *row)
 
make_baseline_spline

Fit an LMS line to a row. Make the fit parallel to the given gradient and set the row accordingly.

void make_baseline_spline (TO_ROW *row, TO_BLOCK *block)
 
segment_baseline

Divide the baseline up into segments which require a different quadratic fitted to them. Return true if enough blobs were far enough away to need a quadratic.

bool segment_baseline (TO_ROW *row, TO_BLOCK *block, int32_t &segments, int32_t *xstarts)
 
linear_spline_baseline

Divide the baseline up into segments which require a different quadratic fitted to them.

Returns
true if enough blobs were far enough away to need a quadratic.
double * linear_spline_baseline (TO_ROW *row, TO_BLOCK *block, int32_t &segments, int32_t xstarts[])
 
assign_blobs_to_rows

Make enough rows to allocate all the given blobs to one. If a block skew is given, use that, else attempt to track it.

void assign_blobs_to_rows (TO_BLOCK *block, float *gradient, int pass, bool reject_misses, bool make_new_rows, bool drawing_skew)
 
most_overlapping_row

Return the row which most overlaps the blob.

OVERLAP_STATE most_overlapping_row (TO_ROW_IT *row_it, TO_ROW *&best_row, float top, float bottom, float rowsize, bool testing_blob)
 
blob_x_order

Sort function to sort blobs in x from page left.

int blob_x_order (const void *item1, const void *item2)
 
row_y_order

Sort function to sort rows in y from page top.

int row_y_order (const void *item1, const void *item2)
 
row_spacing_order

Qsort style function to compare 2 TO_ROWS based on their spacing value.

int row_spacing_order (const void *item1, const void *item2)
 
mark_repeated_chars

Mark blobs marked with BTFT_LEADER in repeated sets using the repeated_set member of BLOBNBOX.

void mark_repeated_chars (TO_ROW *row)
 

Variables

bool textord_heavy_nr = false
 
bool textord_show_initial_rows = false
 
bool textord_show_parallel_rows = false
 
bool textord_show_expanded_rows = false
 
bool textord_show_final_rows = false
 
bool textord_show_final_blobs = false
 
bool textord_test_landscape = false
 
bool textord_parallel_baselines = true
 
bool textord_straight_baselines = false
 
bool textord_old_baselines = true
 
bool textord_old_xheight = false
 
bool textord_fix_xheight_bug = true
 
bool textord_fix_makerow_bug = true
 
bool textord_debug_xheights = false
 
int textord_test_x = -INT32_MAX
 
int textord_test_y = -INT32_MAX
 
int textord_min_blobs_in_row = 4
 
int textord_spline_minblobs = 8
 
int textord_spline_medianwin = 6
 
int textord_min_xheight = 10
 
double textord_spline_shift_fraction = 0.02
 
double textord_spline_outlier_fraction = 0.1
 
double textord_skew_ile = 0.5
 
double textord_skew_lag = 0.02
 
double textord_linespace_iqrlimit = 0.2
 
double textord_width_limit = 8
 
double textord_chop_width = 1.5
 
double textord_minxh = 0.25
 
double textord_min_linesize = 1.25
 
double textord_excess_blobsize = 1.3
 
double textord_occupancy_threshold = 0.4
 
double textord_underline_width = 2.0
 
double textord_min_blob_height_fraction = 0.75
 
double textord_xheight_mode_fraction = 0.4
 
double textord_ascheight_mode_fraction = 0.08
 
double textord_ascx_ratio_min = 1.25
 
double textord_ascx_ratio_max = 1.8
 
double textord_descx_ratio_min = 0.25
 
double textord_descx_ratio_max = 0.6
 
double textord_xheight_error_margin = 0.1
 
int textord_lms_line_trials = 12
 
bool textord_new_initial_xheight = true
 
bool textord_debug_blob = false
 
const int kMinLeaderCount = 5
 

compute_page_skew

Compute the skew over a full page by averaging the gradients over all the lines. Get the error of the same row.

const double kNoiseSize = 0.5
 
const int kMinSize = 8
 
void compute_page_skew (TO_BLOCK_LIST *blocks, float &page_m, float &page_err)
 
void vigorous_noise_removal (TO_BLOCK *block)
 
void cleanup_rows_making (ICOORD page_tr, TO_BLOCK *block, float gradient, FCOORD rotation, int32_t block_edge, bool testing_on)
 
void delete_non_dropout_rows (TO_BLOCK *block, float gradient, FCOORD rotation, int32_t block_edge, bool testing_on)
 

Macro Definition Documentation

◆ MAX_HEIGHT_MODES

#define MAX_HEIGHT_MODES   12

Definition at line 103 of file makerow.cpp.

Function Documentation

◆ adjust_row_limits()

void adjust_row_limits ( TO_BLOCK block)

adjust_row_limits

Change the limits of rows to suit the default fractions.

Definition at line 1107 of file makerow.cpp.

1109 {
1110 TO_ROW *row; //current row
1111 float size; //size of row
1112 float ymax; //top of row
1113 float ymin; //bottom of row
1114 TO_ROW_IT row_it = block->get_rows ();
1115
1117 tprintf("Adjusting row limits for block(%d,%d)\n",
1118 block->block->pdblk.bounding_box().left(),
1119 block->block->pdblk.bounding_box().top());
1120 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1121 row = row_it.data ();
1122 size = row->max_y () - row->min_y ();
1124 tprintf("Row at %f has min %f, max %f, size %f\n",
1125 row->intercept(), row->min_y(), row->max_y(), size);
1132 row->set_limits (row->intercept () + ymin, row->intercept () + ymax);
1133 row->merged = false;
1134 }
1135}
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
bool textord_show_expanded_rows
Definition: makerow.cpp:45
float intercept() const
Definition: blobbox.h:589
float max_y() const
Definition: blobbox.h:559
void set_limits(float new_min, float new_max)
Definition: blobbox.h:622
bool merged
Definition: blobbox.h:645
float min_y() const
Definition: blobbox.h:562
BLOCK * block
Definition: blobbox.h:777
TO_ROW_LIST * get_rows()
Definition: blobbox.h:704
static const double kXHeightFraction
Definition: ccstruct.h:34
static const double kDescenderFraction
Definition: ccstruct.h:33
static const double kAscenderFraction
Definition: ccstruct.h:35
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:190
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:59

◆ assign_blobs_to_rows()

void assign_blobs_to_rows ( TO_BLOCK block,
float *  gradient,
int  pass,
bool  reject_misses,
bool  make_new_rows,
bool  drawing_skew 
)

Definition at line 2278 of file makerow.cpp.

2285 {
2286 OVERLAP_STATE overlap_result; //what to do with it
2287 float ycoord; //current y
2288 float top, bottom; //of blob
2289 float g_length = 1.0f; //from gradient
2290 int16_t row_count; //no of rows
2291 int16_t left_x; //left edge
2292 int16_t last_x; //previous edge
2293 float block_skew; //y delta
2294 float smooth_factor; //for new coords
2295 float near_dist; //dist to nearest row
2296 ICOORD testpt; //testing only
2297 BLOBNBOX *blob; //current blob
2298 TO_ROW *row; //current row
2299 TO_ROW *dest_row = nullptr; //row to put blob in
2300 //iterators
2301 BLOBNBOX_IT blob_it = &block->blobs;
2302 TO_ROW_IT row_it = block->get_rows ();
2303
2304 ycoord =
2305 (block->block->pdblk.bounding_box ().bottom () +
2306 block->block->pdblk.bounding_box ().top ()) / 2.0f;
2307 if (gradient != nullptr)
2308 g_length = sqrt (1 + *gradient * *gradient);
2309#ifndef GRAPHICS_DISABLED
2310 if (drawing_skew)
2311 to_win->SetCursor(block->block->pdblk.bounding_box ().left (), ycoord);
2312#endif
2314 blob_it.sort (blob_x_order);
2315 smooth_factor = 1.0;
2316 block_skew = 0.0f;
2317 row_count = row_it.length (); //might have rows
2318 if (!blob_it.empty ()) {
2319 left_x = blob_it.data ()->bounding_box ().left ();
2320 }
2321 else {
2322 left_x = block->block->pdblk.bounding_box ().left ();
2323 }
2324 last_x = left_x;
2325 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
2326 blob = blob_it.data ();
2327 if (gradient != nullptr) {
2328 block_skew = (1 - 1 / g_length) * blob->bounding_box ().bottom ()
2329 + *gradient / g_length * blob->bounding_box ().left ();
2330 }
2331 else if (blob->bounding_box ().left () - last_x > block->line_size / 2
2332 && last_x - left_x > block->line_size * 2
2333 && textord_interpolating_skew) {
2334 // tprintf("Interpolating skew from %g",block_skew);
2335 block_skew *= static_cast<float>(blob->bounding_box ().left () - left_x)
2336 / (last_x - left_x);
2337 // tprintf("to %g\n",block_skew);
2338 }
2339 last_x = blob->bounding_box ().left ();
2340 top = blob->bounding_box ().top () - block_skew;
2341 bottom = blob->bounding_box ().bottom () - block_skew;
2342#ifndef GRAPHICS_DISABLED
2343 if (drawing_skew)
2344 to_win->DrawTo(blob->bounding_box ().left (), ycoord + block_skew);
2345#endif
2346 if (!row_it.empty ()) {
2347 for (row_it.move_to_first ();
2348 !row_it.at_last () && row_it.data ()->min_y () > top;
2349 row_it.forward ());
2350 row = row_it.data ();
2351 if (row->min_y () <= top && row->max_y () >= bottom) {
2352 //any overlap
2353 dest_row = row;
2354 overlap_result = most_overlapping_row (&row_it, dest_row,
2355 top, bottom,
2356 block->line_size,
2357 blob->bounding_box ().
2358 contains (testpt));
2359 if (overlap_result == NEW_ROW && !reject_misses)
2360 overlap_result = ASSIGN;
2361 }
2362 else {
2363 overlap_result = NEW_ROW;
2364 if (!make_new_rows) {
2365 near_dist = row_it.data_relative (-1)->min_y () - top;
2366 //below bottom
2367 if (bottom < row->min_y ()) {
2368 if (row->min_y () - bottom <=
2369 (block->line_spacing -
2371 //done it
2372 overlap_result = ASSIGN;
2373 dest_row = row;
2374 }
2375 }
2376 else if (near_dist > 0
2377 && near_dist < bottom - row->max_y ()) {
2378 row_it.backward ();
2379 dest_row = row_it.data ();
2380 if (dest_row->min_y () - bottom <=
2381 (block->line_spacing -
2383 //done it
2384 overlap_result = ASSIGN;
2385 }
2386 }
2387 else {
2388 if (top - row->max_y () <=
2389 (block->line_spacing -
2390 block->line_size) * (textord_overlap_x +
2392 //done it
2393 overlap_result = ASSIGN;
2394 dest_row = row;
2395 }
2396 }
2397 }
2398 }
2399 if (overlap_result == ASSIGN)
2400 dest_row->add_blob (blob_it.extract (), top, bottom,
2401 block->line_size);
2402 if (overlap_result == NEW_ROW) {
2403 if (make_new_rows && top - bottom < block->max_blob_size) {
2404 dest_row =
2405 new TO_ROW (blob_it.extract (), top, bottom,
2406 block->line_size);
2407 row_count++;
2408 if (bottom > row_it.data ()->min_y ())
2409 row_it.add_before_then_move (dest_row);
2410 //insert in right place
2411 else
2412 row_it.add_after_then_move (dest_row);
2413 smooth_factor =
2414 1.0 / (row_count * textord_skew_lag +
2415 textord_skewsmooth_offset);
2416 }
2417 else
2418 overlap_result = REJECT;
2419 }
2420 }
2421 else if (make_new_rows && top - bottom < block->max_blob_size) {
2422 overlap_result = NEW_ROW;
2423 dest_row =
2424 new TO_ROW(blob_it.extract(), top, bottom, block->line_size);
2425 row_count++;
2426 row_it.add_after_then_move(dest_row);
2427 smooth_factor = 1.0 / (row_count * textord_skew_lag +
2428 textord_skewsmooth_offset2);
2429 }
2430 else
2431 overlap_result = REJECT;
2432 if (blob->bounding_box ().contains(testpt) && textord_debug_blob) {
2433 if (overlap_result != REJECT) {
2434 tprintf("Test blob assigned to row at (%g,%g) on pass %d\n",
2435 dest_row->min_y(), dest_row->max_y(), pass);
2436 }
2437 else {
2438 tprintf("Test blob assigned to no row on pass %d\n", pass);
2439 }
2440 }
2441 if (overlap_result != REJECT) {
2442 while (!row_it.at_first() &&
2443 row_it.data()->min_y() > row_it.data_relative(-1)->min_y()) {
2444 row = row_it.extract();
2445 row_it.backward();
2446 row_it.add_before_then_move(row);
2447 }
2448 while (!row_it.at_last() &&
2449 row_it.data ()->min_y() < row_it.data_relative (1)->min_y()) {
2450 row = row_it.extract();
2451 row_it.forward();
2452 // Keep rows in order.
2453 row_it.add_after_then_move(row);
2454 }
2455 BLOBNBOX_IT added_blob_it(dest_row->blob_list());
2456 added_blob_it.move_to_last();
2457 TBOX prev_box = added_blob_it.data_relative(-1)->bounding_box();
2458 if (dest_row->blob_list()->singleton() ||
2459 !prev_box.major_x_overlap(blob->bounding_box())) {
2460 block_skew = (1 - smooth_factor) * block_skew
2461 + smooth_factor * (blob->bounding_box().bottom() -
2462 dest_row->initial_min_y());
2463 }
2464 }
2465 }
2466 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
2467 if (row_it.data()->blob_list()->empty())
2468 delete row_it.extract(); // Discard empty rows.
2469 }
2470}
ScrollView * to_win
Definition: drawtord.cpp:35
int textord_test_x
Definition: makerow.cpp:60
bool textord_debug_blob
Definition: makerow.cpp:101
int blob_x_order(const void *item1, const void *item2)
Definition: makerow.cpp:2573
OVERLAP_STATE most_overlapping_row(TO_ROW_IT *row_it, TO_ROW *&best_row, float top, float bottom, float rowsize, bool testing_blob)
Definition: makerow.cpp:2478
double textord_skew_lag
Definition: makerow.cpp:73
int textord_test_y
Definition: makerow.cpp:61
OVERLAP_STATE
Definition: makerow.h:29
@ ASSIGN
Definition: makerow.h:30
@ REJECT
Definition: makerow.h:31
@ NEW_ROW
Definition: makerow.h:32
const TBOX & bounding_box() const
Definition: blobbox.h:230
void add_blob(BLOBNBOX *blob, float top, float bottom, float row_size)
Definition: blobbox.cpp:733
float initial_min_y() const
Definition: blobbox.h:568
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:600
BLOBNBOX_LIST blobs
Definition: blobbox.h:772
float line_size
Definition: blobbox.h:785
float line_spacing
Definition: blobbox.h:779
integer coordinate
Definition: points.h:32
Definition: rect.h:34
int16_t top() const
Definition: rect.h:58
int16_t left() const
Definition: rect.h:72
int16_t bottom() const
Definition: rect.h:65
bool contains(const FCOORD pt) const
Definition: rect.h:333
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:412
void DrawTo(int x, int y)
Definition: scrollview.cpp:525
void SetCursor(int x, int y)
Definition: scrollview.cpp:519

◆ blob_x_order()

int blob_x_order ( const void *  item1,
const void *  item2 
)

Definition at line 2573 of file makerow.cpp.

2575 {
2576 //converted ptr
2577 const BLOBNBOX *blob1 = *reinterpret_cast<const BLOBNBOX* const*>(item1);
2578 //converted ptr
2579 const BLOBNBOX *blob2 = *reinterpret_cast<const BLOBNBOX* const*>(item2);
2580
2581 if (blob1->bounding_box ().left () < blob2->bounding_box ().left ())
2582 return -1;
2583 else if (blob1->bounding_box ().left () > blob2->bounding_box ().left ())
2584 return 1;
2585 else
2586 return 0;
2587}

◆ cleanup_rows_making()

void cleanup_rows_making ( ICOORD  page_tr,
TO_BLOCK block,
float  gradient,
FCOORD  rotation,
int32_t  block_edge,
bool  testing_on 
)

cleanup_rows_making

Remove overlapping rows and fit all the blobs to what's left.

Definition at line 517 of file makerow.cpp.

524 {
525 //iterators
526 BLOBNBOX_IT blob_it = &block->blobs;
527 TO_ROW_IT row_it = block->get_rows ();
528
529#ifndef GRAPHICS_DISABLED
530 if (textord_show_parallel_rows && testing_on) {
531 if (to_win == nullptr)
532 create_to_win(page_tr);
533 }
534#endif
535 //get row coords
536 fit_parallel_rows(block,
537 gradient,
538 rotation,
539 block_edge,
540 textord_show_parallel_rows && testing_on);
542 gradient,
543 rotation,
544 block_edge,
545 textord_show_parallel_rows && testing_on);
546 expand_rows(page_tr, block, gradient, rotation, block_edge, testing_on);
547 blob_it.set_to_list (&block->blobs);
548 row_it.set_to_list (block->get_rows ());
549 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
550 blob_it.add_list_after (row_it.data ()->blob_list ());
551 //give blobs back
552 assign_blobs_to_rows (block, &gradient, 1, false, false, false);
553 //now new rows must be genuine
554 blob_it.set_to_list (&block->blobs);
555 blob_it.add_list_after (&block->large_blobs);
556 assign_blobs_to_rows (block, &gradient, 2, true, true, false);
557 //safe to use big ones now
558 blob_it.set_to_list (&block->blobs);
559 //throw all blobs in
560 blob_it.add_list_after (&block->noise_blobs);
561 blob_it.add_list_after (&block->small_blobs);
562 assign_blobs_to_rows (block, &gradient, 3, false, false, false);
563}
ScrollView * create_to_win(ICOORD page_tr)
Definition: drawtord.cpp:44
void delete_non_dropout_rows(TO_BLOCK *block, float gradient, FCOORD rotation, int32_t block_edge, bool testing_on)
Definition: makerow.cpp:570
void fit_parallel_rows(TO_BLOCK *block, float gradient, FCOORD rotation, int32_t block_edge, bool testing_on)
Definition: makerow.cpp:1931
bool textord_show_parallel_rows
Definition: makerow.cpp:44
void expand_rows(ICOORD page_tr, TO_BLOCK *block, float gradient, FCOORD rotation, int32_t block_edge, bool testing_on)
Definition: makerow.cpp:949
void assign_blobs_to_rows(TO_BLOCK *block, float *gradient, int pass, bool reject_misses, bool make_new_rows, bool drawing_skew)
Definition: makerow.cpp:2278
BLOBNBOX_LIST noise_blobs
Definition: blobbox.h:774
BLOBNBOX_LIST large_blobs
Definition: blobbox.h:776
BLOBNBOX_LIST small_blobs
Definition: blobbox.h:775

◆ compute_dropout_distances()

void compute_dropout_distances ( int32_t *  occupation,
int32_t *  thresholds,
int32_t  line_count 
)

Definition at line 902 of file makerow.cpp.

906 {
907 int32_t line_index; //of thresholds line
908 int32_t distance; //from prev dropout
909 int32_t next_dist; //to next dropout
910 int32_t back_index; //for back filling
911 int32_t prev_threshold; //before overwrite
912
913 distance = -line_count;
914 line_index = 0;
915 do {
916 do {
917 distance--;
918 prev_threshold = thresholds[line_index];
919 //distance from prev
920 thresholds[line_index] = distance;
921 line_index++;
922 }
923 while (line_index < line_count
924 && (occupation[line_index] < thresholds[line_index]
925 || occupation[line_index - 1] >= prev_threshold));
926 if (line_index < line_count) {
927 back_index = line_index - 1;
928 next_dist = 1;
929 while (next_dist < -distance && back_index >= 0) {
930 thresholds[back_index] = next_dist;
931 back_index--;
932 next_dist++;
933 distance++;
934 }
935 distance = 1;
936 }
937 }
938 while (line_index < line_count);
939}

◆ compute_height_modes()

int32_t compute_height_modes ( STATS heights,
int32_t  min_height,
int32_t  max_height,
int32_t *  modes,
int32_t  maxmodes 
)

Definition at line 1623 of file makerow.cpp.

1627 { // size of modes
1628 int32_t pile_count; // no in source pile
1629 int32_t src_count; // no of source entries
1630 int32_t src_index; // current entry
1631 int32_t least_count; // height of smalllest
1632 int32_t least_index; // index of least
1633 int32_t dest_count; // index in modes
1634
1635 src_count = max_height + 1 - min_height;
1636 dest_count = 0;
1637 least_count = INT32_MAX;
1638 least_index = -1;
1639 for (src_index = 0; src_index < src_count; src_index++) {
1640 pile_count = heights->pile_count(min_height + src_index);
1641 if (pile_count > 0) {
1642 if (dest_count < maxmodes) {
1643 if (pile_count < least_count) {
1644 // find smallest in array
1645 least_count = pile_count;
1646 least_index = dest_count;
1647 }
1648 modes[dest_count++] = min_height + src_index;
1649 } else if (pile_count >= least_count) {
1650 while (least_index < maxmodes - 1) {
1651 modes[least_index] = modes[least_index + 1];
1652 // shuffle up
1653 least_index++;
1654 }
1655 // new one on end
1656 modes[maxmodes - 1] = min_height + src_index;
1657 if (pile_count == least_count) {
1658 // new smallest
1659 least_index = maxmodes - 1;
1660 } else {
1661 least_count = heights->pile_count(modes[0]);
1662 least_index = 0;
1663 for (dest_count = 1; dest_count < maxmodes; dest_count++) {
1664 pile_count = heights->pile_count(modes[dest_count]);
1665 if (pile_count < least_count) {
1666 // find smallest
1667 least_count = pile_count;
1668 least_index = dest_count;
1669 }
1670 }
1671 }
1672 }
1673 }
1674 }
1675 return dest_count;
1676}
int32_t pile_count(int32_t value) const
Definition: statistc.h:76

◆ compute_line_occupation()

void compute_line_occupation ( TO_BLOCK block,
float  gradient,
int32_t  min_y,
int32_t  max_y,
int32_t *  occupation,
int32_t *  deltas 
)

Definition at line 768 of file makerow.cpp.

775 {
776 int32_t line_count; //maxy-miny+1
777 int32_t line_index; //of scan line
778 int index; //array index for daft compilers
779 TO_ROW *row; //current row
780 TO_ROW_IT row_it = block->get_rows ();
781 BLOBNBOX *blob; //current blob
782 BLOBNBOX_IT blob_it; //iterator
783 float length; //of skew vector
784 TBOX blob_box; //bounding box
785 FCOORD rotation; //inverse of skew
786
787 line_count = max_y - min_y + 1;
788 length = sqrt (gradient * gradient + 1);
789 rotation = FCOORD (1 / length, -gradient / length);
790 for (line_index = 0; line_index < line_count; line_index++)
791 deltas[line_index] = 0;
792 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
793 row = row_it.data ();
794 blob_it.set_to_list (row->blob_list ());
795 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
796 blob_it.forward ()) {
797 blob = blob_it.data ();
798 blob_box = blob->bounding_box ();
799 blob_box.rotate (rotation);//de-skew it
800 int32_t width = blob_box.right() - blob_box.left();
801 index = blob_box.bottom() - min_y;
802 ASSERT_HOST(index >= 0 && index < line_count);
803 // count transitions
804 deltas[index] += width;
805 index = blob_box.top() - min_y;
806 ASSERT_HOST(index >= 0 && index < line_count);
807 deltas[index] -= width;
808 }
809 }
810 occupation[0] = deltas[0];
811 for (line_index = 1; line_index < line_count; line_index++)
812 occupation[line_index] = occupation[line_index - 1] + deltas[line_index];
813}
#define ASSERT_HOST(x)
Definition: errcode.h:88
Definition: points.h:189
void rotate(const FCOORD &vec)
Definition: rect.h:197
int16_t right() const
Definition: rect.h:79

◆ compute_occupation_threshold()

void compute_occupation_threshold ( int32_t  low_window,
int32_t  high_window,
int32_t  line_count,
int32_t *  occupation,
int32_t *  thresholds 
)

compute_occupation_threshold

Compute thresholds for textline or not for the occupation array.

Definition at line 821 of file makerow.cpp.

827 {
828 int32_t line_index; //of thresholds line
829 int32_t low_index; //in occupation
830 int32_t high_index; //in occupation
831 int32_t sum; //current average
832 int32_t divisor; //to get thresholds
833 int32_t min_index; //of min occ
834 int32_t min_occ; //min in locality
835 int32_t test_index; //for finding min
836
837 divisor =
838 static_cast<int32_t>(ceil ((low_window + high_window) / textord_occupancy_threshold));
839 if (low_window + high_window < line_count) {
840 for (sum = 0, high_index = 0; high_index < low_window; high_index++)
841 sum += occupation[high_index];
842 for (low_index = 0; low_index < high_window; low_index++, high_index++)
843 sum += occupation[high_index];
844 min_occ = occupation[0];
845 min_index = 0;
846 for (test_index = 1; test_index < high_index; test_index++) {
847 if (occupation[test_index] <= min_occ) {
848 min_occ = occupation[test_index];
849 min_index = test_index; //find min in region
850 }
851 }
852 for (line_index = 0; line_index < low_window; line_index++)
853 thresholds[line_index] = (sum - min_occ) / divisor + min_occ;
854 //same out to end
855 for (low_index = 0; high_index < line_count; low_index++, high_index++) {
856 sum -= occupation[low_index];
857 sum += occupation[high_index];
858 if (occupation[high_index] <= min_occ) {
859 //find min in region
860 min_occ = occupation[high_index];
861 min_index = high_index;
862 }
863 //lost min from region
864 if (min_index <= low_index) {
865 min_occ = occupation[low_index + 1];
866 min_index = low_index + 1;
867 for (test_index = low_index + 2; test_index <= high_index;
868 test_index++) {
869 if (occupation[test_index] <= min_occ) {
870 min_occ = occupation[test_index];
871 //find min in region
872 min_index = test_index;
873 }
874 }
875 }
876 thresholds[line_index++] = (sum - min_occ) / divisor + min_occ;
877 }
878 }
879 else {
880 min_occ = occupation[0];
881 min_index = 0;
882 for (sum = 0, low_index = 0; low_index < line_count; low_index++) {
883 if (occupation[low_index] < min_occ) {
884 min_occ = occupation[low_index];
885 min_index = low_index;
886 }
887 sum += occupation[low_index];
888 }
889 line_index = 0;
890 }
891 for (; line_index < line_count; line_index++)
892 thresholds[line_index] = (sum - min_occ) / divisor + min_occ;
893 //same out to end
894}
double textord_occupancy_threshold
Definition: makerow.cpp:84

◆ compute_page_skew()

void compute_page_skew ( TO_BLOCK_LIST *  blocks,
float &  page_m,
float &  page_err 
)

Definition at line 286 of file makerow.cpp.

290 {
291 int32_t row_count; //total rows
292 int32_t blob_count; //total_blobs
293 int32_t row_err; //integer error
294 int32_t row_index; //of total
295 TO_ROW *row; //current row
296 TO_BLOCK_IT block_it = blocks; //iterator
297
298 row_count = 0;
299 blob_count = 0;
300 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
301 block_it.forward ()) {
302 POLY_BLOCK* pb = block_it.data()->block->pdblk.poly_block();
303 if (pb != nullptr && !pb->IsText())
304 continue; // Pretend non-text blocks don't exist.
305 row_count += block_it.data ()->get_rows ()->length ();
306 //count up rows
307 TO_ROW_IT row_it(block_it.data()->get_rows());
308 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
309 blob_count += row_it.data ()->blob_list ()->length ();
310 }
311 if (row_count == 0) {
312 page_m = 0.0f;
313 page_err = 0.0f;
314 return;
315 }
316 // of rows
317 std::vector<float> gradients(blob_count);
318 // of rows
319 std::vector<float> errors(blob_count);
320
321 row_index = 0;
322 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
323 block_it.forward ()) {
324 POLY_BLOCK* pb = block_it.data()->block->pdblk.poly_block();
325 if (pb != nullptr && !pb->IsText())
326 continue; // Pretend non-text blocks don't exist.
327 TO_ROW_IT row_it(block_it.data ()->get_rows());
328 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
329 row = row_it.data ();
330 blob_count = row->blob_list ()->length ();
331 row_err = static_cast<int32_t>(ceil (row->line_error ()));
332 if (row_err <= 0)
333 row_err = 1;
334 if (textord_biased_skewcalc) {
335 blob_count /= row_err;
336 for (blob_count /= row_err; blob_count > 0; blob_count--) {
337 gradients[row_index] = row->line_m ();
338 errors[row_index] = row->line_error ();
339 row_index++;
340 }
341 }
342 else if (blob_count >= textord_min_blobs_in_row) {
343 //get gradient
344 gradients[row_index] = row->line_m ();
345 errors[row_index] = row->line_error ();
346 row_index++;
347 }
348 }
349 }
350 if (row_index == 0) {
351 //desperate
352 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
353 block_it.forward ()) {
354 POLY_BLOCK* pb = block_it.data()->block->pdblk.poly_block();
355 if (pb != nullptr && !pb->IsText())
356 continue; // Pretend non-text blocks don't exist.
357 TO_ROW_IT row_it(block_it.data()->get_rows());
358 for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
359 row_it.forward ()) {
360 row = row_it.data ();
361 gradients[row_index] = row->line_m ();
362 errors[row_index] = row->line_error ();
363 row_index++;
364 }
365 }
366 }
367 row_count = row_index;
368 row_index = choose_nth_item (static_cast<int32_t>(row_count * textord_skew_ile),
369 &gradients[0], row_count);
370 page_m = gradients[row_index];
371 row_index = choose_nth_item (static_cast<int32_t>(row_count * textord_skew_ile),
372 &errors[0], row_count);
373 page_err = errors[row_index];
374}
int32_t choose_nth_item(int32_t index, float *array, int32_t count)
Definition: statistc.cpp:630
double textord_skew_ile
Definition: makerow.cpp:72
int textord_min_blobs_in_row
Definition: makerow.cpp:62
float line_m() const
Definition: blobbox.h:571
float line_error() const
Definition: blobbox.h:577
bool IsText() const
Definition: polyblk.h:49

◆ compute_row_descdrop()

int32_t compute_row_descdrop ( TO_ROW row,
float  gradient,
int  xheight_blob_count,
STATS asc_heights 
)

Definition at line 1563 of file makerow.cpp.

1564 {
1565 // Count how many potential ascenders are in this row.
1566 int i_min = asc_heights->min_bucket();
1567 if ((i_min / row->xheight) < textord_ascx_ratio_min) {
1568 i_min = static_cast<int>(
1569 floor(row->xheight * textord_ascx_ratio_min + 0.5));
1570 }
1571 int i_max = asc_heights->max_bucket();
1572 if ((i_max / row->xheight) > textord_ascx_ratio_max) {
1573 i_max = static_cast<int>(floor(row->xheight * textord_ascx_ratio_max));
1574 }
1575 int num_potential_asc = 0;
1576 for (int i = i_min; i <= i_max; ++i) {
1577 num_potential_asc += asc_heights->pile_count(i);
1578 }
1579 auto min_height =
1580 static_cast<int32_t>(floor(row->xheight * textord_descx_ratio_min + 0.5));
1581 auto max_height =
1582 static_cast<int32_t>(floor(row->xheight * textord_descx_ratio_max));
1583 float xcentre; // centre of blob
1584 float height; // height of blob
1585 BLOBNBOX_IT blob_it = row->blob_list();
1586 BLOBNBOX *blob; // current blob
1587 STATS heights (min_height, max_height + 1);
1588 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1589 blob = blob_it.data();
1590 if (!blob->joined_to_prev()) {
1591 xcentre = (blob->bounding_box().left() +
1592 blob->bounding_box().right()) / 2.0f;
1593 height = (gradient * xcentre + row->parallel_c() -
1594 blob->bounding_box().bottom());
1595 if (height >= min_height && height <= max_height)
1596 heights.add(static_cast<int>(floor(height + 0.5)), 1);
1597 }
1598 }
1599 int blob_index = heights.mode(); // find mode
1600 int blob_count = heights.pile_count(blob_index); // get count of mode
1601 float total_fraction =
1602 (textord_descheight_mode_fraction + textord_ascheight_mode_fraction);
1603 if (static_cast<float>(blob_count + num_potential_asc) <
1604 xheight_blob_count * total_fraction) {
1605 blob_count = 0;
1606 }
1607 int descdrop = blob_count > 0 ? -blob_index : 0;
1609 tprintf("Descdrop: %d (potential ascenders %d, descenders %d)\n",
1610 descdrop, num_potential_asc, blob_count);
1611 heights.print();
1612 }
1613 return descdrop;
1614}
double textord_ascx_ratio_max
Definition: makerow.cpp:95
double textord_descx_ratio_max
Definition: makerow.cpp:97
double textord_ascx_ratio_min
Definition: makerow.cpp:94
double textord_descx_ratio_min
Definition: makerow.cpp:96
bool textord_debug_xheights
Definition: makerow.cpp:55
double textord_ascheight_mode_fraction
Definition: makerow.cpp:91
bool joined_to_prev() const
Definition: blobbox.h:256
float xheight
Definition: blobbox.h:657
float parallel_c() const
Definition: blobbox.h:580
Definition: statistc.h:31
int32_t max_bucket() const
Definition: statistc.cpp:213
int32_t min_bucket() const
Definition: statistc.cpp:198

◆ compute_row_stats()

void compute_row_stats ( TO_BLOCK block,
bool  testing_on 
)

Definition at line 1143 of file makerow.cpp.

1146 {
1147 int32_t row_index; //of median
1148 TO_ROW *row; //current row
1149 TO_ROW *prev_row; //previous row
1150 float iqr; //inter quartile range
1151 TO_ROW_IT row_it = block->get_rows ();
1152 //number of rows
1153 int16_t rowcount = row_it.length ();
1154 // for choose nth
1155 std::vector<TO_ROW*> rows(rowcount);
1156 rowcount = 0;
1157 prev_row = nullptr;
1158 row_it.move_to_last (); //start at bottom
1159 do {
1160 row = row_it.data ();
1161 if (prev_row != nullptr) {
1162 rows[rowcount++] = prev_row;
1163 prev_row->spacing = row->intercept () - prev_row->intercept ();
1164 if (testing_on)
1165 tprintf ("Row at %g yields spacing of %g\n",
1166 row->intercept (), prev_row->spacing);
1167 }
1168 prev_row = row;
1169 row_it.backward ();
1170 }
1171 while (!row_it.at_last ());
1172 block->key_row = prev_row;
1173 block->baseline_offset =
1174 fmod (prev_row->parallel_c (), block->line_spacing);
1175 if (testing_on)
1176 tprintf ("Blob based spacing=(%g,%g), offset=%g",
1177 block->line_size, block->line_spacing, block->baseline_offset);
1178 if (rowcount > 0) {
1179 row_index = choose_nth_item(rowcount * 3 / 4, &rows[0], rowcount,
1180 sizeof (TO_ROW *), row_spacing_order);
1181 iqr = rows[row_index]->spacing;
1182 row_index = choose_nth_item(rowcount / 4, &rows[0], rowcount,
1183 sizeof (TO_ROW *), row_spacing_order);
1184 iqr -= rows[row_index]->spacing;
1185 row_index = choose_nth_item(rowcount / 2, &rows[0], rowcount,
1186 sizeof (TO_ROW *), row_spacing_order);
1187 block->key_row = rows[row_index];
1188 if (testing_on)
1189 tprintf (" row based=%g(%g)", rows[row_index]->spacing, iqr);
1190 if (rowcount > 2
1191 && iqr < rows[row_index]->spacing * textord_linespace_iqrlimit) {
1193 if (rows[row_index]->spacing < block->line_spacing
1194 && rows[row_index]->spacing > block->line_size)
1195 //within range
1196 block->line_size = rows[row_index]->spacing;
1197 //spacing=size
1198 else if (rows[row_index]->spacing > block->line_spacing)
1199 block->line_size = block->line_spacing;
1200 //too big so use max
1201 }
1202 else {
1203 if (rows[row_index]->spacing < block->line_spacing)
1204 block->line_size = rows[row_index]->spacing;
1205 else
1206 block->line_size = block->line_spacing;
1207 //too big so use max
1208 }
1209 if (block->line_size < textord_min_xheight)
1210 block->line_size = (float) textord_min_xheight;
1211 block->line_spacing = rows[row_index]->spacing;
1212 block->max_blob_size =
1214 }
1215 block->baseline_offset = fmod (rows[row_index]->intercept (),
1216 block->line_spacing);
1217 }
1218 if (testing_on)
1219 tprintf ("\nEstimate line size=%g, spacing=%g, offset=%g\n",
1220 block->line_size, block->line_spacing, block->baseline_offset);
1221}
double textord_linespace_iqrlimit
Definition: makerow.cpp:74
int textord_min_xheight
Definition: makerow.cpp:67
double textord_excess_blobsize
Definition: makerow.cpp:83
int row_spacing_order(const void *item1, const void *item2)
Definition: makerow.cpp:2617
bool textord_new_initial_xheight
Definition: makerow.cpp:100
float spacing
Definition: blobbox.h:656
TO_ROW * key_row
Definition: blobbox.h:798
float baseline_offset
Definition: blobbox.h:787
float max_blob_size
Definition: blobbox.h:786

◆ compute_xheight_from_modes()

int compute_xheight_from_modes ( STATS heights,
STATS floating_heights,
bool  cap_only,
int  min_height,
int  max_height,
float *  xheight,
float *  ascrise 
)

Definition at line 1467 of file makerow.cpp.

1469 {
1470 int blob_index = heights->mode(); // find mode
1471 int blob_count = heights->pile_count(blob_index); // get count of mode
1473 tprintf("min_height=%d, max_height=%d, mode=%d, count=%d, total=%d\n",
1474 min_height, max_height, blob_index, blob_count,
1475 heights->get_total());
1476 heights->print();
1477 floating_heights->print();
1478 }
1479 if (blob_count == 0) return 0;
1480 int modes[MAX_HEIGHT_MODES]; // biggest piles
1481 bool in_best_pile = false;
1482 int prev_size = -INT32_MAX;
1483 int best_count = 0;
1484 int mode_count = compute_height_modes(heights, min_height, max_height,
1485 modes, MAX_HEIGHT_MODES);
1486 if (cap_only && mode_count > 1)
1487 mode_count = 1;
1488 int x;
1490 tprintf("found %d modes: ", mode_count);
1491 for (x = 0; x < mode_count; x++) tprintf("%d ", modes[x]);
1492 tprintf("\n");
1493 }
1494
1495 for (x = 0; x < mode_count - 1; x++) {
1496 if (modes[x] != prev_size + 1)
1497 in_best_pile = false; // had empty height
1498 int modes_x_count = heights->pile_count(modes[x]) -
1499 floating_heights->pile_count(modes[x]);
1500 if ((modes_x_count >= blob_count * textord_xheight_mode_fraction) &&
1501 (in_best_pile || modes_x_count > best_count)) {
1502 for (int asc = x + 1; asc < mode_count; asc++) {
1503 float ratio =
1504 static_cast<float>(modes[asc]) / static_cast<float>(modes[x]);
1505 if (textord_ascx_ratio_min < ratio &&
1506 ratio < textord_ascx_ratio_max &&
1507 (heights->pile_count(modes[asc]) >=
1508 blob_count * textord_ascheight_mode_fraction)) {
1509 if (modes_x_count > best_count) {
1510 in_best_pile = true;
1511 best_count = modes_x_count;
1512 }
1514 tprintf("X=%d, asc=%d, count=%d, ratio=%g\n",
1515 modes[x], modes[asc]-modes[x], modes_x_count, ratio);
1516 }
1517 prev_size = modes[x];
1518 *xheight = static_cast<float>(modes[x]);
1519 *ascrise = static_cast<float>(modes[asc] - modes[x]);
1520 }
1521 }
1522 }
1523 }
1524 if (*xheight == 0) { // single mode
1525 // Remove counts of the "floating" blobs (the one whose height is too
1526 // small in relation to it's top end of the bounding box) from heights
1527 // before computing the single-mode xheight.
1528 // Restore the counts in heights after the mode is found, since
1529 // floating blobs might be useful for determining potential ascenders
1530 // in compute_row_descdrop().
1531 if (floating_heights->get_total() > 0) {
1532 for (x = min_height; x < max_height; ++x) {
1533 heights->add(x, -(floating_heights->pile_count(x)));
1534 }
1535 blob_index = heights->mode(); // find the modified mode
1536 for (x = min_height; x < max_height; ++x) {
1537 heights->add(x, floating_heights->pile_count(x));
1538 }
1539 }
1540 *xheight = static_cast<float>(blob_index);
1541 *ascrise = 0.0f;
1542 best_count = heights->pile_count(blob_index);
1544 tprintf("Single mode xheight set to %g\n", *xheight);
1545 } else if (textord_debug_xheights) {
1546 tprintf("Multi-mode xheight set to %g, asc=%g\n", *xheight, *ascrise);
1547 }
1548 return best_count;
1549}
double textord_xheight_mode_fraction
Definition: makerow.cpp:89
int32_t compute_height_modes(STATS *heights, int32_t min_height, int32_t max_height, int32_t *modes, int32_t maxmodes)
Definition: makerow.cpp:1623
#define MAX_HEIGHT_MODES
Definition: makerow.cpp:103
void add(int32_t value, int32_t count)
Definition: statistc.cpp:93
int32_t get_total() const
Definition: statistc.h:84
void print() const
Definition: statistc.cpp:526
int32_t mode() const
Definition: statistc.cpp:107

◆ correct_row_xheight()

void correct_row_xheight ( TO_ROW row,
float  xheight,
float  ascrise,
float  descdrop 
)

Definition at line 1685 of file makerow.cpp.

1686 {
1687 ROW_CATEGORY row_category = get_row_category(row);
1689 tprintf("correcting row xheight: row->xheight %.4f"
1690 ", row->acrise %.4f row->descdrop %.4f\n",
1691 row->xheight, row->ascrise, row->descdrop);
1692 }
1693 bool normal_xheight =
1695 bool cap_xheight =
1696 within_error_margin(row->xheight, xheight + ascrise,
1698 // Use the average xheight/ascrise for the following cases:
1699 // -- the xheight of the row could not be determined at all
1700 // -- the row has descenders (e.g. "many groups", "ISBN 12345 p.3")
1701 // and its xheight is close to either cap height or average xheight
1702 // -- the row does not have ascenders or descenders, but its xheight
1703 // is close to the average block xheight (e.g. row with "www.mmm.com")
1704 if (row_category == ROW_ASCENDERS_FOUND) {
1705 if (row->descdrop >= 0.0) {
1706 row->descdrop = row->xheight * (descdrop / xheight);
1707 }
1708 } else if (row_category == ROW_INVALID ||
1709 (row_category == ROW_DESCENDERS_FOUND &&
1710 (normal_xheight || cap_xheight)) ||
1711 (row_category == ROW_UNKNOWN && normal_xheight)) {
1712 if (textord_debug_xheights) tprintf("using average xheight\n");
1713 row->xheight = xheight;
1714 row->ascrise = ascrise;
1715 row->descdrop = descdrop;
1716 } else if (row_category == ROW_DESCENDERS_FOUND) {
1717 // Assume this is a row with mostly lowercase letters and it's xheight
1718 // is computed correctly (unfortunately there is no way to distinguish
1719 // this from the case when descenders are found, but the most common
1720 // height is capheight).
1721 if (textord_debug_xheights) tprintf("lowercase, corrected ascrise\n");
1722 row->ascrise = row->xheight * (ascrise / xheight);
1723 } else if (row_category == ROW_UNKNOWN) {
1724 // Otherwise assume this row is an all-caps or small-caps row
1725 // and adjust xheight and ascrise of the row.
1726
1727 row->all_caps = true;
1728 if (cap_xheight) { // regular all caps
1729 if (textord_debug_xheights) tprintf("all caps\n");
1730 row->xheight = xheight;
1731 row->ascrise = ascrise;
1732 row->descdrop = descdrop;
1733 } else { // small caps or caps with an odd xheight
1735 if (row->xheight < xheight + ascrise && row->xheight > xheight) {
1736 tprintf("small caps\n");
1737 } else {
1738 tprintf("all caps with irregular xheight\n");
1739 }
1740 }
1741 row->ascrise = row->xheight * (ascrise / (xheight + ascrise));
1742 row->xheight -= row->ascrise;
1743 row->descdrop = row->xheight * (descdrop / xheight);
1744 }
1745 }
1747 tprintf("corrected row->xheight = %.4f, row->acrise = %.4f, row->descdrop"
1748 " = %.4f\n", row->xheight, row->ascrise, row->descdrop);
1749 }
1750}
double textord_xheight_error_margin
Definition: makerow.cpp:98
bool within_error_margin(float test, float num, float margin)
Definition: makerow.h:128
ROW_CATEGORY get_row_category(const TO_ROW *row)
Definition: makerow.h:122
ROW_CATEGORY
Definition: makerow.h:35
@ ROW_DESCENDERS_FOUND
Definition: makerow.h:37
@ ROW_UNKNOWN
Definition: makerow.h:38
@ ROW_ASCENDERS_FOUND
Definition: makerow.h:36
@ ROW_INVALID
Definition: makerow.h:39
bool all_caps
Definition: blobbox.h:646
float descdrop
Definition: blobbox.h:660
float ascrise
Definition: blobbox.h:659

◆ delete_non_dropout_rows()

void delete_non_dropout_rows ( TO_BLOCK block,
float  gradient,
FCOORD  rotation,
int32_t  block_edge,
bool  testing_on 
)

delete_non_dropout_rows

Compute the linespacing and offset.

Definition at line 570 of file makerow.cpp.

576 {
577 TBOX block_box; //deskewed block
578 int32_t max_y; //in block
579 int32_t min_y;
580 int32_t line_index; //of scan line
581 int32_t line_count; //no of scan lines
582 int32_t distance; //to drop-out
583 int32_t xleft; //of block
584 int32_t ybottom; //of block
585 TO_ROW *row; //current row
586 TO_ROW_IT row_it = block->get_rows ();
587 BLOBNBOX_IT blob_it = &block->blobs;
588
589 if (row_it.length () == 0)
590 return; //empty block
591 block_box = deskew_block_coords (block, gradient);
592 xleft = block->block->pdblk.bounding_box ().left ();
593 ybottom = block->block->pdblk.bounding_box ().bottom ();
594 min_y = block_box.bottom () - 1;
595 max_y = block_box.top () + 1;
596 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
597 line_index = static_cast<int32_t>(floor (row_it.data ()->intercept ()));
598 if (line_index <= min_y)
599 min_y = line_index - 1;
600 if (line_index >= max_y)
601 max_y = line_index + 1;
602 }
603 line_count = max_y - min_y + 1;
604 if (line_count <= 0)
605 return; //empty block
606 // change in occupation
607 std::vector<int32_t> deltas(line_count);
608 // of pixel coords
609 std::vector<int32_t> occupation(line_count);
610
611 compute_line_occupation(block, gradient, min_y, max_y, &occupation[0], &deltas[0]);
612 compute_occupation_threshold (static_cast<int32_t>(ceil (block->line_spacing *
615 static_cast<int32_t>(ceil (block->line_spacing *
618 max_y - min_y + 1, &occupation[0], &deltas[0]);
619#ifndef GRAPHICS_DISABLED
620 if (testing_on) {
621 draw_occupation(xleft, ybottom, min_y, max_y, &occupation[0], &deltas[0]);
622 }
623#endif
624 compute_dropout_distances(&occupation[0], &deltas[0], line_count);
625 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
626 row = row_it.data ();
627 line_index = static_cast<int32_t>(floor (row->intercept ()));
628 distance = deltas[line_index - min_y];
629 if (find_best_dropout_row (row, distance, block->line_spacing / 2,
630 line_index, &row_it, testing_on)) {
631#ifndef GRAPHICS_DISABLED
632 if (testing_on)
633 plot_parallel_row(row, gradient, block_edge,
634 ScrollView::WHITE, rotation);
635#endif
636 blob_it.add_list_after (row_it.data ()->blob_list ());
637 delete row_it.extract (); //too far away
638 }
639 }
640 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
641 blob_it.add_list_after (row_it.data ()->blob_list ());
642 }
643}
void draw_occupation(int32_t xleft, int32_t ybottom, int32_t min_y, int32_t max_y, int32_t occupation[], int32_t thresholds[])
Definition: drawtord.cpp:163
void plot_parallel_row(TO_ROW *row, float gradient, int32_t left, ScrollView::Color colour, FCOORD rotation)
Definition: drawtord.cpp:122
void compute_dropout_distances(int32_t *occupation, int32_t *thresholds, int32_t line_count)
Definition: makerow.cpp:902
bool find_best_dropout_row(TO_ROW *row, int32_t distance, float dist_limit, int32_t line_index, TO_ROW_IT *row_it, bool testing_on)
Definition: makerow.cpp:652
void compute_occupation_threshold(int32_t low_window, int32_t high_window, int32_t line_count, int32_t *occupation, int32_t *thresholds)
Definition: makerow.cpp:821
void compute_line_occupation(TO_BLOCK *block, float gradient, int32_t min_y, int32_t max_y, int32_t *occupation, int32_t *deltas)
Definition: makerow.cpp:768
TBOX deskew_block_coords(TO_BLOCK *block, float gradient)
Definition: makerow.cpp:732

◆ deskew_block_coords()

TBOX deskew_block_coords ( TO_BLOCK block,
float  gradient 
)

Definition at line 732 of file makerow.cpp.

735 {
736 TBOX result; //block bounds
737 TBOX blob_box; //of block
738 FCOORD rotation; //deskew vector
739 float length; //of gradient vector
740 TO_ROW_IT row_it = block->get_rows ();
741 TO_ROW *row; //current row
742 BLOBNBOX *blob; //current blob
743 BLOBNBOX_IT blob_it; //iterator
744
745 length = sqrt (gradient * gradient + 1);
746 rotation = FCOORD (1 / length, -gradient / length);
747 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
748 row = row_it.data ();
749 blob_it.set_to_list (row->blob_list ());
750 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
751 blob_it.forward ()) {
752 blob = blob_it.data ();
753 blob_box = blob->bounding_box ();
754 blob_box.rotate (rotation);//de-skew it
755 result += blob_box;
756 }
757 }
758 return result;
759}

◆ expand_rows()

void expand_rows ( ICOORD  page_tr,
TO_BLOCK block,
float  gradient,
FCOORD  rotation,
int32_t  block_edge,
bool  testing_on 
)

Definition at line 949 of file makerow.cpp.

956 {
957 bool swallowed_row; //eaten a neighbour
958 float y_max, y_min; //new row limits
959 float y_bottom, y_top; //allowed limits
960 TO_ROW *test_row; //next row
961 TO_ROW *row; //current row
962 //iterators
963 BLOBNBOX_IT blob_it = &block->blobs;
964 TO_ROW_IT row_it = block->get_rows ();
965
966#ifndef GRAPHICS_DISABLED
967 if (textord_show_expanded_rows && testing_on) {
968 if (to_win == nullptr)
969 create_to_win(page_tr);
970 }
971#endif
972
973 adjust_row_limits(block); //shift min,max.
975 if (block->get_rows ()->length () == 0)
976 return;
977 compute_row_stats(block, textord_show_expanded_rows && testing_on);
978 }
979 assign_blobs_to_rows (block, &gradient, 4, true, false, false);
980 //get real membership
981 if (block->get_rows ()->length () == 0)
982 return;
983 fit_parallel_rows(block,
984 gradient,
985 rotation,
986 block_edge,
987 textord_show_expanded_rows && testing_on);
989 compute_row_stats(block, textord_show_expanded_rows && testing_on);
990 row_it.move_to_last ();
991 do {
992 row = row_it.data ();
993 y_max = row->max_y (); //get current limits
994 y_min = row->min_y ();
995 y_bottom = row->intercept () - block->line_size * textord_expansion_factor *
997 y_top = row->intercept () + block->line_size * textord_expansion_factor *
1000 if (y_min > y_bottom) { //expansion allowed
1001 if (textord_show_expanded_rows && testing_on)
1002 tprintf("Expanding bottom of row at %f from %f to %f\n",
1003 row->intercept(), y_min, y_bottom);
1004 //expandable
1005 swallowed_row = true;
1006 while (swallowed_row && !row_it.at_last ()) {
1007 swallowed_row = false;
1008 //get next one
1009 test_row = row_it.data_relative (1);
1010 //overlaps space
1011 if (test_row->max_y () > y_bottom) {
1012 if (test_row->min_y () > y_bottom) {
1013 if (textord_show_expanded_rows && testing_on)
1014 tprintf("Eating row below at %f\n", test_row->intercept());
1015 row_it.forward ();
1016#ifndef GRAPHICS_DISABLED
1017 if (textord_show_expanded_rows && testing_on)
1018 plot_parallel_row(test_row,
1019 gradient,
1020 block_edge,
1022 rotation);
1023#endif
1024 blob_it.set_to_list (row->blob_list ());
1025 blob_it.add_list_after (test_row->blob_list ());
1026 //swallow complete row
1027 delete row_it.extract ();
1028 row_it.backward ();
1029 swallowed_row = true;
1030 }
1031 else if (test_row->max_y () < y_min) {
1032 //shorter limit
1033 y_bottom = test_row->max_y ();
1034 if (textord_show_expanded_rows && testing_on)
1035 tprintf("Truncating limit to %f due to touching row at %f\n",
1036 y_bottom, test_row->intercept());
1037 }
1038 else {
1039 y_bottom = y_min; //can't expand it
1040 if (textord_show_expanded_rows && testing_on)
1041 tprintf("Not expanding limit beyond %f due to touching row at %f\n",
1042 y_bottom, test_row->intercept());
1043 }
1044 }
1045 }
1046 y_min = y_bottom; //expand it
1047 }
1048 if (y_max < y_top) { //expansion allowed
1049 if (textord_show_expanded_rows && testing_on)
1050 tprintf("Expanding top of row at %f from %f to %f\n",
1051 row->intercept(), y_max, y_top);
1052 swallowed_row = true;
1053 while (swallowed_row && !row_it.at_first ()) {
1054 swallowed_row = false;
1055 //get one above
1056 test_row = row_it.data_relative (-1);
1057 if (test_row->min_y () < y_top) {
1058 if (test_row->max_y () < y_top) {
1059 if (textord_show_expanded_rows && testing_on)
1060 tprintf("Eating row above at %f\n", test_row->intercept());
1061 row_it.backward ();
1062 blob_it.set_to_list (row->blob_list ());
1063#ifndef GRAPHICS_DISABLED
1064 if (textord_show_expanded_rows && testing_on)
1065 plot_parallel_row(test_row,
1066 gradient,
1067 block_edge,
1069 rotation);
1070#endif
1071 blob_it.add_list_after (test_row->blob_list ());
1072 //swallow complete row
1073 delete row_it.extract ();
1074 row_it.forward ();
1075 swallowed_row = true;
1076 }
1077 else if (test_row->min_y () < y_max) {
1078 //shorter limit
1079 y_top = test_row->min_y ();
1080 if (textord_show_expanded_rows && testing_on)
1081 tprintf("Truncating limit to %f due to touching row at %f\n",
1082 y_top, test_row->intercept());
1083 }
1084 else {
1085 y_top = y_max; //can't expand it
1086 if (textord_show_expanded_rows && testing_on)
1087 tprintf("Not expanding limit beyond %f due to touching row at %f\n",
1088 y_top, test_row->intercept());
1089 }
1090 }
1091 }
1092 y_max = y_top;
1093 }
1094 //new limits
1095 row->set_limits (y_min, y_max);
1096 row_it.backward ();
1097 }
1098 while (!row_it.at_last ());
1099}
void compute_row_stats(TO_BLOCK *block, bool testing_on)
Definition: makerow.cpp:1143
void adjust_row_limits(TO_BLOCK *block)
Definition: makerow.cpp:1107

◆ fill_heights()

void fill_heights ( TO_ROW row,
float  gradient,
int  min_height,
int  max_height,
STATS heights,
STATS floating_heights 
)

Definition at line 1406 of file makerow.cpp.

1407 {
1408 float xcentre; // centre of blob
1409 float top; // top y coord of blob
1410 float height; // height of blob
1411 BLOBNBOX *blob; // current blob
1412 int repeated_set;
1413 BLOBNBOX_IT blob_it = row->blob_list();
1414 if (blob_it.empty()) return; // no blobs in this row
1415 bool has_rep_chars =
1416 row->rep_chars_marked() && row->num_repeated_sets() > 0;
1417 do {
1418 blob = blob_it.data();
1419 if (!blob->joined_to_prev()) {
1420 xcentre = (blob->bounding_box().left() +
1421 blob->bounding_box().right()) / 2.0f;
1422 top = blob->bounding_box().top();
1423 height = blob->bounding_box().height();
1425 top -= row->baseline.y(xcentre);
1426 else
1427 top -= gradient * xcentre + row->parallel_c();
1428 if (top >= min_height && top <= max_height) {
1429 heights->add(static_cast<int32_t>(floor(top + 0.5)), 1);
1430 if (height / top < textord_min_blob_height_fraction) {
1431 floating_heights->add(static_cast<int32_t>(floor(top + 0.5)), 1);
1432 }
1433 }
1434 }
1435 // Skip repeated chars, since they are likely to skew the height stats.
1436 if (has_rep_chars && blob->repeated_set() != 0) {
1437 repeated_set = blob->repeated_set();
1438 blob_it.forward();
1439 while (!blob_it.at_first() &&
1440 blob_it.data()->repeated_set() == repeated_set) {
1441 blob_it.forward();
1443 tprintf("Skipping repeated char when computing xheight\n");
1444 }
1445 } else {
1446 blob_it.forward();
1447 }
1448 } while (!blob_it.at_first());
1449}
double textord_min_blob_height_fraction
Definition: makerow.cpp:87
bool textord_fix_xheight_bug
Definition: makerow.cpp:53
int repeated_set() const
Definition: blobbox.h:262
QSPLINE baseline
Definition: blobbox.h:670
bool rep_chars_marked() const
Definition: blobbox.h:631
int num_repeated_sets() const
Definition: blobbox.h:637
double y(double x) const
Definition: quspline.cpp:209
int16_t height() const
Definition: rect.h:108

◆ find_best_dropout_row()

bool find_best_dropout_row ( TO_ROW row,
int32_t  distance,
float  dist_limit,
int32_t  line_index,
TO_ROW_IT *  row_it,
bool  testing_on 
)

Definition at line 652 of file makerow.cpp.

659 {
660 int32_t next_index; // of neighbouring row
661 int32_t row_offset; //from current row
662 int32_t abs_dist; //absolute distance
663 int8_t row_inc; //increment to row_index
664 TO_ROW *next_row; //nextious row
665
666 if (testing_on)
667 tprintf ("Row at %g(%g), dropout dist=%d,",
668 row->intercept (), row->parallel_c (), distance);
669 if (distance < 0) {
670 row_inc = 1;
671 abs_dist = -distance;
672 }
673 else {
674 row_inc = -1;
675 abs_dist = distance;
676 }
677 if (abs_dist > dist_limit) {
678 if (testing_on) {
679 tprintf (" too far - deleting\n");
680 }
681 return true;
682 }
683 if ((distance < 0 && !row_it->at_last ())
684 || (distance >= 0 && !row_it->at_first ())) {
685 row_offset = row_inc;
686 do {
687 next_row = row_it->data_relative (row_offset);
688 next_index = static_cast<int32_t>(floor (next_row->intercept ()));
689 if ((distance < 0
690 && next_index < line_index
691 && next_index > line_index + distance + distance)
692 || (distance >= 0
693 && next_index > line_index
694 && next_index < line_index + distance + distance)) {
695 if (testing_on) {
696 tprintf (" nearer neighbour (%d) at %g\n",
697 line_index + distance - next_index,
698 next_row->intercept ());
699 }
700 return true; //other is nearer
701 }
702 else if (next_index == line_index
703 || next_index == line_index + distance + distance) {
704 if (row->believability () <= next_row->believability ()) {
705 if (testing_on) {
706 tprintf (" equal but more believable at %g (%g/%g)\n",
707 next_row->intercept (),
708 row->believability (),
709 next_row->believability ());
710 }
711 return true; //other is more believable
712 }
713 }
714 row_offset += row_inc;
715 }
716 while ((next_index == line_index
717 || next_index == line_index + distance + distance)
718 && row_offset < row_it->length ());
719 if (testing_on)
720 tprintf (" keeping\n");
721 }
722 return false;
723}
float believability() const
Definition: blobbox.h:586

◆ fit_lms_line()

void fit_lms_line ( TO_ROW row)

Definition at line 266 of file makerow.cpp.

266 {
267 float m, c; // fitted line
269 BLOBNBOX_IT blob_it = row->blob_list();
270
271 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
272 const TBOX& box = blob_it.data()->bounding_box();
273 lms.Add(ICOORD((box.left() + box.right()) / 2, box.bottom()));
274 }
275 double error = lms.Fit(&m, &c);
276 row->set_line(m, c, error);
277}
void set_line(float new_m, float new_c, float new_error)
Definition: blobbox.h:604
void Add(const ICOORD &pt)
Definition: detlinefit.cpp:51
double Fit(ICOORD *pt1, ICOORD *pt2)
Definition: detlinefit.h:75

◆ fit_parallel_lms()

void fit_parallel_lms ( float  gradient,
TO_ROW row 
)

Definition at line 1973 of file makerow.cpp.

1973 {
1974 float c; // fitted line
1975 int blobcount; // no of blobs
1977 BLOBNBOX_IT blob_it = row->blob_list();
1978
1979 blobcount = 0;
1980 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1981 if (!blob_it.data()->joined_to_prev()) {
1982 const TBOX& box = blob_it.data()->bounding_box();
1983 lms.Add(ICOORD((box.left() + box.right()) / 2, box.bottom()));
1984 blobcount++;
1985 }
1986 }
1987 double error = lms.ConstrainedFit(gradient, &c);
1988 row->set_parallel_line(gradient, c, error);
1990 error = lms.Fit(&gradient, &c);
1991 }
1992 //set the other too
1993 row->set_line(gradient, c, error);
1994}
int textord_lms_line_trials
Definition: makerow.cpp:99
bool textord_straight_baselines
Definition: makerow.cpp:50
void set_parallel_line(float gradient, float new_c, float new_error)
Definition: blobbox.h:612
double ConstrainedFit(const FCOORD &direction, double min_dist, double max_dist, bool debug, ICOORD *line_pt)
Definition: detlinefit.cpp:130

◆ fit_parallel_rows()

void fit_parallel_rows ( TO_BLOCK block,
float  gradient,
FCOORD  rotation,
int32_t  block_edge,
bool  testing_on 
)

Definition at line 1931 of file makerow.cpp.

1937 {
1938#ifndef GRAPHICS_DISABLED
1939 ScrollView::Color colour; //of row
1940#endif
1941 TO_ROW_IT row_it = block->get_rows ();
1942
1943 row_it.move_to_first ();
1944 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1945 if (row_it.data ()->blob_list ()->empty ())
1946 delete row_it.extract (); //nothing in it
1947 else
1948 fit_parallel_lms (gradient, row_it.data ());
1949 }
1950#ifndef GRAPHICS_DISABLED
1951 if (testing_on) {
1952 colour = ScrollView::RED;
1953 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1954 plot_parallel_row (row_it.data (), gradient,
1955 block_edge, colour, rotation);
1956 colour = static_cast<ScrollView::Color>(colour + 1);
1957 if (colour > ScrollView::MAGENTA)
1958 colour = ScrollView::RED;
1959 }
1960 }
1961#endif
1962 row_it.sort (row_y_order); //may have gone out of order
1963}
void fit_parallel_lms(float gradient, TO_ROW *row)
Definition: makerow.cpp:1973
int row_y_order(const void *item1, const void *item2)
Definition: makerow.cpp:2595

◆ linear_spline_baseline()

double * linear_spline_baseline ( TO_ROW row,
TO_BLOCK block,
int32_t &  segments,
int32_t  xstarts[] 
)

Definition at line 2187 of file makerow.cpp.

2192 {
2193 int blobcount; //no of blobs
2194 int blobindex; //current blob
2195 int index1, index2; //blob numbers
2196 int blobs_per_segment; //blobs in each
2197 TBOX box; //blob box
2198 TBOX new_box; //new_it box
2199 //blobs
2200 BLOBNBOX_IT blob_it = row->blob_list ();
2201 BLOBNBOX_IT new_it = blob_it; //front end
2202 float b, c; //fitted curve
2204 int32_t segment; //current segment
2205
2206 box = box_next_pre_chopped (&blob_it);
2207 xstarts[0] = box.left ();
2208 blobcount = 1;
2209 while (!blob_it.at_first ()) {
2210 blobcount++;
2211 box = box_next_pre_chopped (&blob_it);
2212 }
2213 segments = blobcount / textord_spline_medianwin;
2214 if (segments < 1)
2215 segments = 1;
2216 blobs_per_segment = blobcount / segments;
2217 // quadratic coeffs
2218 auto *coeffs = new double[segments * 3];
2220 tprintf
2221 ("Linear splining baseline of %d blobs at (%d,%d), into %d segments of %d blobs\n",
2222 blobcount, box.left (), box.bottom (), segments, blobs_per_segment);
2223 segment = 1;
2224 for (index2 = 0; index2 < blobs_per_segment / 2; index2++)
2225 box_next_pre_chopped(&new_it);
2226 index1 = 0;
2227 blobindex = index2;
2228 do {
2229 blobindex += blobs_per_segment;
2230 lms.Clear();
2231 while (index1 < blobindex || (segment == segments && index1 < blobcount)) {
2232 box = box_next_pre_chopped (&blob_it);
2233 int middle = (box.left() + box.right()) / 2;
2234 lms.Add(ICOORD(middle, box.bottom()));
2235 index1++;
2236 if (index1 == blobindex - blobs_per_segment / 2
2237 || index1 == blobcount - 1) {
2238 xstarts[segment] = box.left ();
2239 }
2240 }
2241 lms.Fit(&b, &c);
2242 coeffs[segment * 3 - 3] = 0;
2243 coeffs[segment * 3 - 2] = b;
2244 coeffs[segment * 3 - 1] = c;
2245 segment++;
2246 if (segment > segments)
2247 break;
2248
2249 blobindex += blobs_per_segment;
2250 lms.Clear();
2251 while (index2 < blobindex || (segment == segments && index2 < blobcount)) {
2252 new_box = box_next_pre_chopped (&new_it);
2253 int middle = (new_box.left() + new_box.right()) / 2;
2254 lms.Add(ICOORD (middle, new_box.bottom()));
2255 index2++;
2256 if (index2 == blobindex - blobs_per_segment / 2
2257 || index2 == blobcount - 1) {
2258 xstarts[segment] = new_box.left ();
2259 }
2260 }
2261 lms.Fit(&b, &c);
2262 coeffs[segment * 3 - 3] = 0;
2263 coeffs[segment * 3 - 2] = b;
2264 coeffs[segment * 3 - 1] = c;
2265 segment++;
2266 }
2267 while (segment <= segments);
2268 return coeffs;
2269}
TBOX box_next_pre_chopped(BLOBNBOX_IT *it)
Definition: blobbox.cpp:665
int textord_spline_medianwin
Definition: makerow.cpp:64
bool textord_oldbl_debug
Definition: oldbasel.cpp:39

◆ make_baseline_spline()

void make_baseline_spline ( TO_ROW row,
TO_BLOCK block 
)

Definition at line 2056 of file makerow.cpp.

2057 {
2058 double *coeffs; // quadratic coeffs
2059 int32_t segments; // no of segments
2060
2061 // spline boundaries
2062 auto *xstarts = new int32_t[row->blob_list()->length() + 1];
2063 if (segment_baseline(row, block, segments, xstarts)
2065 coeffs = linear_spline_baseline(row, block, segments, xstarts);
2066 } else {
2067 xstarts[1] = xstarts[segments];
2068 segments = 1;
2069 coeffs = new double[3];
2070 coeffs[0] = 0;
2071 coeffs[1] = row->line_m ();
2072 coeffs[2] = row->line_c ();
2073 }
2074 row->baseline = QSPLINE (segments, xstarts, coeffs);
2075 delete[] coeffs;
2076 delete[] xstarts;
2077}
bool segment_baseline(TO_ROW *row, TO_BLOCK *block, int32_t &segments, int32_t *xstarts)
Definition: makerow.cpp:2088
double * linear_spline_baseline(TO_ROW *row, TO_BLOCK *block, int32_t &segments, int32_t xstarts[])
Definition: makerow.cpp:2187
bool textord_parallel_baselines
Definition: makerow.cpp:49
float line_c() const
Definition: blobbox.h:574

◆ make_initial_textrows()

void make_initial_textrows ( ICOORD  page_tr,
TO_BLOCK block,
FCOORD  rotation,
bool  testing_on 
)

Definition at line 226 of file makerow.cpp.

231 {
232 TO_ROW_IT row_it = block->get_rows ();
233
234#ifndef GRAPHICS_DISABLED
235 ScrollView::Color colour; //of row
236
237 if (textord_show_initial_rows && testing_on) {
238 if (to_win == nullptr)
239 create_to_win(page_tr);
240 }
241#endif
242 //guess skew
243 assign_blobs_to_rows (block, nullptr, 0, true, true, textord_show_initial_rows && testing_on);
244 row_it.move_to_first ();
245 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
246 fit_lms_line (row_it.data ());
247#ifndef GRAPHICS_DISABLED
248 if (textord_show_initial_rows && testing_on) {
249 colour = ScrollView::RED;
250 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
251 plot_to_row (row_it.data (), colour, rotation);
252 colour = static_cast<ScrollView::Color>(colour + 1);
253 if (colour > ScrollView::MAGENTA)
254 colour = ScrollView::RED;
255 }
256 }
257#endif
258}
void plot_to_row(TO_ROW *row, ScrollView::Color colour, FCOORD rotation)
Definition: drawtord.cpp:88
bool textord_show_initial_rows
Definition: makerow.cpp:43
void fit_lms_line(TO_ROW *row)
Definition: makerow.cpp:266

◆ make_rows()

float make_rows ( ICOORD  page_tr,
TO_BLOCK_LIST *  port_blocks 
)

Definition at line 200 of file makerow.cpp.

200 {
201 float port_m; // global skew
202 float port_err; // global noise
203 TO_BLOCK_IT block_it; // iterator
204
205 block_it.set_to_list(port_blocks);
206 for (block_it.mark_cycle_pt(); !block_it.cycled_list();
207 block_it.forward())
208 make_initial_textrows(page_tr, block_it.data(), FCOORD(1.0f, 0.0f),
210 // compute globally
211 compute_page_skew(port_blocks, port_m, port_err);
212 block_it.set_to_list(port_blocks);
213 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
214 cleanup_rows_making(page_tr, block_it.data(), port_m, FCOORD(1.0f, 0.0f),
215 block_it.data()->block->pdblk.bounding_box().left(),
217 }
218 return port_m; // global skew
219}
void cleanup_rows_making(ICOORD page_tr, TO_BLOCK *block, float gradient, FCOORD rotation, int32_t block_edge, bool testing_on)
Definition: makerow.cpp:517
void make_initial_textrows(ICOORD page_tr, TO_BLOCK *block, FCOORD rotation, bool testing_on)
Definition: makerow.cpp:226
bool textord_test_landscape
Definition: makerow.cpp:48
void compute_page_skew(TO_BLOCK_LIST *blocks, float &page_m, float &page_err)
Definition: makerow.cpp:286

◆ make_single_row()

float make_single_row ( ICOORD  page_tr,
bool  allow_sub_blobs,
TO_BLOCK block,
TO_BLOCK_LIST *  blocks 
)

Definition at line 163 of file makerow.cpp.

164 {
165 BLOBNBOX_IT blob_it = &block->blobs;
166 TO_ROW_IT row_it = block->get_rows();
167
168 // Include all the small blobs and large blobs.
169 blob_it.add_list_after(&block->small_blobs);
170 blob_it.add_list_after(&block->noise_blobs);
171 blob_it.add_list_after(&block->large_blobs);
172 if (block->blobs.singleton() && allow_sub_blobs) {
173 blob_it.move_to_first();
174 float size = MakeRowFromSubBlobs(block, blob_it.data()->cblob(), &row_it);
175 if (size > block->line_size)
176 block->line_size = size;
177 } else if (block->blobs.empty()) {
178 // Make a fake blob.
180 // The blobnbox owns the blob.
181 auto* bblob = new BLOBNBOX(blob);
182 blob_it.add_after_then_move(bblob);
183 }
184 MakeRowFromBlobs(block->line_size, &blob_it, &row_it);
185 // Fit an LMS line to the rows.
186 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward())
187 fit_lms_line(row_it.data());
188 float gradient;
189 float fit_error;
190 // Compute the skew based on the fitted line.
191 compute_page_skew(blocks, gradient, fit_error);
192 return gradient;
193}
static C_BLOB * FakeBlob(const TBOX &box)
Definition: stepblob.cpp:241

◆ mark_repeated_chars()

void mark_repeated_chars ( TO_ROW row)

Definition at line 2639 of file makerow.cpp.

2639 {
2640 BLOBNBOX_IT box_it(row->blob_list()); // Iterator.
2641 int num_repeated_sets = 0;
2642 if (!box_it.empty()) {
2643 do {
2644 BLOBNBOX* bblob = box_it.data();
2645 int repeat_length = 1;
2646 if (bblob->flow() == BTFT_LEADER &&
2647 !bblob->joined_to_prev() && bblob->cblob() != nullptr) {
2648 BLOBNBOX_IT test_it(box_it);
2649 for (test_it.forward(); !test_it.at_first();) {
2650 bblob = test_it.data();
2651 if (bblob->flow() != BTFT_LEADER)
2652 break;
2653 test_it.forward();
2654 bblob = test_it.data();
2655 if (bblob->joined_to_prev() || bblob->cblob() == nullptr) {
2656 repeat_length = 0;
2657 break;
2658 }
2659 ++repeat_length;
2660 }
2661 }
2662 if (repeat_length >= kMinLeaderCount) {
2663 num_repeated_sets++;
2664 for (; repeat_length > 0; box_it.forward(), --repeat_length) {
2665 bblob = box_it.data();
2666 bblob->set_repeated_set(num_repeated_sets);
2667 }
2668 } else {
2669 bblob->set_repeated_set(0);
2670 box_it.forward();
2671 }
2672 } while (!box_it.at_first()); // until all done
2673 }
2674 row->set_num_repeated_sets(num_repeated_sets);
2675}
@ BTFT_LEADER
Definition: blobbox.h:121
const int kMinLeaderCount
Definition: makerow.cpp:105
void set_repeated_set(int set_id)
Definition: blobbox.h:265
C_BLOB * cblob() const
Definition: blobbox.h:268
BlobTextFlowType flow() const
Definition: blobbox.h:295
void set_num_repeated_sets(int num_sets)
Definition: blobbox.h:640

◆ most_overlapping_row()

OVERLAP_STATE most_overlapping_row ( TO_ROW_IT *  row_it,
TO_ROW *&  best_row,
float  top,
float  bottom,
float  rowsize,
bool  testing_blob 
)

Definition at line 2478 of file makerow.cpp.

2485 {
2486 OVERLAP_STATE result; //result of tests
2487 float overlap; //of blob & row
2488 float bestover; //nearest row
2489 float merge_top, merge_bottom; //size of merged row
2490 ICOORD testpt; //testing only
2491 TO_ROW *row; //current row
2492 TO_ROW *test_row; //for multiple overlaps
2493 BLOBNBOX_IT blob_it; //for merging rows
2494
2495 result = ASSIGN;
2496 row = row_it->data ();
2497 bestover = top - bottom;
2498 if (top > row->max_y ())
2499 bestover -= top - row->max_y ();
2500 if (bottom < row->min_y ())
2501 //compute overlap
2502 bestover -= row->min_y () - bottom;
2503 if (testing_blob && textord_debug_blob) {
2504 tprintf("Test blob y=(%g,%g), row=(%f,%f), size=%g, overlap=%f\n",
2505 bottom, top, row->min_y(), row->max_y(), rowsize, bestover);
2506 }
2507 test_row = row;
2508 do {
2509 if (!row_it->at_last ()) {
2510 row_it->forward ();
2511 test_row = row_it->data ();
2512 if (test_row->min_y () <= top && test_row->max_y () >= bottom) {
2513 merge_top =
2514 test_row->max_y () >
2515 row->max_y ()? test_row->max_y () : row->max_y ();
2516 merge_bottom =
2517 test_row->min_y () <
2518 row->min_y ()? test_row->min_y () : row->min_y ();
2519 if (merge_top - merge_bottom <= rowsize) {
2520 if (testing_blob && textord_debug_blob) {
2521 tprintf ("Merging rows at (%g,%g), (%g,%g)\n",
2522 row->min_y (), row->max_y (),
2523 test_row->min_y (), test_row->max_y ());
2524 }
2525 test_row->set_limits (merge_bottom, merge_top);
2526 blob_it.set_to_list (test_row->blob_list ());
2527 blob_it.add_list_after (row->blob_list ());
2528 blob_it.sort (blob_x_order);
2529 row_it->backward ();
2530 delete row_it->extract ();
2531 row_it->forward ();
2532 bestover = -1.0f; //force replacement
2533 }
2534 overlap = top - bottom;
2535 if (top > test_row->max_y ())
2536 overlap -= top - test_row->max_y ();
2537 if (bottom < test_row->min_y ())
2538 overlap -= test_row->min_y () - bottom;
2539 if (bestover >= rowsize - 1 && overlap >= rowsize - 1) {
2540 result = REJECT;
2541 }
2542 if (overlap > bestover) {
2543 bestover = overlap; //find biggest overlap
2544 row = test_row;
2545 }
2546 if (testing_blob && textord_debug_blob) {
2547 tprintf("Test blob y=(%g,%g), row=(%f,%f), size=%g, overlap=%f->%f\n",
2548 bottom, top, test_row->min_y(), test_row->max_y(),
2549 rowsize, overlap, bestover);
2550 }
2551 }
2552 }
2553 }
2554 while (!row_it->at_last ()
2555 && test_row->min_y () <= top && test_row->max_y () >= bottom);
2556 while (row_it->data () != row)
2557 row_it->backward (); //make it point to row
2558 //doesn't overlap much
2559 if (top - bottom - bestover > rowsize * textord_overlap_x &&
2560 (!textord_fix_makerow_bug || bestover < rowsize * textord_overlap_x)
2561 && result == ASSIGN)
2562 result = NEW_ROW; //doesn't overlap enough
2563 best_row = row;
2564 return result;
2565}
bool textord_fix_makerow_bug
Definition: makerow.cpp:54

◆ pre_associate_blobs()

void pre_associate_blobs ( ICOORD  page_tr,
TO_BLOCK block,
FCOORD  rotation,
bool  testing_on 
)

Definition at line 1845 of file makerow.cpp.

1850 {
1851#ifndef GRAPHICS_DISABLED
1852 ScrollView::Color colour; //of boxes
1853#endif
1854 BLOBNBOX *blob; //current blob
1855 BLOBNBOX *nextblob; //next in list
1856 TBOX blob_box;
1857 FCOORD blob_rotation; //inverse of rotation
1858 BLOBNBOX_IT blob_it; //iterator
1859 BLOBNBOX_IT start_it; //iterator
1860 TO_ROW_IT row_it = block->get_rows ();
1861
1862#ifndef GRAPHICS_DISABLED
1863 colour = ScrollView::RED;
1864#endif
1865
1866 blob_rotation = FCOORD (rotation.x (), -rotation.y ());
1867 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
1868 //get blobs
1869 blob_it.set_to_list (row_it.data ()->blob_list ());
1870 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
1871 blob_it.forward ()) {
1872 blob = blob_it.data ();
1873 blob_box = blob->bounding_box ();
1874 start_it = blob_it; //save start point
1875 // if (testing_on && textord_show_final_blobs)
1876 // {
1877 // tprintf("Blob at (%d,%d)->(%d,%d), addr=%x, count=%d\n",
1878 // blob_box.left(),blob_box.bottom(),
1879 // blob_box.right(),blob_box.top(),
1880 // (void*)blob,blob_it.length());
1881 // }
1882 bool overlap;
1883 do {
1884 overlap = false;
1885 if (!blob_it.at_last ()) {
1886 nextblob = blob_it.data_relative(1);
1887 overlap = blob_box.major_x_overlap(nextblob->bounding_box());
1888 if (overlap) {
1889 blob->merge(nextblob); // merge new blob
1890 blob_box = blob->bounding_box(); // get bigger box
1891 blob_it.forward();
1892 }
1893 }
1894 }
1895 while (overlap);
1896 blob->chop (&start_it, &blob_it,
1897 blob_rotation,
1900 //attempt chop
1901 }
1902#ifndef GRAPHICS_DISABLED
1903 if (testing_on && textord_show_final_blobs) {
1904 if (to_win == nullptr)
1905 create_to_win(page_tr);
1906 to_win->Pen(colour);
1907 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
1908 blob_it.forward ()) {
1909 blob = blob_it.data ();
1910 blob_box = blob->bounding_box ();
1911 blob_box.rotate (rotation);
1912 if (!blob->joined_to_prev ()) {
1913 to_win->Rectangle (blob_box.left (), blob_box.bottom (),
1914 blob_box.right (), blob_box.top ());
1915 }
1916 }
1917 colour = static_cast<ScrollView::Color>(colour + 1);
1918 if (colour > ScrollView::MAGENTA)
1919 colour = ScrollView::RED;
1920 }
1921#endif
1922 }
1923}
double textord_chop_width
Definition: makerow.cpp:76
bool textord_show_final_blobs
Definition: makerow.cpp:47
void chop(BLOBNBOX_IT *start_it, BLOBNBOX_IT *blob_it, FCOORD rotation, float xheight)
Definition: blobbox.cpp:120
void merge(BLOBNBOX *nextblob)
Definition: blobbox.cpp:92
float y() const
Definition: points.h:210
float x() const
Definition: points.h:207
void Pen(Color color)
Definition: scrollview.cpp:719
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:600

◆ row_spacing_order()

int row_spacing_order ( const void *  item1,
const void *  item2 
)

Definition at line 2617 of file makerow.cpp.

2619 {
2620 //converted ptr
2621 const TO_ROW *row1 = *reinterpret_cast<const TO_ROW* const*>(item1);
2622 //converted ptr
2623 const TO_ROW *row2 = *reinterpret_cast<const TO_ROW* const*>(item2);
2624
2625 if (row1->spacing < row2->spacing)
2626 return -1;
2627 else if (row1->spacing > row2->spacing)
2628 return 1;
2629 else
2630 return 0;
2631}

◆ row_y_order()

int row_y_order ( const void *  item1,
const void *  item2 
)

Definition at line 2595 of file makerow.cpp.

2597 {
2598 //converted ptr
2599 const TO_ROW *row1 = *reinterpret_cast<const TO_ROW* const*>(item1);
2600 //converted ptr
2601 const TO_ROW *row2 = *reinterpret_cast<const TO_ROW* const*>(item2);
2602
2603 if (row1->parallel_c () > row2->parallel_c ())
2604 return -1;
2605 else if (row1->parallel_c () < row2->parallel_c ())
2606 return 1;
2607 else
2608 return 0;
2609}

◆ segment_baseline()

bool segment_baseline ( TO_ROW row,
TO_BLOCK block,
int32_t &  segments,
int32_t *  xstarts 
)

Definition at line 2088 of file makerow.cpp.

2093 {
2094 bool needs_curve; //needs curved line
2095 int blobcount; //no of blobs
2096 int blobindex; //current blob
2097 int last_state; //above, on , below
2098 int state; //of current blob
2099 float yshift; //from baseline
2100 TBOX box; //blob box
2101 TBOX new_box; //new_it box
2102 float middle; //xcentre of blob
2103 //blobs
2104 BLOBNBOX_IT blob_it = row->blob_list ();
2105 BLOBNBOX_IT new_it = blob_it; //front end
2106 SORTED_FLOATS yshifts; //shifts from baseline
2107
2108 needs_curve = false;
2109 box = box_next_pre_chopped (&blob_it);
2110 xstarts[0] = box.left ();
2111 segments = 1;
2112 blobcount = row->blob_list ()->length ();
2114 tprintf ("Segmenting baseline of %d blobs at (%d,%d)\n",
2115 blobcount, box.left (), box.bottom ());
2116 if (blobcount <= textord_spline_medianwin
2117 || blobcount < textord_spline_minblobs) {
2118 blob_it.move_to_last ();
2119 box = blob_it.data ()->bounding_box ();
2120 xstarts[1] = box.right ();
2121 return false;
2122 }
2123 last_state = 0;
2124 new_it.mark_cycle_pt ();
2125 for (blobindex = 0; blobindex < textord_spline_medianwin; blobindex++) {
2126 new_box = box_next_pre_chopped (&new_it);
2127 middle = (new_box.left () + new_box.right ()) / 2.0;
2128 yshift = new_box.bottom () - row->line_m () * middle - row->line_c ();
2129 //record shift
2130 yshifts.add (yshift, blobindex);
2131 if (new_it.cycled_list ()) {
2132 xstarts[1] = new_box.right ();
2133 return false;
2134 }
2135 }
2136 for (blobcount = 0; blobcount < textord_spline_medianwin / 2; blobcount++)
2137 box = box_next_pre_chopped (&blob_it);
2138 do {
2139 new_box = box_next_pre_chopped (&new_it);
2140 //get middle one
2141 yshift = yshifts[textord_spline_medianwin / 2];
2142 if (yshift > textord_spline_shift_fraction * block->line_size)
2143 state = 1;
2144 else if (-yshift > textord_spline_shift_fraction * block->line_size)
2145 state = -1;
2146 else
2147 state = 0;
2148 if (state != 0)
2149 needs_curve = true;
2150 // tprintf("State=%d, prev=%d, shift=%g\n",
2151 // state,last_state,yshift);
2152 if (state != last_state && blobcount > textord_spline_minblobs) {
2153 xstarts[segments++] = box.left ();
2154 blobcount = 0;
2155 }
2156 last_state = state;
2157 yshifts.remove (blobindex - textord_spline_medianwin);
2158 box = box_next_pre_chopped (&blob_it);
2159 middle = (new_box.left () + new_box.right ()) / 2.0;
2160 yshift = new_box.bottom () - row->line_m () * middle - row->line_c ();
2161 yshifts.add (yshift, blobindex);
2162 blobindex++;
2163 blobcount++;
2164 }
2165 while (!new_it.cycled_list ());
2166 if (blobcount > textord_spline_minblobs || segments == 1) {
2167 xstarts[segments] = new_box.right ();
2168 }
2169 else {
2170 xstarts[--segments] = new_box.right ();
2171 }
2173 tprintf ("Made %d segments on row at (%d,%d)\n",
2174 segments, box.right (), box.bottom ());
2175 return needs_curve;
2176}
int textord_spline_minblobs
Definition: makerow.cpp:63
double textord_spline_shift_fraction
Definition: makerow.cpp:69
void remove(int32_t key)
Definition: sortflts.cpp:52
void add(float value, int32_t key)
Definition: sortflts.cpp:27

◆ separate_underlines()

void separate_underlines ( TO_BLOCK block,
float  gradient,
FCOORD  rotation,
bool  testing_on 
)

Definition at line 1772 of file makerow.cpp.

1775 { // correct orientation
1776 BLOBNBOX *blob; // current blob
1777 C_BLOB *rotated_blob; // rotated blob
1778 TO_ROW *row; // current row
1779 float length; // of g_vec
1780 TBOX blob_box;
1781 FCOORD blob_rotation; // inverse of rotation
1782 FCOORD g_vec; // skew rotation
1783 BLOBNBOX_IT blob_it; // iterator
1784 // iterator
1785 BLOBNBOX_IT under_it = &block->underlines;
1786 BLOBNBOX_IT large_it = &block->large_blobs;
1787 TO_ROW_IT row_it = block->get_rows();
1788 int min_blob_height = static_cast<int>(textord_min_blob_height_fraction *
1789 block->line_size + 0.5);
1790
1791 // length of vector
1792 length = sqrt(1 + gradient * gradient);
1793 g_vec = FCOORD(1 / length, -gradient / length);
1794 blob_rotation = FCOORD(rotation.x(), -rotation.y());
1795 blob_rotation.rotate(g_vec); // undoing everything
1796 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1797 row = row_it.data();
1798 // get blobs
1799 blob_it.set_to_list(row->blob_list());
1800 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
1801 blob_it.forward()) {
1802 blob = blob_it.data();
1803 blob_box = blob->bounding_box();
1804 if (blob_box.width() > block->line_size * textord_underline_width) {
1805 ASSERT_HOST(blob->cblob() != nullptr);
1806 rotated_blob = crotate_cblob (blob->cblob(),
1807 blob_rotation);
1808 if (test_underline(
1809 testing_on && textord_show_final_rows,
1810 rotated_blob, static_cast<int16_t>(row->intercept()),
1811 static_cast<int16_t>(
1812 block->line_size *
1815 under_it.add_after_then_move(blob_it.extract());
1816 if (testing_on && textord_show_final_rows) {
1817 tprintf("Underlined blob at:");
1818 rotated_blob->bounding_box().print();
1819 tprintf("Was:");
1820 blob_box.print();
1821 }
1822 } else if (CountOverlaps(blob->bounding_box(), min_blob_height,
1823 row->blob_list()) >
1824 textord_max_blob_overlaps) {
1825 large_it.add_after_then_move(blob_it.extract());
1826 if (testing_on && textord_show_final_rows) {
1827 tprintf("Large blob overlaps %d blobs at:",
1828 CountOverlaps(blob_box, min_blob_height,
1829 row->blob_list()));
1830 blob_box.print();
1831 }
1832 }
1833 delete rotated_blob;
1834 }
1835 }
1836 }
1837}
C_BLOB * crotate_cblob(C_BLOB *blob, FCOORD rotation)
Definition: blobbox.cpp:611
bool test_underline(bool testing_on, C_BLOB *blob, int16_t baseline, int16_t xheight)
Definition: blkocc.cpp:48
bool textord_show_final_rows
Definition: makerow.cpp:46
double textord_underline_width
Definition: makerow.cpp:85
BLOBNBOX_LIST underlines
Definition: blobbox.h:773
void rotate(const FCOORD vec)
Definition: points.h:763
void print() const
Definition: rect.h:278
int16_t width() const
Definition: rect.h:115
TBOX bounding_box() const
Definition: stepblob.cpp:253

◆ vigorous_noise_removal()

void vigorous_noise_removal ( TO_BLOCK block)

Definition at line 466 of file makerow.cpp.

466 {
467 TO_ROW_IT row_it = block->get_rows ();
468 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
469 TO_ROW* row = row_it.data();
470 BLOBNBOX_IT b_it = row->blob_list();
471 // Estimate the xheight on the row.
472 int max_height = 0;
473 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
474 BLOBNBOX* blob = b_it.data();
475 if (blob->bounding_box().height() > max_height)
476 max_height = blob->bounding_box().height();
477 }
478 STATS hstats(0, max_height + 1);
479 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
480 BLOBNBOX* blob = b_it.data();
481 int height = blob->bounding_box().height();
482 if (height >= kMinSize)
483 hstats.add(blob->bounding_box().height(), 1);
484 }
485 float xheight = hstats.median();
486 // Delete small objects.
487 BLOBNBOX* prev = nullptr;
488 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
489 BLOBNBOX* blob = b_it.data();
490 const TBOX& box = blob->bounding_box();
491 if (box.height() < kNoiseSize * xheight) {
492 // Small so delete unless it looks like an i dot.
493 if (prev != nullptr) {
494 if (dot_of_i(blob, prev, row))
495 continue; // Looks OK.
496 }
497 if (!b_it.at_last()) {
498 BLOBNBOX* next = b_it.data_relative(1);
499 if (dot_of_i(blob, next, row))
500 continue; // Looks OK.
501 }
502 // It might be noise so get rid of it.
503 delete blob->cblob();
504 delete b_it.extract();
505 } else {
506 prev = blob;
507 }
508 }
509 }
510}
const int kMinSize
Definition: makerow.cpp:377
const double kNoiseSize
Definition: makerow.cpp:376

Variable Documentation

◆ kMinLeaderCount

const int kMinLeaderCount = 5

Definition at line 105 of file makerow.cpp.

◆ kMinSize

const int kMinSize = 8

Definition at line 377 of file makerow.cpp.

◆ kNoiseSize

const double kNoiseSize = 0.5

Definition at line 376 of file makerow.cpp.

◆ textord_ascheight_mode_fraction

double textord_ascheight_mode_fraction = 0.08

"Min pile height to make ascheight"

Definition at line 91 of file makerow.cpp.

◆ textord_ascx_ratio_max

double textord_ascx_ratio_max = 1.8

"Max cap/xheight"

Definition at line 95 of file makerow.cpp.

◆ textord_ascx_ratio_min

double textord_ascx_ratio_min = 1.25

"Min cap/xheight"

Definition at line 94 of file makerow.cpp.

◆ textord_chop_width

double textord_chop_width = 1.5

"Max width before chopping"

Definition at line 76 of file makerow.cpp.

◆ textord_debug_blob

bool textord_debug_blob = false

"Print test blob information"

Definition at line 101 of file makerow.cpp.

◆ textord_debug_xheights

bool textord_debug_xheights = false

"Test xheight algorithms"

Definition at line 55 of file makerow.cpp.

◆ textord_descx_ratio_max

double textord_descx_ratio_max = 0.6

"Max desc/xheight"

Definition at line 97 of file makerow.cpp.

◆ textord_descx_ratio_min

double textord_descx_ratio_min = 0.25

"Min desc/xheight"

Definition at line 96 of file makerow.cpp.

◆ textord_excess_blobsize

double textord_excess_blobsize = 1.3

"New row made if blob makes row this big"

Definition at line 83 of file makerow.cpp.

◆ textord_fix_makerow_bug

bool textord_fix_makerow_bug = true

"Prevent multiple baselines"

Definition at line 54 of file makerow.cpp.

◆ textord_fix_xheight_bug

bool textord_fix_xheight_bug = true

"Use spline baseline"

Definition at line 53 of file makerow.cpp.

◆ textord_heavy_nr

bool textord_heavy_nr = false

"Vigorously remove noise"

Definition at line 42 of file makerow.cpp.

◆ textord_linespace_iqrlimit

double textord_linespace_iqrlimit = 0.2

"Max iqr/median for linespace"

Definition at line 74 of file makerow.cpp.

◆ textord_lms_line_trials

int textord_lms_line_trials = 12

"Number of linew fits to do"

Definition at line 99 of file makerow.cpp.

◆ textord_min_blob_height_fraction

double textord_min_blob_height_fraction = 0.75

"Min blob height/top to include blob top into xheight stats"

Definition at line 87 of file makerow.cpp.

◆ textord_min_blobs_in_row

int textord_min_blobs_in_row = 4

"Min blobs before gradient counted"

Definition at line 62 of file makerow.cpp.

◆ textord_min_linesize

double textord_min_linesize = 1.25

"* blob height for initial linesize"

Definition at line 81 of file makerow.cpp.

◆ textord_min_xheight

int textord_min_xheight = 10

"Min credible pixel xheight"

Definition at line 67 of file makerow.cpp.

◆ textord_minxh

double textord_minxh = 0.25

"fraction of linesize for min xheight"

Definition at line 80 of file makerow.cpp.

◆ textord_new_initial_xheight

bool textord_new_initial_xheight = true

"Use test xheight mechanism"

Definition at line 100 of file makerow.cpp.

◆ textord_occupancy_threshold

double textord_occupancy_threshold = 0.4

"Fraction of neighbourhood"

Definition at line 84 of file makerow.cpp.

◆ textord_old_baselines

bool textord_old_baselines = true

"Use old baseline algorithm"

Definition at line 51 of file makerow.cpp.

◆ textord_old_xheight

bool textord_old_xheight = false

"Use old xheight algorithm"

Definition at line 52 of file makerow.cpp.

◆ textord_parallel_baselines

bool textord_parallel_baselines = true

"Force parallel baselines"

Definition at line 49 of file makerow.cpp.

◆ textord_show_expanded_rows

bool textord_show_expanded_rows = false

"Display rows after expanding"

Definition at line 45 of file makerow.cpp.

◆ textord_show_final_blobs

bool textord_show_final_blobs = false

"Display blob bounds after pre-ass"

Definition at line 47 of file makerow.cpp.

◆ textord_show_final_rows

bool textord_show_final_rows = false

"Display rows after final fitting"

Definition at line 46 of file makerow.cpp.

◆ textord_show_initial_rows

bool textord_show_initial_rows = false

"Display row accumulation"

Definition at line 43 of file makerow.cpp.

◆ textord_show_parallel_rows

bool textord_show_parallel_rows = false

"Display page correlated rows"

Definition at line 44 of file makerow.cpp.

◆ textord_skew_ile

double textord_skew_ile = 0.5

"Ile of gradients for page skew"

Definition at line 72 of file makerow.cpp.

◆ textord_skew_lag

double textord_skew_lag = 0.02

"Lag for skew on row accumulation"

Definition at line 73 of file makerow.cpp.

◆ textord_spline_medianwin

int textord_spline_medianwin = 6

"Size of window for spline segmentation"

Definition at line 64 of file makerow.cpp.

◆ textord_spline_minblobs

int textord_spline_minblobs = 8

"Min blobs in each spline segment"

Definition at line 63 of file makerow.cpp.

◆ textord_spline_outlier_fraction

double textord_spline_outlier_fraction = 0.1

"Fraction of line spacing for outlier"

Definition at line 71 of file makerow.cpp.

◆ textord_spline_shift_fraction

double textord_spline_shift_fraction = 0.02

"Fraction of line spacing for quad"

Definition at line 69 of file makerow.cpp.

◆ textord_straight_baselines

bool textord_straight_baselines = false

"Force straight baselines"

Definition at line 50 of file makerow.cpp.

◆ textord_test_landscape

bool textord_test_landscape = false

"Tests refer to land/port"

Definition at line 48 of file makerow.cpp.

◆ textord_test_x

int textord_test_x = -INT32_MAX

"coord of test pt"

Definition at line 60 of file makerow.cpp.

◆ textord_test_y

int textord_test_y = -INT32_MAX

"coord of test pt"

Definition at line 61 of file makerow.cpp.

◆ textord_underline_width

double textord_underline_width = 2.0

"Multiple of line_size for underline"

Definition at line 85 of file makerow.cpp.

◆ textord_width_limit

double textord_width_limit = 8

"Max width of blobs to make rows"

Definition at line 75 of file makerow.cpp.

◆ textord_xheight_error_margin

double textord_xheight_error_margin = 0.1

"Accepted variation"

Definition at line 98 of file makerow.cpp.

◆ textord_xheight_mode_fraction

double textord_xheight_mode_fraction = 0.4

"Min pile height to make xheight"

Definition at line 89 of file makerow.cpp.