tesseract 4.1.1
Loading...
Searching...
No Matches
tospace.cpp
Go to the documentation of this file.
1// Licensed under the Apache License, Version 2.0 (the "License");
2// you may not use this file except in compliance with the License.
3// You may obtain a copy of the License at
4// http://www.apache.org/licenses/LICENSE-2.0
5// Unless required by applicable law or agreed to in writing, software
6// distributed under the License is distributed on an "AS IS" BASIS,
7// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8// See the License for the specific language governing permissions and
9// limitations under the License.
10/**********************************************************************
11 * tospace.cpp
12 *
13 * Compute fuzzy word spacing thresholds for each row.
14 * I.e. set : max_nonspace
15 * space_threshold
16 * min_space
17 * kern_size
18 * space_size
19 * for each row.
20 * ONLY FOR PROPORTIONAL BLOCKS - FIXED PITCH IS ASSUMED ALREADY DONE
21 *
22 * Note: functions in this file were originally not members of any
23 * class or enclosed by any namespace. Now they are all static members
24 * of the Textord class.
25 *
26 **********************************************************************/
27
28#include "drawtord.h"
29#include "statistc.h"
30#include "textord.h"
31#include "tovars.h"
32
33// Include automatically generated configuration file if running autoconf.
34#ifdef HAVE_CONFIG_H
35#include "config_auto.h"
36#endif
37
38#include <algorithm>
39#include <memory>
40
41#define MAXSPACING 128 /*max expected spacing in pix */
42
43namespace tesseract {
45 ICOORD page_tr, //topright of page
46 TO_BLOCK_LIST *blocks //blocks on page
47 ) {
48 TO_BLOCK_IT block_it; //iterator
49 TO_BLOCK *block; //current block;
50 TO_ROW *row; //current row
51 int block_index; //block number
52 int row_index; //row number
53 //estimated width of real spaces for whole block
54 int16_t block_space_gap_width;
55 //estimated width of non space gaps for whole block
56 int16_t block_non_space_gap_width;
57 bool old_text_ord_proportional;//old fixed/prop result
58
59 block_it.set_to_list (blocks);
60 block_index = 1;
61 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
62 block_it.forward ()) {
63 block = block_it.data ();
64 std::unique_ptr<GAPMAP> gapmap(new GAPMAP (block)); //map of big vert gaps in blk
65 block_spacing_stats(block,
66 gapmap.get(),
67 old_text_ord_proportional,
68 block_space_gap_width,
69 block_non_space_gap_width);
70 // Make sure relative values of block-level space and non-space gap
71 // widths are reasonable. The ratio of 1:3 is also used in
72 // block_spacing_stats, to corrrect the block_space_gap_width
73 // Useful for arabic and hindi, when the non-space gap width is
74 // often over-estimated and should not be trusted. A similar ratio
75 // is found in block_spacing_stats.
77 static_cast<float>(block_space_gap_width) / block_non_space_gap_width < 3.0) {
78 block_non_space_gap_width = static_cast<int16_t>(floor (block_space_gap_width / 3.0));
79 }
80 // row iterator
81 TO_ROW_IT row_it(block->get_rows());
82 row_index = 1;
83 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
84 row = row_it.data ();
85 if ((row->pitch_decision == PITCH_DEF_PROP) ||
87 if ((tosp_debug_level > 0) && !old_text_ord_proportional)
88 tprintf ("Block %d Row %d: Now Proportional\n",
89 block_index, row_index);
90 row_spacing_stats(row,
91 gapmap.get(),
92 block_index,
93 row_index,
94 block_space_gap_width,
95 block_non_space_gap_width);
96 }
97 else {
98 if ((tosp_debug_level > 0) && old_text_ord_proportional)
100 ("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n",
101 block_index, row_index, row->pitch_decision,
102 row->fixed_pitch);
103 }
104#ifndef GRAPHICS_DISABLED
106 plot_word_decisions (to_win, static_cast<int16_t>(row->fixed_pitch), row);
107#endif
108 row_index++;
109 }
110 block_index++;
111 }
112}
113
114
115/*************************************************************************
116 * block_spacing_stats()
117 *************************************************************************/
118
119void Textord::block_spacing_stats(
120 TO_BLOCK* block,
121 GAPMAP* gapmap,
122 bool& old_text_ord_proportional,
123 int16_t& block_space_gap_width, // resulting estimate
124 int16_t& block_non_space_gap_width // resulting estimate
125) {
126 TO_ROW *row; // current row
127 BLOBNBOX_IT blob_it; // iterator
128
129 STATS centre_to_centre_stats (0, MAXSPACING);
130 // DEBUG USE ONLY
131 STATS all_gap_stats (0, MAXSPACING);
132 STATS space_gap_stats (0, MAXSPACING);
133 int16_t minwidth = MAXSPACING; // narrowest blob
134 TBOX blob_box;
135 TBOX prev_blob_box;
136 int16_t centre_to_centre;
137 int16_t gap_width;
138 float real_space_threshold;
139 float iqr_centre_to_centre; // DEBUG USE ONLY
140 float iqr_all_gap_stats; // DEBUG USE ONLY
141 int32_t end_of_row;
142 int32_t row_length;
143
144 // row iterator
145 TO_ROW_IT row_it(block->get_rows());
146 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
147 row = row_it.data ();
148 if (!row->blob_list ()->empty () &&
150 (row->pitch_decision == PITCH_DEF_PROP) ||
151 (row->pitch_decision == PITCH_CORR_PROP))) {
152 blob_it.set_to_list (row->blob_list ());
153 blob_it.mark_cycle_pt ();
154 end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
156 blob_box = box_next_pre_chopped (&blob_it);
158 blob_box = reduced_box_next (row, &blob_it);
159 else
160 blob_box = box_next (&blob_it);
161 row_length = end_of_row - blob_box.left ();
162 if (blob_box.width () < minwidth)
163 minwidth = blob_box.width ();
164 prev_blob_box = blob_box;
165 while (!blob_it.cycled_list ()) {
167 blob_box = box_next_pre_chopped (&blob_it);
169 blob_box = reduced_box_next (row, &blob_it);
170 else
171 blob_box = box_next (&blob_it);
172 if (blob_box.width () < minwidth)
173 minwidth = blob_box.width ();
174 int16_t left = prev_blob_box.right();
175 int16_t right = blob_box.left();
176 gap_width = right - left;
177 if (!ignore_big_gap(row, row_length, gapmap, left, right)) {
178 all_gap_stats.add (gap_width, 1);
179
180 centre_to_centre = (right + blob_box.right () -
181 (prev_blob_box.left () + left)) / 2;
182 //DEBUG
183 centre_to_centre_stats.add (centre_to_centre, 1);
184 // DEBUG
185 }
186 prev_blob_box = blob_box;
187 }
188 }
189 }
190
191 //Inadequate samples
192 if (all_gap_stats.get_total () <= 1) {
193 block_non_space_gap_width = minwidth;
194 block_space_gap_width = -1; //No est. space width
195 //DEBUG
196 old_text_ord_proportional = true;
197 }
198 else {
199 /* For debug only ..... */
200 iqr_centre_to_centre = centre_to_centre_stats.ile (0.75) -
201 centre_to_centre_stats.ile (0.25);
202 iqr_all_gap_stats = all_gap_stats.ile (0.75) - all_gap_stats.ile (0.25);
203 old_text_ord_proportional =
204 iqr_centre_to_centre * 2 > iqr_all_gap_stats;
205 /* .......For debug only */
206
207 /*
208 The median of the gaps is used as an estimate of the NON-SPACE gap width.
209 This RELIES on the assumption that there are more gaps WITHIN words than
210 BETWEEN words in a block
211
212 Now try to estimate the width of a real space for all real spaces in the
213 block. Do this by using a crude threshold to ignore "narrow" gaps, then
214 find the median of the "wide" gaps and use this.
215 */
216 block_non_space_gap_width = static_cast<int16_t>(floor (all_gap_stats.median ()));
217 // median gap
218
219 row_it.set_to_list (block->get_rows ());
220 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
221 row = row_it.data ();
222 if (!row->blob_list ()->empty () &&
224 (row->pitch_decision == PITCH_DEF_PROP) ||
225 (row->pitch_decision == PITCH_CORR_PROP))) {
226 real_space_threshold =
227 std::max (tosp_init_guess_kn_mult * block_non_space_gap_width,
229 blob_it.set_to_list (row->blob_list ());
230 blob_it.mark_cycle_pt ();
231 end_of_row =
232 blob_it.data_relative (-1)->bounding_box ().right ();
234 blob_box = box_next_pre_chopped (&blob_it);
236 blob_box = reduced_box_next (row, &blob_it);
237 else
238 blob_box = box_next (&blob_it);
239 row_length = blob_box.left () - end_of_row;
240 prev_blob_box = blob_box;
241 while (!blob_it.cycled_list ()) {
243 blob_box = box_next_pre_chopped (&blob_it);
245 blob_box = reduced_box_next (row, &blob_it);
246 else
247 blob_box = box_next (&blob_it);
248 int16_t left = prev_blob_box.right();
249 int16_t right = blob_box.left();
250 gap_width = right - left;
251 if ((gap_width > real_space_threshold) &&
252 !ignore_big_gap(row, row_length, gapmap, left, right)) {
253 /*
254 If tosp_use_cert_spaces is enabled, the estimate of the space gap is
255 restricted to obvious spaces - those wider than half the xht or those
256 with wide blobs on both sides - i.e not things that are suspect 1's or
257 punctuation that is sometimes widely spaced.
258 */
260 (gap_width >
262 ||
263 ((gap_width >
266 || (!narrow_blob (row, prev_blob_box)
267 && !narrow_blob (row, blob_box))))
268 || (wide_blob (row, prev_blob_box)
269 && wide_blob (row, blob_box)))
270 space_gap_stats.add (gap_width, 1);
271 }
272 prev_blob_box = blob_box;
273 }
274 }
275 }
276 //Inadequate samples
277 if (space_gap_stats.get_total () <= 2)
278 block_space_gap_width = -1;//No est. space width
279 else
280 block_space_gap_width =
281 std::max(static_cast<int16_t>(floor(space_gap_stats.median())),
282 static_cast<int16_t>(3 * block_non_space_gap_width));
283 }
284}
285
286
287/*************************************************************************
288 * row_spacing_stats()
289 * Set values for min_space, max_non_space based on row stats only
290 * If failure - return 0 values.
291 *************************************************************************/
292void Textord::row_spacing_stats(
293 TO_ROW *row,
294 GAPMAP *gapmap,
295 int16_t block_idx,
296 int16_t row_idx,
297 int16_t block_space_gap_width, //estimate for block
298 int16_t block_non_space_gap_width //estimate for block
299 ) {
300 //iterator
301 BLOBNBOX_IT blob_it = row->blob_list ();
302 STATS all_gap_stats (0, MAXSPACING);
303 STATS cert_space_gap_stats (0, MAXSPACING);
304 STATS all_space_gap_stats (0, MAXSPACING);
305 STATS small_gap_stats (0, MAXSPACING);
306 TBOX blob_box;
307 TBOX prev_blob_box;
308 int16_t gap_width;
309 int16_t real_space_threshold = 0;
310 int16_t max = 0;
311 int16_t index;
312 int16_t large_gap_count = 0;
313 bool suspected_table;
314 int32_t max_max_nonspace; //upper bound
315 bool good_block_space_estimate = block_space_gap_width > 0;
316 int32_t end_of_row;
317 int32_t row_length = 0;
318 float sane_space;
319 int32_t sane_threshold;
320
321 /* Collect first pass stats for row */
322
323 if (!good_block_space_estimate)
324 block_space_gap_width = int16_t (floor (row->xheight / 2));
325 if (!row->blob_list ()->empty ()) {
326 if (tosp_threshold_bias1 > 0)
327 real_space_threshold =
328 block_non_space_gap_width +
329 int16_t (floor (0.5 +
330 tosp_threshold_bias1 * (block_space_gap_width -
331 block_non_space_gap_width)));
332 else
333 real_space_threshold = //Old TO method
334 (block_space_gap_width + block_non_space_gap_width) / 2;
335 blob_it.set_to_list (row->blob_list ());
336 blob_it.mark_cycle_pt ();
337 end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
339 blob_box = box_next_pre_chopped (&blob_it);
341 blob_box = reduced_box_next (row, &blob_it);
342 else
343 blob_box = box_next (&blob_it);
344 row_length = end_of_row - blob_box.left ();
345 prev_blob_box = blob_box;
346 while (!blob_it.cycled_list ()) {
348 blob_box = box_next_pre_chopped (&blob_it);
350 blob_box = reduced_box_next (row, &blob_it);
351 else
352 blob_box = box_next (&blob_it);
353 int16_t left = prev_blob_box.right();
354 int16_t right = blob_box.left();
355 gap_width = right - left;
356 if (ignore_big_gap(row, row_length, gapmap, left, right)) {
357 large_gap_count++;
358 } else {
359 if (gap_width >= real_space_threshold) {
361 (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
362 ((gap_width > tosp_fuzzy_space_factor1 * row->xheight)
364 || (!narrow_blob (row, prev_blob_box)
365 && !narrow_blob (row, blob_box))))
366 || (wide_blob (row, prev_blob_box)
367 && wide_blob (row, blob_box)))
368 cert_space_gap_stats.add (gap_width, 1);
369 all_space_gap_stats.add (gap_width, 1);
370 }
371 else
372 small_gap_stats.add (gap_width, 1);
373 all_gap_stats.add (gap_width, 1);
374 }
375 prev_blob_box = blob_box;
376 }
377 }
378 suspected_table = (large_gap_count > 1) ||
379 ((large_gap_count > 0) &&
380 (all_gap_stats.get_total () <= tosp_few_samples));
381
382 /* Now determine row kern size, space size and threshold */
383
384 if ((cert_space_gap_stats.get_total () >=
386 ((suspected_table ||
387 all_gap_stats.get_total () <= tosp_short_row) &&
388 cert_space_gap_stats.get_total () > 0)) {
389 old_to_method(row,
390 &all_gap_stats,
391 &cert_space_gap_stats,
392 &small_gap_stats,
393 block_space_gap_width,
394 block_non_space_gap_width);
395 } else {
397 !isolated_row_stats (row, gapmap, &all_gap_stats, suspected_table,
398 block_idx, row_idx)) {
400 tprintf ("B:%d R:%d -- Inadequate certain spaces.\n",
401 block_idx, row_idx);
402 if (tosp_row_use_cert_spaces1 && good_block_space_estimate) {
403 //Use block default
404 row->space_size = block_space_gap_width;
405 if (all_gap_stats.get_total () > tosp_redo_kern_limit)
406 row->kern_size = all_gap_stats.median ();
407 else
408 row->kern_size = block_non_space_gap_width;
409 row->space_threshold =
410 int32_t (floor ((row->space_size + row->kern_size) /
412 }
413 else
414 old_to_method(row,
415 &all_gap_stats,
416 &all_space_gap_stats,
417 &small_gap_stats,
418 block_space_gap_width,
419 block_non_space_gap_width);
420 }
421 }
422
423 if (tosp_improve_thresh && !suspected_table)
424 improve_row_threshold(row, &all_gap_stats);
425
426 /* Now lets try to be careful not to do anything silly with tables when we
427 are ignoring big gaps*/
428 if (tosp_sanity_method == 0) {
429 if (suspected_table &&
431 if (tosp_debug_level > 5)
432 tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f.\n", block_idx,
433 row_idx, row->kern_size, row->space_threshold, row->space_size);
434 row->space_threshold =
435 static_cast<int32_t>(tosp_table_kn_sp_ratio * row->kern_size);
436 row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);
437 }
438 }
439 else if (tosp_sanity_method == 1) {
440 sane_space = row->space_size;
441 /* NEVER let space size get too close to kern size */
442 if ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f))
443 || ((row->space_size - row->kern_size) <
444 (tosp_silly_kn_sp_gap * row->xheight))) {
445 if (good_block_space_estimate &&
446 (block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size))
447 sane_space = block_space_gap_width;
448 else
449 sane_space =
450 std::max(static_cast<float>(tosp_min_sane_kn_sp) * std::max(row->kern_size, 2.5f),
451 row->xheight / 2.0f);
452 if (tosp_debug_level > 5)
453 tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n",
454 block_idx, row_idx, row->kern_size, row->space_threshold,
455 row->space_size, sane_space);
456 row->space_size = sane_space;
457 row->space_threshold =
458 int32_t (floor ((row->space_size + row->kern_size) /
460 }
461 /* NEVER let threshold get VERY far away from kern */
462 sane_threshold = int32_t (floor (tosp_max_sane_kn_thresh *
463 std::max(row->kern_size, 2.5f)));
464 if (row->space_threshold > sane_threshold) {
465 if (tosp_debug_level > 5)
466 tprintf("B:%d R:%d -- DON'T BELIEVE THRESH %3.2f %d %3.2f->%d.\n",
467 block_idx, row_idx, row->kern_size, row->space_threshold,
468 row->space_size, sane_threshold);
469 row->space_threshold = sane_threshold;
470 if (row->space_size <= sane_threshold)
471 row->space_size = row->space_threshold + 1.0f;
472 }
473 /* Beware of tables - there may be NO spaces */
474 if (suspected_table) {
475 sane_space = std::max(tosp_table_kn_sp_ratio * row->kern_size,
477 sane_threshold = int32_t (floor ((sane_space + row->kern_size) / 2));
478
479 if ((row->space_size < sane_space) ||
480 (row->space_threshold < sane_threshold)) {
481 if (tosp_debug_level > 5)
482 tprintf ("B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\n",
483 block_idx, row_idx,
484 row->kern_size,
485 row->space_threshold, row->space_size);
486 //the minimum sane value
487 row->space_threshold = static_cast<int32_t>(sane_space);
488 row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);
489 }
490 }
491 }
492
493 /* Now lets try to put some error limits on the threshold */
494
495 if (tosp_old_to_method) {
496 /* Old textord made a space if gap >= threshold */
497 //NO FUZZY SPACES YET
498 row->max_nonspace = row->space_threshold;
499 //NO FUZZY SPACES YET
500 row->min_space = row->space_threshold + 1;
501 }
502 else {
503 /* Any gap greater than 0.6 x-ht is bound to be a space (isn't it:-) */
504 row->min_space =
505 std::min(int32_t (ceil (tosp_fuzzy_space_factor * row->xheight)),
506 int32_t (row->space_size));
507 if (row->min_space <= row->space_threshold)
508 // Don't be silly
509 row->min_space = row->space_threshold + 1;
510 /*
511 Lets try to guess the max certain kern gap by looking at the cluster of
512 kerns for the row. The row is proportional so the kerns should cluster
513 tightly at the bottom of the distribution. We also expect most gaps to be
514 kerns. Find the maximum of the kern piles between 0 and twice the kern
515 estimate. Piles before the first one with less than 1/10 the maximum
516 number of samples can be taken as certain kerns.
517
518 Of course, there are some cases where the kern peak and space peaks merge,
519 so we will put an UPPER limit on the max certain kern gap of some fraction
520 below the threshold.
521 */
522
523 max_max_nonspace = int32_t ((row->space_threshold + row->kern_size) / 2);
524
525 //default
526 row->max_nonspace = max_max_nonspace;
527 for (index = 0; index <= max_max_nonspace; index++) {
528 if (all_gap_stats.pile_count (index) > max)
529 max = all_gap_stats.pile_count (index);
530 if ((index > row->kern_size) &&
531 (all_gap_stats.pile_count (index) < 0.1 * max)) {
532 row->max_nonspace = index;
533 break;
534 }
535 }
536 }
537
538 /* Yet another algorithm - simpler this time - just choose a fraction of the
539 threshold to space range */
540
541 if ((tosp_fuzzy_sp_fraction > 0) &&
542 (row->space_size > row->space_threshold))
543 row->min_space = std::max(row->min_space,
544 static_cast<int32_t>(ceil (row->space_threshold +
546 (row->space_size -
547 row->space_threshold))));
548
549 /* Ensure that ANY space less than some multiplier times the kern size is
550 fuzzy. In tables there is a risk of erroneously setting a small space size
551 when there are no real spaces. Sometimes tables have text squashed into
552 columns so that the kn->sp ratio is small anyway - this means that we can't
553 use this to force a wider separation - hence we rely on context to join any
554 dubious breaks. */
555
557 (suspected_table || tosp_fuzzy_limit_all))
558 row->min_space = std::max(row->min_space,
559 static_cast<int32_t>(ceil (tosp_table_fuzzy_kn_sp_ratio *
560 row->kern_size)));
561
562 if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold)) {
563 row->max_nonspace = static_cast<int32_t>(floor (0.5 + row->kern_size +
565 (row->space_threshold -
566 row->kern_size)));
567 }
568 if (row->max_nonspace > row->space_threshold) {
569 // Don't be silly
570 row->max_nonspace = row->space_threshold;
571 }
572
573 if (tosp_debug_level > 5)
574 tprintf
575 ("B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) Sp:%3.2f\n",
576 block_idx, row_idx, row_length, block_non_space_gap_width,
577 block_space_gap_width, real_space_threshold, row->kern_size,
578 row->max_nonspace, row->space_threshold, row->min_space,
579 row->space_size);
580 if (tosp_debug_level > 10)
581 tprintf("row->kern_size = %3.2f, row->space_size = %3.2f, "
582 "row->space_threshold = %d\n",
583 row->kern_size, row->space_size, row->space_threshold);
584}
585
586void Textord::old_to_method(
587 TO_ROW *row,
588 STATS *all_gap_stats,
589 STATS *space_gap_stats,
590 STATS *small_gap_stats,
591 int16_t block_space_gap_width, //estimate for block
592 int16_t block_non_space_gap_width //estimate for block
593 ) {
594 /* First, estimate row space size */
595 /* Old to condition was > 2 */
596 if (space_gap_stats->get_total () >= tosp_enough_space_samples_for_median) {
597 //Adequate samples
598 /* Set space size to median of spaces BUT limits it if it seems wildly out */
599 row->space_size = space_gap_stats->median ();
600 if (row->space_size > block_space_gap_width * 1.5) {
602 row->space_size = block_space_gap_width * 1.5;
603 else
604 //BUG??? should be *1.5
605 row->space_size = block_space_gap_width;
606 }
607 if (row->space_size < (block_non_space_gap_width * 2) + 1)
608 row->space_size = (block_non_space_gap_width * 2) + 1;
609 }
610 //Only 1 or 2 samples
611 else if (space_gap_stats->get_total () >= 1) {
612 //hence mean not median
613 row->space_size = space_gap_stats->mean ();
614 if (row->space_size > block_space_gap_width * 1.5) {
616 row->space_size = block_space_gap_width * 1.5;
617 else
618 //BUG??? should be *1.5
619 row->space_size = block_space_gap_width;
620 }
621 if (row->space_size < (block_non_space_gap_width * 3) + 1)
622 row->space_size = (block_non_space_gap_width * 3) + 1;
623 }
624 else {
625 //Use block default
626 row->space_size = block_space_gap_width;
627 }
628
629 /* Next, estimate row kern size */
631 (small_gap_stats->get_total () > tosp_redo_kern_limit))
632 row->kern_size = small_gap_stats->median ();
633 else if (all_gap_stats->get_total () > tosp_redo_kern_limit)
634 row->kern_size = all_gap_stats->median ();
635 else //old TO -SAME FOR ALL ROWS
636 row->kern_size = block_non_space_gap_width;
637
638 /* Finally, estimate row space threshold */
639 if (tosp_threshold_bias2 > 0) {
640 row->space_threshold =
641 int32_t (floor (0.5 + row->kern_size +
643 row->kern_size)));
644 } else {
645 /*
646 NOTE old text ord uses (space_size + kern_size + 1)/2 as the threshold
647 and holds this in a float. The use is with a >= test
648 NEW textord uses an integer threshold and a > test
649 It comes to the same thing.
650 (Though there is a difference in that old textor has integer space_size
651 and kern_size.)
652 */
653 row->space_threshold =
654 int32_t (floor ((row->space_size + row->kern_size) / 2));
655 }
656
657 // Apply the same logic and ratios as in row_spacing_stats to
658 // restrict relative values of the row's space_size, kern_size, and
659 // space_threshold
661 ((row->space_size <
662 tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) ||
663 ((row->space_size - row->kern_size) <
664 tosp_silly_kn_sp_gap * row->xheight))) {
665 if (row->kern_size > 2.5)
667 row->space_threshold = int32_t (floor ((row->space_size + row->kern_size) /
669 }
670}
671
672
673/*************************************************************************
674 * isolated_row_stats()
675 * Set values for min_space, max_non_space based on row stats only
676 *************************************************************************/
677bool Textord::isolated_row_stats(TO_ROW* row,
678 GAPMAP* gapmap,
679 STATS* all_gap_stats,
680 bool suspected_table,
681 int16_t block_idx,
682 int16_t row_idx) {
683 float kern_estimate;
684 float crude_threshold_estimate;
685 int16_t small_gaps_count;
686 int16_t total;
687 //iterator
688 BLOBNBOX_IT blob_it = row->blob_list ();
689 STATS cert_space_gap_stats (0, MAXSPACING);
690 STATS all_space_gap_stats (0, MAXSPACING);
691 STATS small_gap_stats (0, MAXSPACING);
692 TBOX blob_box;
693 TBOX prev_blob_box;
694 int16_t gap_width;
695 int32_t end_of_row;
696 int32_t row_length;
697
698 kern_estimate = all_gap_stats->median ();
699 crude_threshold_estimate = std::max(tosp_init_guess_kn_mult * kern_estimate,
701 small_gaps_count = stats_count_under (all_gap_stats,
702 static_cast<int16_t>(ceil (crude_threshold_estimate)));
703 total = all_gap_stats->get_total ();
704
705 if ((total <= tosp_redo_kern_limit) ||
706 ((small_gaps_count / static_cast<float>(total)) < tosp_enough_small_gaps) ||
707 (total - small_gaps_count < 1)) {
708 if (tosp_debug_level > 5)
709 tprintf("B:%d R:%d -- Can't do isolated row stats.\n", block_idx,
710 row_idx);
711 return false;
712 }
713 blob_it.set_to_list (row->blob_list ());
714 blob_it.mark_cycle_pt ();
715 end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
717 blob_box = box_next_pre_chopped (&blob_it);
719 blob_box = reduced_box_next (row, &blob_it);
720 else
721 blob_box = box_next (&blob_it);
722 row_length = end_of_row - blob_box.left ();
723 prev_blob_box = blob_box;
724 while (!blob_it.cycled_list ()) {
726 blob_box = box_next_pre_chopped (&blob_it);
728 blob_box = reduced_box_next (row, &blob_it);
729 else
730 blob_box = box_next (&blob_it);
731 int16_t left = prev_blob_box.right();
732 int16_t right = blob_box.left();
733 gap_width = right - left;
734 if (!ignore_big_gap(row, row_length, gapmap, left, right) &&
735 (gap_width > crude_threshold_estimate)) {
736 if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
737 ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
739 (!narrow_blob (row, prev_blob_box) &&
740 !narrow_blob (row, blob_box)))) ||
741 (wide_blob (row, prev_blob_box) && wide_blob (row, blob_box)))
742 cert_space_gap_stats.add (gap_width, 1);
743 all_space_gap_stats.add (gap_width, 1);
744 }
745 if (gap_width < crude_threshold_estimate)
746 small_gap_stats.add (gap_width, 1);
747
748 prev_blob_box = blob_box;
749 }
750 if (cert_space_gap_stats.get_total () >=
752 //median
753 row->space_size = cert_space_gap_stats.median ();
754 else if (suspected_table && (cert_space_gap_stats.get_total () > 0))
755 //to avoid spaced
756 row->space_size = cert_space_gap_stats.mean ();
757 // 1's in tables
758 else if (all_space_gap_stats.get_total () >=
760 //median
761 row->space_size = all_space_gap_stats.median ();
762 else
763 row->space_size = all_space_gap_stats.mean ();
764
766 row->kern_size = small_gap_stats.median ();
767 else
768 row->kern_size = all_gap_stats->median ();
769 row->space_threshold =
770 int32_t (floor ((row->space_size + row->kern_size) / 2));
771 /* Sanity check */
772 if ((row->kern_size >= row->space_threshold) ||
773 (row->space_threshold >= row->space_size) ||
774 (row->space_threshold <= 0)) {
775 if (tosp_debug_level > 5)
776 tprintf ("B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\n",
777 block_idx, row_idx,
778 row->kern_size, row->space_threshold, row->space_size);
779 row->kern_size = 0.0f;
780 row->space_threshold = 0;
781 row->space_size = 0.0f;
782 return false;
783 }
784
785 if (tosp_debug_level > 5)
786 tprintf ("B:%d R:%d -- Isolated row stats: %f %d %f\n",
787 block_idx, row_idx,
788 row->kern_size, row->space_threshold, row->space_size);
789 return true;
790}
791
792int16_t Textord::stats_count_under(STATS *stats, int16_t threshold) {
793 int16_t index;
794 int16_t total = 0;
795
796 for (index = 0; index < threshold; index++)
797 total += stats->pile_count (index);
798 return total;
799}
800
801
802/*************************************************************************
803 * improve_row_threshold()
804 * Try to recognise a "normal line" -
805 * > 25 gaps
806 * && space > 3 * kn && space > 10
807 * (I.e. reasonably large space and kn:sp ratio)
808 * && > 3/4 # gaps < kn + (sp - kn)/3
809 * (I.e. most gaps are well away from space estimate)
810 * && a gap of max(3, (sp - kn) / 3) empty histogram positions is found
811 * somewhere in the histogram between kn and sp
812 * THEN set the threshold and fuzzy limits to this gap - ie NO fuzzies
813 * NO!!!!! the bristol line has "11" with a gap of 12 between the 1's!!!
814 * try moving the default threshold to within this band but leave the
815 * fuzzy limit calculation as at present.
816 *************************************************************************/
817void Textord::improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) {
818 float sp = row->space_size;
819 float kn = row->kern_size;
820 int16_t reqd_zero_width = 0;
821 int16_t zero_width = 0;
822 int16_t zero_start = 0;
823 int16_t index = 0;
824
825 if (tosp_debug_level > 10)
826 tprintf ("Improve row threshold 0");
827 if ((all_gap_stats->get_total () <= 25) ||
828 (sp <= 10) ||
829 (sp <= 3 * kn) ||
830 (stats_count_under (all_gap_stats,
831 static_cast<int16_t>(ceil (kn + (sp - kn) / 3 + 0.5))) <
832 (0.75 * all_gap_stats->get_total ())))
833 return;
834 if (tosp_debug_level > 10)
835 tprintf (" 1");
836 /*
837 Look for the first region of all 0's in the histogram which is wider than
838 max(3, (sp - kn) / 3) and starts between kn and sp. If found, and current
839 threshold is not within it, move the threshold so that is is just inside it.
840 */
841 reqd_zero_width = static_cast<int16_t>(floor ((sp - kn) / 3 + 0.5));
842 if (reqd_zero_width < 3)
843 reqd_zero_width = 3;
844
845 for (index = int16_t (ceil (kn)); index < int16_t (floor (sp)); index++) {
846 if (all_gap_stats->pile_count (index) == 0) {
847 if (zero_width == 0)
848 zero_start = index;
849 zero_width++;
850 }
851 else {
852 if (zero_width >= reqd_zero_width)
853 break;
854 else {
855 zero_width = 0;
856 }
857 }
858 }
859 index--;
860 if (tosp_debug_level > 10)
861 tprintf (" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n",
862 reqd_zero_width, zero_width, zero_start, row->space_threshold);
863 if ((zero_width < reqd_zero_width) ||
864 ((row->space_threshold >= zero_start) &&
865 (row->space_threshold <= index)))
866 return;
867 if (tosp_debug_level > 10)
868 tprintf (" 2");
869 if (row->space_threshold < zero_start) {
870 if (tosp_debug_level > 5)
871 tprintf
872 ("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n",
873 kn, sp, zero_start, index, row->space_threshold, zero_start);
874 row->space_threshold = zero_start;
875 }
876 if (row->space_threshold > index) {
877 if (tosp_debug_level > 5)
878 tprintf
879 ("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n",
880 kn, sp, zero_start, index, row->space_threshold, index);
881 row->space_threshold = index;
882 }
883}
884
885
886/**********************************************************************
887 * make_prop_words
888 *
889 * Convert a TO_ROW to a ROW.
890 **********************************************************************/
892 TO_ROW *row, // row to make
893 FCOORD rotation // for drawing
894 ) {
895 bool bol; // start of line
896 /* prev_ values are for start of word being built. non prev_ values are for
897 the gap between the word being built and the next one. */
898 bool prev_fuzzy_sp; // probably space
899 bool prev_fuzzy_non; // probably not
900 uint8_t prev_blanks; // in front of word
901 bool fuzzy_sp = false; // probably space
902 bool fuzzy_non = false; // probably not
903 uint8_t blanks = 0; // in front of word
904 bool prev_gap_was_a_space = false;
905 bool break_at_next_gap = false;
906 ROW *real_row; // output row
907 C_OUTLINE_IT cout_it;
908 C_BLOB_LIST cblobs;
909 C_BLOB_IT cblob_it = &cblobs;
910 WERD_LIST words;
911 WERD *word; // new word
912 int32_t next_rep_char_word_right = INT32_MAX;
913 float repetition_spacing; // gap between repetitions
914 int32_t xstarts[2]; // row ends
915 int32_t prev_x; // end of prev blob
916 BLOBNBOX *bblob; // current blob
917 TBOX blob_box; // bounding box
918 BLOBNBOX_IT box_it; // iterator
919 TBOX prev_blob_box;
920 TBOX next_blob_box;
921 int16_t prev_gap = INT16_MAX;
922 int16_t current_gap = INT16_MAX;
923 int16_t next_gap = INT16_MAX;
924 int16_t prev_within_xht_gap = INT16_MAX;
925 int16_t current_within_xht_gap = INT16_MAX;
926 int16_t next_within_xht_gap = INT16_MAX;
927 int16_t word_count = 0;
928
929 // repeated char words
930 WERD_IT rep_char_it(&(row->rep_words));
931 if (!rep_char_it.empty ()) {
932 next_rep_char_word_right =
933 rep_char_it.data ()->bounding_box ().right ();
934 }
935
936 prev_x = -INT16_MAX;
937 cblob_it.set_to_list (&cblobs);
938 box_it.set_to_list (row->blob_list ());
939 // new words
940 WERD_IT word_it(&words);
941 bol = true;
942 prev_blanks = 0;
943 prev_fuzzy_sp = false;
944 prev_fuzzy_non = false;
945 if (!box_it.empty ()) {
946 xstarts[0] = box_it.data ()->bounding_box ().left ();
947 if (xstarts[0] > next_rep_char_word_right) {
948 /* We need to insert a repeated char word at the start of the row */
949 word = rep_char_it.extract ();
950 word_it.add_after_then_move (word);
951 /* Set spaces before repeated char word */
952 word->set_flag (W_BOL, true);
953 bol = false;
954 word->set_blanks (0);
955 //NO uncertainty
956 word->set_flag (W_FUZZY_SP, false);
957 word->set_flag (W_FUZZY_NON, false);
958 xstarts[0] = word->bounding_box ().left ();
959 /* Set spaces after repeated char word (and leave current word set) */
960 repetition_spacing = find_mean_blob_spacing (word);
961 current_gap = box_it.data ()->bounding_box ().left () -
962 next_rep_char_word_right;
963 current_within_xht_gap = current_gap;
964 if (current_gap > tosp_rep_space * repetition_spacing) {
965 prev_blanks = static_cast<uint8_t>(floor (current_gap / row->space_size));
966 if (prev_blanks < 1)
967 prev_blanks = 1;
968 }
969 else
970 prev_blanks = 0;
971 if (tosp_debug_level > 5)
972 tprintf ("Repch wd at BOL(%d, %d). rep spacing %5.2f; Rgap:%d ",
973 box_it.data ()->bounding_box ().left (),
974 box_it.data ()->bounding_box ().bottom (),
975 repetition_spacing, current_gap);
976 prev_fuzzy_sp = false;
977 prev_fuzzy_non = false;
978 if (rep_char_it.empty ()) {
979 next_rep_char_word_right = INT32_MAX;
980 }
981 else {
982 rep_char_it.forward ();
983 next_rep_char_word_right =
984 rep_char_it.data ()->bounding_box ().right ();
985 }
986 }
987
988 peek_at_next_gap(row,
989 box_it,
990 next_blob_box,
991 next_gap,
992 next_within_xht_gap);
993 do {
994 bblob = box_it.data ();
995 blob_box = bblob->bounding_box ();
996 if (bblob->joined_to_prev ()) {
997 if (bblob->cblob () != nullptr) {
998 cout_it.set_to_list (cblob_it.data ()->out_list ());
999 cout_it.move_to_last ();
1000 cout_it.add_list_after (bblob->cblob ()->out_list ());
1001 delete bblob->cblob ();
1002 }
1003 } else {
1004 if (bblob->cblob() != nullptr)
1005 cblob_it.add_after_then_move (bblob->cblob ());
1006 prev_x = blob_box.right ();
1007 }
1008 box_it.forward (); //next one
1009 bblob = box_it.data ();
1010 blob_box = bblob->bounding_box ();
1011
1012 if (!bblob->joined_to_prev() && bblob->cblob() != nullptr) {
1013 /* Real Blob - not multiple outlines or pre-chopped */
1014 prev_gap = current_gap;
1015 prev_within_xht_gap = current_within_xht_gap;
1016 prev_blob_box = next_blob_box;
1017 current_gap = next_gap;
1018 current_within_xht_gap = next_within_xht_gap;
1019 peek_at_next_gap(row,
1020 box_it,
1021 next_blob_box,
1022 next_gap,
1023 next_within_xht_gap);
1024
1025 int16_t prev_gap_arg = prev_gap;
1026 int16_t next_gap_arg = next_gap;
1028 prev_gap_arg = prev_within_xht_gap;
1029 next_gap_arg = next_within_xht_gap;
1030 }
1031 // Decide if a word-break should be inserted
1032 if (blob_box.left () > next_rep_char_word_right ||
1033 make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box,
1034 current_gap, current_within_xht_gap,
1035 next_blob_box, next_gap_arg,
1036 blanks, fuzzy_sp, fuzzy_non,
1037 prev_gap_was_a_space,
1038 break_at_next_gap) ||
1039 box_it.at_first()) {
1040 /* Form a new word out of the blobs collected */
1041 word = new WERD (&cblobs, prev_blanks, nullptr);
1042 word_count++;
1043 word_it.add_after_then_move (word);
1044 if (bol) {
1045 word->set_flag (W_BOL, true);
1046 bol = false;
1047 }
1048 if (prev_fuzzy_sp)
1049 //probably space
1050 word->set_flag (W_FUZZY_SP, true);
1051 else if (prev_fuzzy_non)
1052 word->set_flag (W_FUZZY_NON, true);
1053 //probably not
1054
1055 if (blob_box.left () > next_rep_char_word_right) {
1056 /* We need to insert a repeated char word */
1057 word = rep_char_it.extract ();
1058 word_it.add_after_then_move (word);
1059
1060 /* Set spaces before repeated char word */
1061 repetition_spacing = find_mean_blob_spacing (word);
1062 current_gap = word->bounding_box ().left () - prev_x;
1063 current_within_xht_gap = current_gap;
1064 if (current_gap > tosp_rep_space * repetition_spacing) {
1065 blanks =
1066 static_cast<uint8_t>(floor (current_gap / row->space_size));
1067 if (blanks < 1)
1068 blanks = 1;
1069 }
1070 else
1071 blanks = 0;
1072 if (tosp_debug_level > 5)
1073 tprintf
1074 ("Repch wd (%d,%d) rep gap %5.2f; Lgap:%d (%d blanks);",
1075 word->bounding_box ().left (),
1076 word->bounding_box ().bottom (),
1077 repetition_spacing, current_gap, blanks);
1078 word->set_blanks (blanks);
1079 //NO uncertainty
1080 word->set_flag (W_FUZZY_SP, false);
1081 word->set_flag (W_FUZZY_NON, false);
1082
1083 /* Set spaces after repeated char word (and leave current word set) */
1084 current_gap =
1085 blob_box.left () - next_rep_char_word_right;
1086 if (current_gap > tosp_rep_space * repetition_spacing) {
1087 blanks = static_cast<uint8_t>(current_gap / row->space_size);
1088 if (blanks < 1)
1089 blanks = 1;
1090 }
1091 else
1092 blanks = 0;
1093 if (tosp_debug_level > 5)
1094 tprintf (" Rgap:%d (%d blanks)\n",
1095 current_gap, blanks);
1096 fuzzy_sp = false;
1097 fuzzy_non = false;
1098
1099 if (rep_char_it.empty ()) {
1100 next_rep_char_word_right = INT32_MAX;
1101 }
1102 else {
1103 rep_char_it.forward ();
1104 next_rep_char_word_right =
1105 rep_char_it.data ()->bounding_box ().right ();
1106 }
1107 }
1108
1109 if (box_it.at_first () && rep_char_it.empty ()) {
1110 //at end of line
1111 word->set_flag (W_EOL, true);
1112 xstarts[1] = prev_x;
1113 }
1114 else {
1115 prev_blanks = blanks;
1116 prev_fuzzy_sp = fuzzy_sp;
1117 prev_fuzzy_non = fuzzy_non;
1118 }
1119 }
1120 }
1121 }
1122 while (!box_it.at_first ()); //until back at start
1123
1124 /* Insert any further repeated char words */
1125 while (!rep_char_it.empty ()) {
1126 word = rep_char_it.extract ();
1127 word_it.add_after_then_move (word);
1128
1129 /* Set spaces before repeated char word */
1130 repetition_spacing = find_mean_blob_spacing (word);
1131 current_gap = word->bounding_box ().left () - prev_x;
1132 if (current_gap > tosp_rep_space * repetition_spacing) {
1133 blanks = static_cast<uint8_t>(floor (current_gap / row->space_size));
1134 if (blanks < 1)
1135 blanks = 1;
1136 }
1137 else
1138 blanks = 0;
1139 if (tosp_debug_level > 5)
1140 tprintf(
1141 "Repch wd at EOL (%d,%d). rep spacing %5.2f; Lgap:%d (%d blanks)\n",
1142 word->bounding_box().left(), word->bounding_box().bottom(),
1143 repetition_spacing, current_gap, blanks);
1144 word->set_blanks (blanks);
1145 //NO uncertainty
1146 word->set_flag (W_FUZZY_SP, false);
1147 word->set_flag (W_FUZZY_NON, false);
1148 prev_x = word->bounding_box ().right ();
1149 if (rep_char_it.empty ()) {
1150 //at end of line
1151 word->set_flag (W_EOL, true);
1152 xstarts[1] = prev_x;
1153 }
1154 else {
1155 rep_char_it.forward ();
1156 }
1157 }
1158 real_row = new ROW (row,
1159 static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
1160 word_it.set_to_list (real_row->word_list ());
1161 //put words in row
1162 word_it.add_list_after (&words);
1163 real_row->recalc_bounding_box ();
1164
1165 if (tosp_debug_level > 4) {
1166 tprintf ("Row: Made %d words in row ((%d,%d)(%d,%d))\n",
1167 word_count,
1168 real_row->bounding_box ().left (),
1169 real_row->bounding_box ().bottom (),
1170 real_row->bounding_box ().right (),
1171 real_row->bounding_box ().top ());
1172 }
1173 return real_row;
1174 }
1175 return nullptr;
1176}
1177
1178/**********************************************************************
1179 * make_blob_words
1180 *
1181 * Converts words into blobs so that each blob is a single character.
1182 * Used for chopper test.
1183 **********************************************************************/
1185 TO_ROW *row, // row to make
1186 FCOORD rotation // for drawing
1187 ) {
1188 bool bol; // start of line
1189 ROW *real_row; // output row
1190 C_OUTLINE_IT cout_it;
1191 C_BLOB_LIST cblobs;
1192 C_BLOB_IT cblob_it = &cblobs;
1193 WERD_LIST words;
1194 WERD *word; // new word
1195 BLOBNBOX *bblob; // current blob
1196 TBOX blob_box; // bounding box
1197 BLOBNBOX_IT box_it; // iterator
1198 int16_t word_count = 0;
1199
1200 cblob_it.set_to_list(&cblobs);
1201 box_it.set_to_list(row->blob_list());
1202 // new words
1203 WERD_IT word_it(&words);
1204 bol = true;
1205 if (!box_it.empty()) {
1206
1207 do {
1208 bblob = box_it.data();
1209 blob_box = bblob->bounding_box();
1210 if (bblob->joined_to_prev()) {
1211 if (bblob->cblob() != nullptr) {
1212 cout_it.set_to_list(cblob_it.data()->out_list());
1213 cout_it.move_to_last();
1214 cout_it.add_list_after(bblob->cblob()->out_list());
1215 delete bblob->cblob();
1216 }
1217 } else {
1218 if (bblob->cblob() != nullptr)
1219 cblob_it.add_after_then_move(bblob->cblob());
1220 }
1221 box_it.forward(); // next one
1222 bblob = box_it.data();
1223 blob_box = bblob->bounding_box();
1224
1225 if (!bblob->joined_to_prev() && !cblobs.empty()) {
1226 word = new WERD(&cblobs, 1, nullptr);
1227 word_count++;
1228 word_it.add_after_then_move(word);
1229 if (bol) {
1230 word->set_flag(W_BOL, true);
1231 bol = false;
1232 }
1233 if (box_it.at_first()) { // at end of line
1234 word->set_flag(W_EOL, true);
1235 }
1236 }
1237 }
1238 while (!box_it.at_first()); // until back at start
1239 /* Setup the row with created words. */
1240 real_row = new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
1241 word_it.set_to_list(real_row->word_list());
1242 //put words in row
1243 word_it.add_list_after(&words);
1244 real_row->recalc_bounding_box();
1245 if (tosp_debug_level > 4) {
1246 tprintf ("Row:Made %d words in row ((%d,%d)(%d,%d))\n",
1247 word_count,
1248 real_row->bounding_box().left(),
1249 real_row->bounding_box().bottom(),
1250 real_row->bounding_box().right(),
1251 real_row->bounding_box().top());
1252 }
1253 return real_row;
1254 }
1255 return nullptr;
1256}
1257
1258bool Textord::make_a_word_break(
1259 TO_ROW* row, // row being made
1260 TBOX blob_box, // for next_blob // how many blanks?
1261 int16_t prev_gap,
1262 TBOX prev_blob_box,
1263 int16_t real_current_gap,
1264 int16_t within_xht_current_gap,
1265 TBOX next_blob_box,
1266 int16_t next_gap,
1267 uint8_t& blanks,
1268 bool& fuzzy_sp,
1269 bool& fuzzy_non,
1270 bool& prev_gap_was_a_space,
1271 bool& break_at_next_gap) {
1272 bool space;
1273 int16_t current_gap;
1274 float fuzzy_sp_to_kn_limit;
1275
1276 if (break_at_next_gap) {
1277 break_at_next_gap = false;
1278 return true;
1279 }
1280 /* Inhibit using the reduced gap if
1281 The kerning is large - chars are not kerned and reducing "f"s can cause
1282 erroneous blanks
1283 OR The real gap is less than 0
1284 OR The real gap is less than the kerning estimate
1285 */
1286 if ((row->kern_size > tosp_large_kerning * row->xheight) ||
1288 (real_current_gap < tosp_dont_fool_with_small_kerns * row->kern_size)))
1289 //Ignore the difference
1290 within_xht_current_gap = real_current_gap;
1291
1293 current_gap = within_xht_current_gap;
1294 else
1295 current_gap = real_current_gap;
1296
1297 if (tosp_old_to_method) {
1298 //Boring old method
1299 space = current_gap > row->max_nonspace;
1300 if (space && (current_gap < INT16_MAX)) {
1301 if (current_gap < row->min_space) {
1302 if (current_gap > row->space_threshold) {
1303 blanks = 1;
1304 fuzzy_sp = true;
1305 fuzzy_non = false;
1306 }
1307 else {
1308 blanks = 0;
1309 fuzzy_sp = false;
1310 fuzzy_non = true;
1311 }
1312 }
1313 else {
1314 blanks = static_cast<uint8_t>(current_gap / row->space_size);
1315 if (blanks < 1)
1316 blanks = 1;
1317 fuzzy_sp = false;
1318 fuzzy_non = false;
1319 }
1320 }
1321 return space;
1322 }
1323 else {
1324 /* New exciting heuristic method */
1325 if (prev_blob_box.null_box ()) // Beginning of row
1326 prev_gap_was_a_space = true;
1327
1328 //Default as old TO
1329 space = current_gap > row->space_threshold;
1330
1331 /* Set defaults for the word break in case we find one. Currently there are
1332 no fuzzy spaces. Depending on the reliability of the different heuristics
1333 we may need to set PARTICULAR spaces to fuzzy or not. The values will ONLY
1334 be used if the function returns true - ie the word is to be broken.
1335 */
1336 int num_blanks = current_gap;
1337 if (row->space_size > 1.0f)
1338 num_blanks = IntCastRounded(current_gap / row->space_size);
1339 blanks = static_cast<uint8_t>(ClipToRange<int>(num_blanks, 1, UINT8_MAX));
1340 fuzzy_sp = false;
1341 fuzzy_non = false;
1342 /*
1343 If xht measure causes gap to flip one of the 3 thresholds act accordingly -
1344 despite any other heuristics - the MINIMUM action is to pass a fuzzy kern to
1345 context.
1346 */
1347 if (tosp_use_xht_gaps &&
1348 (real_current_gap <= row->max_nonspace) &&
1349 (within_xht_current_gap > row->max_nonspace)) {
1350 space = true;
1351 fuzzy_non = true;
1352#ifndef GRAPHICS_DISABLED
1353 mark_gap (blob_box, 20,
1354 prev_gap, prev_blob_box.width (),
1355 current_gap, next_blob_box.width (), next_gap);
1356#endif
1357 }
1358 else if (tosp_use_xht_gaps &&
1359 (real_current_gap <= row->space_threshold) &&
1360 (within_xht_current_gap > row->space_threshold)) {
1361 space = true;
1363 fuzzy_sp = true;
1364 else
1365 fuzzy_non = true;
1366#ifndef GRAPHICS_DISABLED
1367 mark_gap (blob_box, 21,
1368 prev_gap, prev_blob_box.width (),
1369 current_gap, next_blob_box.width (), next_gap);
1370#endif
1371 }
1372 else if (tosp_use_xht_gaps &&
1373 (real_current_gap < row->min_space) &&
1374 (within_xht_current_gap >= row->min_space)) {
1375 space = true;
1376#ifndef GRAPHICS_DISABLED
1377 mark_gap (blob_box, 22,
1378 prev_gap, prev_blob_box.width (),
1379 current_gap, next_blob_box.width (), next_gap);
1380#endif
1381 }
1383 !suspected_punct_blob(row, prev_blob_box) &&
1384 suspected_punct_blob(row, blob_box)) {
1385 break_at_next_gap = true;
1386 }
1387 /* Now continue with normal heuristics */
1388 else if ((current_gap < row->min_space) &&
1389 (current_gap > row->space_threshold)) {
1390 /* Heuristics to turn dubious spaces to kerns */
1392 fuzzy_sp_to_kn_limit = row->kern_size +
1394 (row->space_size - row->kern_size);
1395 else
1396 fuzzy_sp_to_kn_limit = 99999.0f;
1397
1398 /* If current gap is significantly smaller than the previous space the other
1399 side of a narrow blob then this gap is a kern. */
1400 if ((prev_blob_box.width () > 0) &&
1401 narrow_blob (row, prev_blob_box) &&
1402 prev_gap_was_a_space &&
1403 (current_gap <= tosp_gap_factor * prev_gap)) {
1404 if ((tosp_all_flips_fuzzy) ||
1405 (current_gap > fuzzy_sp_to_kn_limit)) {
1407 fuzzy_non = true;
1408 else
1409 fuzzy_sp = true;
1410 }
1411 else
1412 space = false;
1413#ifndef GRAPHICS_DISABLED
1414 mark_gap (blob_box, 1,
1415 prev_gap, prev_blob_box.width (),
1416 current_gap, next_blob_box.width (), next_gap);
1417#endif
1418 }
1419 /* If current gap not much bigger than the previous kern the other side of a
1420 narrow blob then this gap is a kern as well */
1421 else if ((prev_blob_box.width () > 0) &&
1422 narrow_blob (row, prev_blob_box) &&
1423 !prev_gap_was_a_space &&
1424 (current_gap * tosp_gap_factor <= prev_gap)) {
1425 if ((tosp_all_flips_fuzzy) ||
1426 (current_gap > fuzzy_sp_to_kn_limit)) {
1428 fuzzy_non = true;
1429 else
1430 fuzzy_sp = true;
1431 }
1432 else
1433 space = false;
1434#ifndef GRAPHICS_DISABLED
1435 mark_gap (blob_box, 2,
1436 prev_gap, prev_blob_box.width (),
1437 current_gap, next_blob_box.width (), next_gap);
1438#endif
1439 }
1440 else if ((next_blob_box.width () > 0) &&
1441 narrow_blob (row, next_blob_box) &&
1442 (next_gap > row->space_threshold) &&
1443 (current_gap <= tosp_gap_factor * next_gap)) {
1444 if ((tosp_all_flips_fuzzy) ||
1445 (current_gap > fuzzy_sp_to_kn_limit)) {
1447 fuzzy_non = true;
1448 else
1449 fuzzy_sp = true;
1450 }
1451 else
1452 space = false;
1453#ifndef GRAPHICS_DISABLED
1454 mark_gap (blob_box, 3,
1455 prev_gap, prev_blob_box.width (),
1456 current_gap, next_blob_box.width (), next_gap);
1457#endif
1458 }
1459 else if ((next_blob_box.width () > 0) &&
1460 narrow_blob (row, next_blob_box) &&
1461 (next_gap <= row->space_threshold) &&
1462 (current_gap * tosp_gap_factor <= next_gap)) {
1463 if ((tosp_all_flips_fuzzy) ||
1464 (current_gap > fuzzy_sp_to_kn_limit)) {
1466 fuzzy_non = true;
1467 else
1468 fuzzy_sp = true;
1469 }
1470 else
1471 space = false;
1472#ifndef GRAPHICS_DISABLED
1473 mark_gap (blob_box, 4,
1474 prev_gap, prev_blob_box.width (),
1475 current_gap, next_blob_box.width (), next_gap);
1476#endif
1477 }
1478 else if ((((next_blob_box.width () > 0) &&
1479 narrow_blob (row, next_blob_box)) ||
1480 ((prev_blob_box.width () > 0) &&
1481 narrow_blob (row, prev_blob_box)))) {
1482 fuzzy_sp = true;
1483#ifndef GRAPHICS_DISABLED
1484 mark_gap (blob_box, 6,
1485 prev_gap, prev_blob_box.width (),
1486 current_gap, next_blob_box.width (), next_gap);
1487#endif
1488 }
1489 }
1490 else if ((current_gap > row->max_nonspace) &&
1491 (current_gap <= row->space_threshold)) {
1492
1493 /* Heuristics to turn dubious kerns to spaces */
1494 /* TRIED THIS BUT IT MADE THINGS WORSE
1495 if (prev_gap == INT16_MAX)
1496 prev_gap = 0; // start of row
1497 if (next_gap == INT16_MAX)
1498 next_gap = 0; // end of row
1499 */
1500 if ((prev_blob_box.width () > 0) &&
1501 (next_blob_box.width () > 0) &&
1502 (current_gap >=
1503 tosp_kern_gap_factor1 * std::max(prev_gap, next_gap)) &&
1504 wide_blob (row, prev_blob_box) &&
1505 wide_blob (row, next_blob_box)) {
1506
1507 space = true;
1508 /*
1509 tosp_flip_caution is an attempt to stop the default changing in cases
1510 where there is a large difference between the kern and space estimates.
1511 See problem in 'chiefs' where "have" gets split in the quotation.
1512 */
1514 ((tosp_flip_caution <= 0) ||
1515 (tosp_flip_caution * row->kern_size > row->space_size)))
1516 fuzzy_sp = true;
1517 else
1518 fuzzy_non = true;
1519#ifndef GRAPHICS_DISABLED
1520 mark_gap (blob_box, 7,
1521 prev_gap, prev_blob_box.width (),
1522 current_gap, next_blob_box.width (), next_gap);
1523#endif
1524 } else if (prev_blob_box.width() > 0 &&
1525 next_blob_box.width() > 0 &&
1526 current_gap > 5 && // Rule 9 handles small gap, big ratio.
1527 current_gap >=
1528 tosp_kern_gap_factor2 * std::max(prev_gap, next_gap) &&
1529 !(narrow_blob(row, prev_blob_box) ||
1530 suspected_punct_blob(row, prev_blob_box)) &&
1531 !(narrow_blob(row, next_blob_box) ||
1532 suspected_punct_blob(row, next_blob_box))) {
1533 space = true;
1534 fuzzy_non = true;
1535#ifndef GRAPHICS_DISABLED
1536 mark_gap (blob_box, 8,
1537 prev_gap, prev_blob_box.width (),
1538 current_gap, next_blob_box.width (), next_gap);
1539#endif
1540 }
1541 else if ((tosp_kern_gap_factor3 > 0) &&
1542 (prev_blob_box.width () > 0) &&
1543 (next_blob_box.width () > 0) &&
1544 (current_gap >= tosp_kern_gap_factor3 * std::max(prev_gap, next_gap)) &&
1546 (!suspected_punct_blob (row, prev_blob_box) &&
1547 !suspected_punct_blob (row, next_blob_box)))) {
1548 space = true;
1549 fuzzy_non = true;
1550#ifndef GRAPHICS_DISABLED
1551 mark_gap (blob_box, 9,
1552 prev_gap, prev_blob_box.width (),
1553 current_gap, next_blob_box.width (), next_gap);
1554#endif
1555 }
1556 }
1557 if (tosp_debug_level > 10)
1558 tprintf("word break = %d current_gap = %d, prev_gap = %d, "
1559 "next_gap = %d\n", space ? 1 : 0, current_gap,
1560 prev_gap, next_gap);
1561 prev_gap_was_a_space = space && !(fuzzy_non);
1562 return space;
1563 }
1564}
1565
1566bool Textord::narrow_blob(TO_ROW* row, TBOX blob_box) {
1567 bool result;
1568 result = ((blob_box.width () <= tosp_narrow_fraction * row->xheight) ||
1569 ((static_cast<float>(blob_box.width ()) / blob_box.height ()) <=
1571 return result;
1572}
1573
1574bool Textord::wide_blob(TO_ROW* row, TBOX blob_box) {
1575 bool result;
1576 if (tosp_wide_fraction > 0) {
1577 if (tosp_wide_aspect_ratio > 0)
1578 result = ((blob_box.width () >= tosp_wide_fraction * row->xheight) &&
1579 ((static_cast<float>(blob_box.width ()) / blob_box.height ()) >
1581 else
1582 result = (blob_box.width () >= tosp_wide_fraction * row->xheight);
1583 }
1584 else
1585 result = !narrow_blob (row, blob_box);
1586 return result;
1587}
1588
1589bool Textord::suspected_punct_blob(TO_ROW* row, TBOX box) {
1590 bool result;
1591 float baseline;
1592 float blob_x_centre;
1593 /* Find baseline of centre of blob */
1594 blob_x_centre = (box.right () + box.left ()) / 2.0;
1595 baseline = row->baseline.y (blob_x_centre);
1596
1597 result = (box.height () <= 0.66 * row->xheight) ||
1598 (box.top () < baseline + row->xheight / 2.0) ||
1599 (box.bottom () > baseline + row->xheight / 2.0);
1600 return result;
1601}
1602
1603
1604void Textord::peek_at_next_gap(TO_ROW *row,
1605 BLOBNBOX_IT box_it,
1606 TBOX &next_blob_box,
1607 int16_t &next_gap,
1608 int16_t &next_within_xht_gap) {
1609 TBOX next_reduced_blob_box;
1610 TBOX bit_beyond;
1611 BLOBNBOX_IT reduced_box_it = box_it;
1612
1613 next_blob_box = box_next (&box_it);
1614 next_reduced_blob_box = reduced_box_next (row, &reduced_box_it);
1615 if (box_it.at_first ()) {
1616 next_gap = INT16_MAX;
1617 next_within_xht_gap = INT16_MAX;
1618 }
1619 else {
1620 bit_beyond = box_it.data ()->bounding_box ();
1621 next_gap = bit_beyond.left () - next_blob_box.right ();
1622 bit_beyond = reduced_box_next (row, &reduced_box_it);
1623 next_within_xht_gap =
1624 bit_beyond.left () - next_reduced_blob_box.right ();
1625 }
1626}
1627
1628
1629#ifndef GRAPHICS_DISABLED
1630void Textord::mark_gap(
1631 TBOX blob, // blob following gap
1632 int16_t rule, // heuristic id
1633 int16_t prev_gap,
1634 int16_t prev_blob_width,
1635 int16_t current_gap,
1636 int16_t next_blob_width,
1637 int16_t next_gap) {
1638 ScrollView::Color col; //of ellipse marking flipped gap
1639
1640 switch (rule) {
1641 case 1:
1642 col = ScrollView::RED;
1643 break;
1644 case 2:
1645 col = ScrollView::CYAN;
1646 break;
1647 case 3:
1648 col = ScrollView::GREEN;
1649 break;
1650 case 4:
1651 col = ScrollView::BLACK;
1652 break;
1653 case 5:
1654 col = ScrollView::MAGENTA;
1655 break;
1656 case 6:
1657 col = ScrollView::BLUE;
1658 break;
1659
1660 case 7:
1661 col = ScrollView::WHITE;
1662 break;
1663 case 8:
1664 col = ScrollView::YELLOW;
1665 break;
1666 case 9:
1667 col = ScrollView::BLACK;
1668 break;
1669
1670 case 20:
1671 col = ScrollView::CYAN;
1672 break;
1673 case 21:
1674 col = ScrollView::GREEN;
1675 break;
1676 case 22:
1677 col = ScrollView::MAGENTA;
1678 break;
1679 default:
1680 col = ScrollView::BLACK;
1681 }
1683 to_win->Pen(col);
1684 /* if (rule < 20)
1685 //interior_style(to_win, INT_SOLID, false);
1686 else
1687 //interior_style(to_win, INT_HOLLOW, true);*/
1688 //x radius
1689 to_win->Ellipse (current_gap / 2.0f,
1690 blob.height () / 2.0f, //y radius
1691 //x centre
1692 blob.left () - current_gap / 2.0f,
1693 //y centre
1694 blob.bottom () + blob.height () / 2.0f);
1695 }
1696 if (tosp_debug_level > 5)
1697 tprintf(" (%d,%d) Sp<->Kn Rule %d %d %d %d %d %d\n",
1698 blob.left() - current_gap / 2, blob.bottom(), rule, prev_gap,
1699 prev_blob_width, current_gap, next_blob_width, next_gap);
1700}
1701#endif
1702
1703float Textord::find_mean_blob_spacing(WERD *word) {
1704 C_BLOB_IT cblob_it;
1705 TBOX blob_box;
1706 int32_t gap_sum = 0;
1707 int16_t gap_count = 0;
1708 int16_t prev_right;
1709
1710 cblob_it.set_to_list (word->cblob_list ());
1711 if (!cblob_it.empty ()) {
1712 cblob_it.mark_cycle_pt ();
1713 prev_right = cblob_it.data ()->bounding_box ().right ();
1714 //first blob
1715 cblob_it.forward ();
1716 for (; !cblob_it.cycled_list (); cblob_it.forward ()) {
1717 blob_box = cblob_it.data ()->bounding_box ();
1718 gap_sum += blob_box.left () - prev_right;
1719 gap_count++;
1720 prev_right = blob_box.right ();
1721 }
1722 }
1723 if (gap_count > 0)
1724 return (gap_sum / static_cast<float>(gap_count));
1725 else
1726 return 0.0f;
1727}
1728
1729
1730bool Textord::ignore_big_gap(TO_ROW* row,
1731 int32_t row_length,
1732 GAPMAP* gapmap,
1733 int16_t left,
1734 int16_t right) {
1735 int16_t gap = right - left + 1;
1736
1737 if (tosp_ignore_big_gaps > 999) return false; // Don't ignore
1738 if (tosp_ignore_big_gaps > 0)
1739 return (gap > tosp_ignore_big_gaps * row->xheight);
1740 if (gap > tosp_ignore_very_big_gaps * row->xheight)
1741 return true;
1742 if (tosp_ignore_big_gaps == 0) {
1743 if ((gap > 2.1 * row->xheight) && (row_length > 20 * row->xheight))
1744 return true;
1745 if ((gap > 1.75 * row->xheight) &&
1746 ((row_length > 35 * row->xheight) ||
1747 gapmap->table_gap (left, right)))
1748 return true;
1749 }
1750 else {
1751 /* ONLY time gaps < 3.0 * xht are ignored is when they are part of a table */
1752 if ((gap > gapmap_big_gaps * row->xheight) &&
1753 gapmap->table_gap (left, right))
1754 return true;
1755 }
1756 return false;
1757}
1758
1759/**********************************************************************
1760 * reduced_box_next
1761 *
1762 * Compute the bounding box of this blob with merging of x overlaps
1763 * but no pre-chopping.
1764 * Then move the iterator on to the start of the next blob.
1765 * DON'T reduce the box for small things - eg punctuation.
1766 **********************************************************************/
1767TBOX Textord::reduced_box_next(
1768 TO_ROW *row, // current row
1769 BLOBNBOX_IT *it // iterator to blobds
1770 ) {
1771 BLOBNBOX *blob; //current blob
1772 BLOBNBOX *head_blob; //place to store box
1773 TBOX full_box; //full blob boundg box
1774 TBOX reduced_box; //box of significant part
1775 int16_t left_above_xht; //ABOVE xht left limit
1776 int16_t new_left_above_xht; //ABOVE xht left limit
1777
1778 blob = it->data ();
1779 if (blob->red_box_set ()) {
1780 reduced_box = blob->reduced_box ();
1781 do {
1782 it->forward();
1783 blob = it->data();
1784 }
1785 while (blob->cblob() == nullptr || blob->joined_to_prev());
1786 return reduced_box;
1787 }
1788 head_blob = blob;
1789 full_box = blob->bounding_box ();
1790 reduced_box = reduced_box_for_blob (blob, row, &left_above_xht);
1791 do {
1792 it->forward ();
1793 blob = it->data ();
1794 if (blob->cblob() == nullptr)
1795 //was pre-chopped
1796 full_box += blob->bounding_box ();
1797 else if (blob->joined_to_prev ()) {
1798 reduced_box +=
1799 reduced_box_for_blob(blob, row, &new_left_above_xht);
1800 left_above_xht = std::min(left_above_xht, new_left_above_xht);
1801 }
1802 }
1803 //until next real blob
1804 while (blob->cblob() == nullptr || blob->joined_to_prev());
1805
1806 if ((reduced_box.width () > 0) &&
1807 ((reduced_box.left () + tosp_near_lh_edge * reduced_box.width ())
1808 < left_above_xht) && (reduced_box.height () > 0.7 * row->xheight)) {
1809#ifndef GRAPHICS_DISABLED
1812#endif
1813 }
1814 else
1815 reduced_box = full_box;
1816 head_blob->set_reduced_box (reduced_box);
1817 return reduced_box;
1818}
1819
1820
1821/*************************************************************************
1822 * reduced_box_for_blob()
1823 * Find box for blob which is the same height and y position as the whole blob,
1824 * but whose left limit is the left most position of the blob ABOVE the
1825 * baseline and whose right limit is the right most position of the blob BELOW
1826 * the xheight.
1827 *
1828 *
1829 * !!!!!!! WONT WORK WITH LARGE UPPER CASE CHARS - T F V W - look at examples on
1830 * "home". Perhaps we need something which say if the width ABOVE the
1831 * xht alone includes the whole of the reduced width, then use the full
1832 * blob box - Might still fail on italic F
1833 *
1834 * Alternatively we could be a little less severe and only reduce the
1835 * left and right edges by half the difference between the full box and
1836 * the reduced box.
1837 *
1838 * NOTE that we need to rotate all the coordinates as
1839 * find_blob_limits finds the y min and max within a specified x band
1840 *************************************************************************/
1841TBOX Textord::reduced_box_for_blob(
1842 BLOBNBOX *blob,
1843 TO_ROW *row,
1844 int16_t *left_above_xht) {
1845 float baseline;
1846 float blob_x_centre;
1847 float left_limit;
1848 float right_limit;
1849 float junk;
1850 TBOX blob_box;
1851
1852 /* Find baseline of centre of blob */
1853
1854 blob_box = blob->bounding_box ();
1855 blob_x_centre = (blob_box.left () + blob_box.right ()) / 2.0;
1856 baseline = row->baseline.y (blob_x_centre);
1857
1858 /*
1859 Find LH limit of blob ABOVE the xht. This is so that we can detect certain
1860 caps ht chars which should NOT have their box reduced: T, Y, V, W etc
1861 */
1862 left_limit = static_cast<float>(INT32_MAX);
1863 junk = static_cast<float>(-INT32_MAX);
1864 find_cblob_hlimits(blob->cblob(), (baseline + 1.1 * row->xheight),
1865 static_cast<float>(INT16_MAX), left_limit, junk);
1866 if (left_limit > junk)
1867 *left_above_xht = INT16_MAX; //No area above xht
1868 else
1869 *left_above_xht = static_cast<int16_t>(floor (left_limit));
1870 /*
1871 Find reduced LH limit of blob - the left extent of the region ABOVE the
1872 baseline.
1873 */
1874 left_limit = static_cast<float>(INT32_MAX);
1875 junk = static_cast<float>(-INT32_MAX);
1876 find_cblob_hlimits(blob->cblob(), baseline, static_cast<float>(INT16_MAX),
1877 left_limit, junk);
1878
1879 if (left_limit > junk)
1880 return TBOX (); //no area within xht so return empty box
1881 /*
1882 Find reduced RH limit of blob - the right extent of the region BELOW the xht.
1883 */
1884 junk = static_cast<float>(INT32_MAX);
1885 right_limit = static_cast<float>(-INT32_MAX);
1886 find_cblob_hlimits(blob->cblob(), static_cast<float>(-INT16_MAX),
1887 (baseline + row->xheight), junk, right_limit);
1888 if (junk > right_limit)
1889 return TBOX (); //no area within xht so return empty box
1890
1891 return TBOX (ICOORD (static_cast<int16_t>(floor (left_limit)), blob_box.bottom ()),
1892 ICOORD (static_cast<int16_t>(ceil (right_limit)), blob_box.top ()));
1893}
1894} // namespace tesseract
TBOX box_next(BLOBNBOX_IT *it)
Definition: blobbox.cpp:636
TBOX box_next_pre_chopped(BLOBNBOX_IT *it)
Definition: blobbox.cpp:665
void find_cblob_hlimits(C_BLOB *blob, float bottomy, float topy, float &xmin, float &xmax)
Definition: blobbox.cpp:576
@ PITCH_DEF_PROP
Definition: blobbox.h:49
@ PITCH_CORR_PROP
Definition: blobbox.h:52
@ W_FUZZY_SP
fuzzy space
Definition: werd.h:39
@ W_EOL
end of line
Definition: werd.h:33
@ W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:40
@ W_BOL
start of line
Definition: werd.h:32
int IntCastRounded(double x)
Definition: helpers.h:175
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
@ baseline
Definition: mfoutline.h:63
ScrollView * to_win
Definition: drawtord.cpp:35
void plot_word_decisions(ScrollView *win, int16_t pitch, TO_ROW *row)
Definition: drawtord.cpp:247
double gapmap_big_gaps
Definition: gap_map.cpp:18
#define MAXSPACING
Definition: tospace.cpp:41
bool textord_show_initial_words
Definition: tovars.cpp:23
void set_reduced_box(TBOX new_box)
Definition: blobbox.h:249
const TBOX & bounding_box() const
Definition: blobbox.h:230
C_BLOB * cblob() const
Definition: blobbox.h:268
bool joined_to_prev() const
Definition: blobbox.h:256
bool red_box_set() const
Definition: blobbox.h:259
const TBOX & reduced_box() const
Definition: blobbox.h:246
float fixed_pitch
Definition: blobbox.h:651
QSPLINE baseline
Definition: blobbox.h:670
int32_t space_threshold
Definition: blobbox.h:665
float xheight
Definition: blobbox.h:657
PITCH_TYPE pitch_decision
Definition: blobbox.h:650
int32_t max_nonspace
Definition: blobbox.h:664
WERD_LIST rep_words
Definition: blobbox.h:668
float kern_size
Definition: blobbox.h:666
int32_t min_space
Definition: blobbox.h:663
float space_size
Definition: blobbox.h:667
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:600
TO_ROW_LIST * get_rows()
Definition: blobbox.h:704
Definition: ocrrow.h:37
WERD_LIST * word_list()
Definition: ocrrow.h:55
TBOX bounding_box() const
Definition: ocrrow.h:88
void recalc_bounding_box()
Definition: ocrrow.cpp:100
integer coordinate
Definition: points.h:32
Definition: points.h:189
double y(double x) const
Definition: quspline.cpp:209
Definition: rect.h:34
void plot(ScrollView *fd) const
Definition: rect.h:286
int16_t top() const
Definition: rect.h:58
int16_t width() const
Definition: rect.h:115
int16_t height() const
Definition: rect.h:108
int16_t left() const
Definition: rect.h:72
int16_t bottom() const
Definition: rect.h:65
bool null_box() const
Definition: rect.h:50
int16_t right() const
Definition: rect.h:79
Definition: statistc.h:31
int32_t pile_count(int32_t value) const
Definition: statistc.h:76
double mean() const
Definition: statistc.cpp:127
void add(int32_t value, int32_t count)
Definition: statistc.cpp:93
double median() const
Definition: statistc.cpp:231
int32_t get_total() const
Definition: statistc.h:84
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:70
Definition: werd.h:56
C_BLOB_LIST * cblob_list()
Definition: werd.h:95
void set_blanks(uint8_t new_blanks)
Definition: werd.h:102
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:118
TBOX bounding_box() const
Definition: werd.cpp:148
Definition: gap_map.h:17
bool table_gap(int16_t left, int16_t right)
Definition: gap_map.cpp:161
bool tosp_only_use_xht_gaps
Definition: textord.h:295
bool tosp_old_to_constrain_sp_kn
Definition: textord.h:266
double tosp_min_sane_kn_sp
Definition: textord.h:353
double tosp_wide_fraction
Definition: textord.h:323
double tosp_table_fuzzy_kn_sp_ratio
Definition: textord.h:349
bool tosp_only_small_gaps_for_kern
Definition: textord.h:286
bool tosp_block_use_cert_spaces
Definition: textord.h:277
double tosp_pass_wide_fuzz_sp_to_context
Definition: textord.h:371
double tosp_init_guess_xht_mult
Definition: textord.h:357
double tosp_fuzzy_kn_fraction
Definition: textord.h:350
double tosp_kern_gap_factor1
Definition: textord.h:334
double tosp_fuzzy_space_factor2
Definition: textord.h:331
double tosp_table_xht_sp_ratio
Definition: textord.h:347
double tosp_wide_aspect_ratio
Definition: textord.h:325
bool tosp_all_flips_fuzzy
Definition: textord.h:287
double tosp_large_kerning
Definition: textord.h:363
bool tosp_only_use_prop_rows
Definition: textord.h:268
bool tosp_old_to_method
Definition: textord.h:263
bool tosp_force_wordbreak_on_punct
Definition: textord.h:271
bool tosp_recovery_isolated_row_stats
Definition: textord.h:285
bool tosp_use_pre_chopping
Definition: textord.h:273
int tosp_sanity_method
Definition: textord.h:311
ROW * make_prop_words(TO_ROW *row, FCOORD rotation)
Definition: tospace.cpp:891
double tosp_kern_gap_factor2
Definition: textord.h:336
double tosp_threshold_bias1
Definition: textord.h:316
double tosp_dont_fool_with_small_kerns
Definition: textord.h:365
void to_spacing(ICOORD page_tr, TO_BLOCK_LIST *blocks)
Definition: tospace.cpp:44
double tosp_threshold_bias2
Definition: textord.h:318
double tosp_rep_space
Definition: textord.h:341
bool tosp_rule_9_test_punct
Definition: textord.h:297
bool tosp_narrow_blobs_not_cert
Definition: textord.h:281
bool tosp_flip_fuzz_sp_to_kn
Definition: textord.h:299
bool tosp_row_use_cert_spaces1
Definition: textord.h:283
bool tosp_improve_thresh
Definition: textord.h:301
double tosp_silly_kn_sp_gap
Definition: textord.h:369
bool tosp_stats_use_xht_gaps
Definition: textord.h:291
bool tosp_flip_fuzz_kn_to_sp
Definition: textord.h:298
double tosp_max_sane_kn_thresh
Definition: textord.h:359
double tosp_fuzzy_space_factor
Definition: textord.h:327
double tosp_enough_small_gaps
Definition: textord.h:343
bool tosp_use_xht_gaps
Definition: textord.h:293
int tosp_redo_kern_limit
Definition: textord.h:306
int tosp_enough_space_samples_for_median
Definition: textord.h:304
double tosp_init_guess_kn_mult
Definition: textord.h:355
double tosp_table_kn_sp_ratio
Definition: textord.h:345
bool tosp_fuzzy_limit_all
Definition: textord.h:289
bool tosp_old_to_bug_fix
Definition: textord.h:275
double tosp_kern_gap_factor3
Definition: textord.h:338
double tosp_fuzzy_sp_fraction
Definition: textord.h:351
ROW * make_blob_words(TO_ROW *row, FCOORD rotation)
Definition: tospace.cpp:1184
double tosp_fuzzy_space_factor1
Definition: textord.h:329
double tosp_ignore_very_big_gaps
Definition: textord.h:340
double tosp_gap_factor
Definition: textord.h:332
bool tosp_row_use_cert_spaces
Definition: textord.h:279
double tosp_flip_caution
Definition: textord.h:361
double tosp_ignore_big_gaps
Definition: textord.h:339
double tosp_near_lh_edge
Definition: textord.h:367
double tosp_old_sp_kn_th_factor
Definition: textord.h:314
double tosp_narrow_aspect_ratio
Definition: textord.h:322
double tosp_narrow_fraction
Definition: textord.h:320
void Ellipse(int x, int y, int width, int height)
Definition: scrollview.cpp:609
void Pen(Color color)
Definition: scrollview.cpp:719