tesseract 4.1.1
Loading...
Searching...
No Matches
fixspace.cpp
Go to the documentation of this file.
1/******************************************************************
2 * File: fixspace.cpp (Formerly fixspace.c)
3 * Description: Implements a pass over the page res, exploring the alternative
4 * spacing possibilities, trying to use context to improve the
5 * word spacing
6 * Author: Phil Cheatle
7 *
8 * (C) Copyright 1993, Hewlett-Packard Ltd.
9 ** Licensed under the Apache License, Version 2.0 (the "License");
10 ** you may not use this file except in compliance with the License.
11 ** You may obtain a copy of the License at
12 ** http://www.apache.org/licenses/LICENSE-2.0
13 ** Unless required by applicable law or agreed to in writing, software
14 ** distributed under the License is distributed on an "AS IS" BASIS,
15 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 ** See the License for the specific language governing permissions and
17 ** limitations under the License.
18 *
19 **********************************************************************/
20
21#include "fixspace.h"
22#include <cstdint> // for INT16_MAX, int16_t, int32_t
23#include "blobs.h" // for TWERD, TBLOB, TESSLINE
24#include "boxword.h" // for BoxWord
25#include "errcode.h" // for ASSERT_HOST
26#include "normalis.h" // for kBlnXHeight, kBlnBaselineOffset
27#include "ocrclass.h" // for ETEXT_DESC
28#include "pageres.h" // for WERD_RES_IT, WERD_RES, WERD_RES_LIST
29#include "params.h" // for IntParam, StringParam, BoolParam, Doub...
30#include "ratngs.h" // for WERD_CHOICE, FREQ_DAWG_PERM, NUMBER_PERM
31#include "rect.h" // for TBOX
32#include "stepblob.h" // for C_BLOB_IT, C_BLOB_LIST, C_BLOB
33#include "strngs.h" // for STRING
34#include "tesseractclass.h" // for Tesseract, TesseractStats, WordData
35#include "tessvars.h" // for debug_fp
36#include "tprintf.h" // for tprintf
37#include "unichar.h" // for UNICHAR_ID
38#include "unicharset.h" // for UNICHARSET
39#include "werd.h" // for WERD, W_EOL, W_FUZZY_NON, W_FUZZY_SP
40
41class BLOCK;
42class ROW;
43
44#define PERFECT_WERDS 999
45
46namespace tesseract {
47
48/**********************************************************************
49 * c_blob_comparator()
50 *
51 * Blob comparator used to sort a blob list so that blobs are in increasing
52 * order of left edge.
53 **********************************************************************/
54
55static int c_blob_comparator( // sort blobs
56 const void *blob1p, // ptr to ptr to blob1
57 const void *blob2p // ptr to ptr to blob2
58 ) {
59 const C_BLOB *blob1 = *reinterpret_cast<const C_BLOB* const*>(blob1p);
60 const C_BLOB *blob2 = *reinterpret_cast<const C_BLOB* const*>(blob2p);
61
62 return blob1->bounding_box ().left () - blob2->bounding_box ().left ();
63}
64
76 int32_t word_count,
77 PAGE_RES *page_res) {
78 BLOCK_RES_IT block_res_it;
79 ROW_RES_IT row_res_it;
80 WERD_RES_IT word_res_it_from;
81 WERD_RES_IT word_res_it_to;
82 WERD_RES *word_res;
83 WERD_RES_LIST fuzzy_space_words;
84 int16_t new_length;
85 bool prevent_null_wd_fixsp; // DON'T process blobless wds
86 int32_t word_index; // current word
87
88 block_res_it.set_to_list(&page_res->block_res_list);
89 word_index = 0;
90 for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
91 block_res_it.forward()) {
92 row_res_it.set_to_list(&block_res_it.data()->row_res_list);
93 for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
94 row_res_it.forward()) {
95 word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
96 while (!word_res_it_from.at_last()) {
97 word_res = word_res_it_from.data();
98 while (!word_res_it_from.at_last() &&
99 !(word_res->combination ||
100 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
101 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
102 fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
103 block_res_it.data()->block);
104 word_res = word_res_it_from.forward();
105 word_index++;
106 if (monitor != nullptr) {
107 monitor->ocr_alive = true;
108 monitor->progress = 90 + 5 * word_index / word_count;
109 if (monitor->deadline_exceeded() ||
110 (monitor->cancel != nullptr &&
111 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
112 return;
113 }
114 }
115
116 if (!word_res_it_from.at_last()) {
117 word_res_it_to = word_res_it_from;
118 prevent_null_wd_fixsp =
119 word_res->word->cblob_list()->empty();
120 if (check_debug_pt(word_res, 60))
121 debug_fix_space_level.set_value(10);
122 word_res_it_to.forward();
123 word_index++;
124 if (monitor != nullptr) {
125 monitor->ocr_alive = true;
126 monitor->progress = 90 + 5 * word_index / word_count;
127 if (monitor->deadline_exceeded() ||
128 (monitor->cancel != nullptr &&
129 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
130 return;
131 }
132 while (!word_res_it_to.at_last () &&
133 (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
134 word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
135 if (check_debug_pt(word_res, 60))
136 debug_fix_space_level.set_value(10);
137 if (word_res->word->cblob_list()->empty())
138 prevent_null_wd_fixsp = true;
139 word_res = word_res_it_to.forward();
140 }
141 if (check_debug_pt(word_res, 60))
142 debug_fix_space_level.set_value(10);
143 if (word_res->word->cblob_list()->empty())
144 prevent_null_wd_fixsp = true;
145 if (prevent_null_wd_fixsp) {
146 word_res_it_from = word_res_it_to;
147 } else {
148 fuzzy_space_words.assign_to_sublist(&word_res_it_from,
149 &word_res_it_to);
150 fix_fuzzy_space_list(fuzzy_space_words,
151 row_res_it.data()->row,
152 block_res_it.data()->block);
153 new_length = fuzzy_space_words.length();
154 word_res_it_from.add_list_before(&fuzzy_space_words);
155 for (;
156 !word_res_it_from.at_last() && new_length > 0;
157 new_length--) {
158 word_res_it_from.forward();
159 }
160 }
161 if (test_pt)
162 debug_fix_space_level.set_value(0);
163 }
164 fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
165 block_res_it.data()->block);
166 // Last word in row
167 }
168 }
169 }
170}
171
172void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm,
173 ROW *row,
174 BLOCK* block) {
175 int16_t best_score;
176 WERD_RES_LIST current_perm;
177 int16_t current_score;
178 bool improved = false;
179
180 best_score = eval_word_spacing(best_perm); // default score
181 dump_words(best_perm, best_score, 1, improved);
182
183 if (best_score != PERFECT_WERDS)
184 initialise_search(best_perm, current_perm);
185
186 while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
187 match_current_words(current_perm, row, block);
188 current_score = eval_word_spacing(current_perm);
189 dump_words(current_perm, current_score, 2, improved);
190 if (current_score > best_score) {
191 best_perm.clear();
192 best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
193 best_score = current_score;
194 improved = true;
195 }
196 if (current_score < PERFECT_WERDS)
197 transform_to_next_perm(current_perm);
198 }
199 dump_words(best_perm, best_score, 3, improved);
200}
201
202} // namespace tesseract
203
204void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
205 WERD_RES_IT src_it(&src_list);
206 WERD_RES_IT new_it(&new_list);
207 WERD_RES *src_wd;
208 WERD_RES *new_wd;
209
210 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
211 src_wd = src_it.data();
212 if (!src_wd->combination) {
213 new_wd = WERD_RES::deep_copy(src_wd);
214 new_wd->combination = false;
215 new_wd->part_of_combo = false;
216 new_it.add_after_then_move(new_wd);
217 }
218 }
219}
220
221
222namespace tesseract {
223void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
224 BLOCK* block) {
225 WERD_RES_IT word_it(&words);
226 WERD_RES *word;
227 // Since we are not using PAGE_RES to iterate over words, we need to update
228 // prev_word_best_choice_ before calling classify_word_pass2().
229 prev_word_best_choice_ = nullptr;
230 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
231 word = word_it.data();
232 if ((!word->part_of_combo) && (word->box_word == nullptr)) {
233 WordData word_data(block, row, word);
234 SetupWordPassN(2, &word_data);
235 classify_word_and_language(2, nullptr, &word_data);
236 }
238 }
239}
240
266int16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
267 WERD_RES_IT word_res_it(&word_res_list);
268 int16_t total_score = 0;
269 int16_t word_count = 0;
270 int16_t done_word_count = 0;
271 int16_t word_len;
272 int16_t i;
273 int16_t offset;
274 WERD_RES *word; // current word
275 int16_t prev_word_score = 0;
276 bool prev_word_done = false;
277 bool prev_char_1 = false; // prev ch a "1/I/l"?
278 bool prev_char_digit = false; // prev ch 2..9 or 0
279 bool current_char_1 = false;
280 bool current_word_ok_so_far;
281 STRING punct_chars = "!\"`',.:;";
282 bool prev_char_punct = false;
283 bool current_char_punct = false;
284 bool word_done = false;
285
286 do {
287 word = word_res_it.data();
288 word_done = fixspace_thinks_word_done(word);
289 word_count++;
290 if (word->tess_failed) {
291 total_score += prev_word_score;
292 if (prev_word_done)
293 done_word_count++;
294 prev_word_score = 0;
295 prev_char_1 = false;
296 prev_char_digit = false;
297 prev_word_done = false;
298 } else {
299 /*
300 Can we add the prev word score and potentially count this word?
301 Yes IF it didn't end in a 1 when the first char of this word is a digit
302 AND it didn't end in a digit when the first char of this word is a 1
303 */
304 word_len = word->reject_map.length();
305 current_word_ok_so_far = false;
306 if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
307 (prev_char_digit && (
308 (word_done &&
309 word->best_choice->unichar_lengths().string()[0] == 1 &&
310 word->best_choice->unichar_string()[0] == '1') ||
311 (!word_done && STRING(conflict_set_I_l_1).contains(
312 word->best_choice->unichar_string()[0])))))) {
313 total_score += prev_word_score;
314 if (prev_word_done)
315 done_word_count++;
316 current_word_ok_so_far = word_done;
317 }
318
319 if (current_word_ok_so_far) {
320 prev_word_done = true;
321 prev_word_score = word_len;
322 } else {
323 prev_word_done = false;
324 prev_word_score = 0;
325 }
326
327 /* Add 1 to total score for every joined 1 regardless of context and
328 rejtn */
329 for (i = 0, prev_char_1 = false; i < word_len; i++) {
330 current_char_1 = word->best_choice->unichar_string()[i] == '1';
331 if (prev_char_1 || (current_char_1 && (i > 0)))
332 total_score++;
333 prev_char_1 = current_char_1;
334 }
335
336 /* Add 1 to total score for every joined punctuation regardless of context
337 and rejtn */
339 for (i = 0, offset = 0, prev_char_punct = false; i < word_len;
340 offset += word->best_choice->unichar_lengths()[i++]) {
341 current_char_punct =
342 punct_chars.contains(word->best_choice->unichar_string()[offset]);
343 if (prev_char_punct || (current_char_punct && i > 0))
344 total_score++;
345 prev_char_punct = current_char_punct;
346 }
347 }
348 prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
349 for (i = 0, offset = 0; i < word_len - 1;
350 offset += word->best_choice->unichar_lengths()[i++]);
351 prev_char_1 =
352 ((word_done && (word->best_choice->unichar_string()[offset] == '1'))
353 || (!word_done && STRING(conflict_set_I_l_1).contains(
354 word->best_choice->unichar_string()[offset])));
355 }
356 /* Find next word */
357 do {
358 word_res_it.forward();
359 } while (word_res_it.data()->part_of_combo);
360 } while (!word_res_it.at_first());
361 total_score += prev_word_score;
362 if (prev_word_done)
363 done_word_count++;
364 if (done_word_count == word_count)
365 return PERFECT_WERDS;
366 else
367 return total_score;
368}
369
370bool Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {
371 int i;
372 int offset;
373
374 for (i = 0, offset = 0; i < char_position;
375 offset += word->best_choice->unichar_lengths()[i++]);
376 return (
377 word->uch_set->get_isdigit(
378 word->best_choice->unichar_string().string() + offset,
379 word->best_choice->unichar_lengths()[i]) ||
380 (word->best_choice->permuter() == NUMBER_PERM &&
381 STRING(numeric_punctuation).contains(
382 word->best_choice->unichar_string().string()[offset])));
383}
384
385} // namespace tesseract
386
387
399void transform_to_next_perm(WERD_RES_LIST &words) {
400 WERD_RES_IT word_it(&words);
401 WERD_RES_IT prev_word_it(&words);
402 WERD_RES *word;
403 WERD_RES *prev_word;
404 WERD_RES *combo;
405 WERD *copy_word;
406 int16_t prev_right = -INT16_MAX;
407 TBOX box;
408 int16_t gap;
409 int16_t min_gap = INT16_MAX;
410
411 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
412 word = word_it.data();
413 if (!word->part_of_combo) {
414 box = word->word->bounding_box();
415 if (prev_right > -INT16_MAX) {
416 gap = box.left() - prev_right;
417 if (gap < min_gap)
418 min_gap = gap;
419 }
420 prev_right = box.right();
421 }
422 }
423 if (min_gap < INT16_MAX) {
424 prev_right = -INT16_MAX; // back to start
425 word_it.set_to_list(&words);
426 // Note: we can't use cycle_pt due to inserted combos at start of list.
427 for (; (prev_right == -INT16_MAX) || !word_it.at_first();
428 word_it.forward()) {
429 word = word_it.data();
430 if (!word->part_of_combo) {
431 box = word->word->bounding_box();
432 if (prev_right > -INT16_MAX) {
433 gap = box.left() - prev_right;
434 if (gap <= min_gap) {
435 prev_word = prev_word_it.data();
436 if (prev_word->combination) {
437 combo = prev_word;
438 } else {
439 /* Make a new combination and insert before
440 * the first word being joined. */
441 copy_word = new WERD;
442 *copy_word = *(prev_word->word);
443 // deep copy
444 combo = new WERD_RES(copy_word);
445 combo->combination = true;
446 combo->x_height = prev_word->x_height;
447 prev_word->part_of_combo = true;
448 prev_word_it.add_before_then_move(combo);
449 }
450 combo->word->set_flag(W_EOL, word->word->flag(W_EOL));
451 if (word->combination) {
452 combo->word->join_on(word->word);
453 // Move blobs to combo
454 // old combo no longer needed
455 delete word_it.extract();
456 } else {
457 // Copy current wd to combo
458 combo->copy_on(word);
459 word->part_of_combo = true;
460 }
461 combo->done = false;
462 combo->ClearResults();
463 } else {
464 prev_word_it = word_it; // catch up
465 }
466 }
467 prev_right = box.right();
468 }
469 }
470 } else {
471 words.clear(); // signal termination
472 }
473}
474
475namespace tesseract {
476void Tesseract::dump_words(WERD_RES_LIST &perm, int16_t score,
477 int16_t mode, bool improved) {
478 WERD_RES_IT word_res_it(&perm);
479
480 if (debug_fix_space_level > 0) {
481 if (mode == 1) {
482 stats_.dump_words_str = "";
483 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
484 word_res_it.forward()) {
485 if (!word_res_it.data()->part_of_combo) {
486 stats_.dump_words_str +=
487 word_res_it.data()->best_choice->unichar_string();
488 stats_.dump_words_str += ' ';
489 }
490 }
491 }
492
493 if (debug_fix_space_level > 1) {
494 switch (mode) {
495 case 1:
496 tprintf("EXTRACTED (%d): \"", score);
497 break;
498 case 2:
499 tprintf("TESTED (%d): \"", score);
500 break;
501 case 3:
502 tprintf("RETURNED (%d): \"", score);
503 break;
504 }
505
506 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
507 word_res_it.forward()) {
508 if (!word_res_it.data()->part_of_combo) {
509 tprintf("%s/%1d ",
510 word_res_it.data()->best_choice->unichar_string().string(),
511 static_cast<int>(word_res_it.data()->best_choice->permuter()));
512 }
513 }
514 tprintf("\"\n");
515 } else if (improved) {
516 tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.string());
517 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
518 word_res_it.forward()) {
519 if (!word_res_it.data()->part_of_combo) {
520 tprintf("%s/%1d ",
521 word_res_it.data()->best_choice->unichar_string().string(),
522 static_cast<int>(word_res_it.data()->best_choice->permuter()));
523 }
524 }
525 tprintf("\"\n");
526 }
527 }
528}
529
531 if (word->done)
532 return true;
533
534 /*
535 Use all the standard pass 2 conditions for mode 5 in set_done() in
536 reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
537 CARE WHETHER WE HAVE of/at on/an etc.
538 */
539 if (fixsp_done_mode > 0 &&
540 (word->tess_accepted ||
541 (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
542 fixsp_done_mode == 3) &&
543 (strchr(word->best_choice->unichar_string().string(), ' ') == nullptr) &&
544 ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
545 (word->best_choice->permuter() == FREQ_DAWG_PERM) ||
546 (word->best_choice->permuter() == USER_DAWG_PERM) ||
547 (word->best_choice->permuter() == NUMBER_PERM))) {
548 return true;
549 } else {
550 return false;
551 }
552}
553
554
562void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row,
563 BLOCK* block) {
564 WERD_RES *word_res;
565 WERD_RES_LIST sub_word_list;
566 WERD_RES_IT sub_word_list_it(&sub_word_list);
567 int16_t blob_index;
568 int16_t new_length;
569 float junk;
570
571 word_res = word_res_it.data();
572 if (word_res->word->flag(W_REP_CHAR) ||
573 word_res->combination ||
574 word_res->part_of_combo ||
575 !word_res->word->flag(W_DONT_CHOP))
576 return;
577
578 blob_index = worst_noise_blob(word_res, &junk);
579 if (blob_index < 0)
580 return;
581
582 if (debug_fix_space_level > 1) {
583 tprintf("FP fixspace working on \"%s\"\n",
584 word_res->best_choice->unichar_string().string());
585 }
586 word_res->word->rej_cblob_list()->sort(c_blob_comparator);
587 sub_word_list_it.add_after_stay_put(word_res_it.extract());
588 fix_noisy_space_list(sub_word_list, row, block);
589 new_length = sub_word_list.length();
590 word_res_it.add_list_before(&sub_word_list);
591 for (; !word_res_it.at_last() && new_length > 1; new_length--) {
592 word_res_it.forward();
593 }
594}
595
596void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,
597 BLOCK* block) {
598 int16_t best_score;
599 WERD_RES_IT best_perm_it(&best_perm);
600 WERD_RES_LIST current_perm;
601 WERD_RES_IT current_perm_it(&current_perm);
602 WERD_RES *old_word_res;
603 int16_t current_score;
604 bool improved = false;
605
606 best_score = fp_eval_word_spacing(best_perm); // default score
607
608 dump_words(best_perm, best_score, 1, improved);
609
610 old_word_res = best_perm_it.data();
611 // Even deep_copy doesn't copy the underlying WERD unless its combination
612 // flag is true!.
613 old_word_res->combination = true; // Kludge to force deep copy
614 current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
615 old_word_res->combination = false; // Undo kludge
616
617 break_noisiest_blob_word(current_perm);
618
619 while (best_score != PERFECT_WERDS && !current_perm.empty()) {
620 match_current_words(current_perm, row, block);
621 current_score = fp_eval_word_spacing(current_perm);
622 dump_words(current_perm, current_score, 2, improved);
623 if (current_score > best_score) {
624 best_perm.clear();
625 best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
626 best_score = current_score;
627 improved = true;
628 }
629 if (current_score < PERFECT_WERDS) {
630 break_noisiest_blob_word(current_perm);
631 }
632 }
633 dump_words(best_perm, best_score, 3, improved);
634}
635
636
642void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {
643 WERD_RES_IT word_it(&words);
644 WERD_RES_IT worst_word_it;
645 float worst_noise_score = 9999;
646 int worst_blob_index = -1; // Noisiest blob of noisiest wd
647 int blob_index; // of wds noisiest blob
648 float noise_score; // of wds noisiest blob
649 WERD_RES *word_res;
650 C_BLOB_IT blob_it;
651 C_BLOB_IT rej_cblob_it;
652 C_BLOB_LIST new_blob_list;
653 C_BLOB_IT new_blob_it;
654 C_BLOB_IT new_rej_cblob_it;
655 WERD *new_word;
656 int16_t start_of_noise_blob;
657 int16_t i;
658
659 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
660 blob_index = worst_noise_blob(word_it.data(), &noise_score);
661 if (blob_index > -1 && worst_noise_score > noise_score) {
662 worst_noise_score = noise_score;
663 worst_blob_index = blob_index;
664 worst_word_it = word_it;
665 }
666 }
667 if (worst_blob_index < 0) {
668 words.clear(); // signal termination
669 return;
670 }
671
672 /* Now split the worst_word_it */
673
674 word_res = worst_word_it.data();
675
676 /* Move blobs before noise blob to a new bloblist */
677
678 new_blob_it.set_to_list(&new_blob_list);
679 blob_it.set_to_list(word_res->word->cblob_list());
680 for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
681 new_blob_it.add_after_then_move(blob_it.extract());
682 }
683 start_of_noise_blob = blob_it.data()->bounding_box().left();
684 delete blob_it.extract(); // throw out noise blob
685
686 new_word = new WERD(&new_blob_list, word_res->word);
687 new_word->set_flag(W_EOL, false);
688 word_res->word->set_flag(W_BOL, false);
689 word_res->word->set_blanks(1); // After break
690
691 new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
692 rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
693 for (;
694 (!rej_cblob_it.empty() &&
695 (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
696 rej_cblob_it.forward()) {
697 new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
698 }
699
700 auto* new_word_res = new WERD_RES(new_word);
701 new_word_res->combination = true;
702 worst_word_it.add_before_then_move(new_word_res);
703
704 word_res->ClearResults();
705}
706
708 float *worst_noise_score) {
709 float noise_score[512];
710 int i;
711 int min_noise_blob; // 1st contender
712 int max_noise_blob; // last contender
713 int non_noise_count;
714 int worst_noise_blob; // Worst blob
715 float small_limit = kBlnXHeight * fixsp_small_outlines_size;
716 float non_noise_limit = kBlnXHeight * 0.8;
717
718 if (word_res->rebuild_word == nullptr)
719 return -1; // Can't handle cube words.
720
721 // Normalised.
722 int blob_count = word_res->box_word->length();
723 ASSERT_HOST(blob_count <= 512);
724 if (blob_count < 5)
725 return -1; // too short to split
726
727 /* Get the noise scores for all blobs */
728
729 #ifndef SECURE_NAMES
730 if (debug_fix_space_level > 5)
731 tprintf("FP fixspace Noise metrics for \"%s\": ",
732 word_res->best_choice->unichar_string().string());
733 #endif
734
735 for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
736 TBLOB* blob = word_res->rebuild_word->blobs[i];
737 if (word_res->reject_map[i].accepted())
738 noise_score[i] = non_noise_limit;
739 else
740 noise_score[i] = blob_noise_score(blob);
741
742 if (debug_fix_space_level > 5)
743 tprintf("%1.1f ", noise_score[i]);
744 }
745 if (debug_fix_space_level > 5)
746 tprintf("\n");
747
748 /* Now find the worst one which is far enough away from the end of the word */
749
750 non_noise_count = 0;
751 for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
752 if (noise_score[i] >= non_noise_limit) {
753 non_noise_count++;
754 }
755 }
756 if (non_noise_count < fixsp_non_noise_limit)
757 return -1;
758
759 min_noise_blob = i;
760
761 non_noise_count = 0;
762 for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit;
763 i--) {
764 if (noise_score[i] >= non_noise_limit) {
765 non_noise_count++;
766 }
767 }
768 if (non_noise_count < fixsp_non_noise_limit)
769 return -1;
770
771 max_noise_blob = i;
772
773 if (min_noise_blob > max_noise_blob)
774 return -1;
775
776 *worst_noise_score = small_limit;
777 worst_noise_blob = -1;
778 for (i = min_noise_blob; i <= max_noise_blob; i++) {
779 if (noise_score[i] < *worst_noise_score) {
781 *worst_noise_score = noise_score[i];
782 }
783 }
784 return worst_noise_blob;
785}
786
788 TBOX box; // BB of outline
789 int16_t outline_count = 0;
790 int16_t max_dimension;
791 int16_t largest_outline_dimension = 0;
792
793 for (TESSLINE* ol = blob->outlines; ol != nullptr; ol= ol->next) {
794 outline_count++;
795 box = ol->bounding_box();
796 if (box.height() > box.width()) {
797 max_dimension = box.height();
798 } else {
799 max_dimension = box.width();
800 }
801
802 if (largest_outline_dimension < max_dimension)
803 largest_outline_dimension = max_dimension;
804 }
805
806 if (outline_count > 5) {
807 // penalise LOTS of blobs
808 largest_outline_dimension *= 2;
809 }
810
811 box = blob->bounding_box();
812 if (box.bottom() > kBlnBaselineOffset * 4 ||
813 box.top() < kBlnBaselineOffset / 2) {
814 // Lax blob is if high or low
815 largest_outline_dimension /= 2;
816 }
817
818 return largest_outline_dimension;
819}
820} // namespace tesseract
821
823 TBOX box = word->word->bounding_box();
824 const bool show_map_detail = false;
825 int16_t i;
826
827 box.print();
828 tprintf(" \"%s\" ", word->best_choice->unichar_string().string());
829 tprintf("Blob count: %d (word); %d/%d (rebuild word)\n",
830 word->word->cblob_list()->length(),
831 word->rebuild_word->NumBlobs(),
832 word->box_word->length());
834 tprintf("\n");
835 if (show_map_detail) {
836 tprintf("\"%s\"\n", word->best_choice->unichar_string().string());
837 for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
838 tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
840 }
841 }
842
843 tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
844 tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
845}
846
847
856namespace tesseract {
857int16_t Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
858 WERD_RES_IT word_it(&word_res_list);
859 WERD_RES *word;
860 int16_t score = 0;
861 int16_t i;
862 float small_limit = kBlnXHeight * fixsp_small_outlines_size;
863
864 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
865 word = word_it.data();
866 if (word->rebuild_word == nullptr)
867 continue; // Can't handle cube words.
868 if (word->done ||
869 word->tess_accepted ||
871 word->best_choice->permuter() == FREQ_DAWG_PERM ||
872 word->best_choice->permuter() == USER_DAWG_PERM ||
873 safe_dict_word(word) > 0) {
874 int num_blobs = word->rebuild_word->NumBlobs();
875 UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
876 for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
877 TBLOB* blob = word->rebuild_word->blobs[i];
878 if (word->best_choice->unichar_id(i) == space ||
879 blob_noise_score(blob) < small_limit) {
880 score -= 1; // penalise possibly erroneous non-space
881 } else if (word->reject_map[i].accepted()) {
882 score++;
883 }
884 }
885 }
886 }
887 if (score < 0)
888 score = 0;
889 return score;
890}
891
892} // namespace tesseract
void fixspace_dbg(WERD_RES *word)
Definition: fixspace.cpp:822
void transform_to_next_perm(WERD_RES_LIST &words)
Definition: fixspace.cpp:399
#define PERFECT_WERDS
Definition: fixspace.cpp:44
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list)
Definition: fixspace.cpp:204
FILE * debug_fp
Definition: tessvars.cpp:24
const int kBlnBaselineOffset
Definition: normalis.h:25
const int kBlnXHeight
Definition: normalis.h:24
@ FREQ_DAWG_PERM
Definition: ratngs.h:244
@ USER_DAWG_PERM
Definition: ratngs.h:243
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:241
@ NUMBER_PERM
Definition: ratngs.h:239
@ W_FUZZY_SP
fuzzy space
Definition: werd.h:39
@ W_EOL
end of line
Definition: werd.h:33
@ W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:40
@ W_REP_CHAR
repeated character
Definition: werd.h:38
@ W_DONT_CHOP
fixed pitch chopped
Definition: werd.h:37
@ W_BOL
start of line
Definition: werd.h:32
#define ASSERT_HOST(x)
Definition: errcode.h:88
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int UNICHAR_ID
Definition: unichar.h:34
bool fixspace_thinks_word_done(WERD_RES *word)
Definition: fixspace.cpp:530
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1319
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:596
int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:707
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
Definition: fixspace.cpp:562
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:177
float blob_noise_score(TBLOB *blob)
Definition: fixspace.cpp:787
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:172
void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
Definition: fixspace.cpp:476
bool digit_or_numeric_punct(WERD_RES *word, int char_position)
Definition: fixspace.cpp:370
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:608
void break_noisiest_blob_word(WERD_RES_LIST &words)
Definition: fixspace.cpp:642
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1849
int16_t fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:857
int16_t eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:266
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
Definition: fixspace.cpp:223
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:75
TESSLINE * next
Definition: blobs.h:281
Definition: blobs.h:284
TESSLINE * outlines
Definition: blobs.h:400
TBOX bounding_box() const
Definition: blobs.cpp:468
int NumBlobs() const
Definition: blobs.h:448
GenericVector< TBLOB * > blobs
Definition: blobs.h:459
int length() const
Definition: boxword.h:83
Definition: ocrblock.h:31
Definition: ocrrow.h:37
BLOCK_RES_LIST block_res_list
Definition: pageres.h:80
const UNICHARSET * uch_set
Definition: pageres.h:203
TWERD * rebuild_word
Definition: pageres.h:266
bool done
Definition: pageres.h:305
tesseract::BoxWord * box_word
Definition: pageres.h:272
void copy_on(WERD_RES *word_res)
Definition: pageres.h:660
bool combination
Definition: pageres.h:339
WERD_CHOICE * best_choice
Definition: pageres.h:241
float x_height
Definition: pageres.h:316
bool part_of_combo
Definition: pageres.h:340
void ClearResults()
Definition: pageres.cpp:1104
bool tess_failed
Definition: pageres.h:295
bool tess_accepted
Definition: pageres.h:303
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:649
REJMAP reject_map
Definition: pageres.h:294
WERD * word
Definition: pageres.h:186
const STRING & unichar_string() const
Definition: ratngs.h:531
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
uint8_t permuter() const
Definition: ratngs.h:336
int length() const
Definition: ratngs.h:293
const STRING & unichar_lengths() const
Definition: ratngs.h:538
Definition: rect.h:34
int16_t top() const
Definition: rect.h:58
void print() const
Definition: rect.h:278
int16_t width() const
Definition: rect.h:115
int16_t height() const
Definition: rect.h:108
int16_t left() const
Definition: rect.h:72
int16_t bottom() const
Definition: rect.h:65
int16_t right() const
Definition: rect.h:79
void print(FILE *fp)
Definition: rejctmap.cpp:321
int16_t reject_count()
Definition: rejctmap.h:229
void full_print(FILE *fp)
Definition: rejctmap.cpp:333
int32_t length() const
Definition: rejctmap.h:223
TBOX bounding_box() const
Definition: stepblob.cpp:253
Definition: werd.h:56
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:90
C_BLOB_LIST * cblob_list()
Definition: werd.h:95
void set_blanks(uint8_t new_blanks)
Definition: werd.h:102
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:118
TBOX bounding_box() const
Definition: werd.cpp:148
void join_on(WERD *other)
Definition: werd.cpp:199
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
volatile int8_t ocr_alive
true if not last
Definition: ocrclass.h:110
int16_t progress
chars in this buffer(0)
Definition: ocrclass.h:105
void * cancel_this
monitor-aware progress callback
Definition: ocrclass.h:116
bool deadline_exceeded() const
Definition: ocrclass.h:138
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:112
Definition: strngs.h:45
bool contains(char c) const
Definition: strngs.cpp:185
const char * string() const
Definition: strngs.cpp:194
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:476