tesseract 4.1.1
Loading...
Searching...
No Matches
control.cpp
Go to the documentation of this file.
1/******************************************************************
2 * File: control.cpp (Formerly control.c)
3 * Description: Module-independent matcher controller.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 1992, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19// Include automatically generated configuration file if running autoconf.
20#ifdef HAVE_CONFIG_H
21#include "config_auto.h"
22#endif
23
24#include <cmath>
25#include <cstdint> // for int16_t, int32_t
26#include <cstdio> // for fclose, fopen, FILE
27#include <ctime> // for clock
28#include <cctype>
29#include "callcpp.h"
30#include "control.h"
31#ifndef DISABLED_LEGACY_ENGINE
32#include "docqual.h"
33#include "drawfx.h"
34#include "fixspace.h"
35#endif
36#include "lstmrecognizer.h"
37#include "ocrclass.h"
38#include "output.h"
39#include "pageres.h" // for WERD_RES, PAGE_RES_IT, PAGE_RES, BLO...
40#ifndef DISABLED_LEGACY_ENGINE
41#include "reject.h"
42#endif
43#include "sorthelper.h"
44#include "tesseractclass.h"
45#include "tessvars.h"
46#include "werdit.h"
47
48const char* const kBackUpConfigFile = "tempconfigdata.config";
49// Min believable x-height for any text when refitting as a fraction of
50// original x-height
51const double kMinRefitXHeightFraction = 0.5;
52
53
60namespace tesseract {
61
63 TBOX &selection_box) {
64 PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
65 if (it != nullptr) {
68 delete it;
69 }
70}
71
78 int16_t char_qual;
79 int16_t good_char_qual;
80
81 WordData word_data(*pr_it);
82 SetupWordPassN(2, &word_data);
83 // LSTM doesn't run on pass2, but we want to run pass2 for tesseract.
84 if (lstm_recognizer_ == nullptr) {
85#ifndef DISABLED_LEGACY_ENGINE
86 classify_word_and_language(2, pr_it, &word_data);
87#endif // ndef DISABLED_LEGACY_ENGINE
88 } else {
89 classify_word_and_language(1, pr_it, &word_data);
90 }
91#ifndef DISABLED_LEGACY_ENGINE
93 WERD_RES* word_res = pr_it->word();
94 word_char_quality(word_res, pr_it->row()->row, &char_qual, &good_char_qual);
95 tprintf("\n%d chars; word_blob_quality: %d; outline_errs: %d; "
96 "char_quality: %d; good_char_quality: %d\n",
97 word_res->reject_map.length(),
98 word_blob_quality(word_res, pr_it->row()->row),
99 word_outline_errs(word_res), char_qual, good_char_qual);
100 }
101#endif // ndef DISABLED_LEGACY_ENGINE
102 return true;
103}
104
105// Helper function to check for a target word and handle it appropriately.
106// Inspired by Jetsoft's requirement to process only single words on pass2
107// and beyond.
108// If word_config is not null:
109// If the word_box and target_word_box overlap, read the word_config file
110// else reset to previous config data.
111// return true.
112// else
113// If the word_box and target_word_box overlap or pass <= 1, return true.
114// Note that this function uses a fixed temporary file for storing the previous
115// configs, so it is neither thread-safe, nor process-safe, but the assumption
116// is that it will only be used for one debug window at a time.
117//
118// Since this function is used for debugging (and not to change OCR results)
119// set only debug params from the word config file.
121 const TBOX& target_word_box,
122 const char* word_config,
123 int pass) {
124 if (word_config != nullptr) {
125 if (word_box.major_overlap(target_word_box)) {
126 if (backup_config_file_ == nullptr) {
127 backup_config_file_ = kBackUpConfigFile;
128 FILE* config_fp = fopen(backup_config_file_, "wb");
129 if (config_fp == nullptr) {
130 tprintf("Error, failed to open file \"%s\"\n", backup_config_file_);
131 } else {
132 ParamUtils::PrintParams(config_fp, params());
133 fclose(config_fp);
134 }
135 ParamUtils::ReadParamsFile(word_config,
137 params());
138 }
139 } else {
140 if (backup_config_file_ != nullptr) {
141 ParamUtils::ReadParamsFile(backup_config_file_,
143 params());
144 backup_config_file_ = nullptr;
145 }
146 }
147 } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
148 return false;
149 }
150 return true;
151}
152
155 const TBOX* target_word_box,
156 const char* word_config,
157 PAGE_RES* page_res,
159 // Prepare all the words.
160 PAGE_RES_IT page_res_it(page_res);
161 for (page_res_it.restart_page(); page_res_it.word() != nullptr;
162 page_res_it.forward()) {
163 if (target_word_box == nullptr ||
164 ProcessTargetWord(page_res_it.word()->word->bounding_box(),
165 *target_word_box, word_config, 1)) {
166 words->push_back(WordData(page_res_it));
167 }
168 }
169 // Setup all the words for recognition with polygonal approximation.
170 for (int w = 0; w < words->size(); ++w) {
171 SetupWordPassN(pass_n, &(*words)[w]);
172 if (w > 0) (*words)[w].prev_word = &(*words)[w - 1];
173 }
174}
175
176// Sets up the single word ready for whichever engine is to be run.
177void Tesseract::SetupWordPassN(int pass_n, WordData* word) {
178 if (pass_n == 1 || !word->word->done) {
179 if (pass_n == 1) {
185 word->row, word->block);
186 } else if (pass_n == 2) {
187 // TODO(rays) Should we do this on pass1 too?
188 word->word->caps_height = 0.0;
189 if (word->word->x_height == 0.0f)
190 word->word->x_height = word->row->x_height();
191 }
192 word->lang_words.truncate(0);
193 for (int s = 0; s <= sub_langs_.size(); ++s) {
194 // The sub_langs_.size() entry is for the master language.
195 Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
196 auto* word_res = new WERD_RES;
197 word_res->InitForRetryRecognition(*word->word);
198 word->lang_words.push_back(word_res);
199 // LSTM doesn't get setup for pass2.
200 if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
201 word_res->SetupForRecognition(
202 lang_t->unicharset, lang_t, BestPix(),
203 lang_t->tessedit_ocr_engine_mode, nullptr,
206 lang_t->poly_allow_detailed_fx, word->row, word->block);
207 }
208 }
209 }
210}
211
212// Runs word recognition on all the words.
214 PAGE_RES_IT* pr_it,
216 // TODO(rays) Before this loop can be parallelized (it would yield a massive
217 // speed-up) all remaining member globals need to be converted to local/heap
218 // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
219 // added. The results will be significantly different with adaption on, and
220 // deterioration will need investigation.
221 pr_it->restart_page();
222 for (int w = 0; w < words->size(); ++w) {
223 WordData* word = &(*words)[w];
224 if (w > 0) word->prev_word = &(*words)[w - 1];
225 if (monitor != nullptr) {
226 monitor->ocr_alive = true;
227 if (pass_n == 1) {
228 monitor->progress = 70 * w / words->size();
229 } else {
230 monitor->progress = 70 + 30 * w / words->size();
231 }
232 if (monitor->progress_callback2 != nullptr) {
233 TBOX box = pr_it->word()->word->bounding_box();
234 (*monitor->progress_callback2)(monitor, box.left(),
235 box.right(), box.top(), box.bottom());
236 }
237 if (monitor->deadline_exceeded() ||
238 (monitor->cancel != nullptr && (*monitor->cancel)(monitor->cancel_this,
239 words->size()))) {
240 // Timeout. Fake out the rest of the words.
241 for (; w < words->size(); ++w) {
242 (*words)[w].word->SetupFake(unicharset);
243 }
244 return false;
245 }
246 }
247 if (word->word->tess_failed) {
248 int s;
249 for (s = 0; s < word->lang_words.size() &&
250 word->lang_words[s]->tess_failed; ++s) {}
251 // If all are failed, skip it. Image words are skipped by this test.
252 if (s > word->lang_words.size()) continue;
253 }
254 // Sync pr_it with the wth WordData.
255 while (pr_it->word() != nullptr && pr_it->word() != word->word)
256 pr_it->forward();
257 ASSERT_HOST(pr_it->word() != nullptr);
258 bool make_next_word_fuzzy = false;
259 #ifndef DISABLED_LEGACY_ENGINE
260 if (!AnyLSTMLang() &&
261 ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {
262 // Needs to be setup again to see the new outlines in the chopped_word.
263 SetupWordPassN(pass_n, word);
264 }
265 #endif // ndef DISABLED_LEGACY_ENGINE
266
267 classify_word_and_language(pass_n, pr_it, word);
269 tprintf("Pass%d: %s [%s]\n", pass_n,
271 word->word->best_choice->debug_string().string());
272 }
273 pr_it->forward();
274 if (make_next_word_fuzzy && pr_it->word() != nullptr) {
275 pr_it->MakeCurrentWordFuzzy();
276 }
277 }
278 return true;
279}
280
303 ETEXT_DESC* monitor,
304 const TBOX* target_word_box,
305 const char* word_config,
306 int dopasses) {
307 PAGE_RES_IT page_res_it(page_res);
308
310 tessedit_test_adaption.set_value (true);
311 tessedit_minimal_rejection.set_value (true);
312 }
313
314 if (dopasses==0 || dopasses==1) {
315 page_res_it.restart_page();
316 // ****************** Pass 1 *******************
317
318 #ifndef DISABLED_LEGACY_ENGINE
319 // If the adaptive classifier is full switch to one we prepared earlier,
320 // ie on the previous page. If the current adaptive classifier is non-empty,
321 // prepare a backup starting at this page, in case it fills up. Do all this
322 // independently for each language.
325 } else if (!AdaptiveClassifierIsEmpty()) {
327 }
328 // Now check the sub-langs as well.
329 for (int i = 0; i < sub_langs_.size(); ++i) {
330 if (sub_langs_[i]->AdaptiveClassifierIsFull()) {
331 sub_langs_[i]->SwitchAdaptiveClassifier();
332 } else if (!sub_langs_[i]->AdaptiveClassifierIsEmpty()) {
333 sub_langs_[i]->StartBackupAdaptiveClassifier();
334 }
335 }
336
337 #endif // ndef DISABLED_LEGACY_ENGINE
338
339 // Set up all words ready for recognition, so that if parallelism is on
340 // all the input and output classes are ready to run the classifier.
342 SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);
343 #ifndef DISABLED_LEGACY_ENGINE
345 PrerecAllWordsPar(words);
346 }
347 #endif // ndef DISABLED_LEGACY_ENGINE
348
349 stats_.word_count = words.size();
350
351 stats_.dict_words = 0;
352 stats_.doc_blob_quality = 0;
353 stats_.doc_outline_errs = 0;
354 stats_.doc_char_quality = 0;
355 stats_.good_char_count = 0;
356 stats_.doc_good_char_quality = 0;
357
358 most_recently_used_ = this;
359 // Run pass 1 word recognition.
360 if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) return false;
361 // Pass 1 post-processing.
362 for (page_res_it.restart_page(); page_res_it.word() != nullptr;
363 page_res_it.forward()) {
364 if (page_res_it.word()->word->flag(W_REP_CHAR)) {
365 fix_rep_char(&page_res_it);
366 continue;
367 }
368
369 // Count dict words.
370 if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
371 ++(stats_.dict_words);
372
373 // Update misadaption log (we only need to do it on pass 1, since
374 // adaption only happens on this pass).
375 if (page_res_it.word()->blamer_bundle != nullptr &&
376 page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
377 page_res->misadaption_log.push_back(
378 page_res_it.word()->blamer_bundle->misadaption_debug());
379 }
380 }
381 }
382
383 if (dopasses == 1) return true;
384
385 #ifndef DISABLED_LEGACY_ENGINE
386
387 // ****************** Pass 2 *******************
389 AnyTessLang()) {
390 page_res_it.restart_page();
392 SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
394 PrerecAllWordsPar(words);
395 }
396 most_recently_used_ = this;
397 // Run pass 2 word recognition.
398 if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) return false;
399 }
400
401 // The next passes are only required for Tess-only.
402 if (AnyTessLang() && !AnyLSTMLang()) {
403 // ****************** Pass 3 *******************
404 // Fix fuzzy spaces.
406
409 fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
410
411 // ****************** Pass 4 *******************
414
415 // ****************** Pass 5,6 *******************
416 rejection_passes(page_res, monitor, target_word_box, word_config);
417
418 // ****************** Pass 8 *******************
419 font_recognition_pass(page_res);
420
421 // ****************** Pass 9 *******************
422 // Check the correctness of the final results.
423 blamer_pass(page_res);
424 script_pos_pass(page_res);
425 }
426
427 #endif // ndef DISABLED_LEGACY_ENGINE
428
429 // Write results pass.
431 // This is now redundant, but retained commented so show how to obtain
432 // bounding boxes and style information.
433
434 #ifndef DISABLED_LEGACY_ENGINE
435 // changed by jetsoft
436 // needed for dll to output memory structure
437 if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
438 output_pass(page_res_it, target_word_box);
439 // end jetsoft
440 #endif //ndef DISABLED_LEGACY_ENGINE
441
442 const auto pageseg_mode = static_cast<PageSegMode>(
443 static_cast<int>(tessedit_pageseg_mode));
444 textord_.CleanupSingleRowResult(pageseg_mode, page_res);
445
446 // Remove empty words, as these mess up the result iterators.
447 for (page_res_it.restart_page(); page_res_it.word() != nullptr;
448 page_res_it.forward()) {
449 const WERD_RES* word = page_res_it.word();
450 const POLY_BLOCK* pb = page_res_it.block()->block != nullptr
451 ? page_res_it.block()->block->pdblk.poly_block()
452 : nullptr;
453 if (word->best_choice == nullptr || word->best_choice->length() == 0 ||
454 (word->best_choice->IsAllSpaces() && (pb == nullptr || pb->IsText()))) {
455 page_res_it.DeleteCurrentWord();
456 }
457 }
458
459 if (monitor != nullptr) {
460 monitor->progress = 100;
461 }
462 return true;
463}
464
465#ifndef DISABLED_LEGACY_ENGINE
466
468 PAGE_RES_IT word_it(page_res);
469
470 WERD_RES *w_prev = nullptr;
471 WERD_RES *w = word_it.word();
472 while (true) {
473 w_prev = w;
474 while (word_it.forward() != nullptr &&
475 (!word_it.word() || word_it.word()->part_of_combo)) {
476 // advance word_it, skipping over parts of combos
477 }
478 if (!word_it.word()) break;
479 w = word_it.word();
480 if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
481 continue;
482 }
483 if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
485 tprintf("Skipping because one of the words is W_REP_CHAR\n");
486 }
487 continue;
488 }
489 // Two words sharing the same language model, excellent!
490 GenericVector<WERD_CHOICE *> overrides_word1;
491 GenericVector<WERD_CHOICE *> overrides_word2;
492
493 const STRING orig_w1_str = w_prev->best_choice->unichar_string();
494 const STRING orig_w2_str = w->best_choice->unichar_string();
495 WERD_CHOICE prev_best(w->uch_set);
496 {
497 int w1start, w1end;
498 w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
499 prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
500 }
501 WERD_CHOICE this_best(w->uch_set);
502 {
503 int w2start, w2end;
504 w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
505 this_best = w->best_choice->shallow_copy(w2start, w2end);
506 }
507
508 if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
510 tprintf("Top choice \"%s %s\" verified by bigram model.\n",
511 orig_w1_str.string(), orig_w2_str.string());
512 }
513 continue;
514 }
515 if (tessedit_bigram_debug > 2) {
516 tprintf("Examining alt choices for \"%s %s\".\n",
517 orig_w1_str.string(), orig_w2_str.string());
518 }
519 if (tessedit_bigram_debug > 1) {
520 if (!w_prev->best_choices.singleton()) {
521 w_prev->PrintBestChoices();
522 }
523 if (!w->best_choices.singleton()) {
524 w->PrintBestChoices();
525 }
526 }
527 float best_rating = 0.0;
528 int best_idx = 0;
529 WERD_CHOICE_IT prev_it(&w_prev->best_choices);
530 for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
531 WERD_CHOICE *p1 = prev_it.data();
532 WERD_CHOICE strip1(w->uch_set);
533 {
534 int p1start, p1end;
535 p1->GetNonSuperscriptSpan(&p1start, &p1end);
536 strip1 = p1->shallow_copy(p1start, p1end);
537 }
538 WERD_CHOICE_IT w_it(&w->best_choices);
539 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
540 WERD_CHOICE *p2 = w_it.data();
541 WERD_CHOICE strip2(w->uch_set);
542 {
543 int p2start, p2end;
544 p2->GetNonSuperscriptSpan(&p2start, &p2end);
545 strip2 = p2->shallow_copy(p2start, p2end);
546 }
547 if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
548 overrides_word1.push_back(p1);
549 overrides_word2.push_back(p2);
550 if (overrides_word1.size() == 1 ||
551 p1->rating() + p2->rating() < best_rating) {
552 best_rating = p1->rating() + p2->rating();
553 best_idx = overrides_word1.size() - 1;
554 }
555 }
556 }
557 }
558 if (!overrides_word1.empty()) {
559 // Excellent, we have some bigram matches.
561 *overrides_word1[best_idx]) &&
563 *overrides_word2[best_idx])) {
564 if (tessedit_bigram_debug > 1) {
565 tprintf("Top choice \"%s %s\" verified (sans case) by bigram "
566 "model.\n", orig_w1_str.string(), orig_w2_str.string());
567 }
568 continue;
569 }
570 const STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
571 const STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
572 if (new_w1_str != orig_w1_str) {
573 w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
574 }
575 if (new_w2_str != orig_w2_str) {
576 w->ReplaceBestChoice(overrides_word2[best_idx]);
577 }
578 if (tessedit_bigram_debug > 0) {
579 STRING choices_description;
580 int num_bigram_choices
581 = overrides_word1.size() * overrides_word2.size();
582 if (num_bigram_choices == 1) {
583 choices_description = "This was the unique bigram choice.";
584 } else {
585 if (tessedit_bigram_debug > 1) {
586 STRING bigrams_list;
587 const int kMaxChoicesToPrint = 20;
588 for (int i = 0; i < overrides_word1.size() &&
589 i < kMaxChoicesToPrint; i++) {
590 if (i > 0) { bigrams_list += ", "; }
591 WERD_CHOICE *p1 = overrides_word1[i];
592 WERD_CHOICE *p2 = overrides_word2[i];
593 bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
594 }
595 choices_description = "There were many choices: {";
596 choices_description += bigrams_list;
597 choices_description += "}";
598 } else {
599 choices_description.add_str_int("There were ", num_bigram_choices);
600 choices_description += " compatible bigrams.";
601 }
602 }
603 tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
604 orig_w1_str.string(), orig_w2_str.string(),
605 new_w1_str.string(), new_w2_str.string(),
606 choices_description.string());
607 }
608 }
609 }
610}
611
613 ETEXT_DESC* monitor,
614 const TBOX* target_word_box,
615 const char* word_config) {
616 PAGE_RES_IT page_res_it(page_res);
617 // ****************** Pass 5 *******************
618 // Gather statistics on rejects.
619 int word_index = 0;
620 while (!tessedit_test_adaption && page_res_it.word() != nullptr) {
622 WERD_RES* word = page_res_it.word();
623 word_index++;
624 if (monitor != nullptr) {
625 monitor->ocr_alive = true;
626 monitor->progress = 95 + 5 * word_index / stats_.word_count;
627 }
628 if (word->rebuild_word == nullptr) {
629 // Word was not processed by tesseract.
630 page_res_it.forward();
631 continue;
632 }
633 check_debug_pt(word, 70);
634
635 // changed by jetsoft
636 // specific to its needs to extract one word when need
637 if (target_word_box &&
639 *target_word_box, word_config, 4)) {
640 page_res_it.forward();
641 continue;
642 }
643 // end jetsoft
644
645 page_res_it.rej_stat_word();
646 const int chars_in_word = word->reject_map.length();
647 const int rejects_in_word = word->reject_map.reject_count();
648
649 const int blob_quality = word_blob_quality(word, page_res_it.row()->row);
650 stats_.doc_blob_quality += blob_quality;
651 const int outline_errs = word_outline_errs(word);
652 stats_.doc_outline_errs += outline_errs;
653 int16_t all_char_quality;
654 int16_t accepted_all_char_quality;
655 word_char_quality(word, page_res_it.row()->row,
656 &all_char_quality, &accepted_all_char_quality);
657 stats_.doc_char_quality += all_char_quality;
658 const uint8_t permuter_type = word->best_choice->permuter();
659 if ((permuter_type == SYSTEM_DAWG_PERM) ||
660 (permuter_type == FREQ_DAWG_PERM) ||
661 (permuter_type == USER_DAWG_PERM)) {
662 stats_.good_char_count += chars_in_word - rejects_in_word;
663 stats_.doc_good_char_quality += accepted_all_char_quality;
664 }
665 check_debug_pt(word, 80);
667 (blob_quality == 0) && (outline_errs >= chars_in_word))
669 check_debug_pt(word, 90);
670 page_res_it.forward();
671 }
672
674 tprintf
675 ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f"
676 " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
677 page_res->char_count, page_res->rej_count,
678 page_res->rej_count / static_cast<float>(page_res->char_count),
679 stats_.doc_blob_quality,
680 stats_.doc_blob_quality / static_cast<float>(page_res->char_count),
681 stats_.doc_outline_errs,
682 stats_.doc_outline_errs / static_cast<float>(page_res->char_count),
683 stats_.doc_char_quality,
684 stats_.doc_char_quality / static_cast<float>(page_res->char_count),
686 (stats_.good_char_count > 0) ?
687 (stats_.doc_good_char_quality /
688 static_cast<float>(stats_.good_char_count)) : 0.0);
689 }
690 bool good_quality_doc =
691 ((page_res->rej_count / static_cast<float>(page_res->char_count)) <=
693 (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >=
695 (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <=
697 (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >=
699
700 // ****************** Pass 6 *******************
701 // Do whole document or whole block rejection pass
704 quality_based_rejection(page_res_it, good_quality_doc);
705 }
706}
707
708#endif // ndef DISABLED_LEGACY_ENGINE
709
711 if (!wordrec_run_blamer) return;
712 PAGE_RES_IT page_res_it(page_res);
713 for (page_res_it.restart_page(); page_res_it.word() != nullptr;
714 page_res_it.forward()) {
715 WERD_RES *word = page_res_it.word();
718 }
719 tprintf("Blame reasons:\n");
720 for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
722 static_cast<IncorrectResultReason>(bl)),
723 page_res->blame_reasons[bl]);
724 }
725 if (page_res->misadaption_log.length() > 0) {
726 tprintf("Misadaption log:\n");
727 for (int i = 0; i < page_res->misadaption_log.length(); ++i) {
728 tprintf("%s\n", page_res->misadaption_log[i].string());
729 }
730 }
731}
732
733// Sets script positions and detects smallcaps on all output words.
735 PAGE_RES_IT page_res_it(page_res);
736 for (page_res_it.restart_page(); page_res_it.word() != nullptr;
737 page_res_it.forward()) {
738 WERD_RES* word = page_res_it.word();
739 if (word->word->flag(W_REP_CHAR)) {
740 page_res_it.forward();
741 continue;
742 }
743 const float x_height = page_res_it.block()->block->x_height();
744 float word_x_height = word->x_height;
745 if (word_x_height < word->best_choice->min_x_height() ||
746 word_x_height > word->best_choice->max_x_height()) {
747 word_x_height = (word->best_choice->min_x_height() +
748 word->best_choice->max_x_height()) / 2.0f;
749 }
750 // Test for small caps. Word capheight must be close to block xheight,
751 // and word must contain no lower case letters, and at least one upper case.
752 const double small_cap_xheight = x_height * kXHeightCapRatio;
753 const double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
754 if (word->uch_set->script_has_xheight() &&
755 small_cap_xheight - small_cap_delta <= word_x_height &&
756 word_x_height <= small_cap_xheight + small_cap_delta) {
757 // Scan for upper/lower.
758 int num_upper = 0;
759 int num_lower = 0;
760 for (int i = 0; i < word->best_choice->length(); ++i) {
761 if (word->uch_set->get_isupper(word->best_choice->unichar_id(i)))
762 ++num_upper;
763 else if (word->uch_set->get_islower(word->best_choice->unichar_id(i)))
764 ++num_lower;
765 }
766 if (num_upper > 0 && num_lower == 0)
767 word->small_caps = true;
768 }
769 word->SetScriptPositions();
770 }
771}
772
773// Helper finds the gap between the index word and the next.
774static void WordGap(const PointerVector<WERD_RES>& words, int index, int* right,
775 int* next_left) {
776 *right = -INT32_MAX;
777 *next_left = INT32_MAX;
778 if (index < words.size()) {
779 *right = words[index]->word->bounding_box().right();
780 if (index + 1 < words.size())
781 *next_left = words[index + 1]->word->bounding_box().left();
782 }
783}
784
785// Factored helper computes the rating, certainty, badness and validity of
786// the permuter of the words in [first_index, end_index).
787static void EvaluateWordSpan(const PointerVector<WERD_RES>& words,
788 int first_index, int end_index, float* rating,
789 float* certainty, bool* bad,
790 bool* valid_permuter) {
791 if (end_index <= first_index) {
792 *bad = true;
793 *valid_permuter = false;
794 }
795 for (int index = first_index; index < end_index && index < words.size();
796 ++index) {
797 WERD_CHOICE* choice = words[index]->best_choice;
798 if (choice == nullptr) {
799 *bad = true;
800 } else {
801 *rating += choice->rating();
802 *certainty = std::min(*certainty, choice->certainty());
803 if (!Dict::valid_word_permuter(choice->permuter(), false))
804 *valid_permuter = false;
805 }
806 }
807}
808
809// Helper chooses the best combination of words, transferring good ones from
810// new_words to best_words. To win, a new word must have (better rating and
811// certainty) or (better permuter status and rating within rating ratio and
812// certainty within certainty margin) than current best.
813// All the new_words are consumed (moved to best_words or deleted.)
814// The return value is the number of new_words used minus the number of
815// best_words that remain in the output.
816static int SelectBestWords(double rating_ratio,
817 double certainty_margin,
818 bool debug,
819 PointerVector<WERD_RES>* new_words,
820 PointerVector<WERD_RES>* best_words) {
821 // Process the smallest groups of words that have an overlapping word
822 // boundary at the end.
823 GenericVector<WERD_RES*> out_words;
824 // Index into each word vector (best, new).
825 int b = 0, n = 0;
826 int num_best = 0, num_new = 0;
827 while (b < best_words->size() || n < new_words->size()) {
828 // Start of the current run in each.
829 int start_b = b, start_n = n;
830 while (b < best_words->size() || n < new_words->size()) {
831 int b_right = -INT32_MAX;
832 int next_b_left = INT32_MAX;
833 WordGap(*best_words, b, &b_right, &next_b_left);
834 int n_right = -INT32_MAX;
835 int next_n_left = INT32_MAX;
836 WordGap(*new_words, n, &n_right, &next_n_left);
837 if (std::max(b_right, n_right) < std::min(next_b_left, next_n_left)) {
838 // The word breaks overlap. [start_b,b] and [start_n, n] match.
839 break;
840 }
841 // Keep searching for the matching word break.
842 if ((b_right < n_right && b < best_words->size()) ||
843 n == new_words->size())
844 ++b;
845 else
846 ++n;
847 }
848 // Rating of the current run in each.
849 float b_rating = 0.0f, n_rating = 0.0f;
850 // Certainty of the current run in each.
851 float b_certainty = 0.0f, n_certainty = 0.0f;
852 // True if any word is missing its best choice.
853 bool b_bad = false, n_bad = false;
854 // True if all words have a valid permuter.
855 bool b_valid_permuter = true, n_valid_permuter = true;
856 const int end_b = b < best_words->size() ? b + 1 : b;
857 const int end_n = n < new_words->size() ? n + 1 : n;
858 EvaluateWordSpan(*best_words, start_b, end_b, &b_rating, &b_certainty,
859 &b_bad, &b_valid_permuter);
860 EvaluateWordSpan(*new_words, start_n, end_n, &n_rating, &n_certainty,
861 &n_bad, &n_valid_permuter);
862 bool new_better = false;
863 if (!n_bad && (b_bad || (n_certainty > b_certainty &&
864 n_rating < b_rating) ||
865 (!b_valid_permuter && n_valid_permuter &&
866 n_rating < b_rating * rating_ratio &&
867 n_certainty > b_certainty - certainty_margin))) {
868 // New is better.
869 for (int i = start_n; i < end_n; ++i) {
870 out_words.push_back((*new_words)[i]);
871 (*new_words)[i] = nullptr;
872 ++num_new;
873 }
874 new_better = true;
875 } else if (!b_bad) {
876 // Current best is better.
877 for (int i = start_b; i < end_b; ++i) {
878 out_words.push_back((*best_words)[i]);
879 (*best_words)[i] = nullptr;
880 ++num_best;
881 }
882 }
883 if (debug) {
884 tprintf("%d new words %s than %d old words: r: %g v %g c: %g v %g"
885 " valid dict: %d v %d\n",
886 end_n - start_n, new_better ? "better" : "worse",
887 end_b - start_b, n_rating, b_rating,
888 n_certainty, b_certainty, n_valid_permuter, b_valid_permuter);
889 }
890 // Move on to the next group.
891 b = end_b;
892 n = end_n;
893 }
894 // Transfer from out_words to best_words.
895 best_words->clear();
896 for (int i = 0; i < out_words.size(); ++i)
897 best_words->push_back(out_words[i]);
898 return num_new - num_best;
899}
900
901// Helper to recognize the word using the given (language-specific) tesseract.
902// Returns positive if this recognizer found more new best words than the
903// number kept from best_words.
905 WordRecognizer recognizer, bool debug,
906 WERD_RES** in_word,
907 PointerVector<WERD_RES>* best_words) {
908 if (debug) {
909 tprintf("Trying word using lang %s, oem %d\n",
910 lang.string(), static_cast<int>(tessedit_ocr_engine_mode));
911 }
912 // Run the recognizer on the word.
913 PointerVector<WERD_RES> new_words;
914 (this->*recognizer)(word_data, in_word, &new_words);
915 if (new_words.empty()) {
916 // Transfer input word to new_words, as the classifier must have put
917 // the result back in the input.
918 new_words.push_back(*in_word);
919 *in_word = nullptr;
920 }
921 if (debug) {
922 for (int i = 0; i < new_words.size(); ++i)
923 new_words[i]->DebugTopChoice("Lang result");
924 }
925 // Initial version is a bit of a hack based on better certainty and rating
926 // or a dictionary vs non-dictionary word.
927 return SelectBestWords(classify_max_rating_ratio,
929 debug, &new_words, best_words);
930}
931
932// Helper returns true if all the words are acceptable.
933static bool WordsAcceptable(const PointerVector<WERD_RES>& words) {
934 for (int w = 0; w < words.size(); ++w) {
935 if (words[w]->tess_failed || !words[w]->tess_accepted) return false;
936 }
937 return true;
938}
939
940#ifndef DISABLED_LEGACY_ENGINE
941
942// Moves good-looking "noise"/diacritics from the reject list to the main
943// blob list on the current word. Returns true if anything was done, and
944// sets make_next_word_fuzzy if blob(s) were added to the end of the word.
946 bool* make_next_word_fuzzy) {
947 *make_next_word_fuzzy = false;
948 WERD* real_word = pr_it->word()->word;
949 if (real_word->rej_cblob_list()->empty() ||
950 real_word->cblob_list()->empty() ||
951 real_word->rej_cblob_list()->length() > noise_maxperword)
952 return false;
953 real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
954 // Get the noise outlines into a vector with matching bool map.
956 real_word->GetNoiseOutlines(&outlines);
957 GenericVector<bool> word_wanted;
958 GenericVector<bool> overlapped_any_blob;
959 GenericVector<C_BLOB*> target_blobs;
960 AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it,
961 &word_wanted, &overlapped_any_blob,
962 &target_blobs);
963 // Filter the outlines that overlapped any blob and put them into the word
964 // now. This simplifies the remaining task and also makes it more accurate
965 // as it has more completed blobs to work on.
966 GenericVector<bool> wanted;
967 GenericVector<C_BLOB*> wanted_blobs;
968 GenericVector<C_OUTLINE*> wanted_outlines;
969 int num_overlapped = 0;
970 int num_overlapped_used = 0;
971 for (int i = 0; i < overlapped_any_blob.size(); ++i) {
972 if (overlapped_any_blob[i]) {
973 ++num_overlapped;
974 if (word_wanted[i]) ++num_overlapped_used;
975 wanted.push_back(word_wanted[i]);
976 wanted_blobs.push_back(target_blobs[i]);
977 wanted_outlines.push_back(outlines[i]);
978 outlines[i] = nullptr;
979 }
980 }
981 real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, nullptr);
982 AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted,
983 &target_blobs);
984 int non_overlapped = 0;
985 int non_overlapped_used = 0;
986 for (int i = 0; i < word_wanted.size(); ++i) {
987 if (word_wanted[i]) ++non_overlapped_used;
988 if (outlines[i] != nullptr) ++non_overlapped_used;
989 }
991 tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:",
992 num_overlapped_used, num_overlapped, non_overlapped_used,
993 non_overlapped);
994 real_word->bounding_box().print();
995 }
996 // Now we have decided which outlines we want, put them into the real_word.
997 if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines,
998 make_next_word_fuzzy)) {
999 pr_it->MakeCurrentWordFuzzy();
1000 }
1001 // TODO(rays) Parts of combos have a deep copy of the real word, and need
1002 // to have their noise outlines moved/assigned in the same way!!
1003 return num_overlapped_used != 0 || non_overlapped_used != 0;
1004}
1005
1006// Attempts to put noise/diacritic outlines into the blobs that they overlap.
1007// Input: a set of noisy outlines that probably belong to the real_word.
1008// Output: word_wanted indicates which outlines are to be assigned to a blob,
1009// target_blobs indicates which to assign to, and overlapped_any_blob is
1010// true for all outlines that overlapped a blob.
1012 const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
1013 PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
1014 GenericVector<bool>* overlapped_any_blob,
1015 GenericVector<C_BLOB*>* target_blobs) {
1016 GenericVector<bool> blob_wanted;
1017 word_wanted->init_to_size(outlines.size(), false);
1018 overlapped_any_blob->init_to_size(outlines.size(), false);
1019 target_blobs->init_to_size(outlines.size(), nullptr);
1020 // For each real blob, find the outlines that seriously overlap it.
1021 // A single blob could be several merged characters, so there can be quite
1022 // a few outlines overlapping, and the full engine needs to be used to chop
1023 // and join to get a sensible result.
1024 C_BLOB_IT blob_it(real_word->cblob_list());
1025 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1026 C_BLOB* blob = blob_it.data();
1027 const TBOX blob_box = blob->bounding_box();
1028 blob_wanted.init_to_size(outlines.size(), false);
1029 int num_blob_outlines = 0;
1030 for (int i = 0; i < outlines.size(); ++i) {
1031 if (blob_box.major_x_overlap(outlines[i]->bounding_box()) &&
1032 !(*word_wanted)[i]) {
1033 blob_wanted[i] = true;
1034 (*overlapped_any_blob)[i] = true;
1035 ++num_blob_outlines;
1036 }
1037 }
1038 if (debug_noise_removal) {
1039 tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
1040 blob_box.print();
1041 }
1042 // If any outlines overlap the blob, and not too many, classify the blob
1043 // (using the full engine, languages and all), and choose the maximal
1044 // combination of outlines that doesn't hurt the end-result classification
1045 // by too much. Mark them as wanted.
1046 if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
1047 if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob,
1048 outlines, num_blob_outlines,
1049 &blob_wanted)) {
1050 for (int i = 0; i < blob_wanted.size(); ++i) {
1051 if (blob_wanted[i]) {
1052 // Claim the outline and record where it is going.
1053 (*word_wanted)[i] = true;
1054 (*target_blobs)[i] = blob;
1055 }
1056 }
1057 }
1058 }
1059 }
1060}
1061
1062// Attempts to assign non-overlapping outlines to their nearest blobs or
1063// make new blobs out of them.
1065 const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
1066 PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
1067 GenericVector<C_BLOB*>* target_blobs) {
1068 GenericVector<bool> blob_wanted;
1069 word_wanted->init_to_size(outlines.size(), false);
1070 target_blobs->init_to_size(outlines.size(), nullptr);
1071 // Check for outlines that need to be turned into stand-alone blobs.
1072 for (int i = 0; i < outlines.size(); ++i) {
1073 if (outlines[i] == nullptr) continue;
1074 // Get a set of adjacent outlines that don't overlap any existing blob.
1075 blob_wanted.init_to_size(outlines.size(), false);
1076 int num_blob_outlines = 0;
1077 TBOX total_ol_box(outlines[i]->bounding_box());
1078 while (i < outlines.size() && outlines[i] != nullptr) {
1079 blob_wanted[i] = true;
1080 total_ol_box += outlines[i]->bounding_box();
1081 ++i;
1082 ++num_blob_outlines;
1083 }
1084 // Find the insertion point.
1085 C_BLOB_IT blob_it(real_word->cblob_list());
1086 while (!blob_it.at_last() &&
1087 blob_it.data_relative(1)->bounding_box().left() <=
1088 total_ol_box.left()) {
1089 blob_it.forward();
1090 }
1091 // Choose which combination of them we actually want and where to put
1092 // them.
1094 tprintf("Num blobless outlines = %d\n", num_blob_outlines);
1095 C_BLOB* left_blob = blob_it.data();
1096 TBOX left_box = left_blob->bounding_box();
1097 C_BLOB* right_blob = blob_it.at_last() ? nullptr : blob_it.data_relative(1);
1098 if ((left_box.x_overlap(total_ol_box) || right_blob == nullptr ||
1099 !right_blob->bounding_box().x_overlap(total_ol_box)) &&
1100 SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob,
1101 outlines, num_blob_outlines,
1102 &blob_wanted)) {
1103 if (debug_noise_removal) tprintf("Added to left blob\n");
1104 for (int j = 0; j < blob_wanted.size(); ++j) {
1105 if (blob_wanted[j]) {
1106 (*word_wanted)[j] = true;
1107 (*target_blobs)[j] = left_blob;
1108 }
1109 }
1110 } else if (right_blob != nullptr &&
1111 (!left_box.x_overlap(total_ol_box) ||
1112 right_blob->bounding_box().x_overlap(total_ol_box)) &&
1114 right_blob, outlines,
1115 num_blob_outlines, &blob_wanted)) {
1116 if (debug_noise_removal) tprintf("Added to right blob\n");
1117 for (int j = 0; j < blob_wanted.size(); ++j) {
1118 if (blob_wanted[j]) {
1119 (*word_wanted)[j] = true;
1120 (*target_blobs)[j] = right_blob;
1121 }
1122 }
1123 } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, nullptr,
1124 outlines, num_blob_outlines,
1125 &blob_wanted)) {
1126 if (debug_noise_removal) tprintf("Fitted between blobs\n");
1127 for (int j = 0; j < blob_wanted.size(); ++j) {
1128 if (blob_wanted[j]) {
1129 (*word_wanted)[j] = true;
1130 (*target_blobs)[j] = nullptr;
1131 }
1132 }
1133 }
1134 }
1135}
1136
1137// Starting with ok_outlines set to indicate which outlines overlap the blob,
1138// chooses the optimal set (approximately) and returns true if any outlines
1139// are desired, in which case ok_outlines indicates which ones.
1141 int pass, float certainty_threshold, PAGE_RES_IT* pr_it, C_BLOB* blob,
1142 const GenericVector<C_OUTLINE*>& outlines, int num_outlines,
1143 GenericVector<bool>* ok_outlines) {
1144 STRING best_str;
1145 float target_cert = certainty_threshold;
1146 if (blob != nullptr) {
1147 float target_c2;
1148 target_cert = ClassifyBlobAsWord(pass, pr_it, blob, &best_str, &target_c2);
1149 if (debug_noise_removal) {
1150 tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.string(),
1151 target_cert, target_c2);
1152 blob->bounding_box().print();
1153 }
1154 target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
1155 }
1156 GenericVector<bool> test_outlines = *ok_outlines;
1157 // Start with all the outlines in.
1158 STRING all_str;
1159 GenericVector<bool> best_outlines = *ok_outlines;
1160 float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
1161 pr_it, blob, &all_str);
1162 if (debug_noise_removal) {
1163 TBOX ol_box;
1164 for (int i = 0; i < test_outlines.size(); ++i) {
1165 if (test_outlines[i]) ol_box += outlines[i]->bounding_box();
1166 }
1167 tprintf("All Noise blob classified as %s=%g, delta=%g at:",
1168 all_str.string(), best_cert, best_cert - target_cert);
1169 ol_box.print();
1170 }
1171 // Iteratively zero out the bit that improves the certainty the most, until
1172 // we get past the threshold, have zero bits, or fail to improve.
1173 int best_index = 0; // To zero out.
1174 while (num_outlines > 1 && best_index >= 0 &&
1175 (blob == nullptr || best_cert < target_cert || blob != nullptr)) {
1176 // Find the best bit to zero out.
1177 best_index = -1;
1178 for (int i = 0; i < outlines.size(); ++i) {
1179 if (test_outlines[i]) {
1180 test_outlines[i] = false;
1181 STRING str;
1182 float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
1183 pr_it, blob, &str);
1184 if (debug_noise_removal) {
1185 TBOX ol_box;
1186 for (int j = 0; j < outlines.size(); ++j) {
1187 if (test_outlines[j]) ol_box += outlines[j]->bounding_box();
1188 tprintf("%d", test_outlines[j]);
1189 }
1190 tprintf(" blob classified as %s=%g, delta=%g) at:", str.string(),
1191 cert, cert - target_cert);
1192 ol_box.print();
1193 }
1194 if (cert > best_cert) {
1195 best_cert = cert;
1196 best_index = i;
1197 best_outlines = test_outlines;
1198 }
1199 test_outlines[i] = true;
1200 }
1201 }
1202 if (best_index >= 0) {
1203 test_outlines[best_index] = false;
1204 --num_outlines;
1205 }
1206 }
1207 if (best_cert >= target_cert) {
1208 // Save the best combination.
1209 *ok_outlines = best_outlines;
1210 if (debug_noise_removal) {
1211 tprintf("%s noise combination ", blob ? "Adding" : "New");
1212 for (int i = 0; i < best_outlines.size(); ++i) {
1213 tprintf("%d", best_outlines[i]);
1214 }
1215 tprintf(" yields certainty %g, beating target of %g\n", best_cert,
1216 target_cert);
1217 }
1218 return true;
1219 }
1220
1221 return false;
1222}
1223
1224// Classifies the given blob plus the outlines flagged by ok_outlines, undoes
1225// the inclusion of the outlines, and returns the certainty of the raw choice.
1227 const GenericVector<bool>& ok_outlines,
1228 const GenericVector<C_OUTLINE*>& outlines, int pass_n, PAGE_RES_IT* pr_it,
1229 C_BLOB* blob, STRING* best_str) {
1230 C_OUTLINE_IT ol_it;
1231 C_OUTLINE* first_to_keep = nullptr;
1232 C_BLOB* local_blob = nullptr;
1233 if (blob != nullptr) {
1234 // Add the required outlines to the blob.
1235 ol_it.set_to_list(blob->out_list());
1236 first_to_keep = ol_it.data();
1237 }
1238 for (int i = 0; i < ok_outlines.size(); ++i) {
1239 if (ok_outlines[i]) {
1240 // This outline is to be added.
1241 if (blob == nullptr) {
1242 local_blob = new C_BLOB(outlines[i]);
1243 blob = local_blob;
1244 ol_it.set_to_list(blob->out_list());
1245 } else {
1246 ol_it.add_before_stay_put(outlines[i]);
1247 }
1248 }
1249 }
1250 float c2;
1251 float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
1252 ol_it.move_to_first();
1253 if (first_to_keep == nullptr) {
1254 // We created blob. Empty its outlines and delete it.
1255 for (; !ol_it.empty(); ol_it.forward()) ol_it.extract();
1256 delete local_blob;
1257 cert = -c2;
1258 } else {
1259 // Remove the outlines that we put in.
1260 for (; ol_it.data() != first_to_keep; ol_it.forward()) {
1261 ol_it.extract();
1262 }
1263 }
1264 return cert;
1265}
1266
1267// Classifies the given blob (part of word_data->word->word) as an individual
1268// word, using languages, chopper etc, returning only the certainty of the
1269// best raw choice, and undoing all the work done to fake out the word.
1271 C_BLOB* blob, STRING* best_str, float* c2) {
1272 WERD* real_word = pr_it->word()->word;
1273 WERD* word = real_word->ConstructFromSingleBlob(
1274 real_word->flag(W_BOL), real_word->flag(W_EOL), C_BLOB::deep_copy(blob));
1275 WERD_RES* word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
1276 // Get a new iterator that points to the new word.
1277 PAGE_RES_IT it(pr_it->page_res);
1278 while (it.word() != word_res && it.word() != nullptr) it.forward();
1279 ASSERT_HOST(it.word() == word_res);
1280 WordData wd(it);
1281 // Force full initialization.
1282 SetupWordPassN(1, &wd);
1283 classify_word_and_language(pass_n, &it, &wd);
1284 if (debug_noise_removal) {
1285 if (wd.word->raw_choice != nullptr) {
1286 tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height,
1287 wd.row->x_height(), wd.word->raw_choice->min_x_height(),
1289 } else {
1290 tprintf("Got word with null raw choice xheight=%g, row=%g\n", word_res->x_height,
1291 wd.row->x_height());
1292 }
1293 }
1294 float cert = 0.0f;
1295 if (wd.word->raw_choice != nullptr) { // This probably shouldn't happen, but...
1296 cert = wd.word->raw_choice->certainty();
1297 float rat = wd.word->raw_choice->rating();
1298 *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
1299 *best_str = wd.word->raw_choice->unichar_string();
1300 } else {
1301 *c2 = 0.0f;
1302 *best_str = "";
1303 }
1304 it.DeleteCurrentWord();
1305 pr_it->ResetWordIterator();
1306 return cert;
1307}
1308
1309#endif // ndef DISABLED_LEGACY_ENGINE
1310
1311// Generic function for classifying a word. Can be used either for pass1 or
1312// pass2 according to the function passed to recognizer.
1313// word_data holds the word to be recognized, and its block and row, and
1314// pr_it points to the word as well, in case we are running LSTM and it wants
1315// to output multiple words.
1316// Recognizes in the current language, and if successful that is all.
1317// If recognition was not successful, tries all available languages until
1318// it gets a successful result or runs out of languages. Keeps the best result.
1320 WordData* word_data) {
1321#ifdef DISABLED_LEGACY_ENGINE
1323#else
1324 WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
1326#endif // def DISABLED_LEGACY_ENGINE
1327
1328 // Best result so far.
1329 PointerVector<WERD_RES> best_words;
1330 // Points to the best result. May be word or in lang_words.
1331 const WERD_RES* word = word_data->word;
1332 clock_t start_t = clock();
1333 const bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
1334 if (debug) {
1335 tprintf("%s word with lang %s at:",
1336 word->done ? "Already done" : "Processing",
1337 most_recently_used_->lang.string());
1338 word->word->bounding_box().print();
1339 }
1340 if (word->done) {
1341 // If done on pass1, leave it as-is.
1342 if (!word->tess_failed)
1343 most_recently_used_ = word->tesseract;
1344 return;
1345 }
1346 int sub = sub_langs_.size();
1347 if (most_recently_used_ != this) {
1348 // Get the index of the most_recently_used_.
1349 for (sub = 0; sub < sub_langs_.size() &&
1350 most_recently_used_ != sub_langs_[sub]; ++sub) {}
1351 }
1352 most_recently_used_->RetryWithLanguage(
1353 *word_data, recognizer, debug, &word_data->lang_words[sub], &best_words);
1354 Tesseract* best_lang_tess = most_recently_used_;
1355 if (!WordsAcceptable(best_words)) {
1356 // Try all the other languages to see if they are any better.
1357 if (most_recently_used_ != this &&
1358 this->RetryWithLanguage(*word_data, recognizer, debug,
1359 &word_data->lang_words[sub_langs_.size()],
1360 &best_words) > 0) {
1361 best_lang_tess = this;
1362 }
1363 for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
1364 ++i) {
1365 if (most_recently_used_ != sub_langs_[i] &&
1366 sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug,
1367 &word_data->lang_words[i],
1368 &best_words) > 0) {
1369 best_lang_tess = sub_langs_[i];
1370 }
1371 }
1372 }
1373 most_recently_used_ = best_lang_tess;
1374 if (!best_words.empty()) {
1375 if (best_words.size() == 1 && !best_words[0]->combination) {
1376 // Move the best single result to the main word.
1377 word_data->word->ConsumeWordResults(best_words[0]);
1378 } else {
1379 // Words came from LSTM, and must be moved to the PAGE_RES properly.
1380 word_data->word = best_words.back();
1381 pr_it->ReplaceCurrentWord(&best_words);
1382 }
1383 ASSERT_HOST(word_data->word->box_word != nullptr);
1384 } else {
1385 tprintf("no best words!!\n");
1386 }
1387 clock_t ocr_t = clock();
1389 tprintf("%s (ocr took %.2f sec)\n",
1390 word_data->word->best_choice->unichar_string().string(),
1391 static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC);
1392 }
1393}
1394
1402 WERD_RES** in_word,
1403 PointerVector<WERD_RES>* out_words) {
1404 ROW* row = word_data.row;
1405 BLOCK* block = word_data.block;
1406 prev_word_best_choice_ = word_data.prev_word != nullptr
1407 ? word_data.prev_word->word->best_choice : nullptr;
1408#ifndef ANDROID_BUILD
1409#ifdef DISABLED_LEGACY_ENGINE
1411#else
1414#endif // def DISABLED_LEGACY_ENGINE
1415 if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1416 LSTMRecognizeWord(*block, row, *in_word, out_words);
1417 if (!out_words->empty())
1418 return; // Successful lstm recognition.
1419 }
1421 // No fallback allowed, so use a fake.
1422 (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset());
1423 return;
1424 }
1425
1426 #ifndef DISABLED_LEGACY_ENGINE
1427 // Fall back to tesseract for failed words or odd words.
1428 (*in_word)->SetupForRecognition(unicharset, this, BestPix(),
1429 OEM_TESSERACT_ONLY, nullptr,
1432 poly_allow_detailed_fx, row, block);
1433#endif // ndef DISABLED_LEGACY_ENGINE
1434 }
1435#endif // ndef ANDROID_BUILD
1436
1437#ifndef DISABLED_LEGACY_ENGINE
1438 WERD_RES* word = *in_word;
1439 match_word_pass_n(1, word, row, block);
1440 if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1441 word->tess_would_adapt = AdaptableWord(word);
1442 bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
1443
1444 if (adapt_ok) {
1445 // Send word to adaptive classifier for training.
1447 LearnWord(nullptr, word);
1448 // Mark misadaptions if running blamer.
1449 if (word->blamer_bundle != nullptr) {
1452 }
1453 }
1454
1455 if (tessedit_enable_doc_dict && !word->IsAmbiguous())
1457 }
1458#endif // ndef DISABLED_LEGACY_ENGINE
1459}
1460
1461// Helper to report the result of the xheight fix.
1462void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht,
1463 WERD_RES* word, WERD_RES* new_word) {
1464 tprintf("New XHT Match:%s = %s ",
1466 word->best_choice->debug_string().string());
1467 word->reject_map.print(debug_fp);
1468 tprintf(" -> %s = %s ",
1469 new_word->best_choice->unichar_string().string(),
1470 new_word->best_choice->debug_string().string());
1471 new_word->reject_map.print(debug_fp);
1472 tprintf(" %s->%s %s %s\n",
1473 word->guessed_x_ht ? "GUESS" : "CERT",
1474 new_word->guessed_x_ht ? "GUESS" : "CERT",
1475 new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
1476 accept_new_word ? "ACCEPTED" : "");
1477}
1478
1479#ifndef DISABLED_LEGACY_ENGINE
1480
1481// Run the x-height fix-up, based on min/max top/bottom information in
1482// unicharset.
1483// Returns true if the word was changed.
1484// See the comment in fixxht.cpp for a description of the overall process.
1486 int original_misfits = CountMisfitTops(word);
1487 if (original_misfits == 0)
1488 return false;
1489 float baseline_shift = 0.0f;
1490 float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);
1491 if (baseline_shift != 0.0f) {
1492 // Try the shift on its own first.
1493 if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height,
1494 word, block, row))
1495 return false;
1496 original_misfits = CountMisfitTops(word);
1497 if (original_misfits > 0) {
1498 float new_baseline_shift;
1499 // Now recompute the new x_height.
1500 new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);
1501 if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1502 // No test of return value here, as we are definitely making a change
1503 // to the word by shifting the baseline.
1504 TestNewNormalization(original_misfits, baseline_shift, new_x_ht,
1505 word, block, row);
1506 }
1507 }
1508 return true;
1509 } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1510 return TestNewNormalization(original_misfits, 0.0f, new_x_ht,
1511 word, block, row);
1512 } else {
1513 return false;
1514 }
1515}
1516
1517// Runs recognition with the test baseline shift and x-height and returns true
1518// if there was an improvement in recognition result.
1519bool Tesseract::TestNewNormalization(int original_misfits,
1520 float baseline_shift, float new_x_ht,
1521 WERD_RES *word, BLOCK* block, ROW *row) {
1522 bool accept_new_x_ht = false;
1523 WERD_RES new_x_ht_word(word->word);
1524 if (word->blamer_bundle != nullptr) {
1525 new_x_ht_word.blamer_bundle = new BlamerBundle();
1526 new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
1527 }
1528 new_x_ht_word.x_height = new_x_ht;
1529 new_x_ht_word.baseline_shift = baseline_shift;
1530 new_x_ht_word.caps_height = 0.0;
1531 new_x_ht_word.SetupForRecognition(
1532 unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
1534 poly_allow_detailed_fx, row, block);
1535 match_word_pass_n(2, &new_x_ht_word, row, block);
1536 if (!new_x_ht_word.tess_failed) {
1537 int new_misfits = CountMisfitTops(&new_x_ht_word);
1538 if (debug_x_ht_level >= 1) {
1539 tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n",
1540 original_misfits, word->x_height,
1541 new_misfits, new_x_ht);
1542 tprintf("Old rating= %f, certainty=%f, new=%f, %f\n",
1543 word->best_choice->rating(), word->best_choice->certainty(),
1544 new_x_ht_word.best_choice->rating(),
1545 new_x_ht_word.best_choice->certainty());
1546 }
1547 // The misfits must improve and either the rating or certainty.
1548 accept_new_x_ht = new_misfits < original_misfits &&
1549 (new_x_ht_word.best_choice->certainty() >
1550 word->best_choice->certainty() ||
1551 new_x_ht_word.best_choice->rating() <
1552 word->best_choice->rating());
1553 if (debug_x_ht_level >= 1) {
1554 ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
1555 }
1556 }
1557 if (accept_new_x_ht) {
1558 word->ConsumeWordResults(&new_x_ht_word);
1559 return true;
1560 }
1561 return false;
1562}
1563
1564#endif // ndef DISABLED_LEGACY_ENGINE
1565
1573 WERD_RES** in_word,
1574 PointerVector<WERD_RES>* out_words) {
1575 // Return if we do not want to run Tesseract.
1577 return;
1578 }
1579#ifndef DISABLED_LEGACY_ENGINE
1580 ROW* row = word_data.row;
1581 BLOCK* block = word_data.block;
1582 WERD_RES* word = *in_word;
1583 prev_word_best_choice_ = word_data.prev_word != nullptr
1584 ? word_data.prev_word->word->best_choice : nullptr;
1585
1587 check_debug_pt(word, 30);
1588 if (!word->done) {
1589 word->caps_height = 0.0;
1590 if (word->x_height == 0.0f)
1591 word->x_height = row->x_height();
1592 match_word_pass_n(2, word, row, block);
1593 check_debug_pt(word, 40);
1594 }
1595
1597
1598 if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1600 block->classify_rotation().y() == 0.0f) {
1601 // Use the tops and bottoms since they are available.
1602 TrainedXheightFix(word, block, row);
1603 }
1604
1606 }
1607#ifndef GRAPHICS_DISABLED
1609 if (fx_win == nullptr)
1610 create_fx_win();
1611 clear_fx_win();
1612 word->rebuild_word->plot(fx_win);
1613 TBOX wbox = word->rebuild_word->bounding_box();
1614 fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
1615 wbox.right(), wbox.bottom());
1617 }
1618#endif
1620 check_debug_pt(word, 50);
1621#endif // ndef DISABLED_LEGACY_ENGINE
1622}
1623
1624#ifndef DISABLED_LEGACY_ENGINE
1631 ROW *row, BLOCK* block) {
1632 if (word->tess_failed) return;
1633 tess_segment_pass_n(pass_n, word);
1634
1635 if (!word->tess_failed) {
1636 if (!word->word->flag (W_REP_CHAR)) {
1637 word->fix_quotes();
1639 word->fix_hyphens();
1640 /* Don't trust fix_quotes! - though I think I've fixed the bug */
1641 if (word->best_choice->length() != word->box_word->length()) {
1642 tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
1643 " #Blobs=%d\n",
1644 word->best_choice->debug_string().string(),
1645 word->best_choice->length(),
1646 word->box_word->length());
1647
1648 }
1649 word->tess_accepted = tess_acceptable_word(word);
1650
1651 // Also sets word->done flag
1652 make_reject_map(word, row, pass_n);
1653 }
1654 }
1655 set_word_fonts(word);
1656
1657 ASSERT_HOST(word->raw_choice != nullptr);
1658}
1659#endif // ndef DISABLED_LEGACY_ENGINE
1660
1661// Helper to return the best rated BLOB_CHOICE in the whole word that matches
1662// the given char_id, or nullptr if none can be found.
1663static BLOB_CHOICE* FindBestMatchingChoice(UNICHAR_ID char_id,
1664 WERD_RES* word_res) {
1665 // Find the corresponding best BLOB_CHOICE from any position in the word_res.
1666 BLOB_CHOICE* best_choice = nullptr;
1667 for (int i = 0; i < word_res->best_choice->length(); ++i) {
1668 BLOB_CHOICE* choice = FindMatchingChoice(char_id,
1669 word_res->GetBlobChoices(i));
1670 if (choice != nullptr) {
1671 if (best_choice == nullptr || choice->rating() < best_choice->rating())
1672 best_choice = choice;
1673 }
1674 }
1675 return best_choice;
1676}
1677
1678// Helper to insert blob_choice in each location in the leader word if there is
1679// no matching BLOB_CHOICE there already, and correct any incorrect results
1680// in the best_choice.
1681static void CorrectRepcharChoices(BLOB_CHOICE* blob_choice,
1682 WERD_RES* word_res) {
1683 WERD_CHOICE* word = word_res->best_choice;
1684 for (int i = 0; i < word_res->best_choice->length(); ++i) {
1685 BLOB_CHOICE* choice = FindMatchingChoice(blob_choice->unichar_id(),
1686 word_res->GetBlobChoices(i));
1687 if (choice == nullptr) {
1688 BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(i));
1689 choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice));
1690 }
1691 }
1692 // Correct any incorrect results in word.
1693 for (int i = 0; i < word->length(); ++i) {
1694 if (word->unichar_id(i) != blob_choice->unichar_id())
1695 word->set_unichar_id(blob_choice->unichar_id(), i);
1696 }
1697}
1698
1707 WERD_RES *word_res = page_res_it->word();
1708 const WERD_CHOICE &word = *(word_res->best_choice);
1709
1710 // Find the frequency of each unique character in the word.
1711 SortHelper<UNICHAR_ID> rep_ch(word.length());
1712 for (int i = 0; i < word.length(); ++i) {
1713 rep_ch.Add(word.unichar_id(i), 1);
1714 }
1715
1716 // Find the most frequent result.
1717 UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
1718 int max_count = rep_ch.MaxCount(&maxch_id);
1719 // Find the best exemplar of a classifier result for maxch_id.
1720 BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
1721 if (best_choice == nullptr) {
1722 tprintf("Failed to find a choice for %s, occurring %d times\n",
1723 word_res->uch_set->debug_str(maxch_id).string(), max_count);
1724 return;
1725 }
1726 word_res->done = true;
1727
1728 // Measure the mean space.
1729 int gap_count = 0;
1730 WERD* werd = word_res->word;
1731 C_BLOB_IT blob_it(werd->cblob_list());
1732 C_BLOB* prev_blob = blob_it.data();
1733 for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1734 C_BLOB* blob = blob_it.data();
1735 int gap = blob->bounding_box().left();
1736 gap -= prev_blob->bounding_box().right();
1737 ++gap_count;
1738 prev_blob = blob;
1739 }
1740 // Just correct existing classification.
1741 CorrectRepcharChoices(best_choice, word_res);
1742 word_res->reject_map.initialise(word.length());
1743}
1744
1746 const UNICHARSET& char_set, const char *s, const char *lengths) {
1747 int i = 0;
1748 int offset = 0;
1749 int leading_punct_count;
1750 int upper_count = 0;
1751 int hyphen_pos = -1;
1753
1754 if (strlen (lengths) > 20)
1755 return word_type;
1756
1757 /* Single Leading punctuation char*/
1758
1759 if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset]))
1760 offset += lengths[i++];
1761 leading_punct_count = i;
1762
1763 /* Initial cap */
1764 while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
1765 offset += lengths[i++];
1766 upper_count++;
1767 }
1768 if (upper_count > 1) {
1769 word_type = AC_UPPER_CASE;
1770 } else {
1771 /* Lower case word, possibly with an initial cap */
1772 while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
1773 offset += lengths[i++];
1774 }
1775 if (i - leading_punct_count < quality_min_initial_alphas_reqd)
1776 goto not_a_word;
1777 /*
1778 Allow a single hyphen in a lower case word
1779 - don't trust upper case - I've seen several cases of "H" -> "I-I"
1780 */
1781 if (lengths[i] == 1 && s[offset] == '-') {
1782 hyphen_pos = i;
1783 offset += lengths[i++];
1784 if (s[offset] != '\0') {
1785 while ((s[offset] != '\0') &&
1786 char_set.get_islower(s + offset, lengths[i])) {
1787 offset += lengths[i++];
1788 }
1789 if (i < hyphen_pos + 3)
1790 goto not_a_word;
1791 }
1792 } else {
1793 /* Allow "'s" in NON hyphenated lower case words */
1794 if (lengths[i] == 1 && (s[offset] == '\'') &&
1795 lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
1796 offset += lengths[i++];
1797 offset += lengths[i++];
1798 }
1799 }
1800 if (upper_count > 0)
1801 word_type = AC_INITIAL_CAP;
1802 else
1803 word_type = AC_LOWER_CASE;
1804 }
1805
1806 /* Up to two different, constrained trailing punctuation chars */
1807 if (lengths[i] == 1 && s[offset] != '\0' &&
1808 STRING(chs_trailing_punct1).contains(s[offset]))
1809 offset += lengths[i++];
1810 if (lengths[i] == 1 && s[offset] != '\0' && i > 0 &&
1811 s[offset - lengths[i - 1]] != s[offset] &&
1812 STRING(chs_trailing_punct2).contains (s[offset]))
1813 offset += lengths[i++];
1814
1815 if (s[offset] != '\0')
1816 word_type = AC_UNACCEPTABLE;
1817
1818 not_a_word:
1819
1820 if (word_type == AC_UNACCEPTABLE) {
1821 /* Look for abbreviation string */
1822 i = 0;
1823 offset = 0;
1824 if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
1825 word_type = AC_UC_ABBREV;
1826 while (s[offset] != '\0' &&
1827 char_set.get_isupper(s + offset, lengths[i]) &&
1828 lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1829 offset += lengths[i++];
1830 offset += lengths[i++];
1831 }
1832 }
1833 else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
1834 word_type = AC_LC_ABBREV;
1835 while (s[offset] != '\0' &&
1836 char_set.get_islower(s + offset, lengths[i]) &&
1837 lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1838 offset += lengths[i++];
1839 offset += lengths[i++];
1840 }
1841 }
1842 if (s[offset] != '\0')
1843 word_type = AC_UNACCEPTABLE;
1844 }
1845
1846 return word_type;
1847}
1848
1849bool Tesseract::check_debug_pt(WERD_RES* word, int location) {
1850 bool show_map_detail = false;
1851 int16_t i;
1852
1853 if (!test_pt)
1854 return false;
1855
1856 tessedit_rejection_debug.set_value (false);
1857 debug_x_ht_level.set_value(0);
1858
1860 if (location < 0)
1861 return true; // For breakpoint use
1862 tessedit_rejection_debug.set_value(true);
1863 debug_x_ht_level.set_value(2);
1864 tprintf ("\n\nTESTWD::");
1865 switch (location) {
1866 case 0:
1867 tprintf ("classify_word_pass1 start\n");
1868 word->word->print();
1869 break;
1870 case 10:
1871 tprintf ("make_reject_map: initial map");
1872 break;
1873 case 20:
1874 tprintf ("make_reject_map: after NN");
1875 break;
1876 case 30:
1877 tprintf ("classify_word_pass2 - START");
1878 break;
1879 case 40:
1880 tprintf ("classify_word_pass2 - Pre Xht");
1881 break;
1882 case 50:
1883 tprintf ("classify_word_pass2 - END");
1884 show_map_detail = true;
1885 break;
1886 case 60:
1887 tprintf ("fixspace");
1888 break;
1889 case 70:
1890 tprintf ("MM pass START");
1891 break;
1892 case 80:
1893 tprintf ("MM pass END");
1894 break;
1895 case 90:
1896 tprintf ("After Poor quality rejection");
1897 break;
1898 case 100:
1899 tprintf ("unrej_good_quality_words - START");
1900 break;
1901 case 110:
1902 tprintf ("unrej_good_quality_words - END");
1903 break;
1904 case 120:
1905 tprintf ("Write results pass");
1906 show_map_detail = true;
1907 break;
1908 }
1909 if (word->best_choice != nullptr) {
1910 tprintf(" \"%s\" ", word->best_choice->unichar_string().string());
1911 word->reject_map.print(debug_fp);
1912 tprintf("\n");
1913 if (show_map_detail) {
1914 tprintf("\"%s\"\n", word->best_choice->unichar_string().string());
1915 for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1916 tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
1917 word->reject_map[i].full_print(debug_fp);
1918 }
1919 }
1920 } else {
1921 tprintf("null best choice\n");
1922 }
1923 tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
1924 tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
1925 return true;
1926 } else {
1927 return false;
1928 }
1929}
1930
1936static void find_modal_font( // good chars in word
1937 STATS* fonts, // font stats
1938 int16_t* font_out, // output font
1939 int8_t* font_count // output count
1940) {
1941 int16_t font; //font index
1942 int32_t count; //pile count
1943
1944 if (fonts->get_total () > 0) {
1945 font = static_cast<int16_t>(fonts->mode ());
1946 *font_out = font;
1947 count = fonts->pile_count (font);
1948 *font_count = count < INT8_MAX ? count : INT8_MAX;
1949 fonts->add (font, -*font_count);
1950 }
1951 else {
1952 *font_out = -1;
1953 *font_count = 0;
1954 }
1955}
1956
1963 // Don't try to set the word fonts for an lstm word, as the configs
1964 // will be meaningless.
1965 if (word->chopped_word == nullptr) return;
1966 ASSERT_HOST(word->best_choice != nullptr);
1967
1968#ifndef DISABLED_LEGACY_ENGINE
1969 const int fontinfo_size = get_fontinfo_table().size();
1970 if (fontinfo_size == 0) return;
1971 GenericVector<int> font_total_score;
1972 font_total_score.init_to_size(fontinfo_size, 0);
1973
1974 // Compute the font scores for the word
1976 tprintf("Examining fonts in %s\n",
1977 word->best_choice->debug_string().string());
1978 }
1979 for (int b = 0; b < word->best_choice->length(); ++b) {
1980 const BLOB_CHOICE* choice = word->GetBlobChoice(b);
1981 if (choice == nullptr) continue;
1982 const GenericVector<ScoredFont>& fonts = choice->fonts();
1983 for (int f = 0; f < fonts.size(); ++f) {
1984 const int fontinfo_id = fonts[f].fontinfo_id;
1985 if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
1986 font_total_score[fontinfo_id] += fonts[f].score;
1987 }
1988 }
1989 }
1990 // Find the top and 2nd choice for the word.
1991 int score1 = 0, score2 = 0;
1992 int16_t font_id1 = -1, font_id2 = -1;
1993 for (int f = 0; f < fontinfo_size; ++f) {
1994 if (tessedit_debug_fonts && font_total_score[f] > 0) {
1995 tprintf("Font %s, total score = %d\n",
1996 fontinfo_table_.get(f).name, font_total_score[f]);
1997 }
1998 if (font_total_score[f] > score1) {
1999 score2 = score1;
2000 font_id2 = font_id1;
2001 score1 = font_total_score[f];
2002 font_id1 = f;
2003 } else if (font_total_score[f] > score2) {
2004 score2 = font_total_score[f];
2005 font_id2 = f;
2006 }
2007 }
2008 word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : nullptr;
2009 word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : nullptr;
2010 // Each score has a limit of UINT16_MAX, so divide by that to get the number
2011 // of "votes" for that font, ie number of perfect scores.
2012 word->fontinfo_id_count = ClipToRange<int>(score1 / UINT16_MAX, 1, INT8_MAX);
2013 word->fontinfo_id2_count = ClipToRange<int>(score2 / UINT16_MAX, 0, INT8_MAX);
2014 if (score1 > 0) {
2015 const FontInfo fi = fontinfo_table_.get(font_id1);
2017 if (word->fontinfo_id2_count > 0 && font_id2 >= 0) {
2018 tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n",
2019 fi.name, word->fontinfo_id_count,
2020 fontinfo_table_.get(font_id2).name,
2021 word->fontinfo_id2_count);
2022 } else {
2023 tprintf("Word modal font=%s, score=%d. No 2nd choice\n",
2024 fi.name, word->fontinfo_id_count);
2025 }
2026 }
2027 }
2028#endif // ndef DISABLED_LEGACY_ENGINE
2029}
2030
2031#ifndef DISABLED_LEGACY_ENGINE
2038 PAGE_RES_IT page_res_it(page_res);
2039 WERD_RES *word; // current word
2040 STATS doc_fonts(0, font_table_size_); // font counters
2041
2042 // Gather font id statistics.
2043 for (page_res_it.restart_page(); page_res_it.word() != nullptr;
2044 page_res_it.forward()) {
2045 word = page_res_it.word();
2046 if (word->fontinfo != nullptr) {
2047 doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
2048 }
2049 if (word->fontinfo2 != nullptr) {
2050 doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
2051 }
2052 }
2053 int16_t doc_font; // modal font
2054 int8_t doc_font_count; // modal font
2055 find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
2056 if (doc_font_count == 0)
2057 return;
2058 // Get the modal font pointer.
2059 const FontInfo* modal_font = nullptr;
2060 for (page_res_it.restart_page(); page_res_it.word() != nullptr;
2061 page_res_it.forward()) {
2062 word = page_res_it.word();
2063 if (word->fontinfo != nullptr && word->fontinfo->universal_id == doc_font) {
2064 modal_font = word->fontinfo;
2065 break;
2066 }
2067 if (word->fontinfo2 != nullptr && word->fontinfo2->universal_id == doc_font) {
2068 modal_font = word->fontinfo2;
2069 break;
2070 }
2071 }
2072 ASSERT_HOST(modal_font != nullptr);
2073
2074 // Assign modal font to weak words.
2075 for (page_res_it.restart_page(); page_res_it.word() != nullptr;
2076 page_res_it.forward()) {
2077 word = page_res_it.word();
2078 const int length = word->best_choice->length();
2079
2080 const int count = word->fontinfo_id_count;
2081 if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
2082 word->fontinfo = modal_font;
2083 // Counts only get 1 as it came from the doc.
2084 word->fontinfo_id_count = 1;
2085 }
2086 }
2087}
2088#endif // ndef DISABLED_LEGACY_ENGINE
2089
2090// If a word has multiple alternates check if the best choice is in the
2091// dictionary. If not, replace it with an alternate that exists in the
2092// dictionary.
2094 PAGE_RES_IT word_it(page_res);
2095 for (WERD_RES* word = word_it.word(); word != nullptr;
2096 word = word_it.forward()) {
2097 if (word->best_choices.singleton())
2098 continue; // There are no alternates.
2099
2100 const WERD_CHOICE* best = word->best_choice;
2101 if (word->tesseract->getDict().valid_word(*best) != 0)
2102 continue; // The best choice is in the dictionary.
2103
2104 WERD_CHOICE_IT choice_it(&word->best_choices);
2105 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
2106 choice_it.forward()) {
2107 WERD_CHOICE* alternate = choice_it.data();
2108 if (word->tesseract->getDict().valid_word(*alternate)) {
2109 // The alternate choice is in the dictionary.
2111 tprintf("Dictionary correction replaces best choice '%s' with '%s'\n",
2112 best->unichar_string().string(),
2113 alternate->unichar_string().string());
2114 }
2115 // Replace the 'best' choice with a better choice.
2116 word->ReplaceBestChoice(alternate);
2117 break;
2118 }
2119 }
2120 }
2121}
2122
2123} // namespace tesseract
const char *const kBackUpConfigFile
Definition: control.cpp:48
const double kMinRefitXHeightFraction
Definition: control.cpp:51
ACCEPTABLE_WERD_TYPE
Definition: control.h:29
@ AC_UC_ABBREV
A.B.C.
Definition: control.h:35
@ AC_INITIAL_CAP
ALL but initial lc.
Definition: control.h:33
@ AC_LC_ABBREV
a.b.c.
Definition: control.h:34
@ AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:30
@ AC_UPPER_CASE
ALL upper case.
Definition: control.h:32
@ AC_LOWER_CASE
ALL lower case.
Definition: control.h:31
FILE * debug_fp
Definition: tessvars.cpp:24
PAGE_RES_IT * make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: werdit.cpp:35
IncorrectResultReason
Definition: blamer.h:51
@ IRR_NUM_REASONS
Definition: blamer.h:98
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
Definition: ratngs.cpp:809
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:184
@ FREQ_DAWG_PERM
Definition: ratngs.h:244
@ USER_DAWG_PERM
Definition: ratngs.h:243
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:241
@ W_EOL
end of line
Definition: werd.h:33
@ W_REP_CHAR
repeated character
Definition: werd.h:38
@ W_BOL
start of line
Definition: werd.h:32
#define LOC_DOC_BLK_REJ
Definition: errcode.h:52
#define SUBLOC_NORM
Definition: errcode.h:58
#define LOC_MM_ADAPT
Definition: errcode.h:51
void set_global_subloc_code(int loc_code)
Definition: globaloc.cpp:30
#define LOC_FUZZY_SPACE
Definition: errcode.h:49
#define ASSERT_HOST(x)
Definition: errcode.h:88
#define LOC_WRITE_RESULTS
Definition: errcode.h:53
void set_global_loc_code(int loc_code)
Definition: globaloc.cpp:25
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int UNICHAR_ID
Definition: unichar.h:34
int count(LIST var_list)
Definition: oldlist.cpp:95
ScrollView * fx_win
Definition: drawfx.cpp:40
void clear_fx_win()
Definition: drawfx.cpp:62
void create_fx_win()
Definition: drawfx.cpp:49
@ OEM_TESSERACT_LSTM_COMBINED
Definition: publictypes.h:271
@ OEM_TESSERACT_ONLY
Definition: publictypes.h:269
@ SET_PARAM_CONSTRAINT_DEBUG_ONLY
Definition: params.h:37
void(Tesseract::*)(const WordData &, WERD_RES **, PointerVector< WERD_RES > *) WordRecognizer
void init_to_size(int size, const T &t)
int push_back(T object)
bool empty() const
Definition: genericvector.h:91
int size() const
Definition: genericvector.h:72
T & back() const
int length() const
Definition: genericvector.h:86
PointerVector< WERD_RES > lang_words
void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
Definition: linerec.cpp:229
bool recog_interactive(PAGE_RES_IT *pr_it)
Definition: control.cpp:77
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1319
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1630
void bigram_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:467
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
Definition: fixxht.cpp:102
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
Definition: control.cpp:154
void fix_rep_char(PAGE_RES_IT *page_res_it)
Definition: control.cpp:1706
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:177
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
Definition: control.cpp:62
int16_t word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:72
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
Definition: control.cpp:612
void tess_segment_pass_n(int pass_n, WERD_RES *word)
Definition: tessbox.cpp:32
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE * > &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
Definition: control.cpp:1140
bool tessedit_enable_bigram_correction
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1745
Pix * BestPix() const
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
Definition: output.cpp:36
Dict & getDict() override
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1519
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:138
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1401
bool SubAndSuperscriptFix(WERD_RES *word_res)
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
Definition: control.cpp:1270
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
Definition: control.cpp:904
void dictionary_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:2093
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1849
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:92
void tess_add_doc_word(WERD_CHOICE *word_choice)
Definition: tessbox.cpp:72
void font_recognition_pass(PAGE_RES *page_res)
Definition: control.cpp:2037
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
Definition: control.cpp:945
bool AnyTessLang() const
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
Definition: control.cpp:120
void AssignDiacriticsToOverlappingBlobs(const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB * > *target_blobs)
Definition: control.cpp:1011
int CountMisfitTops(WERD_RES *word_res)
Definition: fixxht.cpp:70
int16_t word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:60
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1485
bool word_adaptable(WERD_RES *word, uint16_t mode)
Definition: adaptions.cpp:34
void set_word_fonts(WERD_RES *word)
Definition: control.cpp:1962
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:75
bool AnyLSTMLang() const
bool tess_acceptable_word(WERD_RES *word)
Definition: tessbox.cpp:62
bool right_to_left() const
void script_pos_pass(PAGE_RES *page_res)
Definition: control.cpp:734
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
Definition: control.cpp:213
void PrerecAllWordsPar(const GenericVector< WordData > &words)
Definition: par_control.cpp:38
void blamer_pass(PAGE_RES *page_res)
Definition: control.cpp:710
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1572
void AssignDiacriticsToNewBlobs(const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB * > *target_blobs)
Definition: control.cpp:1064
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:302
float ClassifyBlobPlusOutlines(const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
Definition: control.cpp:1226
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
Definition: control.cpp:1462
static void LastChanceBlame(bool debug, WERD_RES *word)
Definition: blamer.cpp:560
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug)
Definition: blamer.cpp:587
void CopyTruth(const BlamerBundle &other)
Definition: blamer.h:203
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: blamer.cpp:64
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:120
const STRING & misadaption_debug() const
Definition: blamer.h:133
TBOX bounding_box() const
Definition: blobs.cpp:861
void plot(ScrollView *window)
Definition: blobs.cpp:897
int length() const
Definition: boxword.h:83
static const double kXHeightCapRatio
Definition: ccstruct.h:37
int32_t universal_id
Definition: fontinfo.h:123
Definition: ocrblock.h:31
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:190
FCOORD classify_rotation() const
Definition: ocrblock.h:140
int32_t x_height() const
return xheight
Definition: ocrblock.h:106
Definition: ocrrow.h:37
float x_height() const
Definition: ocrrow.h:64
GenericVector< int > blame_reasons
Definition: pageres.h:86
int32_t rej_count
Definition: pageres.h:79
int32_t char_count
Definition: pageres.h:78
GenericVector< STRING > misadaption_log
Definition: pageres.h:91
BLOCK * block
Definition: pageres.h:116
ROW * row
Definition: pageres.h:140
const UNICHARSET * uch_set
Definition: pageres.h:203
bool tess_would_adapt
Definition: pageres.h:304
TWERD * rebuild_word
Definition: pageres.h:266
WERD_CHOICE_LIST best_choices
Definition: pageres.h:249
bool guessed_x_ht
Definition: pageres.h:313
BlamerBundle * blamer_bundle
Definition: pageres.h:252
bool done
Definition: pageres.h:305
tesseract::Tesseract * tesseract
Definition: pageres.h:280
int8_t fontinfo_id2_count
Definition: pageres.h:312
const FontInfo * fontinfo
Definition: pageres.h:309
tesseract::BoxWord * box_word
Definition: pageres.h:272
WERD_CHOICE * best_choice
Definition: pageres.h:241
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:795
const FontInfo * fontinfo2
Definition: pageres.h:310
float x_height
Definition: pageres.h:316
bool part_of_combo
Definition: pageres.h:340
void PrintBestChoices() const
Definition: pageres.cpp:717
bool IsAmbiguous()
Definition: pageres.cpp:452
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:765
void BestChoiceToCorrectText()
Definition: pageres.cpp:923
bool tess_failed
Definition: pageres.h:295
void SetScriptPositions()
Definition: pageres.cpp:858
bool tess_accepted
Definition: pageres.h:303
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
Definition: pageres.cpp:759
WERD_CHOICE * raw_choice
Definition: pageres.h:246
void fix_hyphens()
Definition: pageres.cpp:1047
bool small_caps
Definition: pageres.h:306
int8_t fontinfo_id_count
Definition: pageres.h:311
TWERD * chopped_word
Definition: pageres.h:212
void fix_quotes()
Definition: pageres.cpp:1018
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:302
REJMAP reject_map
Definition: pageres.h:294
float caps_height
Definition: pageres.h:317
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:277
WERD * word
Definition: pageres.h:186
float baseline_shift
Definition: pageres.h:318
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:750
WERD_RES * word() const
Definition: pageres.h:754
ROW_RES * row() const
Definition: pageres.h:757
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1473
void rej_stat_word()
Definition: pageres.cpp:1667
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1333
void ResetWordIterator()
Definition: pageres.cpp:1523
BLOCK_RES * block() const
Definition: pageres.h:760
WERD_RES * restart_page()
Definition: pageres.h:701
PAGE_RES * page_res
Definition: pageres.h:677
WERD_RES * forward()
Definition: pageres.h:734
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1213
void DeleteCurrentWord()
Definition: pageres.cpp:1440
POLY_BLOCK * poly_block() const
Definition: pdblock.h:55
Definition: points.h:189
float y() const
Definition: points.h:210
bool IsText() const
Definition: polyblk.h:49
const GenericVector< tesseract::ScoredFont > & fonts() const
Definition: ratngs.h:93
float rating() const
Definition: ratngs.h:80
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
WERD_CHOICE shallow_copy(int start, int end) const
Definition: ratngs.cpp:418
const STRING debug_string() const
Definition: ratngs.h:495
const STRING & unichar_string() const
Definition: ratngs.h:531
bool IsAllSpaces() const
Definition: ratngs.h:511
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
uint8_t permuter() const
Definition: ratngs.h:336
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:349
float min_x_height() const
Definition: ratngs.h:326
float certainty() const
Definition: ratngs.h:320
int length() const
Definition: ratngs.h:293
float max_x_height() const
Definition: ratngs.h:329
void GetNonSuperscriptSpan(int *start, int *end) const
Definition: ratngs.cpp:401
float rating() const
Definition: ratngs.h:317
Definition: rect.h:34
bool x_overlap(const TBOX &box) const
Definition: rect.h:401
int16_t top() const
Definition: rect.h:58
void print() const
Definition: rect.h:278
bool major_overlap(const TBOX &box) const
Definition: rect.h:368
int16_t left() const
Definition: rect.h:72
int16_t bottom() const
Definition: rect.h:65
bool contains(const FCOORD pt) const
Definition: rect.h:333
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:412
int16_t right() const
Definition: rect.h:79
void print(FILE *fp)
Definition: rejctmap.cpp:321
int16_t reject_count()
Definition: rejctmap.h:229
void full_print(FILE *fp)
Definition: rejctmap.cpp:333
void initialise(int16_t length)
Definition: rejctmap.cpp:273
void rej_word_bad_quality()
Definition: rejctmap.cpp:415
int32_t length() const
Definition: rejctmap.h:223
Definition: statistc.h:31
int32_t pile_count(int32_t value) const
Definition: statistc.h:76
void add(int32_t value, int32_t count)
Definition: statistc.cpp:93
int32_t get_total() const
Definition: statistc.h:84
int32_t mode() const
Definition: statistc.cpp:107
static C_BLOB * deep_copy(const C_BLOB *src)
Definition: stepblob.h:119
TBOX bounding_box() const
Definition: stepblob.cpp:253
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:125
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:70
Definition: werd.h:56
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:90
C_BLOB_LIST * cblob_list()
Definition: werd.h:95
WERD * ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob)
Definition: werd.cpp:125
bool AddSelectedOutlines(const GenericVector< bool > &wanted, const GenericVector< C_BLOB * > &target_blobs, const GenericVector< C_OUTLINE * > &outlines, bool *make_next_word_fuzzy)
Definition: werd.cpp:524
void print()
Definition: werd.cpp:253
TBOX bounding_box() const
Definition: werd.cpp:148
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
void GetNoiseOutlines(GenericVector< C_OUTLINE * > *outlines)
Definition: werd.cpp:506
ParamsVectors * params()
Definition: ccutil.h:67
UNICHARSET unicharset
Definition: ccutil.h:73
STRING lang
Definition: ccutil.h:71
volatile int8_t ocr_alive
true if not last
Definition: ocrclass.h:110
int16_t progress
chars in this buffer(0)
Definition: ocrclass.h:105
void * cancel_this
monitor-aware progress callback
Definition: ocrclass.h:116
PROGRESS_FUNC2 progress_callback2
called whenever progress increases
Definition: ocrclass.h:115
bool deadline_exceeded() const
Definition: ocrclass.h:138
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:112
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:39
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:168
void Add(T value, int count)
Definition: sorthelper.h:65
int MaxCount(T *max_value) const
Definition: sorthelper.h:80
Definition: strngs.h:45
void add_str_int(const char *str, int number)
Definition: strngs.cpp:377
int32_t length() const
Definition: strngs.cpp:189
const char * string() const
Definition: strngs.cpp:194
bool top_bottom_useful() const
Definition: unicharset.h:537
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:505
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:343
bool script_has_xheight() const
Definition: unicharset.h:904
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:498
double classify_max_rating_ratio
Definition: classify.h:438
bool AdaptiveClassifierIsEmpty() const
Definition: classify.h:326
bool AdaptableWord(WERD_RES *word)
Definition: adaptmatch.cpp:821
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:250
void StartBackupAdaptiveClassifier()
Definition: adaptmatch.cpp:629
bool classify_bln_numeric_mode
Definition: classify.h:508
double classify_max_certainty_margin
Definition: classify.h:440
void SwitchAdaptiveClassifier()
Definition: adaptmatch.cpp:613
bool AdaptiveClassifierIsFull() const
Definition: classify.h:325
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:386
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:529
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:474
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:813
const UNICHARSET & GetUnicharset() const
void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res)
Definition: textord.cpp:318
static void Update()
Definition: scrollview.cpp:709
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:757
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:476
bool wordrec_run_blamer
Definition: wordrec.h:232
bool wordrec_debug_blamer
Definition: wordrec.h:231