tesseract 4.1.1
Loading...
Searching...
No Matches
pageres.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: pageres.cpp (Formerly page_res.c)
3 * Description: Hierarchy of results classes from PAGE_RES to WERD_RES
4 * and an iterator class to iterate over the words.
5 * Main purposes:
6 * Easy way to iterate over the words without a 3-nested loop.
7 * Holds data used during word recognition.
8 * Holds information about alternative spacing paths.
9 * Author: Phil Cheatle
10 *
11 * (C) Copyright 1992, Hewlett-Packard Ltd.
12 ** Licensed under the Apache License, Version 2.0 (the "License");
13 ** you may not use this file except in compliance with the License.
14 ** You may obtain a copy of the License at
15 ** http://www.apache.org/licenses/LICENSE-2.0
16 ** Unless required by applicable law or agreed to in writing, software
17 ** distributed under the License is distributed on an "AS IS" BASIS,
18 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 ** See the License for the specific language governing permissions and
20 ** limitations under the License.
21 *
22 **********************************************************************/
23
24#include "pageres.h"
25#include <cassert> // for assert
26#include <cstdint> // for INT32_MAX
27#include <cstring> // for strlen
28#include "blamer.h" // for BlamerBundle
29#include "blobs.h" // for TWERD, TBLOB
30#include "boxword.h" // for BoxWord
31#include "errcode.h" // for ASSERT_HOST
32#include "ocrblock.h" // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only)
33#include "ocrrow.h" // for ROW, ROW_IT
34#include "pdblock.h" // for PDBLK
35#include "polyblk.h" // for POLY_BLOCK
36#include "publictypes.h" // for OcrEngineMode, OEM_LSTM_ONLY
37#include "seam.h" // for SEAM, start_seam_list
38#include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
39#include "tesscallback.h" // for NewPermanentTessCallback, TessResultCallback2
40#include "tprintf.h" // for tprintf
41
42struct Pix;
43
46
47// Gain factor for computing thresholds that determine the ambiguity of a word.
48static const double kStopperAmbiguityThresholdGain = 8.0;
49// Constant offset for computing thresholds that determine the ambiguity of a
50// word.
51static const double kStopperAmbiguityThresholdOffset = 1.5;
52// Max number of broken pieces to associate.
54// Max ratio of word box height to line size to allow it to be processed as
55// a line with other words.
56const double kMaxWordSizeRatio = 1.25;
57// Max ratio of line box height to line size to allow a new word to be added.
58const double kMaxLineSizeRatio = 1.25;
59// Max ratio of word gap to line size to allow a new word to be added.
60const double kMaxWordGapRatio = 2.0;
61
62// Computes and returns a threshold of certainty difference used to determine
63// which words to keep, based on the adjustment factors of the two words.
64// TODO(rays) This is horrible. Replace with an enhance params training model.
65static double StopperAmbigThreshold(double f1, double f2) {
66 return (f2 - f1) * kStopperAmbiguityThresholdGain -
67 kStopperAmbiguityThresholdOffset;
68}
69
70/*************************************************************************
71 * PAGE_RES::PAGE_RES
72 *
73 * Constructor for page results
74 *************************************************************************/
76 bool merge_similar_words,
77 BLOCK_LIST *the_block_list,
78 WERD_CHOICE **prev_word_best_choice_ptr) {
79 Init();
80 BLOCK_IT block_it(the_block_list);
81 BLOCK_RES_IT block_res_it(&block_res_list);
82 for (block_it.mark_cycle_pt();
83 !block_it.cycled_list(); block_it.forward()) {
84 block_res_it.add_to_end(new BLOCK_RES(merge_similar_words,
85 block_it.data()));
86 }
87 prev_word_best_choice = prev_word_best_choice_ptr;
88}
89
90/*************************************************************************
91 * BLOCK_RES::BLOCK_RES
92 *
93 * Constructor for BLOCK results
94 *************************************************************************/
95
96BLOCK_RES::BLOCK_RES(bool merge_similar_words, BLOCK *the_block) {
97 ROW_IT row_it (the_block->row_list ());
98 ROW_RES_IT row_res_it(&row_res_list);
99
100 char_count = 0;
101 rej_count = 0;
102 font_class = -1; //not assigned
103 x_height = -1.0;
104 font_assigned = false;
105 row_count = 0;
106
107 block = the_block;
108
109 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
110 row_res_it.add_to_end(new ROW_RES(merge_similar_words, row_it.data()));
111 }
112}
113
114/*************************************************************************
115 * ROW_RES::ROW_RES
116 *
117 * Constructor for ROW results
118 *************************************************************************/
119
120ROW_RES::ROW_RES(bool merge_similar_words, ROW *the_row) {
121 WERD_IT word_it(the_row->word_list());
122 WERD_RES_IT word_res_it(&word_res_list);
123 WERD_RES *combo = nullptr; // current combination of fuzzies
124 WERD *copy_word;
125
126 char_count = 0;
127 rej_count = 0;
129
130 row = the_row;
131 bool add_next_word = false;
132 TBOX union_box;
133 float line_height = the_row->x_height() + the_row->ascenders() -
134 the_row->descenders();
135 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
136 auto* word_res = new WERD_RES(word_it.data());
137 word_res->x_height = the_row->x_height();
138 if (add_next_word) {
139 ASSERT_HOST(combo != nullptr);
140 // We are adding this word to the combination.
141 word_res->part_of_combo = true;
142 combo->copy_on(word_res);
143 } else if (merge_similar_words) {
144 union_box = word_res->word->bounding_box();
145 add_next_word = !word_res->word->flag(W_REP_CHAR) &&
146 union_box.height() <= line_height * kMaxWordSizeRatio;
147 word_res->odd_size = !add_next_word;
148 }
149 WERD* next_word = word_it.data_relative(1);
150 if (merge_similar_words) {
151 if (add_next_word && !next_word->flag(W_REP_CHAR)) {
152 // Next word will be added on if all of the following are true:
153 // Not a rep char.
154 // Box height small enough.
155 // Union box height small enough.
156 // Horizontal gap small enough.
157 TBOX next_box = next_word->bounding_box();
158 int prev_right = union_box.right();
159 union_box += next_box;
160 if (next_box.height() > line_height * kMaxWordSizeRatio ||
161 union_box.height() > line_height * kMaxLineSizeRatio ||
162 next_box.left() > prev_right + line_height * kMaxWordGapRatio) {
163 add_next_word = false;
164 }
165 }
166 next_word->set_flag(W_FUZZY_NON, add_next_word);
167 } else {
168 add_next_word = next_word->flag(W_FUZZY_NON);
169 }
170 if (add_next_word) {
171 if (combo == nullptr) {
172 copy_word = new WERD;
173 *copy_word = *(word_it.data()); // deep copy
174 combo = new WERD_RES(copy_word);
175 combo->x_height = the_row->x_height();
176 combo->combination = true;
177 word_res_it.add_to_end(combo);
178 }
179 word_res->part_of_combo = true;
180 } else {
181 combo = nullptr;
182 }
183 word_res_it.add_to_end(word_res);
184 }
185}
186
187
189 this->ELIST_LINK::operator=(source);
190 Clear();
191 if (source.combination) {
192 word = new WERD;
193 *word = *(source.word); // deep copy
194 } else {
195 word = source.word; // pt to same word
196 }
197 if (source.bln_boxes != nullptr)
199 if (source.chopped_word != nullptr)
200 chopped_word = new TWERD(*source.chopped_word);
201 if (source.rebuild_word != nullptr)
202 rebuild_word = new TWERD(*source.rebuild_word);
203 // TODO(rays) Do we ever need to copy the seam_array?
204 blob_row = source.blob_row;
205 denorm = source.denorm;
206 if (source.box_word != nullptr)
207 box_word = new tesseract::BoxWord(*source.box_word);
208 best_state = source.best_state;
209 correct_text = source.correct_text;
210 blob_widths = source.blob_widths;
211 blob_gaps = source.blob_gaps;
212 // None of the uses of operator= require the ratings matrix to be copied,
213 // so don't as it would be really slow.
214
215 // Copy the cooked choices.
216 WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&source.best_choices));
217 WERD_CHOICE_IT wc_dest_it(&best_choices);
218 for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
219 const WERD_CHOICE *choice = wc_it.data();
220 wc_dest_it.add_after_then_move(new WERD_CHOICE(*choice));
221 }
222 if (!wc_dest_it.empty()) {
223 wc_dest_it.move_to_first();
224 best_choice = wc_dest_it.data();
225 } else {
226 best_choice = nullptr;
227 }
228
229 if (source.raw_choice != nullptr) {
230 raw_choice = new WERD_CHOICE(*source.raw_choice);
231 } else {
232 raw_choice = nullptr;
233 }
234 if (source.ep_choice != nullptr) {
235 ep_choice = new WERD_CHOICE(*source.ep_choice);
236 } else {
237 ep_choice = nullptr;
238 }
239 reject_map = source.reject_map;
240 combination = source.combination;
242 CopySimpleFields(source);
243 if (source.blamer_bundle != nullptr) {
245 }
246 return *this;
247}
248
249// Copies basic fields that don't involve pointers that might be useful
250// to copy when making one WERD_RES from another.
252 tess_failed = source.tess_failed;
255 done = source.done;
257 small_caps = source.small_caps;
258 odd_size = source.odd_size;
259 fontinfo = source.fontinfo;
260 fontinfo2 = source.fontinfo2;
263 x_height = source.x_height;
264 caps_height = source.caps_height;
266 guessed_x_ht = source.guessed_x_ht;
269 uch_set = source.uch_set;
270 tesseract = source.tesseract;
271}
272
273// Initializes a blank (default constructed) WERD_RES from one that has
274// already been recognized.
275// Use SetupFor*Recognition afterwards to complete the setup and make
276// it ready for a retry recognition.
278 word = source.word;
279 CopySimpleFields(source);
280 if (source.blamer_bundle != nullptr) {
283 }
284}
285
286// Sets up the members used in recognition: bln_boxes, chopped_word,
287// seam_array, denorm. Returns false if
288// the word is empty and sets up fake results. If use_body_size is
289// true and row->body_size is set, then body_size will be used for
290// blob normalization instead of xheight + ascrise. This flag is for
291// those languages that are using CJK pitch model and thus it has to
292// be true if and only if tesseract->textord_use_cjk_fp_model is
293// true.
294// If allow_detailed_fx is true, the feature extractor will receive fine
295// precision outline information, allowing smoother features and better
296// features on low resolution images.
297// The norm_mode_hint sets the default mode for normalization in absence
298// of any of the above flags.
299// norm_box is used to override the word bounding box to determine the
300// normalization scale and offset.
301// Returns false if the word is empty and sets up fake results.
303 tesseract::Tesseract* tess, Pix* pix,
304 int norm_mode,
305 const TBOX* norm_box,
306 bool numeric_mode,
307 bool use_body_size,
308 bool allow_detailed_fx,
309 ROW *row, const BLOCK* block) {
310 auto norm_mode_hint =
311 static_cast<tesseract::OcrEngineMode>(norm_mode);
312 tesseract = tess;
313 POLY_BLOCK* pb = block != nullptr ? block->pdblk.poly_block() : nullptr;
314 if ((norm_mode_hint != tesseract::OEM_LSTM_ONLY &&
315 word->cblob_list()->empty()) ||
316 (pb != nullptr && !pb->IsText())) {
317 // Empty words occur when all the blobs have been moved to the rej_blobs
318 // list, which seems to occur frequently in junk.
319 SetupFake(unicharset_in);
320 word->set_flag(W_REP_CHAR, false);
321 return false;
322 }
323 ClearResults();
324 SetupWordScript(unicharset_in);
325 chopped_word = TWERD::PolygonalCopy(allow_detailed_fx, word);
326 float word_xheight = use_body_size && row != nullptr && row->body_size() > 0.0f
327 ? row->body_size() : x_height;
328 chopped_word->BLNormalize(block, row, pix, word->flag(W_INVERSE),
329 word_xheight, baseline_shift, numeric_mode,
330 norm_mode_hint, norm_box, &denorm);
331 blob_row = row;
332 SetupBasicsFromChoppedWord(unicharset_in);
334 int num_blobs = chopped_word->NumBlobs();
335 ratings = new MATRIX(num_blobs, kWordrecMaxNumJoinChunks);
336 tess_failed = false;
337 return true;
338}
339
340// Set up the seam array, bln_boxes, best_choice, and raw_choice to empty
341// accumulators from a made chopped word. We presume the fields are already
342// empty.
348}
349
350// Sets up the members used in recognition for an empty recognition result:
351// bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
352void WERD_RES::SetupFake(const UNICHARSET& unicharset_in) {
353 ClearResults();
354 SetupWordScript(unicharset_in);
355 chopped_word = new TWERD;
356 rebuild_word = new TWERD;
359 int blob_count = word->cblob_list()->length();
360 if (blob_count > 0) {
361 auto** fake_choices = new BLOB_CHOICE*[blob_count];
362 // For non-text blocks, just pass any blobs through to the box_word
363 // and call the word failed with a fake classification.
364 C_BLOB_IT b_it(word->cblob_list());
365 int blob_id = 0;
366 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
367 TBOX box = b_it.data()->bounding_box();
369 fake_choices[blob_id++] = new BLOB_CHOICE;
370 }
371 FakeClassifyWord(blob_count, fake_choices);
372 delete [] fake_choices;
373 } else {
374 auto* word = new WERD_CHOICE(&unicharset_in);
375 word->make_bad();
377 // Ownership of word is taken by *this WERD_RES in LogNewCookedChoice.
378 LogNewCookedChoice(1, false, word);
379 }
380 tess_failed = true;
381 done = true;
382}
383
385 uch_set = &uch;
386 int script = uch.default_sid();
387 word->set_script_id(script);
389 word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid());
390}
391
392// Sets up the blamer_bundle if it is not null, using the initialized denorm.
394 if (blamer_bundle != nullptr) {
396 }
397}
398
399// Computes the blob_widths and blob_gaps from the chopped_word.
403 int num_blobs = chopped_word->NumBlobs();
404 for (int b = 0; b < num_blobs; ++b) {
405 TBLOB *blob = chopped_word->blobs[b];
406 TBOX box = blob->bounding_box();
408 if (b + 1 < num_blobs) {
410 chopped_word->blobs[b + 1]->bounding_box().left() - box.right());
411 }
412 }
413}
414
415// Updates internal data to account for a new SEAM (chop) at the given
416// blob_number. Fixes the ratings matrix and states in the choices, as well
417// as the blob widths and gaps.
418void WERD_RES::InsertSeam(int blob_number, SEAM* seam) {
419 // Insert the seam into the SEAMS array.
420 seam->PrepareToInsertSeam(seam_array, chopped_word->blobs, blob_number, true);
421 seam_array.insert(seam, blob_number);
422 if (ratings != nullptr) {
423 // Expand the ratings matrix.
424 ratings = ratings->ConsumeAndMakeBigger(blob_number);
425 // Fix all the segmentation states.
426 if (raw_choice != nullptr)
427 raw_choice->UpdateStateForSplit(blob_number);
428 WERD_CHOICE_IT wc_it(&best_choices);
429 for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
430 WERD_CHOICE* choice = wc_it.data();
431 choice->UpdateStateForSplit(blob_number);
432 }
434 }
435}
436
437// Returns true if all the word choices except the first have adjust_factors
438// worse than the given threshold.
440 // The choices are not changed by this iteration.
441 WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&best_choices));
442 for (wc_it.forward(); !wc_it.at_first(); wc_it.forward()) {
443 WERD_CHOICE* choice = wc_it.data();
444 if (choice->adjust_factor() <= threshold)
445 return false;
446 }
447 return true;
448}
449
450// Returns true if the current word is ambiguous (by number of answers or
451// by dangerous ambigs.)
453 return !best_choices.singleton() || best_choice->dangerous_ambig_found();
454}
455
456// Returns true if the ratings matrix size matches the sum of each of the
457// segmentation states.
459 int ratings_dim = ratings->dimension();
460 if (raw_choice->TotalOfStates() != ratings_dim) {
461 tprintf("raw_choice has total of states = %d vs ratings dim of %d\n",
462 raw_choice->TotalOfStates(), ratings_dim);
463 return false;
464 }
465 WERD_CHOICE_IT it(&best_choices);
466 int index = 0;
467 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
468 WERD_CHOICE* choice = it.data();
469 if (choice->TotalOfStates() != ratings_dim) {
470 tprintf("Cooked #%d has total of states = %d vs ratings dim of %d\n",
471 index, choice->TotalOfStates(), ratings_dim);
472 return false;
473 }
474 }
475 return true;
476}
477
478// Prints a list of words found if debug is true or the word result matches
479// the word_to_debug.
480void WERD_RES::DebugWordChoices(bool debug, const char* word_to_debug) {
481 if (debug ||
482 (word_to_debug != nullptr && *word_to_debug != '\0' && best_choice != nullptr &&
483 best_choice->unichar_string() == STRING(word_to_debug))) {
484 if (raw_choice != nullptr)
485 raw_choice->print("\nBest Raw Choice");
486
487 WERD_CHOICE_IT it(&best_choices);
488 int index = 0;
489 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
490 WERD_CHOICE* choice = it.data();
491 STRING label;
492 label.add_str_int("\nCooked Choice #", index);
493 choice->print(label.string());
494 }
495 }
496}
497
498// Prints the top choice along with the accepted/done flags.
499void WERD_RES::DebugTopChoice(const char* msg) const {
500 tprintf("Best choice: accepted=%d, adaptable=%d, done=%d : ",
502 if (best_choice == nullptr)
503 tprintf("<Null choice>\n");
504 else
505 best_choice->print(msg);
506}
507
508// Removes from best_choices all choices which are not within a reasonable
509// range of the best choice.
510// TODO(rays) incorporate the information used here into the params training
511// re-ranker, in place of this heuristic that is based on the previous
512// adjustment factor.
513void WERD_RES::FilterWordChoices(int debug_level) {
514 if (best_choice == nullptr || best_choices.singleton())
515 return;
516
517 if (debug_level >= 2)
518 best_choice->print("\nFiltering against best choice");
519 WERD_CHOICE_IT it(&best_choices);
520 int index = 0;
521 for (it.forward(); !it.at_first(); it.forward(), ++index) {
522 WERD_CHOICE* choice = it.data();
523 float threshold = StopperAmbigThreshold(best_choice->adjust_factor(),
524 choice->adjust_factor());
525 // i, j index the blob choice in choice, best_choice.
526 // chunk is an index into the chopped_word blobs (AKA chunks).
527 // Since the two words may use different segmentations of the chunks, we
528 // iterate over the chunks to find out whether a comparable blob
529 // classification is much worse than the best result.
530 int i = 0, j = 0, chunk = 0;
531 // Each iteration of the while deals with 1 chunk. On entry choice_chunk
532 // and best_chunk are the indices of the first chunk in the NEXT blob,
533 // i.e. we don't have to increment i, j while chunk < choice_chunk and
534 // best_chunk respectively.
535 int choice_chunk = choice->state(0), best_chunk = best_choice->state(0);
536 while (i < choice->length() && j < best_choice->length()) {
537 if (choice->unichar_id(i) != best_choice->unichar_id(j) &&
538 choice->certainty(i) - best_choice->certainty(j) < threshold) {
539 if (debug_level >= 2) {
540 choice->print("WorstCertaintyDiffWorseThan");
541 tprintf(
542 "i %d j %d Choice->Blob[i].Certainty %.4g"
543 " WorstOtherChoiceCertainty %g Threshold %g\n",
544 i, j, choice->certainty(i), best_choice->certainty(j), threshold);
545 tprintf("Discarding bad choice #%d\n", index);
546 }
547 delete it.extract();
548 break;
549 }
550 ++chunk;
551 // If needed, advance choice_chunk to keep up with chunk.
552 while (choice_chunk < chunk && ++i < choice->length())
553 choice_chunk += choice->state(i);
554 // If needed, advance best_chunk to keep up with chunk.
555 while (best_chunk < chunk && ++j < best_choice->length())
556 best_chunk += best_choice->state(j);
557 }
558 }
559}
560
561void WERD_RES::ComputeAdaptionThresholds(float certainty_scale,
562 float min_rating,
563 float max_rating,
564 float rating_margin,
565 float* thresholds) {
566 int chunk = 0;
567 int end_chunk = best_choice->state(0);
568 int end_raw_chunk = raw_choice->state(0);
569 int raw_blob = 0;
570 for (int i = 0; i < best_choice->length(); i++, thresholds++) {
571 float avg_rating = 0.0f;
572 int num_error_chunks = 0;
573
574 // For each chunk in best choice blob i, count non-matching raw results.
575 while (chunk < end_chunk) {
576 if (chunk >= end_raw_chunk) {
577 ++raw_blob;
578 end_raw_chunk += raw_choice->state(raw_blob);
579 }
580 if (best_choice->unichar_id(i) !=
581 raw_choice->unichar_id(raw_blob)) {
582 avg_rating += raw_choice->certainty(raw_blob);
583 ++num_error_chunks;
584 }
585 ++chunk;
586 }
587
588 if (num_error_chunks > 0) {
589 avg_rating /= num_error_chunks;
590 *thresholds = (avg_rating / -certainty_scale) * (1.0 - rating_margin);
591 } else {
592 *thresholds = max_rating;
593 }
594
595 if (*thresholds > max_rating)
596 *thresholds = max_rating;
597 if (*thresholds < min_rating)
598 *thresholds = min_rating;
599 }
600}
601
602// Saves a copy of the word_choice if it has the best unadjusted rating.
603// Returns true if the word_choice was the new best.
605 if (raw_choice == nullptr || word_choice->rating() < raw_choice->rating()) {
606 delete raw_choice;
607 raw_choice = new WERD_CHOICE(*word_choice);
609 return true;
610 }
611 return false;
612}
613
614// Consumes word_choice by adding it to best_choices, (taking ownership) if
615// the certainty for word_choice is some distance of the best choice in
616// best_choices, or by deleting the word_choice and returning false.
617// The best_choices list is kept in sorted order by rating. Duplicates are
618// removed, and the list is kept no longer than max_num_choices in length.
619// Returns true if the word_choice is still a valid pointer.
620bool WERD_RES::LogNewCookedChoice(int max_num_choices, bool debug,
621 WERD_CHOICE* word_choice) {
622 if (best_choice != nullptr) {
623 // Throw out obviously bad choices to save some work.
624 // TODO(rays) Get rid of this! This piece of code produces different
625 // results according to the order in which words are found, which is an
626 // undesirable behavior. It would be better to keep all the choices and
627 // prune them later when more information is available.
628 float max_certainty_delta =
629 StopperAmbigThreshold(best_choice->adjust_factor(),
630 word_choice->adjust_factor());
631 if (max_certainty_delta > -kStopperAmbiguityThresholdOffset)
632 max_certainty_delta = -kStopperAmbiguityThresholdOffset;
633 if (word_choice->certainty() - best_choice->certainty() <
634 max_certainty_delta) {
635 if (debug) {
636 STRING bad_string;
637 word_choice->string_and_lengths(&bad_string, nullptr);
638 tprintf("Discarding choice \"%s\" with an overly low certainty"
639 " %.3f vs best choice certainty %.3f (Threshold: %.3f)\n",
640 bad_string.string(), word_choice->certainty(),
642 max_certainty_delta + best_choice->certainty());
643 }
644 delete word_choice;
645 return false;
646 }
647 }
648
649 // Insert in the list in order of increasing rating, but knock out worse
650 // string duplicates.
651 WERD_CHOICE_IT it(&best_choices);
652 const STRING& new_str = word_choice->unichar_string();
653 bool inserted = false;
654 int num_choices = 0;
655 if (!it.empty()) {
656 do {
657 WERD_CHOICE* choice = it.data();
658 if (choice->rating() > word_choice->rating() && !inserted) {
659 // Time to insert.
660 it.add_before_stay_put(word_choice);
661 inserted = true;
662 if (num_choices == 0)
663 best_choice = word_choice; // This is the new best.
664 ++num_choices;
665 }
666 if (choice->unichar_string() == new_str) {
667 if (inserted) {
668 // New is better.
669 delete it.extract();
670 } else {
671 // Old is better.
672 if (debug) {
673 tprintf("Discarding duplicate choice \"%s\", rating %g vs %g\n",
674 new_str.string(), word_choice->rating(), choice->rating());
675 }
676 delete word_choice;
677 return false;
678 }
679 } else {
680 ++num_choices;
681 if (num_choices > max_num_choices)
682 delete it.extract();
683 }
684 it.forward();
685 } while (!it.at_first());
686 }
687 if (!inserted && num_choices < max_num_choices) {
688 it.add_to_end(word_choice);
689 inserted = true;
690 if (num_choices == 0)
691 best_choice = word_choice; // This is the new best.
692 }
693 if (debug) {
694 if (inserted)
695 tprintf("New %s", best_choice == word_choice ? "Best" : "Secondary");
696 else
697 tprintf("Poor");
698 word_choice->print(" Word Choice");
699 }
700 if (!inserted) {
701 delete word_choice;
702 return false;
703 }
704 return true;
705}
706
707
708// Simple helper moves the ownership of the pointer data from src to dest,
709// first deleting anything in dest, and nulling out src afterwards.
710template<class T> static void MovePointerData(T** dest, T**src) {
711 delete *dest;
712 *dest = *src;
713 *src = nullptr;
714}
715
716// Prints a brief list of all the best choices.
718 STRING alternates_str;
719 WERD_CHOICE_IT it(const_cast<WERD_CHOICE_LIST*>(&best_choices));
720 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
721 if (!it.at_first()) alternates_str += "\", \"";
722 alternates_str += it.data()->unichar_string();
723 }
724 tprintf("Alternates for \"%s\": {\"%s\"}\n",
725 best_choice->unichar_string().string(), alternates_str.string());
726}
727
728// Returns the sum of the widths of the blob between start_blob and last_blob
729// inclusive.
730int WERD_RES::GetBlobsWidth(int start_blob, int last_blob) {
731 int result = 0;
732 for (int b = start_blob; b <= last_blob; ++b) {
733 result += blob_widths[b];
734 if (b < last_blob)
735 result += blob_gaps[b];
736 }
737 return result;
738}
739// Returns the width of a gap between the specified blob and the next one.
740int WERD_RES::GetBlobsGap(int blob_index) {
741 if (blob_index < 0 || blob_index >= blob_gaps.size())
742 return 0;
743 return blob_gaps[blob_index];
744}
745
746// Returns the BLOB_CHOICE corresponding to the given index in the
747// best choice word taken from the appropriate cell in the ratings MATRIX.
748// Borrowed pointer, so do not delete. May return nullptr if there is no
749// BLOB_CHOICE matching the unichar_id at the given index.
751 if (index < 0 || index >= best_choice->length()) return nullptr;
752 BLOB_CHOICE_LIST* choices = GetBlobChoices(index);
753 return FindMatchingChoice(best_choice->unichar_id(index), choices);
754}
755
756// Returns the BLOB_CHOICE_LIST corresponding to the given index in the
757// best choice word taken from the appropriate cell in the ratings MATRIX.
758// Borrowed pointer, so do not delete.
759BLOB_CHOICE_LIST* WERD_RES::GetBlobChoices(int index) const {
760 return best_choice->blob_choices(index, ratings);
761}
762
763// Moves the results fields from word to this. This takes ownership of all
764// the data, so src can be destructed.
766 denorm = word->denorm;
767 blob_row = word->blob_row;
768 MovePointerData(&chopped_word, &word->chopped_word);
769 MovePointerData(&rebuild_word, &word->rebuild_word);
770 MovePointerData(&box_word, &word->box_word);
772 seam_array = word->seam_array;
773 word->seam_array.clear();
774 best_state.move(&word->best_state);
775 correct_text.move(&word->correct_text);
776 blob_widths.move(&word->blob_widths);
777 blob_gaps.move(&word->blob_gaps);
778 if (ratings != nullptr) ratings->delete_matrix_pointers();
779 MovePointerData(&ratings, &word->ratings);
780 best_choice = word->best_choice;
781 MovePointerData(&raw_choice, &word->raw_choice);
782 best_choices.clear();
783 WERD_CHOICE_IT wc_it(&best_choices);
784 wc_it.add_list_after(&word->best_choices);
785 reject_map = word->reject_map;
786 if (word->blamer_bundle != nullptr) {
787 assert(blamer_bundle != nullptr);
788 blamer_bundle->CopyResults(*(word->blamer_bundle));
789 }
791}
792
793// Replace the best choice and rebuild box word.
794// choice must be from the current best_choices list.
796 best_choice = choice;
798 SetupBoxWord();
799 // Make up a fake reject map of the right length to keep the
800 // rejection pass happy.
804}
805
806// Builds the rebuild_word and sets the best_state from the chopped_word and
807// the best_choice->state.
809 ASSERT_HOST(best_choice != nullptr);
810 delete rebuild_word;
811 rebuild_word = new TWERD;
812 if (seam_array.empty())
815 int start = 0;
816 for (int i = 0; i < best_choice->length(); ++i) {
817 int length = best_choice->state(i);
818 best_state.push_back(length);
819 if (length > 1) {
821 start + length - 1);
822 }
823 TBLOB* blob = chopped_word->blobs[start];
824 rebuild_word->blobs.push_back(new TBLOB(*blob));
825 if (length > 1) {
827 start + length - 1);
828 }
829 start += length;
830 }
831}
832
833// Copies the chopped_word to the rebuild_word, faking a best_state as well.
834// Also sets up the output box_word.
836 delete rebuild_word;
838 SetupBoxWord();
839 int word_len = box_word->length();
840 best_state.reserve(word_len);
841 correct_text.reserve(word_len);
842 for (int i = 0; i < word_len; ++i) {
845 }
846}
847
848// Sets/replaces the box_word with one made from the rebuild_word.
850 delete box_word;
854}
855
856// Sets up the script positions in the output best_choice using the best_choice
857// to get the unichars, and the unicharset to get the target positions.
860}
861// Sets all the blobs in all the words (raw choice and best choices) to be
862// the given position. (When a sub/superscript is recognized as a separate
863// word, it falls victim to the rule that a whole word cannot be sub or
864// superscript, so this function overrides that problem.)
867 WERD_CHOICE_IT wc_it(&best_choices);
868 for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward())
869 wc_it.data()->SetAllScriptPositions(position);
870}
871
872// Classifies the word with some already-calculated BLOB_CHOICEs.
873// The choices are an array of blob_count pointers to BLOB_CHOICE,
874// providing a single classifier result for each blob.
875// The BLOB_CHOICEs are consumed and the word takes ownership.
876// The number of blobs in the box_word must match blob_count.
877void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) {
878 // Setup the WERD_RES.
879 ASSERT_HOST(box_word != nullptr);
880 ASSERT_HOST(blob_count == box_word->length());
882 ClearRatings();
883 ratings = new MATRIX(blob_count, 1);
884 for (int c = 0; c < blob_count; ++c) {
885 auto* choice_list = new BLOB_CHOICE_LIST;
886 BLOB_CHOICE_IT choice_it(choice_list);
887 choice_it.add_after_then_move(choices[c]);
888 ratings->put(c, c, choice_list);
889 }
891 reject_map.initialise(blob_count);
892 best_state.init_to_size(blob_count, 1);
893 done = true;
894}
895
896// Creates a WERD_CHOICE for the word using the top choices from the leading
897// diagonal of the ratings matrix.
899 int num_blobs = ratings->dimension();
900 auto* word_choice = new WERD_CHOICE(uch_set, num_blobs);
901 word_choice->set_permuter(permuter);
902 for (int b = 0; b < num_blobs; ++b) {
903 UNICHAR_ID unichar_id = UNICHAR_SPACE;
904 float rating = INT32_MAX;
905 float certainty = -INT32_MAX;
906 BLOB_CHOICE_LIST* choices = ratings->get(b, b);
907 if (choices != nullptr && !choices->empty()) {
908 BLOB_CHOICE_IT bc_it(choices);
909 BLOB_CHOICE* choice = bc_it.data();
910 unichar_id = choice->unichar_id();
911 rating = choice->rating();
912 certainty = choice->certainty();
913 }
914 word_choice->append_unichar_id_space_allocated(unichar_id, 1, rating,
915 certainty);
916 }
917 LogNewRawChoice(word_choice);
918 // Ownership of word_choice taken by word here.
919 LogNewCookedChoice(1, false, word_choice);
920}
921
922// Copies the best_choice strings to the correct_text for adaption/training.
925 ASSERT_HOST(best_choice != nullptr);
926 for (int i = 0; i < best_choice->length(); ++i) {
927 UNICHAR_ID choice_id = best_choice->unichar_id(i);
928 const char* blob_choice = uch_set->id_to_unichar(choice_id);
929 correct_text.push_back(STRING(blob_choice));
930 }
931}
932
933// Merges 2 adjacent blobs in the result if the permanent callback
934// class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
935// callback box_cb is nullptr or returns true, setting the merged blob
936// result to the class returned from class_cb.
937// Returns true if anything was merged.
941 ASSERT_HOST(best_choice->length() == 0 || ratings != nullptr);
942 bool modified = false;
943 for (int i = 0; i + 1 < best_choice->length(); ++i) {
944 UNICHAR_ID new_id = class_cb->Run(best_choice->unichar_id(i),
945 best_choice->unichar_id(i+1));
946 if (new_id != INVALID_UNICHAR_ID &&
947 (box_cb == nullptr || box_cb->Run(box_word->BlobBox(i),
948 box_word->BlobBox(i + 1)))) {
949 // Raw choice should not be fixed.
950 best_choice->set_unichar_id(new_id, i);
951 modified = true;
953 const MATRIX_COORD& coord = best_choice->MatrixCoord(i);
954 if (!coord.Valid(*ratings)) {
955 ratings->IncreaseBandSize(coord.row + 1 - coord.col);
956 }
957 BLOB_CHOICE_LIST* blob_choices = GetBlobChoices(i);
958 if (FindMatchingChoice(new_id, blob_choices) == nullptr) {
959 // Insert a fake result.
960 auto* blob_choice = new BLOB_CHOICE;
961 blob_choice->set_unichar_id(new_id);
962 BLOB_CHOICE_IT bc_it(blob_choices);
963 bc_it.add_before_then_move(blob_choice);
964 }
965 }
966 }
967 delete class_cb;
968 delete box_cb;
969 return modified;
970}
971
972// Merges 2 adjacent blobs in the result (index and index+1) and corrects
973// all the data to account for the change.
976 reject_map.remove_pos(index);
977 best_choice->remove_unichar_id(index + 1);
978 rebuild_word->MergeBlobs(index, index + 2);
979 box_word->MergeBoxes(index, index + 2);
980 if (index + 1 < best_state.length()) {
981 best_state[index] += best_state[index + 1];
982 best_state.remove(index + 1);
983 }
984}
985
986// TODO(tkielbus) Decide between keeping this behavior here or modifying the
987// training data.
988
989// Utility function for fix_quotes
990// Return true if the next character in the string (given the UTF8 length in
991// bytes) is a quote character.
992static int is_simple_quote(const char* signed_str, int length) {
993 const auto* str =
994 reinterpret_cast<const unsigned char*>(signed_str);
995 // Standard 1 byte quotes.
996 return (length == 1 && (*str == '\'' || *str == '`')) ||
997 // UTF-8 3 bytes curved quotes.
998 (length == 3 && ((*str == 0xe2 &&
999 *(str + 1) == 0x80 &&
1000 *(str + 2) == 0x98) ||
1001 (*str == 0xe2 &&
1002 *(str + 1) == 0x80 &&
1003 *(str + 2) == 0x99)));
1004}
1005
1006// Callback helper for fix_quotes returns a double quote if both
1007// arguments are quote, otherwise INVALID_UNICHAR_ID.
1009 const char *ch = uch_set->id_to_unichar(id1);
1010 const char *next_ch = uch_set->id_to_unichar(id2);
1011 if (is_simple_quote(ch, strlen(ch)) &&
1012 is_simple_quote(next_ch, strlen(next_ch)))
1013 return uch_set->unichar_to_id("\"");
1014 return INVALID_UNICHAR_ID;
1015}
1016
1017// Change pairs of quotes to double quotes.
1019 if (!uch_set->contains_unichar("\"") ||
1021 return; // Don't create it if it is disallowed.
1022
1025 nullptr);
1026}
1027
1028// Callback helper for fix_hyphens returns UNICHAR_ID of - if both
1029// arguments are hyphen, otherwise INVALID_UNICHAR_ID.
1031 const char *ch = uch_set->id_to_unichar(id1);
1032 const char *next_ch = uch_set->id_to_unichar(id2);
1033 if (strlen(ch) == 1 && strlen(next_ch) == 1 &&
1034 (*ch == '-' || *ch == '~') && (*next_ch == '-' || *next_ch == '~'))
1035 return uch_set->unichar_to_id("-");
1036 return INVALID_UNICHAR_ID;
1037}
1038
1039// Callback helper for fix_hyphens returns true if box1 and box2 overlap
1040// (assuming both on the same textline, are in order and a chopped em dash.)
1041bool WERD_RES::HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2) {
1042 return box1.right() >= box2.left();
1043}
1044
1045// Change pairs of hyphens to a single hyphen if the bounding boxes touch
1046// Typically a long dash which has been segmented.
1048 if (!uch_set->contains_unichar("-") ||
1050 return; // Don't create it if it is disallowed.
1051
1055}
1056
1057// Callback helper for merge_tess_fails returns a space if both
1058// arguments are space, otherwise INVALID_UNICHAR_ID.
1060 if (id1 == id2 && id1 == uch_set->unichar_to_id(" "))
1061 return id1;
1062 else
1063 return INVALID_UNICHAR_ID;
1064}
1065
1066// Change pairs of tess failures to a single one
1070 int len = best_choice->length();
1071 ASSERT_HOST(reject_map.length() == len);
1072 ASSERT_HOST(box_word->length() == len);
1073 }
1074}
1075
1076// Returns true if the collection of count pieces, starting at start, are all
1077// natural connected components, ie there are no real chops involved.
1078bool WERD_RES::PiecesAllNatural(int start, int count) const {
1079 // all seams must have no splits.
1080 for (int index = start; index < start + count - 1; ++index) {
1081 if (index >= 0 && index < seam_array.size()) {
1082 SEAM* seam = seam_array[index];
1083 if (seam != nullptr && seam->HasAnySplits()) return false;
1084 }
1085 }
1086 return true;
1087}
1088
1089
1091 Clear();
1092}
1093
1095 if (combination) {
1096 delete word;
1097 }
1098 word = nullptr;
1099 delete blamer_bundle;
1100 blamer_bundle = nullptr;
1101 ClearResults();
1102}
1103
1105 done = false;
1106 fontinfo = nullptr;
1107 fontinfo2 = nullptr;
1110 delete bln_boxes;
1111 bln_boxes = nullptr;
1112 blob_row = nullptr;
1113 delete chopped_word;
1114 chopped_word = nullptr;
1115 delete rebuild_word;
1116 rebuild_word = nullptr;
1117 delete box_word;
1118 box_word = nullptr;
1119 best_state.clear();
1122 seam_array.clear();
1124 blob_gaps.clear();
1125 ClearRatings();
1127 if (blamer_bundle != nullptr) blamer_bundle->ClearResults();
1128}
1130 best_choice = nullptr;
1131 delete raw_choice;
1132 raw_choice = nullptr;
1133 best_choices.clear();
1134 delete ep_choice;
1135 ep_choice = nullptr;
1136}
1138 if (ratings != nullptr) {
1140 delete ratings;
1141 ratings = nullptr;
1142 }
1143}
1144
1145int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const {
1146 ASSERT_HOST(page_res == other.page_res);
1147 if (other.block_res == nullptr) {
1148 // other points to the end of the page.
1149 if (block_res == nullptr)
1150 return 0;
1151 return -1;
1152 }
1153 if (block_res == nullptr) {
1154 return 1; // we point to the end of the page.
1155 }
1156 if (block_res == other.block_res) {
1157 if (other.row_res == nullptr || row_res == nullptr) {
1158 // this should only happen if we hit an image block.
1159 return 0;
1160 }
1161 if (row_res == other.row_res) {
1162 // we point to the same block and row.
1163 ASSERT_HOST(other.word_res != nullptr && word_res != nullptr);
1164 if (word_res == other.word_res) {
1165 // we point to the same word!
1166 return 0;
1167 }
1168
1169 WERD_RES_IT word_res_it(&row_res->word_res_list);
1170 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
1171 word_res_it.forward()) {
1172 if (word_res_it.data() == word_res) {
1173 return -1;
1174 } else if (word_res_it.data() == other.word_res) {
1175 return 1;
1176 }
1177 }
1178 ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1179 }
1180
1181 // we both point to the same block, but different rows.
1182 ROW_RES_IT row_res_it(&block_res->row_res_list);
1183 for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
1184 row_res_it.forward()) {
1185 if (row_res_it.data() == row_res) {
1186 return -1;
1187 } else if (row_res_it.data() == other.row_res) {
1188 return 1;
1189 }
1190 }
1191 ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1192 }
1193
1194 // We point to different blocks.
1195 BLOCK_RES_IT block_res_it(&page_res->block_res_list);
1196 for (block_res_it.mark_cycle_pt();
1197 !block_res_it.cycled_list(); block_res_it.forward()) {
1198 if (block_res_it.data() == block_res) {
1199 return -1;
1200 } else if (block_res_it.data() == other.block_res) {
1201 return 1;
1202 }
1203 }
1204 // Shouldn't happen...
1205 ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1206 return 0;
1207}
1208
1209// Inserts the new_word as a combination owned by a corresponding WERD_RES
1210// before the current position. The simple fields of the WERD_RES are copied
1211// from clone_res and the resulting WERD_RES is returned for further setup
1212// with best_choice etc.
1214 WERD* new_word) {
1215 // Make a WERD_RES for the new_word.
1216 auto* new_res = new WERD_RES(new_word);
1217 new_res->CopySimpleFields(clone_res);
1218 new_res->combination = true;
1219 // Insert into the appropriate place in the ROW_RES.
1220 WERD_RES_IT wr_it(&row()->word_res_list);
1221 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1222 WERD_RES* word = wr_it.data();
1223 if (word == word_res)
1224 break;
1225 }
1226 ASSERT_HOST(!wr_it.cycled_list());
1227 wr_it.add_before_then_move(new_res);
1228 if (wr_it.at_first()) {
1229 // This is the new first word, so reset the member iterator so it
1230 // detects the cycled_list state correctly.
1232 }
1233 return new_res;
1234}
1235
1236// Helper computes the boundaries between blobs in the word. The blob bounds
1237// are likely very poor, if they come from LSTM, where it only outputs the
1238// character at one pixel within it, so we find the midpoints between them.
1239static void ComputeBlobEnds(const WERD_RES& word, const TBOX& clip_box,
1240 C_BLOB_LIST* next_word_blobs,
1241 GenericVector<int>* blob_ends) {
1242 C_BLOB_IT blob_it(word.word->cblob_list());
1243 for (int i = 0; i < word.best_state.size(); ++i) {
1244 int length = word.best_state[i];
1245 // Get the bounding box of the fake blobs
1246 TBOX blob_box = blob_it.data()->bounding_box();
1247 blob_it.forward();
1248 for (int b = 1; b < length; ++b) {
1249 blob_box += blob_it.data()->bounding_box();
1250 blob_it.forward();
1251 }
1252 // This blob_box is crap, so for now we are only looking for the
1253 // boundaries between them.
1254 int blob_end = INT32_MAX;
1255 if (!blob_it.at_first() || next_word_blobs != nullptr) {
1256 if (blob_it.at_first())
1257 blob_it.set_to_list(next_word_blobs);
1258 blob_end = (blob_box.right() + blob_it.data()->bounding_box().left()) / 2;
1259 }
1260 blob_end = ClipToRange<int>(blob_end, clip_box.left(), clip_box.right());
1261 blob_ends->push_back(blob_end);
1262 }
1263 blob_ends->back() = clip_box.right();
1264}
1265
1266// Helper computes the bounds of a word by restricting it to existing words
1267// that significantly overlap.
1268static TBOX ComputeWordBounds(const tesseract::PointerVector<WERD_RES>& words,
1269 int w_index, TBOX prev_box, WERD_RES_IT w_it) {
1270 constexpr int kSignificantOverlapFraction = 4;
1271 TBOX clipped_box;
1272 TBOX current_box = words[w_index]->word->bounding_box();
1273 TBOX next_box;
1274 if (w_index + 1 < words.size() && words[w_index + 1] != nullptr &&
1275 words[w_index + 1]->word != nullptr)
1276 next_box = words[w_index + 1]->word->bounding_box();
1277 for (w_it.forward(); !w_it.at_first() && w_it.data()->part_of_combo;
1278 w_it.forward()) {
1279 if (w_it.data() == nullptr || w_it.data()->word == nullptr) continue;
1280 TBOX w_box = w_it.data()->word->bounding_box();
1281 int height_limit = std::min<int>(w_box.height(), w_box.width() / 2);
1282 int width_limit = w_box.width() / kSignificantOverlapFraction;
1283 int min_significant_overlap = std::max(height_limit, width_limit);
1284 int overlap = w_box.intersection(current_box).width();
1285 int prev_overlap = w_box.intersection(prev_box).width();
1286 int next_overlap = w_box.intersection(next_box).width();
1287 if (overlap > min_significant_overlap) {
1288 if (prev_overlap > min_significant_overlap) {
1289 // We have no choice but to use the LSTM word edge.
1290 clipped_box.set_left(current_box.left());
1291 } else if (next_overlap > min_significant_overlap) {
1292 // We have no choice but to use the LSTM word edge.
1293 clipped_box.set_right(current_box.right());
1294 } else {
1295 clipped_box += w_box;
1296 }
1297 }
1298 }
1299 if (clipped_box.height() <= 0) {
1300 clipped_box.set_top(current_box.top());
1301 clipped_box.set_bottom(current_box.bottom());
1302 }
1303 if (clipped_box.width() <= 0) clipped_box = current_box;
1304 return clipped_box;
1305}
1306
1307// Helper moves the blob from src to dest. If it isn't contained by clip_box,
1308// the blob is replaced by a fake that is contained.
1309static TBOX MoveAndClipBlob(C_BLOB_IT* src_it, C_BLOB_IT* dest_it,
1310 const TBOX& clip_box) {
1311 C_BLOB* src_blob = src_it->extract();
1312 TBOX box = src_blob->bounding_box();
1313 if (!clip_box.contains(box)) {
1314 int left =
1315 ClipToRange<int>(box.left(), clip_box.left(), clip_box.right() - 1);
1316 int right =
1317 ClipToRange<int>(box.right(), clip_box.left() + 1, clip_box.right());
1318 int top =
1319 ClipToRange<int>(box.top(), clip_box.bottom() + 1, clip_box.top());
1320 int bottom =
1321 ClipToRange<int>(box.bottom(), clip_box.bottom(), clip_box.top() - 1);
1322 box = TBOX(left, bottom, right, top);
1323 delete src_blob;
1324 src_blob = C_BLOB::FakeBlob(box);
1325 }
1326 dest_it->add_after_then_move(src_blob);
1327 return box;
1328}
1329
1330// Replaces the current WERD/WERD_RES with the given words. The given words
1331// contain fake blobs that indicate the position of the characters. These are
1332// replaced with real blobs from the current word as much as possible.
1335 if (words->empty()) {
1337 return;
1338 }
1339 WERD_RES* input_word = word();
1340 // Set the BOL/EOL flags on the words from the input word.
1341 if (input_word->word->flag(W_BOL)) {
1342 (*words)[0]->word->set_flag(W_BOL, true);
1343 } else {
1344 (*words)[0]->word->set_blanks(input_word->word->space());
1345 }
1346 words->back()->word->set_flag(W_EOL, input_word->word->flag(W_EOL));
1347
1348 // Move the blobs from the input word to the new set of words.
1349 // If the input word_res is a combination, then the replacements will also be
1350 // combinations, and will own their own words. If the input word_res is not a
1351 // combination, then the final replacements will not be either, (although it
1352 // is allowed for the input words to be combinations) and their words
1353 // will get put on the row list. This maintains the ownership rules.
1354 WERD_IT w_it(row()->row->word_list());
1355 if (!input_word->combination) {
1356 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1357 WERD* word = w_it.data();
1358 if (word == input_word->word)
1359 break;
1360 }
1361 // w_it is now set to the input_word's word.
1362 ASSERT_HOST(!w_it.cycled_list());
1363 }
1364 // Insert into the appropriate place in the ROW_RES.
1365 WERD_RES_IT wr_it(&row()->word_res_list);
1366 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1367 WERD_RES* word = wr_it.data();
1368 if (word == input_word)
1369 break;
1370 }
1371 ASSERT_HOST(!wr_it.cycled_list());
1372 // Since we only have an estimate of the bounds between blobs, use the blob
1373 // x-middle as the determiner of where to put the blobs
1374 C_BLOB_IT src_b_it(input_word->word->cblob_list());
1375 src_b_it.sort(&C_BLOB::SortByXMiddle);
1376 C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list());
1377 rej_b_it.sort(&C_BLOB::SortByXMiddle);
1378 TBOX clip_box;
1379 for (int w = 0; w < words->size(); ++w) {
1380 WERD_RES* word_w = (*words)[w];
1381 clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word);
1382 // Compute blob boundaries.
1383 GenericVector<int> blob_ends;
1384 C_BLOB_LIST* next_word_blobs =
1385 w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : nullptr;
1386 ComputeBlobEnds(*word_w, clip_box, next_word_blobs, &blob_ends);
1387 // Remove the fake blobs on the current word, but keep safe for back-up if
1388 // no blob can be found.
1389 C_BLOB_LIST fake_blobs;
1390 C_BLOB_IT fake_b_it(&fake_blobs);
1391 fake_b_it.add_list_after(word_w->word->cblob_list());
1392 fake_b_it.move_to_first();
1393 word_w->word->cblob_list()->clear();
1394 C_BLOB_IT dest_it(word_w->word->cblob_list());
1395 // Build the box word as we move the blobs.
1396 auto* box_word = new tesseract::BoxWord;
1397 for (int i = 0; i < blob_ends.size(); ++i, fake_b_it.forward()) {
1398 int end_x = blob_ends[i];
1399 TBOX blob_box;
1400 // Add the blobs up to end_x.
1401 while (!src_b_it.empty() &&
1402 src_b_it.data()->bounding_box().x_middle() < end_x) {
1403 blob_box += MoveAndClipBlob(&src_b_it, &dest_it, clip_box);
1404 src_b_it.forward();
1405 }
1406 while (!rej_b_it.empty() &&
1407 rej_b_it.data()->bounding_box().x_middle() < end_x) {
1408 blob_box += MoveAndClipBlob(&rej_b_it, &dest_it, clip_box);
1409 rej_b_it.forward();
1410 }
1411 if (blob_box.null_box()) {
1412 // Use the original box as a back-up.
1413 blob_box = MoveAndClipBlob(&fake_b_it, &dest_it, clip_box);
1414 }
1415 box_word->InsertBox(i, blob_box);
1416 }
1417 delete word_w->box_word;
1418 word_w->box_word = box_word;
1419 if (!input_word->combination) {
1420 // Insert word_w->word into the ROW. It doesn't own its word, so the
1421 // ROW needs to own it.
1422 w_it.add_before_stay_put(word_w->word);
1423 word_w->combination = false;
1424 }
1425 (*words)[w] = nullptr; // We are taking ownership.
1426 wr_it.add_before_stay_put(word_w);
1427 }
1428 // We have taken ownership of the words.
1429 words->clear();
1430 // Delete the current word, which has been replaced. We could just call
1431 // DeleteCurrentWord, but that would iterate both lists again, and we know
1432 // we are already in the right place.
1433 if (!input_word->combination)
1434 delete w_it.extract();
1435 delete wr_it.extract();
1437}
1438
1439// Deletes the current WERD_RES and its underlying WERD.
1441 // Check that this word is as we expect. part_of_combos are NEVER iterated
1442 // by the normal iterator, so we should never be trying to delete them.
1443 ASSERT_HOST(!word_res->part_of_combo);
1444 if (!word_res->combination) {
1445 // Combinations own their own word, so we won't find the word on the
1446 // row's word_list, but it is legitimate to try to delete them.
1447 // Delete word from the ROW when not a combination.
1448 WERD_IT w_it(row()->row->word_list());
1449 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1450 if (w_it.data() == word_res->word) {
1451 break;
1452 }
1453 }
1454 ASSERT_HOST(!w_it.cycled_list());
1455 delete w_it.extract();
1456 }
1457 // Remove the WERD_RES for the new_word.
1458 // Remove the WORD_RES from the ROW_RES.
1459 WERD_RES_IT wr_it(&row()->word_res_list);
1460 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1461 if (wr_it.data() == word_res) {
1462 word_res = nullptr;
1463 break;
1464 }
1465 }
1466 ASSERT_HOST(!wr_it.cycled_list());
1467 delete wr_it.extract();
1469}
1470
1471// Makes the current word a fuzzy space if not already fuzzy. Updates
1472// corresponding part of combo if required.
1474 WERD* real_word = word_res->word;
1475 if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) {
1476 real_word->set_flag(W_FUZZY_SP, true);
1477 if (word_res->combination) {
1478 // The next word should be the corresponding part of combo, but we have
1479 // already stepped past it, so find it by search.
1480 WERD_RES_IT wr_it(&row()->word_res_list);
1481 for (wr_it.mark_cycle_pt();
1482 !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) {
1483 }
1484 wr_it.forward();
1485 ASSERT_HOST(wr_it.data()->part_of_combo);
1486 real_word = wr_it.data()->word;
1487 ASSERT_HOST(!real_word->flag(W_FUZZY_SP) &&
1488 !real_word->flag(W_FUZZY_NON));
1489 real_word->set_flag(W_FUZZY_SP, true);
1490 }
1491 }
1492}
1493
1494/*************************************************************************
1495 * PAGE_RES_IT::restart_page
1496 *
1497 * Set things up at the start of the page
1498 *************************************************************************/
1499
1501 block_res_it.set_to_list(&page_res->block_res_list);
1502 block_res_it.mark_cycle_pt();
1503 prev_block_res = nullptr;
1504 prev_row_res = nullptr;
1505 prev_word_res = nullptr;
1506 block_res = nullptr;
1507 row_res = nullptr;
1508 word_res = nullptr;
1509 next_block_res = nullptr;
1510 next_row_res = nullptr;
1511 next_word_res = nullptr;
1512 internal_forward(true, empty_ok);
1513 return internal_forward(false, empty_ok);
1514}
1515
1516// Recovers from operations on the current word, such as in InsertCloneWord
1517// and DeleteCurrentWord.
1518// Resets the word_res_it so that it is one past the next_word_res, as
1519// it should be after internal_forward. If next_row_res != row_res,
1520// then the next_word_res is in the next row, so there is no need to do
1521// anything to word_res_it, but it is still a good idea to reset the pointers
1522// word_res and prev_word_res, which are still in the current row.
1524 if (row_res == next_row_res) {
1525 // Reset the member iterator so it can move forward and detect the
1526 // cycled_list state correctly.
1527 word_res_it.move_to_first();
1528 for (word_res_it.mark_cycle_pt();
1529 !word_res_it.cycled_list() && word_res_it.data() != next_word_res;
1530 word_res_it.forward()) {
1531 if (!word_res_it.data()->part_of_combo) {
1532 if (prev_row_res == row_res) prev_word_res = word_res;
1533 word_res = word_res_it.data();
1534 }
1535 }
1536 ASSERT_HOST(!word_res_it.cycled_list());
1537 wr_it_of_next_word = word_res_it;
1538 word_res_it.forward();
1539 } else {
1540 // word_res_it is OK, but reset word_res and prev_word_res if needed.
1541 WERD_RES_IT wr_it(&row_res->word_res_list);
1542 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1543 if (!wr_it.data()->part_of_combo) {
1544 if (prev_row_res == row_res) prev_word_res = word_res;
1545 word_res = wr_it.data();
1546 }
1547 }
1548 }
1549}
1550
1551/*************************************************************************
1552 * PAGE_RES_IT::internal_forward
1553 *
1554 * Find the next word on the page. If empty_ok is true, then non-text blocks
1555 * and text blocks with no text are visited as if they contain a single
1556 * imaginary word in a single imaginary row. (word() and row() both return nullptr
1557 * in such a block and the return value is nullptr.)
1558 * If empty_ok is false, the old behaviour is maintained. Each real word
1559 * is visited and empty and non-text blocks and rows are skipped.
1560 * new_block is used to initialize the iterators for a new block.
1561 * The iterator maintains pointers to block, row and word for the previous,
1562 * current and next words. These are correct, regardless of block/row
1563 * boundaries. nullptr values denote start and end of the page.
1564 *************************************************************************/
1565
1566WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) {
1567 bool new_row = false;
1568
1569 prev_block_res = block_res;
1570 prev_row_res = row_res;
1571 prev_word_res = word_res;
1572 block_res = next_block_res;
1573 row_res = next_row_res;
1574 word_res = next_word_res;
1575 wr_it_of_current_word = wr_it_of_next_word;
1576 next_block_res = nullptr;
1577 next_row_res = nullptr;
1578 next_word_res = nullptr;
1579
1580 while (!block_res_it.cycled_list()) {
1581 if (new_block) {
1582 new_block = false;
1583 row_res_it.set_to_list(&block_res_it.data()->row_res_list);
1584 row_res_it.mark_cycle_pt();
1585 if (row_res_it.empty() && empty_ok) {
1586 next_block_res = block_res_it.data();
1587 break;
1588 }
1589 new_row = true;
1590 }
1591 while (!row_res_it.cycled_list()) {
1592 if (new_row) {
1593 new_row = false;
1594 word_res_it.set_to_list(&row_res_it.data()->word_res_list);
1595 word_res_it.mark_cycle_pt();
1596 }
1597 // Skip any part_of_combo words.
1598 while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo)
1599 word_res_it.forward();
1600 if (!word_res_it.cycled_list()) {
1601 next_block_res = block_res_it.data();
1602 next_row_res = row_res_it.data();
1603 next_word_res = word_res_it.data();
1604 wr_it_of_next_word = word_res_it;
1605 word_res_it.forward();
1606 goto foundword;
1607 }
1608 // end of row reached
1609 row_res_it.forward();
1610 new_row = true;
1611 }
1612 // end of block reached
1613 block_res_it.forward();
1614 new_block = true;
1615 }
1616 foundword:
1617 // Update prev_word_best_choice pointer.
1618 if (page_res != nullptr && page_res->prev_word_best_choice != nullptr) {
1620 (new_block || prev_word_res == nullptr) ? nullptr : prev_word_res->best_choice;
1621 }
1622 return word_res;
1623}
1624
1625/*************************************************************************
1626 * PAGE_RES_IT::restart_row()
1627 *
1628 * Move to the beginning (leftmost word) of the current row.
1629 *************************************************************************/
1631 ROW_RES *row = this->row();
1632 if (!row) return nullptr;
1633 for (restart_page(); this->row() != row; forward()) {
1634 // pass
1635 }
1636 return word();
1637}
1638
1639/*************************************************************************
1640 * PAGE_RES_IT::forward_paragraph
1641 *
1642 * Move to the beginning of the next paragraph, allowing empty blocks.
1643 *************************************************************************/
1644
1646 while (block_res == next_block_res &&
1647 (next_row_res != nullptr && next_row_res->row != nullptr &&
1648 row_res->row->para() == next_row_res->row->para())) {
1649 internal_forward(false, true);
1650 }
1651 return internal_forward(false, true);
1652}
1653
1654/*************************************************************************
1655 * PAGE_RES_IT::forward_block
1656 *
1657 * Move to the beginning of the next block, allowing empty blocks.
1658 *************************************************************************/
1659
1661 while (block_res == next_block_res) {
1662 internal_forward(false, true);
1663 }
1664 return internal_forward(false, true);
1665}
1666
1668 int16_t chars_in_word;
1669 int16_t rejects_in_word = 0;
1670
1671 chars_in_word = word_res->reject_map.length ();
1672 page_res->char_count += chars_in_word;
1673 block_res->char_count += chars_in_word;
1674 row_res->char_count += chars_in_word;
1675
1676 rejects_in_word = word_res->reject_map.reject_count ();
1677
1678 page_res->rej_count += rejects_in_word;
1679 block_res->rej_count += rejects_in_word;
1680 row_res->rej_count += rejects_in_word;
1681 if (chars_in_word == rejects_in_word)
1682 row_res->whole_word_rej_count += rejects_in_word;
1683}
const double kMaxLineSizeRatio
Definition: pageres.cpp:58
const double kMaxWordSizeRatio
Definition: pageres.cpp:56
const double kMaxWordGapRatio
Definition: pageres.cpp:60
const int kWordrecMaxNumJoinChunks
Definition: pageres.cpp:53
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:184
PermuterType
Definition: ratngs.h:232
@ TOP_CHOICE_PERM
Definition: ratngs.h:235
void start_seam_list(TWERD *word, GenericVector< SEAM * > *seam_array)
Definition: seam.cpp:263
@ W_FUZZY_SP
fuzzy space
Definition: werd.h:39
@ W_SCRIPT_HAS_XHEIGHT
x-height concept makes sense.
Definition: werd.h:35
@ W_EOL
end of line
Definition: werd.h:33
@ W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:40
@ W_SCRIPT_IS_LATIN
Special case latin for y. splitting.
Definition: werd.h:36
@ W_REP_CHAR
repeated character
Definition: werd.h:38
@ W_INVERSE
white on black
Definition: werd.h:41
@ W_BOL
start of line
Definition: werd.h:32
#define CLISTIZE(CLASSNAME)
Definition: clst.h:891
#define ELISTIZE(CLASSNAME)
Definition: elst.h:931
#define ASSERT_HOST(x)
Definition: errcode.h:88
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
Definition: tesscallback.h:258
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int UNICHAR_ID
Definition: unichar.h:34
@ UNICHAR_SPACE
Definition: unicharset.h:34
int count(LIST var_list)
Definition: oldlist.cpp:95
void init_to_size(int size, const T &t)
int push_back(T object)
bool empty() const
Definition: genericvector.h:91
int size() const
Definition: genericvector.h:72
void remove(int index)
T & back() const
void insert(const T &t, int index)
int length() const
Definition: genericvector.h:86
void truncate(int size)
void delete_data_pointers()
void reserve(int size)
void move(GenericVector< T > *from)
void delete_matrix_pointers()
Definition: matrix.h:458
T get(ICOORD pos) const
Definition: matrix.h:231
void put(ICOORD pos, const T &thing)
Definition: matrix.h:223
virtual R Run(A1, A2)=0
void CopyResults(const BlamerBundle &other)
Definition: blamer.h:210
void ClearResults()
Definition: blamer.h:189
void CopyTruth(const BlamerBundle &other)
Definition: blamer.h:203
void SetupNormTruthWord(const DENORM &denorm)
Definition: blamer.cpp:153
Definition: blobs.h:284
TBOX bounding_box() const
Definition: blobs.cpp:468
Definition: blobs.h:418
void MergeBlobs(int start, int end)
Definition: blobs.cpp:872
int NumBlobs() const
Definition: blobs.h:448
void BLNormalize(const BLOCK *block, const ROW *row, Pix *pix, bool inverse, float x_height, float baseline_shift, bool numeric_mode, tesseract::OcrEngineMode hint, const TBOX *norm_box, DENORM *word_denorm)
Definition: blobs.cpp:790
GenericVector< TBLOB * > blobs
Definition: blobs.h:459
void ComputeBoundingBoxes()
Definition: blobs.cpp:855
static TWERD * PolygonalCopy(bool allow_detailed_fx, WERD *src)
Definition: blobs.cpp:776
void InsertBox(int index, const TBOX &box)
Definition: boxword.cpp:148
int length() const
Definition: boxword.h:83
void MergeBoxes(int start, int end)
Definition: boxword.cpp:131
const TBOX & BlobBox(int index) const
Definition: boxword.h:84
static BoxWord * CopyFromNormalized(TWERD *tessword)
Definition: boxword.cpp:56
void ClipToOriginalWord(const BLOCK *block, WERD *original_word)
Definition: boxword.cpp:92
int dimension() const
Definition: matrix.h:536
Definition: matrix.h:578
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:49
MATRIX * ConsumeAndMakeBigger(int ind)
Definition: matrix.cpp:58
bool Valid(const MATRIX &m) const
Definition: matrix.h:618
const BLOCK * block() const
Definition: normalis.h:273
Definition: ocrblock.h:31
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:116
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:190
Definition: ocrrow.h:37
float body_size() const
Definition: ocrrow.h:73
WERD_LIST * word_list()
Definition: ocrrow.h:55
float descenders() const
Definition: ocrrow.h:85
float ascenders() const
Definition: ocrrow.h:82
PARA * para() const
Definition: ocrrow.h:118
float x_height() const
Definition: ocrrow.h:64
WERD_CHOICE ** prev_word_best_choice
Definition: pageres.h:84
PAGE_RES()
Definition: pageres.h:101
int32_t rej_count
Definition: pageres.h:79
void Init()
Definition: pageres.h:93
BLOCK_RES_LIST block_res_list
Definition: pageres.h:80
int32_t char_count
Definition: pageres.h:78
float x_height
Definition: pageres.h:121
int32_t rej_count
Definition: pageres.h:118
int16_t font_class
Definition: pageres.h:119
ROW_RES_LIST row_res_list
Definition: pageres.h:125
bool font_assigned
Definition: pageres.h:122
BLOCK_RES()=default
int32_t char_count
Definition: pageres.h:117
BLOCK * block
Definition: pageres.h:116
int16_t row_count
Definition: pageres.h:120
int32_t whole_word_rej_count
Definition: pageres.h:143
int32_t rej_count
Definition: pageres.h:142
WERD_RES_LIST word_res_list
Definition: pageres.h:144
ROW_RES()=default
ROW * row
Definition: pageres.h:140
int32_t char_count
Definition: pageres.h:141
const UNICHARSET * uch_set
Definition: pageres.h:203
UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1059
int GetBlobsGap(int blob_index)
Definition: pageres.cpp:740
bool tess_would_adapt
Definition: pageres.h:304
void CopySimpleFields(const WERD_RES &source)
Definition: pageres.cpp:251
void CloneChoppedToRebuild()
Definition: pageres.cpp:835
DENORM denorm
Definition: pageres.h:201
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:877
TWERD * rebuild_word
Definition: pageres.h:266
WERD_CHOICE_LIST best_choices
Definition: pageres.h:249
bool guessed_x_ht
Definition: pageres.h:313
BlamerBundle * blamer_bundle
Definition: pageres.h:252
void SetupBlamerBundle()
Definition: pageres.cpp:393
bool PiecesAllNatural(int start, int count) const
Definition: pageres.cpp:1078
bool done
Definition: pageres.h:305
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:480
tesseract::Tesseract * tesseract
Definition: pageres.h:280
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:604
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:352
int8_t fontinfo_id2_count
Definition: pageres.h:312
const FontInfo * fontinfo
Definition: pageres.h:309
tesseract::BoxWord * box_word
Definition: pageres.h:272
GenericVector< int > blob_widths
Definition: pageres.h:216
void copy_on(WERD_RES *word_res)
Definition: pageres.h:660
GenericVector< SEAM * > seam_array
Definition: pageres.h:214
bool combination
Definition: pageres.h:339
WERD_CHOICE * best_choice
Definition: pageres.h:241
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:795
const FontInfo * fontinfo2
Definition: pageres.h:310
float x_height
Definition: pageres.h:316
void FakeWordFromRatings(PermuterType permuter)
Definition: pageres.cpp:898
void SetupBlobWidthsAndGaps()
Definition: pageres.cpp:400
bool part_of_combo
Definition: pageres.h:340
bool guessed_caps_ht
Definition: pageres.h:314
void PrintBestChoices() const
Definition: pageres.cpp:717
bool IsAmbiguous()
Definition: pageres.cpp:452
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:765
void ClearResults()
Definition: pageres.cpp:1104
void SetupBoxWord()
Definition: pageres.cpp:849
int GetBlobsWidth(int start_blob, int last_blob)
Definition: pageres.cpp:730
UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1008
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: pageres.cpp:865
void FilterWordChoices(int debug_level)
Definition: pageres.cpp:513
void ComputeAdaptionThresholds(float certainty_scale, float min_rating, float max_rating, float rating_margin, float *thresholds)
Definition: pageres.cpp:561
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:315
void BestChoiceToCorrectText()
Definition: pageres.cpp:923
bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const
Definition: pageres.cpp:439
bool tess_failed
Definition: pageres.h:295
bool odd_size
Definition: pageres.h:307
void SetScriptPositions()
Definition: pageres.cpp:858
bool ConditionalBlobMerge(TessResultCallback2< UNICHAR_ID, UNICHAR_ID, UNICHAR_ID > *class_cb, TessResultCallback2< bool, const TBOX &, const TBOX & > *box_cb)
Definition: pageres.cpp:938
bool reject_spaces
Definition: pageres.h:341
void ClearWordChoices()
Definition: pageres.cpp:1129
void DebugTopChoice(const char *msg) const
Definition: pageres.cpp:499
bool tess_accepted
Definition: pageres.h:303
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
Definition: pageres.cpp:759
void SetupWordScript(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:384
GenericVector< int > best_state
Definition: pageres.h:285
WERD_CHOICE * raw_choice
Definition: pageres.h:246
void fix_hyphens()
Definition: pageres.cpp:1047
void MergeAdjacentBlobs(int index)
Definition: pageres.cpp:974
WERD_RES & operator=(const WERD_RES &source)
Definition: pageres.cpp:188
bool small_caps
Definition: pageres.h:306
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:343
bool HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2)
Definition: pageres.cpp:1041
UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1030
int8_t fontinfo_id_count
Definition: pageres.h:311
ROW * blob_row
Definition: pageres.h:197
TWERD * chopped_word
Definition: pageres.h:212
void fix_quotes()
Definition: pageres.cpp:1018
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:302
REJMAP reject_map
Definition: pageres.h:294
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:418
GenericVector< STRING > correct_text
Definition: pageres.h:289
void RebuildBestState()
Definition: pageres.cpp:808
GenericVector< int > blob_gaps
Definition: pageres.h:219
WERD_CHOICE * ep_choice
Definition: pageres.h:293
float caps_height
Definition: pageres.h:317
void merge_tess_fails()
Definition: pageres.cpp:1067
tesseract::BoxWord * bln_boxes
Definition: pageres.h:195
bool StatesAllValid()
Definition: pageres.cpp:458
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:620
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:277
MATRIX * ratings
Definition: pageres.h:237
void Clear()
Definition: pageres.cpp:1094
void ClearRatings()
Definition: pageres.cpp:1137
WERD * word
Definition: pageres.h:186
float baseline_shift
Definition: pageres.h:318
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:750
WERD_RES * word() const
Definition: pageres.h:754
ROW_RES * row() const
Definition: pageres.h:757
int cmp(const PAGE_RES_IT &other) const
Definition: pageres.cpp:1145
WERD_RES * forward_block()
Definition: pageres.cpp:1660
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1473
void rej_stat_word()
Definition: pageres.cpp:1667
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1333
void ResetWordIterator()
Definition: pageres.cpp:1523
WERD_RES * forward_paragraph()
Definition: pageres.cpp:1645
WERD_RES * start_page(bool empty_ok)
Definition: pageres.cpp:1500
WERD_RES * restart_page()
Definition: pageres.h:701
PAGE_RES * page_res
Definition: pageres.h:677
WERD_RES * restart_row()
Definition: pageres.cpp:1630
WERD_RES * forward()
Definition: pageres.h:734
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1213
void DeleteCurrentWord()
Definition: pageres.cpp:1440
POLY_BLOCK * poly_block() const
Definition: pdblock.h:55
bool IsText() const
Definition: polyblk.h:49
float certainty() const
Definition: ratngs.h:83
float rating() const
Definition: ratngs.h:80
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
void set_unichar_id(UNICHAR_ID newunichar_id)
Definition: ratngs.h:141
MATRIX_COORD MatrixCoord(int index) const
Definition: ratngs.cpp:306
void remove_unichar_id(int index)
Definition: ratngs.h:474
int state(int index) const
Definition: ratngs.h:309
float adjust_factor() const
Definition: ratngs.h:296
void SetScriptPositions(bool small_caps, TWERD *word, int debug=0)
Definition: ratngs.cpp:554
void UpdateStateForSplit(int blob_position)
Definition: ratngs.cpp:703
const STRING & unichar_string() const
Definition: ratngs.h:531
int TotalOfStates() const
Definition: ratngs.cpp:715
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:453
bool dangerous_ambig_found() const
Definition: ratngs.h:353
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:349
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: ratngs.cpp:627
void set_permuter(uint8_t perm)
Definition: ratngs.h:365
BLOB_CHOICE_LIST * blob_choices(int index, MATRIX *ratings) const
Definition: ratngs.cpp:294
float certainty() const
Definition: ratngs.h:320
int length() const
Definition: ratngs.h:293
void print() const
Definition: ratngs.h:570
float rating() const
Definition: ratngs.h:317
Definition: rect.h:34
void set_right(int x)
Definition: rect.h:82
int16_t top() const
Definition: rect.h:58
void set_bottom(int y)
Definition: rect.h:68
int16_t width() const
Definition: rect.h:115
int16_t height() const
Definition: rect.h:108
void set_top(int y)
Definition: rect.h:61
int16_t left() const
Definition: rect.h:72
int16_t bottom() const
Definition: rect.h:65
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87
bool contains(const FCOORD pt) const
Definition: rect.h:333
bool null_box() const
Definition: rect.h:50
void set_left(int x)
Definition: rect.h:75
int16_t right() const
Definition: rect.h:79
int16_t reject_count()
Definition: rejctmap.h:229
void initialise(int16_t length)
Definition: rejctmap.cpp:273
void remove_pos(int16_t pos)
Definition: rejctmap.cpp:309
int32_t length() const
Definition: rejctmap.h:223
Definition: seam.h:38
static void BreakPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:188
bool HasAnySplits() const
Definition: seam.h:61
bool PrepareToInsertSeam(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int insert_index, bool modify)
Definition: seam.cpp:76
static void JoinPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:210
static C_BLOB * FakeBlob(const TBOX &box)
Definition: stepblob.cpp:241
TBOX bounding_box() const
Definition: stepblob.cpp:253
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:125
Definition: werd.h:56
void set_script_id(int id)
Definition: werd.h:104
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:90
C_BLOB_LIST * cblob_list()
Definition: werd.h:95
uint8_t space()
Definition: werd.h:99
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:118
TBOX bounding_box() const
Definition: werd.cpp:148
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
void operator=(const ELIST_LINK &)
Definition: elst.h:94
Definition: strngs.h:45
void add_str_int(const char *str, int number)
Definition: strngs.cpp:377
const char * string() const
Definition: strngs.cpp:194
int default_sid() const
Definition: unicharset.h:894
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:671
int latin_sid() const
Definition: unicharset.h:886
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:878
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
bool script_has_xheight() const
Definition: unicharset.h:904