tesseract 4.1.1
Loading...
Searching...
No Matches
pageres.h
Go to the documentation of this file.
1/**********************************************************************
2 * File: pageres.h (Formerly page_res.h)
3 * Description: Results classes used by control.c
4 * Author: Phil Cheatle
5 *
6 * (C) Copyright 1992, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19#ifndef PAGERES_H
20#define PAGERES_H
21
22#include <cstdint> // for int32_t, int16_t
23#include <set> // for std::pair
24#include <vector> // for std::vector
25#include <sys/types.h> // for int8_t
26#include "blamer.h" // for BlamerBundle (ptr only), IRR_NUM_REASONS
27#include "clst.h" // for CLIST_ITERATOR, CLISTIZEH
28#include "elst.h" // for ELIST_ITERATOR, ELIST_LINK, ELISTIZEH
29#include "genericvector.h" // for GenericVector, PointerVector (ptr only)
30#include "matrix.h" // for MATRIX
31#include "normalis.h" // for DENORM
32#include "ratngs.h" // for WERD_CHOICE, BLOB_CHOICE (ptr only)
33#include "rect.h" // for TBOX
34#include "rejctmap.h" // for REJMAP
35#include "strngs.h" // for STRING
36#include "unichar.h" // for UNICHAR_ID, INVALID_UNICHAR_ID
37#include "unicharset.h" // for UNICHARSET, UNICHARSET::Direction, UNI...
38#include "werd.h" // for WERD, W_BOL, W_EOL
39
40class BLOCK;
41class BLOCK_LIST;
42class BLOCK_RES;
43class ROW;
44class ROW_RES;
45class SEAM;
46class WERD_RES;
47
48struct Pix;
49struct TWERD;
50
51template <class R, class A1, class A2> class TessResultCallback2;
52
53namespace tesseract {
54 class BoxWord;
55 class Tesseract;
56 struct FontInfo;
57}
59
60/* Forward declarations */
61
62class BLOCK_RES;
63
65class
67
69class WERD_RES;
70
72
73/*************************************************************************
74 * PAGE_RES - Page results
75 *************************************************************************/
76class PAGE_RES { // page result
77 public:
78 int32_t char_count;
79 int32_t rej_count;
80 BLOCK_RES_LIST block_res_list;
82 // Updated every time PAGE_RES_IT iterating on this PAGE_RES moves to
83 // the next word. This pointer is not owned by PAGE_RES class.
85 // Sums of blame reasons computed by the blamer.
87 // Debug information about all the misadaptions on this page.
88 // Each BlamerBundle contains an index into this vector, so that words that
89 // caused misadaption could be marked. However, since words could be
90 // deleted/split/merged, the log is stored on the PAGE_RES level.
92
93 inline void Init() {
94 char_count = 0;
95 rej_count = 0;
96 rejected = false;
97 prev_word_best_choice = nullptr;
98 blame_reasons.init_to_size(IRR_NUM_REASONS, 0);
99 }
100
101 PAGE_RES() { Init(); } // empty constructor
102
103 PAGE_RES(bool merge_similar_words,
104 BLOCK_LIST *block_list, // real blocks
105 WERD_CHOICE **prev_word_best_choice_ptr);
106
107 ~PAGE_RES () = default;
108};
109
110/*************************************************************************
111 * BLOCK_RES - Block results
112 *************************************************************************/
113
114class BLOCK_RES:public ELIST_LINK {
115 public:
116 BLOCK * block; // real block
117 int32_t char_count; // chars in block
118 int32_t rej_count; // rejected chars
119 int16_t font_class; //
120 int16_t row_count;
121 float x_height;
122 bool font_assigned; // block already
123 // processed
124
125 ROW_RES_LIST row_res_list;
126
127 BLOCK_RES() = default;
128
129 BLOCK_RES(bool merge_similar_words, BLOCK *the_block); // real block
130
131 ~BLOCK_RES () = default;
132};
133
134/*************************************************************************
135 * ROW_RES - Row results
136 *************************************************************************/
137
138class ROW_RES:public ELIST_LINK {
139 public:
140 ROW * row; // real row
141 int32_t char_count; // chars in block
142 int32_t rej_count; // rejected chars
143 int32_t whole_word_rej_count; // rejs in total rej wds
144 WERD_RES_LIST word_res_list;
145
146 ROW_RES() = default;
147
148 ROW_RES(bool merge_similar_words, ROW *the_row); // real row
149
150 ~ROW_RES() = default;
151};
152
153/*************************************************************************
154 * WERD_RES - Word results
155 *************************************************************************/
157{
163
164// WERD_RES is a collection of publicly accessible members that gathers
165// information about a word result.
166class WERD_RES : public ELIST_LINK {
167 public:
168 // Which word is which?
169 // There are 3 coordinate spaces in use here: a possibly rotated pixel space,
170 // the original image coordinate space, and the BLN space in which the
171 // baseline of a word is at kBlnBaselineOffset, the xheight is kBlnXHeight,
172 // and the x-middle of the word is at 0.
173 // In the rotated pixel space, coordinates correspond to the input image,
174 // but may be rotated about the origin by a multiple of 90 degrees,
175 // and may therefore be negative.
176 // In any case a rotation by denorm.block()->re_rotation() will take them
177 // back to the original image.
178 // The other differences between words all represent different stages of
179 // processing during recognition.
180
181 // ---------------------------INPUT-------------------------------------
182
183 // The word is the input C_BLOBs in the rotated pixel space.
184 // word is NOT owned by the WERD_RES unless combination is true.
185 // All the other word pointers ARE owned by the WERD_RES.
186 WERD* word = nullptr; // Input C_BLOB word.
187
188 // -------------SETUP BY SetupFor*Recognition---READONLY-INPUT------------
189
190 // The bln_boxes contains the bounding boxes (only) of the input word, in the
191 // BLN space. The lengths of word and bln_boxes
192 // match as they are both before any chopping.
193 // TODO(rays) determine if docqual does anything useful and delete bln_boxes
194 // if it doesn't.
195 tesseract::BoxWord* bln_boxes = nullptr; // BLN input bounding boxes.
196 // The ROW that this word sits in. NOT owned by the WERD_RES.
197 ROW* blob_row = nullptr;
198 // The denorm provides the transformation to get back to the rotated image
199 // coords from the chopped_word/rebuild_word BLN coords, but each blob also
200 // has its own denorm.
201 DENORM denorm; // For use on chopped_word.
202 // Unicharset used by the classifier output in best_choice and raw_choice.
203 const UNICHARSET* uch_set = nullptr; // For converting back to utf8.
204
205 // ----Initialized by SetupFor*Recognition---BUT OUTPUT FROM RECOGNITION----
206 // ----Setup to a (different!) state expected by the various classifiers----
207 // TODO(rays) Tidy and make more consistent.
208
209 // The chopped_word is also in BLN space, and represents the fully chopped
210 // character fragments that make up the word.
211 // The length of chopped_word matches length of seam_array + 1 (if set).
212 TWERD* chopped_word = nullptr; // BLN chopped fragments output.
213 // Vector of SEAM* holding chopping points matching chopped_word.
215 // Widths of blobs in chopped_word.
217 // Gaps between blobs in chopped_word. blob_gaps[i] is the gap between
218 // blob i and blob i+1.
220 // Stores the lstm choices of every timestep
221 std::vector<std::vector<std::pair<const char*, float>>> timesteps;
222 // Stores the lstm choices of every timestep segmented by character
223 std::vector<std::vector<std::vector<
224 std::pair<const char*, float>>>> segmented_timesteps;
225 //Symbolchoices aquired during CTC
226 std::vector<std::vector<std::pair<const char*, float>>> CTC_symbol_choices;
227 // Stores if the timestep vector starts with a space
228 bool leading_space = false;
229 // Stores value when the word ends
230 int end = 0;
231 // Ratings matrix contains classifier choices for each classified combination
232 // of blobs. The dimension is the same as the number of blobs in chopped_word
233 // and the leading diagonal corresponds to classifier results of the blobs
234 // in chopped_word. The state_ members of best_choice, raw_choice and
235 // best_choices all correspond to this ratings matrix and allow extraction
236 // of the blob choices for any given WERD_CHOICE.
237 MATRIX* ratings = nullptr; // Owned pointer.
238 // Pointer to the first WERD_CHOICE in best_choices. This is the result that
239 // will be output from Tesseract. Note that this is now a borrowed pointer
240 // and should NOT be deleted.
241 WERD_CHOICE* best_choice = nullptr; // Borrowed pointer.
242 // The best raw_choice found during segmentation search. Differs from the
243 // best_choice by being the best result according to just the character
244 // classifier, not taking any language model information into account.
245 // Unlike best_choice, the pointer IS owned by this WERD_RES.
246 WERD_CHOICE* raw_choice = nullptr; // Owned pointer.
247 // Alternative results found during chopping/segmentation search stages.
248 // Note that being an ELIST, best_choices owns the WERD_CHOICEs.
249 WERD_CHOICE_LIST best_choices;
250
251 // Truth bounding boxes, text and incorrect choice reason.
253
254 // --------------OUTPUT FROM RECOGNITION-------------------------------
255 // --------------Not all fields are necessarily set.-------------------
256 // ---best_choice, raw_choice *must* end up set, with a box_word-------
257 // ---In complete output, the number of blobs in rebuild_word matches---
258 // ---the number of boxes in box_word, the number of unichar_ids in---
259 // ---best_choice, the number of ints in best_state, and the number---
260 // ---of strings in correct_text--------------------------------------
261 // ---SetupFake Sets everything to appropriate values if the word is---
262 // ---known to be bad before recognition.------------------------------
263
264 // The rebuild_word is also in BLN space, but represents the final best
265 // segmentation of the word. Its length is therefore the same as box_word.
266 TWERD* rebuild_word = nullptr; // BLN best segmented word.
267 // The box_word is in the original image coordinate space. It is the
268 // bounding boxes of the rebuild_word, after denormalization.
269 // The length of box_word matches rebuild_word, best_state (if set) and
270 // correct_text (if set), as well as best_choice and represents the
271 // number of classified units in the output.
272 tesseract::BoxWord* box_word = nullptr; // Denormalized output boxes.
273 // The Tesseract that was used to recognize this word. Just a borrowed
274 // pointer. Note: Tesseract's class definition is in a higher-level library.
275 // We avoid introducing a cyclic dependency by not using the Tesseract
276 // within WERD_RES. We are just storing it to provide access to it
277 // for the top-level multi-language controller, and maybe for output of
278 // the recognized language.
279 // tesseract points to data owned elsewhere.
281 // The best_state stores the relationship between chopped_word and
282 // rebuild_word. Each blob[i] in rebuild_word is composed of best_state[i]
283 // adjacent blobs in chopped_word. The seams in seam_array are hidden
284 // within a rebuild_word blob and revealed between them.
285 GenericVector<int> best_state; // Number of blobs in each best blob.
286 // The correct_text is used during training and adaption to carry the
287 // text to the training system without the need for a unicharset. There
288 // is one entry in the vector for each blob in rebuild_word and box_word.
290
291 // Less-well documented members.
292 // TODO(rays) Add more documentation here.
293 WERD_CHOICE *ep_choice = nullptr; // ep text TODO(rays) delete this.
294 REJMAP reject_map; // best_choice rejects
295 bool tess_failed = false;
296 /*
297 If tess_failed is true, one of the following tests failed when Tess
298 returned:
299 - The outword blob list was not the same length as the best_choice string;
300 - The best_choice string contained ALL blanks;
301 - The best_choice string was zero length
302 */
303 bool tess_accepted = false; // Tess thinks its ok?
304 bool tess_would_adapt = false; // Tess would adapt?
305 bool done = false; // ready for output?
306 bool small_caps = false; // word appears to be small caps
307 bool odd_size = false; // word is bigger than line or leader dots.
308 // The fontinfos are pointers to data owned by the classifier.
309 const FontInfo* fontinfo = nullptr;
310 const FontInfo* fontinfo2 = nullptr;
311 int8_t fontinfo_id_count = 0; // number of votes
312 int8_t fontinfo_id2_count = 0; // number of votes
313 bool guessed_x_ht = true;
314 bool guessed_caps_ht = true;
316 float x_height = 0.0f; // post match estimate
317 float caps_height = 0.0f; // post match estimate
318 float baseline_shift = 0.0f; // post match estimate.
319 // Certainty score for the spaces either side of this word (LSTM mode).
320 // MIN this value with the actual word certainty.
321 float space_certainty = 0.0f;
322
323 /*
324 To deal with fuzzy spaces we need to be able to combine "words" to form
325 combinations when we suspect that the gap is a non-space. The (new) text
326 ord code generates separate words for EVERY fuzzy gap - flags in the word
327 indicate whether the gap is below the threshold (fuzzy kern) and is thus
328 NOT a real word break by default, or above the threshold (fuzzy space) and
329 this is a real word break by default.
330
331 The WERD_RES list contains all these words PLUS "combination" words built
332 out of (copies of) the words split by fuzzy kerns. The separate parts have
333 their "part_of_combo" flag set true and should be IGNORED on a default
334 reading of the list.
335
336 Combination words are FOLLOWED by the sequence of part_of_combo words
337 which they combine.
338 */
339 bool combination = false; //of two fuzzy gap wds
340 bool part_of_combo = false; //part of a combo
341 bool reject_spaces = false; //Reject spacing?
342
343 WERD_RES() = default;
344
345 WERD_RES(WERD *the_word) {
346 word = the_word;
347 }
348 // Deep copies everything except the ratings MATRIX.
349 // To get that use deep_copy below.
350 WERD_RES(const WERD_RES& source) : ELIST_LINK(source) {
351 // combination is used in function Clear which is called from operator=.
352 combination = false;
353 *this = source; // see operator=
354 }
355
356 ~WERD_RES();
357
358 // Returns the UTF-8 string for the given blob index in the best_choice word,
359 // given that we know whether we are in a right-to-left reading context.
360 // This matters for mirrorable characters such as parentheses. We recognize
361 // characters purely based on their shape on the page, and by default produce
362 // the corresponding unicode for a left-to-right context.
363 const char* BestUTF8(int blob_index, bool in_rtl_context) const {
364 if (blob_index < 0 || best_choice == nullptr ||
365 blob_index >= best_choice->length())
366 return nullptr;
367 UNICHAR_ID id = best_choice->unichar_id(blob_index);
368 if (id < 0 || id >= uch_set->size())
369 return nullptr;
370 UNICHAR_ID mirrored = uch_set->get_mirror(id);
371 if (in_rtl_context && mirrored > 0)
372 id = mirrored;
373 return uch_set->id_to_unichar_ext(id);
374 }
375 // Returns the UTF-8 string for the given blob index in the raw_choice word.
376 const char* RawUTF8(int blob_index) const {
377 if (blob_index < 0 || blob_index >= raw_choice->length())
378 return nullptr;
379 UNICHAR_ID id = raw_choice->unichar_id(blob_index);
380 if (id < 0 || id >= uch_set->size())
381 return nullptr;
382 return uch_set->id_to_unichar(id);
383 }
384
386 if (best_choice == nullptr ||
387 blob_index >= best_choice->length() ||
388 blob_index < 0)
390 return uch_set->get_direction(best_choice->unichar_id(blob_index));
391 }
392
393 bool AnyRtlCharsInWord() const {
394 if (uch_set == nullptr || best_choice == nullptr || best_choice->length() < 1)
395 return false;
396 for (int id = 0; id < best_choice->length(); id++) {
397 int unichar_id = best_choice->unichar_id(id);
398 if (unichar_id < 0 || unichar_id >= uch_set->size())
399 continue; // Ignore illegal chars.
401 uch_set->get_direction(unichar_id);
402 if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
404 return true;
405 }
406 return false;
407 }
408
409 bool AnyLtrCharsInWord() const {
410 if (uch_set == nullptr || best_choice == nullptr || best_choice->length() < 1)
411 return false;
412 for (int id = 0; id < best_choice->length(); id++) {
413 int unichar_id = best_choice->unichar_id(id);
414 if (unichar_id < 0 || unichar_id >= uch_set->size())
415 continue; // Ignore illegal chars.
417 if (dir == UNICHARSET::U_LEFT_TO_RIGHT ||
419 return true;
420 }
421 return false;
422 }
423
424 // Return whether the blobs in this WERD_RES 0, 1,... come from an engine
425 // that gave us the unichars in reading order (as opposed to strict left
426 // to right).
429 }
430
431 void Clear();
432 void ClearResults();
433 void ClearWordChoices();
434 void ClearRatings();
435
436 // Deep copies everything except the ratings MATRIX.
437 // To get that use deep_copy below.
438 WERD_RES& operator=(const WERD_RES& source); //from this
439
440 void CopySimpleFields(const WERD_RES& source);
441
442 // Initializes a blank (default constructed) WERD_RES from one that has
443 // already been recognized.
444 // Use SetupFor*Recognition afterwards to complete the setup and make
445 // it ready for a retry recognition.
446 void InitForRetryRecognition(const WERD_RES& source);
447
448 // Sets up the members used in recognition: bln_boxes, chopped_word,
449 // seam_array, denorm. Returns false if
450 // the word is empty and sets up fake results. If use_body_size is
451 // true and row->body_size is set, then body_size will be used for
452 // blob normalization instead of xheight + ascrise. This flag is for
453 // those languages that are using CJK pitch model and thus it has to
454 // be true if and only if tesseract->textord_use_cjk_fp_model is
455 // true.
456 // If allow_detailed_fx is true, the feature extractor will receive fine
457 // precision outline information, allowing smoother features and better
458 // features on low resolution images.
459 // The norm_mode sets the default mode for normalization in absence
460 // of any of the above flags. It should really be a tesseract::OcrEngineMode
461 // but is declared as int for ease of use with tessedit_ocr_engine_mode.
462 // Returns false if the word is empty and sets up fake results.
463 bool SetupForRecognition(const UNICHARSET& unicharset_in,
465 int norm_mode,
466 const TBOX* norm_box, bool numeric_mode,
467 bool use_body_size, bool allow_detailed_fx,
468 ROW *row, const BLOCK* block);
469
470 // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty
471 // accumulators from a made chopped word. We presume the fields are already
472 // empty.
473 void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in);
474
475 // Sets up the members used in recognition for an empty recognition result:
476 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
477 void SetupFake(const UNICHARSET& uch);
478
479 // Set the word as having the script of the input unicharset.
480 void SetupWordScript(const UNICHARSET& unicharset_in);
481
482 // Sets up the blamer_bundle if it is not null, using the initialized denorm.
483 void SetupBlamerBundle();
484
485 // Computes the blob_widths and blob_gaps from the chopped_word.
487
488 // Updates internal data to account for a new SEAM (chop) at the given
489 // blob_number. Fixes the ratings matrix and states in the choices, as well
490 // as the blob widths and gaps.
491 void InsertSeam(int blob_number, SEAM* seam);
492
493 // Returns true if all the word choices except the first have adjust_factors
494 // worse than the given threshold.
495 bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const;
496
497 // Returns true if the current word is ambiguous (by number of answers or
498 // by dangerous ambigs.)
499 bool IsAmbiguous();
500
501 // Returns true if the ratings matrix size matches the sum of each of the
502 // segmentation states.
503 bool StatesAllValid();
504
505 // Prints a list of words found if debug is true or the word result matches
506 // the word_to_debug.
507 void DebugWordChoices(bool debug, const char* word_to_debug);
508
509 // Prints the top choice along with the accepted/done flags.
510 void DebugTopChoice(const char* msg) const;
511
512 // Removes from best_choices all choices which are not within a reasonable
513 // range of the best choice.
514 void FilterWordChoices(int debug_level);
515
516 // Computes a set of distance thresholds used to control adaption.
517 // Compares the best choice for the current word to the best raw choice
518 // to determine which characters were classified incorrectly by the
519 // classifier. Then places a separate threshold into thresholds for each
520 // character in the word. If the classifier was correct, max_rating is placed
521 // into thresholds. If the classifier was incorrect, the mean match rating
522 // (error percentage) of the classifier's incorrect choice minus some margin
523 // is placed into thresholds. This can then be used by the caller to try to
524 // create a new template for the desired class that will classify the
525 // character with a rating better than the threshold value. The match rating
526 // placed into thresholds is never allowed to be below min_rating in order to
527 // prevent trying to make overly tight templates.
528 // min_rating limits how tight to make a template.
529 // max_rating limits how loose to make a template.
530 // rating_margin denotes the amount of margin to put in template.
531 void ComputeAdaptionThresholds(float certainty_scale,
532 float min_rating,
533 float max_rating,
534 float rating_margin,
535 float* thresholds);
536
537 // Saves a copy of the word_choice if it has the best unadjusted rating.
538 // Returns true if the word_choice was the new best.
539 bool LogNewRawChoice(WERD_CHOICE* word_choice);
540 // Consumes word_choice by adding it to best_choices, (taking ownership) if
541 // the certainty for word_choice is some distance of the best choice in
542 // best_choices, or by deleting the word_choice and returning false.
543 // The best_choices list is kept in sorted order by rating. Duplicates are
544 // removed, and the list is kept no longer than max_num_choices in length.
545 // Returns true if the word_choice is still a valid pointer.
546 bool LogNewCookedChoice(int max_num_choices, bool debug,
547 WERD_CHOICE* word_choice);
548
549 // Prints a brief list of all the best choices.
550 void PrintBestChoices() const;
551
552 // Returns the sum of the widths of the blob between start_blob and last_blob
553 // inclusive.
554 int GetBlobsWidth(int start_blob, int last_blob);
555 // Returns the width of a gap between the specified blob and the next one.
556 int GetBlobsGap(int blob_index);
557
558 // Returns the BLOB_CHOICE corresponding to the given index in the
559 // best choice word taken from the appropriate cell in the ratings MATRIX.
560 // Borrowed pointer, so do not delete. May return nullptr if there is no
561 // BLOB_CHOICE matching the unichar_id at the given index.
562 BLOB_CHOICE* GetBlobChoice(int index) const;
563
564 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the
565 // best choice word taken from the appropriate cell in the ratings MATRIX.
566 // Borrowed pointer, so do not delete.
567 BLOB_CHOICE_LIST* GetBlobChoices(int index) const;
568
569 // Moves the results fields from word to this. This takes ownership of all
570 // the data, so src can be destructed.
571 // word1.ConsumeWordResult(word);
572 // delete word;
573 // is simpler and faster than:
574 // word1 = *word;
575 // delete word;
576 // as it doesn't need to copy and reallocate anything.
578
579 // Replace the best choice and rebuild box word.
580 // choice must be from the current best_choices list.
581 void ReplaceBestChoice(WERD_CHOICE* choice);
582
583 // Builds the rebuild_word and sets the best_state from the chopped_word and
584 // the best_choice->state.
585 void RebuildBestState();
586
587 // Copies the chopped_word to the rebuild_word, faking a best_state as well.
588 // Also sets up the output box_word.
590
591 // Sets/replaces the box_word with one made from the rebuild_word.
592 void SetupBoxWord();
593
594 // Sets up the script positions in the best_choice using the best_choice
595 // to get the unichars, and the unicharset to get the target positions.
596 void SetScriptPositions();
597 // Sets all the blobs in all the words (best choice and alternates) to be
598 // the given position. (When a sub/superscript is recognized as a separate
599 // word, it falls victim to the rule that a whole word cannot be sub or
600 // superscript, so this function overrides that problem.)
602
603 // Classifies the word with some already-calculated BLOB_CHOICEs.
604 // The choices are an array of blob_count pointers to BLOB_CHOICE,
605 // providing a single classifier result for each blob.
606 // The BLOB_CHOICEs are consumed and the word takes ownership.
607 // The number of blobs in the box_word must match blob_count.
608 void FakeClassifyWord(int blob_count, BLOB_CHOICE** choices);
609
610 // Creates a WERD_CHOICE for the word using the top choices from the leading
611 // diagonal of the ratings matrix.
612 void FakeWordFromRatings(PermuterType permuter);
613
614 // Copies the best_choice strings to the correct_text for adaption/training.
616
617 // Merges 2 adjacent blobs in the result if the permanent callback
618 // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
619 // callback box_cb is nullptr or returns true, setting the merged blob
620 // result to the class returned from class_cb.
621 // Returns true if anything was merged.
625
626 // Merges 2 adjacent blobs in the result (index and index+1) and corrects
627 // all the data to account for the change.
628 void MergeAdjacentBlobs(int index);
629
630 // Callback helper for fix_quotes returns a double quote if both
631 // arguments are quote, otherwise INVALID_UNICHAR_ID.
633 void fix_quotes();
634
635 // Callback helper for fix_hyphens returns UNICHAR_ID of - if both
636 // arguments are hyphen, otherwise INVALID_UNICHAR_ID.
638 // Callback helper for fix_hyphens returns true if box1 and box2 overlap
639 // (assuming both on the same textline, are in order and a chopped em dash.)
640 bool HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2);
641 void fix_hyphens();
642
643 // Callback helper for merge_tess_fails returns a space if both
644 // arguments are space, otherwise INVALID_UNICHAR_ID.
646 void merge_tess_fails();
647
648 // Returns a really deep copy of *src, including the ratings MATRIX.
649 static WERD_RES* deep_copy(const WERD_RES* src) {
650 auto* result = new WERD_RES(*src);
651 // That didn't copy the ratings, but we want a copy if there is one to
652 // begin with.
653 if (src->ratings != nullptr)
654 result->ratings = src->ratings->DeepCopy();
655 return result;
656 }
657
658 // Copy blobs from word_res onto this word (eliminating spaces between).
659 // Since this may be called bidirectionally OR both the BOL and EOL flags.
660 void copy_on(WERD_RES *word_res) { //from this word
661 word->set_flag(W_BOL, word->flag(W_BOL) || word_res->word->flag(W_BOL));
662 word->set_flag(W_EOL, word->flag(W_EOL) || word_res->word->flag(W_EOL));
663 word->copy_on(word_res->word);
664 }
665
666 // Returns true if the collection of count pieces, starting at start, are all
667 // natural connected components, ie there are no real chops involved.
668 bool PiecesAllNatural(int start, int count) const;
669};
670
671/*************************************************************************
672 * PAGE_RES_IT - Page results iterator
673 *************************************************************************/
674
676 public:
677 PAGE_RES * page_res; // page being iterated
678
679 PAGE_RES_IT() = default;
680
681 PAGE_RES_IT(PAGE_RES *the_page_res) { // page result
682 page_res = the_page_res;
683 restart_page(); // ready to scan
684 }
685
686 // Do two PAGE_RES_ITs point at the same word?
687 // This is much cheaper than cmp().
688 bool operator ==(const PAGE_RES_IT &other) const {
689 return word_res == other.word_res && row_res == other.row_res &&
690 block_res == other.block_res;
691 }
692
693 bool operator !=(const PAGE_RES_IT &other) const {return !(*this == other); }
694
695 // Given another PAGE_RES_IT to the same page,
696 // this before other: -1
697 // this equal to other: 0
698 // this later than other: 1
699 int cmp(const PAGE_RES_IT &other) const;
700
702 return start_page(false); // Skip empty blocks.
703 }
705 return start_page(true); // Allow empty blocks.
706 }
707 WERD_RES *start_page(bool empty_ok);
708
710
711 // ============ Methods that mutate the underling structures ===========
712 // Note that these methods will potentially invalidate other PAGE_RES_ITs
713 // and are intended to be used only while a single PAGE_RES_IT is active.
714 // This problem needs to be taken into account if these mutation operators
715 // are ever provided to PageIterator or its subclasses.
716
717 // Inserts the new_word and a corresponding WERD_RES before the current
718 // position. The simple fields of the WERD_RES are copied from clone_res and
719 // the resulting WERD_RES is returned for further setup with best_choice etc.
720 WERD_RES* InsertSimpleCloneWord(const WERD_RES& clone_res, WERD* new_word);
721
722 // Replaces the current WERD/WERD_RES with the given words. The given words
723 // contain fake blobs that indicate the position of the characters. These are
724 // replaced with real blobs from the current word as much as possible.
726
727 // Deletes the current WERD_RES and its underlying WERD.
728 void DeleteCurrentWord();
729
730 // Makes the current word a fuzzy space if not already fuzzy. Updates
731 // corresponding part of combo if required.
733
734 WERD_RES *forward() { // Get next word.
735 return internal_forward(false, false);
736 }
737 // Move forward, but allow empty blocks to show as single nullptr words.
739 return internal_forward(false, true);
740 }
741
742 WERD_RES *forward_paragraph(); // get first word in next non-empty paragraph
743 WERD_RES *forward_block(); // get first word in next non-empty block
744
745 WERD_RES *prev_word() const { // previous word
746 return prev_word_res;
747 }
748 ROW_RES *prev_row() const { // row of prev word
749 return prev_row_res;
750 }
751 BLOCK_RES *prev_block() const { // block of prev word
752 return prev_block_res;
753 }
754 WERD_RES *word() const { // current word
755 return word_res;
756 }
757 ROW_RES *row() const { // row of current word
758 return row_res;
759 }
760 BLOCK_RES *block() const { // block of cur. word
761 return block_res;
762 }
763 WERD_RES *next_word() const { // next word
764 return next_word_res;
765 }
766 ROW_RES *next_row() const { // row of next word
767 return next_row_res;
768 }
769 BLOCK_RES *next_block() const { // block of next word
770 return next_block_res;
771 }
772 void rej_stat_word(); // for page/block/row
773 void ResetWordIterator();
774
775 private:
776 WERD_RES *internal_forward(bool new_block, bool empty_ok);
777
778 WERD_RES * prev_word_res; // previous word
779 ROW_RES *prev_row_res; // row of prev word
780 BLOCK_RES *prev_block_res; // block of prev word
781
782 WERD_RES *word_res; // current word
783 ROW_RES *row_res; // row of current word
784 BLOCK_RES *block_res; // block of cur. word
785
786 WERD_RES *next_word_res; // next word
787 ROW_RES *next_row_res; // row of next word
788 BLOCK_RES *next_block_res; // block of next word
789
790 BLOCK_RES_IT block_res_it; // iterators
791 ROW_RES_IT row_res_it;
792 WERD_RES_IT word_res_it;
793 // Iterators used to get the state of word_res_it for the current word.
794 // Since word_res_it is 2 words further on, this is otherwise hard to do.
795 WERD_RES_IT wr_it_of_current_word;
796 WERD_RES_IT wr_it_of_next_word;
797};
798#endif
@ IRR_NUM_REASONS
Definition: blamer.h:98
CRUNCH_MODE
Definition: pageres.h:157
@ CR_DELETE
Definition: pageres.h:161
@ CR_NONE
Definition: pageres.h:158
@ CR_LOOSE_SPACE
Definition: pageres.h:160
@ CR_KEEP_SPACE
Definition: pageres.h:159
PermuterType
Definition: ratngs.h:232
@ W_EOL
end of line
Definition: werd.h:33
@ W_BOL
start of line
Definition: werd.h:32
#define CLISTIZEH(CLASSNAME)
Definition: clst.h:879
#define ELISTIZEH(CLASSNAME)
Definition: elst.h:918
int UNICHAR_ID
Definition: unichar.h:34
int count(LIST var_list)
Definition: oldlist.cpp:95
void init_to_size(int size, const T &t)
Definition: blobs.h:418
Definition: matrix.h:578
MATRIX * DeepCopy() const
Definition: matrix.cpp:94
Definition: ocrblock.h:31
Definition: ocrrow.h:37
WERD_CHOICE ** prev_word_best_choice
Definition: pageres.h:84
bool rejected
Definition: pageres.h:81
GenericVector< int > blame_reasons
Definition: pageres.h:86
PAGE_RES()
Definition: pageres.h:101
int32_t rej_count
Definition: pageres.h:79
~PAGE_RES()=default
void Init()
Definition: pageres.h:93
BLOCK_RES_LIST block_res_list
Definition: pageres.h:80
int32_t char_count
Definition: pageres.h:78
GenericVector< STRING > misadaption_log
Definition: pageres.h:91
float x_height
Definition: pageres.h:121
int32_t rej_count
Definition: pageres.h:118
int16_t font_class
Definition: pageres.h:119
ROW_RES_LIST row_res_list
Definition: pageres.h:125
bool font_assigned
Definition: pageres.h:122
BLOCK_RES()=default
int32_t char_count
Definition: pageres.h:117
~BLOCK_RES()=default
BLOCK * block
Definition: pageres.h:116
int16_t row_count
Definition: pageres.h:120
int32_t whole_word_rej_count
Definition: pageres.h:143
int32_t rej_count
Definition: pageres.h:142
WERD_RES_LIST word_res_list
Definition: pageres.h:144
ROW_RES()=default
ROW * row
Definition: pageres.h:140
~ROW_RES()=default
int32_t char_count
Definition: pageres.h:141
std::vector< std::vector< std::vector< std::pair< const char *, float > > > > segmented_timesteps
Definition: pageres.h:224
const UNICHARSET * uch_set
Definition: pageres.h:203
UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1059
int GetBlobsGap(int blob_index)
Definition: pageres.cpp:740
WERD_RES()=default
bool tess_would_adapt
Definition: pageres.h:304
void CopySimpleFields(const WERD_RES &source)
Definition: pageres.cpp:251
void CloneChoppedToRebuild()
Definition: pageres.cpp:835
DENORM denorm
Definition: pageres.h:201
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:877
TWERD * rebuild_word
Definition: pageres.h:266
WERD_CHOICE_LIST best_choices
Definition: pageres.h:249
bool guessed_x_ht
Definition: pageres.h:313
BlamerBundle * blamer_bundle
Definition: pageres.h:252
void SetupBlamerBundle()
Definition: pageres.cpp:393
bool PiecesAllNatural(int start, int count) const
Definition: pageres.cpp:1078
UNICHARSET::Direction SymbolDirection(int blob_index) const
Definition: pageres.h:385
bool done
Definition: pageres.h:305
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:480
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:604
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:352
const char * BestUTF8(int blob_index, bool in_rtl_context) const
Definition: pageres.h:363
int8_t fontinfo_id2_count
Definition: pageres.h:312
float space_certainty
Definition: pageres.h:321
const FontInfo * fontinfo
Definition: pageres.h:309
std::vector< std::vector< std::pair< const char *, float > > > timesteps
Definition: pageres.h:221
tesseract::BoxWord * box_word
Definition: pageres.h:272
GenericVector< int > blob_widths
Definition: pageres.h:216
void copy_on(WERD_RES *word_res)
Definition: pageres.h:660
GenericVector< SEAM * > seam_array
Definition: pageres.h:214
bool combination
Definition: pageres.h:339
WERD_CHOICE * best_choice
Definition: pageres.h:241
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:795
const FontInfo * fontinfo2
Definition: pageres.h:310
float x_height
Definition: pageres.h:316
void FakeWordFromRatings(PermuterType permuter)
Definition: pageres.cpp:898
std::vector< std::vector< std::pair< const char *, float > > > CTC_symbol_choices
Definition: pageres.h:226
bool AnyRtlCharsInWord() const
Definition: pageres.h:393
void SetupBlobWidthsAndGaps()
Definition: pageres.cpp:400
bool part_of_combo
Definition: pageres.h:340
bool guessed_caps_ht
Definition: pageres.h:314
void PrintBestChoices() const
Definition: pageres.cpp:717
bool IsAmbiguous()
Definition: pageres.cpp:452
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:765
void ClearResults()
Definition: pageres.cpp:1104
void SetupBoxWord()
Definition: pageres.cpp:849
int GetBlobsWidth(int start_blob, int last_blob)
Definition: pageres.cpp:730
UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1008
int end
Definition: pageres.h:230
bool UnicharsInReadingOrder() const
Definition: pageres.h:427
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: pageres.cpp:865
void FilterWordChoices(int debug_level)
Definition: pageres.cpp:513
void ComputeAdaptionThresholds(float certainty_scale, float min_rating, float max_rating, float rating_margin, float *thresholds)
Definition: pageres.cpp:561
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:315
WERD_RES(const WERD_RES &source)
Definition: pageres.h:350
void BestChoiceToCorrectText()
Definition: pageres.cpp:923
bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const
Definition: pageres.cpp:439
bool tess_failed
Definition: pageres.h:295
bool odd_size
Definition: pageres.h:307
void SetScriptPositions()
Definition: pageres.cpp:858
bool ConditionalBlobMerge(TessResultCallback2< UNICHAR_ID, UNICHAR_ID, UNICHAR_ID > *class_cb, TessResultCallback2< bool, const TBOX &, const TBOX & > *box_cb)
Definition: pageres.cpp:938
bool reject_spaces
Definition: pageres.h:341
WERD_RES(WERD *the_word)
Definition: pageres.h:345
void ClearWordChoices()
Definition: pageres.cpp:1129
void DebugTopChoice(const char *msg) const
Definition: pageres.cpp:499
bool tess_accepted
Definition: pageres.h:303
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
Definition: pageres.cpp:759
void SetupWordScript(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:384
GenericVector< int > best_state
Definition: pageres.h:285
WERD_CHOICE * raw_choice
Definition: pageres.h:246
void fix_hyphens()
Definition: pageres.cpp:1047
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:649
void MergeAdjacentBlobs(int index)
Definition: pageres.cpp:974
WERD_RES & operator=(const WERD_RES &source)
Definition: pageres.cpp:188
bool small_caps
Definition: pageres.h:306
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:343
bool HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2)
Definition: pageres.cpp:1041
UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1030
int8_t fontinfo_id_count
Definition: pageres.h:311
ROW * blob_row
Definition: pageres.h:197
bool AnyLtrCharsInWord() const
Definition: pageres.h:409
TWERD * chopped_word
Definition: pageres.h:212
void fix_quotes()
Definition: pageres.cpp:1018
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:302
REJMAP reject_map
Definition: pageres.h:294
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:418
GenericVector< STRING > correct_text
Definition: pageres.h:289
bool leading_space
Definition: pageres.h:228
void RebuildBestState()
Definition: pageres.cpp:808
GenericVector< int > blob_gaps
Definition: pageres.h:219
WERD_CHOICE * ep_choice
Definition: pageres.h:293
float caps_height
Definition: pageres.h:317
void merge_tess_fails()
Definition: pageres.cpp:1067
tesseract::BoxWord * bln_boxes
Definition: pageres.h:195
bool StatesAllValid()
Definition: pageres.cpp:458
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:620
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:277
MATRIX * ratings
Definition: pageres.h:237
void Clear()
Definition: pageres.cpp:1094
void ClearRatings()
Definition: pageres.cpp:1137
const char * RawUTF8(int blob_index) const
Definition: pageres.h:376
WERD * word
Definition: pageres.h:186
float baseline_shift
Definition: pageres.h:318
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:750
WERD_RES * word() const
Definition: pageres.h:754
PAGE_RES_IT()=default
ROW_RES * row() const
Definition: pageres.h:757
int cmp(const PAGE_RES_IT &other) const
Definition: pageres.cpp:1145
WERD_RES * forward_block()
Definition: pageres.cpp:1660
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1473
void rej_stat_word()
Definition: pageres.cpp:1667
bool operator!=(const PAGE_RES_IT &other) const
Definition: pageres.h:693
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1333
void ResetWordIterator()
Definition: pageres.cpp:1523
BLOCK_RES * prev_block() const
Definition: pageres.h:751
WERD_RES * prev_word() const
Definition: pageres.h:745
ROW_RES * prev_row() const
Definition: pageres.h:748
WERD_RES * forward_paragraph()
Definition: pageres.cpp:1645
WERD_RES * next_word() const
Definition: pageres.h:763
WERD_RES * start_page(bool empty_ok)
Definition: pageres.cpp:1500
bool operator==(const PAGE_RES_IT &other) const
Definition: pageres.h:688
BLOCK_RES * block() const
Definition: pageres.h:760
WERD_RES * restart_page()
Definition: pageres.h:701
PAGE_RES * page_res
Definition: pageres.h:677
WERD_RES * restart_row()
Definition: pageres.cpp:1630
WERD_RES * forward()
Definition: pageres.h:734
ROW_RES * next_row() const
Definition: pageres.h:766
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1213
WERD_RES * restart_page_with_empties()
Definition: pageres.h:704
PAGE_RES_IT(PAGE_RES *the_page_res)
Definition: pageres.h:681
WERD_RES * forward_with_empties()
Definition: pageres.h:738
void DeleteCurrentWord()
Definition: pageres.cpp:1440
BLOCK_RES * next_block() const
Definition: pageres.h:769
bool unichars_in_script_order() const
Definition: ratngs.h:525
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
int length() const
Definition: ratngs.h:293
Definition: rect.h:34
Definition: seam.h:38
Definition: werd.h:56
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:118
void copy_on(WERD *other)
Definition: werd.cpp:221
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:299
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:690
@ U_ARABIC_NUMBER
Definition: unicharset.h:162
@ U_OTHER_NEUTRAL
Definition: unicharset.h:167
@ U_RIGHT_TO_LEFT_ARABIC
Definition: unicharset.h:170
@ U_RIGHT_TO_LEFT
Definition: unicharset.h:158
@ U_LEFT_TO_RIGHT
Definition: unicharset.h:157
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:697
int size() const
Definition: unicharset.h:341