tesseract 4.1.1
Loading...
Searching...
No Matches
ratngs.h
Go to the documentation of this file.
1/**********************************************************************
2 * File: ratngs.h (Formerly ratings.h)
3 * Description: Definition of the WERD_CHOICE and BLOB_CHOICE classes.
4 * Author: Ray Smith
5 * Created: Thu Apr 23 11:40:38 BST 1992
6 *
7 * (C) Copyright 1992, Hewlett-Packard Ltd.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 **********************************************************************/
19
20#ifndef RATNGS_H
21#define RATNGS_H
22
23#include <cassert>
24#include <cfloat> // for FLT_MAX
25
26#include "clst.h"
27#include "elst.h"
28#ifndef DISABLED_LEGACY_ENGINE
29#include "fontinfo.h"
30#endif // ndef DISABLED_LEGACY_ENGINE
31#include "genericvector.h"
32#include "matrix.h"
33#include "unichar.h"
34#include "unicharset.h"
35#include "werd.h"
36
37class MATRIX;
38struct TBLOB;
39struct TWERD;
40
41// Enum to describe the source of a BLOB_CHOICE to make it possible to determine
42// whether a blob has been classified by inspecting the BLOB_CHOICEs.
44 BCC_STATIC_CLASSIFIER, // From the char_norm classifier.
45 BCC_ADAPTED_CLASSIFIER, // From the adaptive classifier.
46 BCC_SPECKLE_CLASSIFIER, // Backup for failed classification.
47 BCC_AMBIG, // Generated by ambiguity detection.
48 BCC_FAKE, // From some other process.
49};
50
52{
53 public:
55 unichar_id_ = UNICHAR_SPACE;
56 fontinfo_id_ = -1;
57 fontinfo_id2_ = -1;
58 rating_ = 10.0;
59 certainty_ = -1.0;
60 script_id_ = -1;
61 min_xheight_ = 0.0f;
62 max_xheight_ = 0.0f;
63 yshift_ = 0.0f;
64 classifier_ = BCC_FAKE;
65 }
66 BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
67 float src_rating, // rating
68 float src_cert, // certainty
69 int script_id, // script
70 float min_xheight, // min xheight in image pixel units
71 float max_xheight, // max xheight allowed by this char
72 float yshift, // the larger of y shift (top or bottom)
73 BlobChoiceClassifier c); // adapted match or other
74 BLOB_CHOICE(const BLOB_CHOICE &other);
75 ~BLOB_CHOICE() = default;
76
78 return unichar_id_;
79 }
80 float rating() const {
81 return rating_;
82 }
83 float certainty() const {
84 return certainty_;
85 }
86 int16_t fontinfo_id() const {
87 return fontinfo_id_;
88 }
89 int16_t fontinfo_id2() const {
90 return fontinfo_id2_;
91 }
92 #ifndef DISABLED_LEGACY_ENGINE
94 return fonts_;
95 }
97 fonts_ = fonts;
98 int score1 = 0, score2 = 0;
99 fontinfo_id_ = -1;
100 fontinfo_id2_ = -1;
101 for (int f = 0; f < fonts_.size(); ++f) {
102 if (fonts_[f].score > score1) {
103 score2 = score1;
104 fontinfo_id2_ = fontinfo_id_;
105 score1 = fonts_[f].score;
106 fontinfo_id_ = fonts_[f].fontinfo_id;
107 } else if (fonts_[f].score > score2) {
108 score2 = fonts_[f].score;
109 fontinfo_id2_ = fonts_[f].fontinfo_id;
110 }
111 }
112 }
113 #endif // ndef DISABLED_LEGACY_ENGINE
114 int script_id() const {
115 return script_id_;
116 }
118 return matrix_cell_;
119 }
120 float min_xheight() const {
121 return min_xheight_;
122 }
123 float max_xheight() const {
124 return max_xheight_;
125 }
126 float yshift() const {
127 return yshift_;
128 }
130 return classifier_;
131 }
132 bool IsAdapted() const {
133 return classifier_ == BCC_ADAPTED_CLASSIFIER;
134 }
135 bool IsClassified() const {
136 return classifier_ == BCC_STATIC_CLASSIFIER ||
137 classifier_ == BCC_ADAPTED_CLASSIFIER ||
138 classifier_ == BCC_SPECKLE_CLASSIFIER;
139 }
140
141 void set_unichar_id(UNICHAR_ID newunichar_id) {
142 unichar_id_ = newunichar_id;
143 }
144 void set_rating(float newrat) {
145 rating_ = newrat;
146 }
147 void set_certainty(float newrat) {
148 certainty_ = newrat;
149 }
150 void set_script(int newscript_id) {
151 script_id_ = newscript_id;
152 }
153 void set_matrix_cell(int col, int row) {
154 matrix_cell_.col = col;
155 matrix_cell_.row = row;
156 }
158 classifier_ = classifier;
159 }
160 static BLOB_CHOICE* deep_copy(const BLOB_CHOICE* src) {
161 auto* choice = new BLOB_CHOICE;
162 *choice = *src;
163 return choice;
164 }
165 // Returns true if *this and other agree on the baseline and x-height
166 // to within some tolerance based on a given estimate of the x-height.
167 bool PosAndSizeAgree(const BLOB_CHOICE& other, float x_height,
168 bool debug) const;
169
170 void print(const UNICHARSET *unicharset) const {
171 tprintf("r%.2f c%.2f x[%g,%g]: %d %s",
172 rating_, certainty_,
173 min_xheight_, max_xheight_, unichar_id_,
174 (unicharset == nullptr) ? "" :
175 unicharset->debug_str(unichar_id_).string());
176 }
177 void print_full() const {
178 print(nullptr);
179 tprintf(" script=%d, font1=%d, font2=%d, yshift=%g, classifier=%d\n",
180 script_id_, fontinfo_id_, fontinfo_id2_, yshift_, classifier_);
181 }
182 // Sort function for sorting BLOB_CHOICEs in increasing order of rating.
183 static int SortByRating(const void *p1, const void *p2) {
184 const BLOB_CHOICE *bc1 = *static_cast<const BLOB_CHOICE *const *>(p1);
185 const BLOB_CHOICE *bc2 = *static_cast<const BLOB_CHOICE *const *>(p2);
186 return (bc1->rating_ < bc2->rating_) ? -1 : 1;
187 }
188
189 private:
190 // Copy assignment operator.
191 BLOB_CHOICE& operator=(const BLOB_CHOICE& other);
192
193 UNICHAR_ID unichar_id_; // unichar id
194#ifndef DISABLED_LEGACY_ENGINE
195 // Fonts and scores. Allowed to be empty.
197#endif // ndef DISABLED_LEGACY_ENGINE
198 int16_t fontinfo_id_; // char font information
199 int16_t fontinfo_id2_; // 2nd choice font information
200 // Rating is the classifier distance weighted by the length of the outline
201 // in the blob. In terms of probability, classifier distance is -klog p such
202 // that the resulting distance is in the range [0, 1] and then
203 // rating = w (-k log p) where w is the weight for the length of the outline.
204 // Sums of ratings may be compared meaningfully for words of different
205 // segmentation.
206 float rating_; // size related
207 // Certainty is a number in [-20, 0] indicating the classifier certainty
208 // of the choice. In terms of probability, certainty = 20 (k log p) where
209 // k is defined as above to normalize -klog p to the range [0, 1].
210 float certainty_; // absolute
211 int script_id_;
212 // Holds the position of this choice in the ratings matrix.
213 // Used to location position in the matrix during path backtracking.
214 MATRIX_COORD matrix_cell_;
215 // X-height range (in image pixels) that this classification supports.
216 float min_xheight_;
217 float max_xheight_;
218 // yshift_ - The vertical distance (in image pixels) the character is
219 // shifted (up or down) from an acceptable y position.
220 float yshift_;
221 BlobChoiceClassifier classifier_; // What generated *this.
222};
223
224// Make BLOB_CHOICE listable.
226
227// Return the BLOB_CHOICE in bc_list matching a given unichar_id,
228// or nullptr if there is no match.
229BLOB_CHOICE *FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list);
230
231// Permuter codes used in WERD_CHOICEs.
246
249
250namespace tesseract {
251// ScriptPos tells whether a character is subscript, superscript or normal.
258
259const char *ScriptPosToString(tesseract::ScriptPos script_pos);
260
261} // namespace tesseract.
262
263class WERD_CHOICE : public ELIST_LINK {
264 public:
265 static const float kBadRating;
266 static const char *permuter_name(uint8_t permuter);
267
269 : unicharset_(unicharset) { this->init(8); }
270 WERD_CHOICE(const UNICHARSET *unicharset, int reserved)
271 : unicharset_(unicharset) { this->init(reserved); }
272 WERD_CHOICE(const char *src_string,
273 const char *src_lengths,
274 float src_rating,
275 float src_certainty,
276 uint8_t src_permuter,
277 const UNICHARSET &unicharset)
278 : unicharset_(&unicharset) {
279 this->init(src_string, src_lengths, src_rating,
280 src_certainty, src_permuter);
281 }
282 WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset);
284 : ELIST_LINK(word), unicharset_(word.unicharset_) {
285 this->init(word.length());
286 this->operator=(word);
287 }
288 ~WERD_CHOICE();
289
290 const UNICHARSET *unicharset() const {
291 return unicharset_;
292 }
293 inline int length() const {
294 return length_;
295 }
296 float adjust_factor() const {
297 return adjust_factor_;
298 }
299 void set_adjust_factor(float factor) {
300 adjust_factor_ = factor;
301 }
302 inline const UNICHAR_ID *unichar_ids() const {
303 return unichar_ids_;
304 }
305 inline UNICHAR_ID unichar_id(int index) const {
306 assert(index < length_);
307 return unichar_ids_[index];
308 }
309 inline int state(int index) const {
310 return state_[index];
311 }
313 if (index < 0 || index >= length_)
315 return script_pos_[index];
316 }
317 inline float rating() const {
318 return rating_;
319 }
320 inline float certainty() const {
321 return certainty_;
322 }
323 inline float certainty(int index) const {
324 return certainties_[index];
325 }
326 inline float min_x_height() const {
327 return min_x_height_;
328 }
329 inline float max_x_height() const {
330 return max_x_height_;
331 }
332 inline void set_x_heights(float min_height, float max_height) {
333 min_x_height_ = min_height;
334 max_x_height_ = max_height;
335 }
336 inline uint8_t permuter() const {
337 return permuter_;
338 }
339 const char *permuter_name() const;
340 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,
341 // taken from the appropriate cell in the ratings MATRIX.
342 // Borrowed pointer, so do not delete.
343 BLOB_CHOICE_LIST* blob_choices(int index, MATRIX* ratings) const;
344
345 // Returns the MATRIX_COORD corresponding to the location in the ratings
346 // MATRIX for the given index into the word.
347 MATRIX_COORD MatrixCoord(int index) const;
348
349 inline void set_unichar_id(UNICHAR_ID unichar_id, int index) {
350 assert(index < length_);
351 unichar_ids_[index] = unichar_id;
352 }
354 return dangerous_ambig_found_;
355 }
356 void set_dangerous_ambig_found_(bool value) {
357 dangerous_ambig_found_ = value;
358 }
359 inline void set_rating(float new_val) {
360 rating_ = new_val;
361 }
362 inline void set_certainty(float new_val) {
363 certainty_ = new_val;
364 }
365 inline void set_permuter(uint8_t perm) {
366 permuter_ = perm;
367 }
368 // Note: this function should only be used if all the fields
369 // are populated manually with set_* functions (rather than
370 // (copy)constructors and append_* functions).
371 inline void set_length(int len) {
372 ASSERT_HOST(reserved_ >= len);
373 length_ = len;
374 }
375
377 inline void double_the_size() {
378 if (reserved_ > 0) {
380 reserved_, unichar_ids_);
382 reserved_, script_pos_);
384 reserved_, state_);
386 reserved_, certainties_);
387 reserved_ *= 2;
388 } else {
389 unichar_ids_ = new UNICHAR_ID[1];
390 script_pos_ = new tesseract::ScriptPos[1];
391 state_ = new int[1];
392 certainties_ = new float[1];
393 reserved_ = 1;
394 }
395 }
396
399 inline void init(int reserved) {
400 reserved_ = reserved;
401 if (reserved > 0) {
402 unichar_ids_ = new UNICHAR_ID[reserved];
403 script_pos_ = new tesseract::ScriptPos[reserved];
404 state_ = new int[reserved];
405 certainties_ = new float[reserved];
406 } else {
407 unichar_ids_ = nullptr;
408 script_pos_ = nullptr;
409 state_ = nullptr;
410 certainties_ = nullptr;
411 }
412 length_ = 0;
413 adjust_factor_ = 1.0f;
414 rating_ = 0.0;
415 certainty_ = FLT_MAX;
416 min_x_height_ = 0.0f;
417 max_x_height_ = FLT_MAX;
418 permuter_ = NO_PERM;
419 unichars_in_script_order_ = false; // Tesseract is strict left-to-right.
420 dangerous_ambig_found_ = false;
421 }
422
428 void init(const char *src_string, const char *src_lengths,
429 float src_rating, float src_certainty,
430 uint8_t src_permuter);
431
433 inline void make_bad() {
434 length_ = 0;
435 rating_ = kBadRating;
436 certainty_ = -FLT_MAX;
437 }
438
443 UNICHAR_ID unichar_id, int blob_count,
444 float rating, float certainty) {
445 assert(reserved_ > length_);
446 length_++;
447 this->set_unichar_id(unichar_id, blob_count,
448 rating, certainty, length_-1);
449 }
450
451 void append_unichar_id(UNICHAR_ID unichar_id, int blob_count,
452 float rating, float certainty);
453
454 inline void set_unichar_id(UNICHAR_ID unichar_id, int blob_count,
455 float rating, float certainty, int index) {
456 assert(index < length_);
457 unichar_ids_[index] = unichar_id;
458 state_[index] = blob_count;
459 certainties_[index] = certainty;
460 script_pos_[index] = tesseract::SP_NORMAL;
461 rating_ += rating;
462 if (certainty < certainty_) {
463 certainty_ = certainty;
464 }
465 }
466 // Sets the entries for the given index from the BLOB_CHOICE, assuming
467 // unit fragment lengths, but setting the state for this index to blob_count.
468 void set_blob_choice(int index, int blob_count,
469 const BLOB_CHOICE* blob_choice);
470
472 void remove_unichar_ids(int index, int num);
473 inline void remove_last_unichar_id() { --length_; }
474 inline void remove_unichar_id(int index) {
475 this->remove_unichar_ids(index, 1);
476 }
477 bool has_rtl_unichar_id() const;
479
480 // Returns the half-open interval of unichar_id indices [start, end) which
481 // enclose the core portion of this word -- the part after stripping
482 // punctuation from the left and right.
483 void punct_stripped(int *start_core, int *end_core) const;
484
485 // Returns the indices [start, end) containing the core of the word, stripped
486 // of any superscript digits on either side. (i.e., the non-footnote part
487 // of the word). There is no guarantee that the output range is non-empty.
488 void GetNonSuperscriptSpan(int *start, int *end) const;
489
490 // Return a copy of this WERD_CHOICE with the choices [start, end).
491 // The result is useful only for checking against a dictionary.
492 WERD_CHOICE shallow_copy(int start, int end) const;
493
494 void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const;
495 const STRING debug_string() const {
496 STRING word_str;
497 for (int i = 0; i < length_; ++i) {
498 word_str += unicharset_->debug_str(unichar_ids_[i]);
499 word_str += " ";
500 }
501 return word_str;
502 }
503 // Returns true if any unichar_id in the word is a non-space-delimited char.
505 for (int i = 0; i < length_; ++i) {
506 if (!unicharset_->IsSpaceDelimited(unichar_ids_[i])) return true;
507 }
508 return false;
509 }
510 // Returns true if the word is all spaces.
511 bool IsAllSpaces() const {
512 for (int i = 0; i < length_; ++i) {
513 if (unichar_ids_[i] != UNICHAR_SPACE) return false;
514 }
515 return true;
516 }
517
518 // Call this to override the default (strict left to right graphemes)
519 // with the fact that some engine produces a "reading order" set of
520 // Graphemes for each word.
521 bool set_unichars_in_script_order(bool in_script_order) {
522 return unichars_in_script_order_ = in_script_order;
523 }
524
526 return unichars_in_script_order_;
527 }
528
529 // Returns a UTF-8 string equivalent to the current choice
530 // of UNICHAR IDs.
531 const STRING &unichar_string() const {
532 this->string_and_lengths(&unichar_string_, &unichar_lengths_);
533 return unichar_string_;
534 }
535
536 // Returns the lengths, one byte each, representing the number of bytes
537 // required in the unichar_string for each UNICHAR_ID.
538 const STRING &unichar_lengths() const {
539 this->string_and_lengths(&unichar_string_, &unichar_lengths_);
540 return unichar_lengths_;
541 }
542
543 // Sets up the script_pos_ member using the blobs_list to get the bln
544 // bounding boxes, *this to get the unichars, and this->unicharset
545 // to get the target positions. If small_caps is true, sub/super are not
546 // considered, but dropcaps are.
547 // NOTE: blobs_list should be the chopped_word blobs. (Fully segemented.)
548 void SetScriptPositions(bool small_caps, TWERD* word, int debug = 0);
549 // Sets the script_pos_ member from some source positions with a given length.
550 void SetScriptPositions(const tesseract::ScriptPos* positions, int length);
551 // Sets all the script_pos_ positions to the given position.
553
554 static tesseract::ScriptPos ScriptPositionOf(bool print_debug,
555 const UNICHARSET& unicharset,
556 const TBOX& blob_box,
558
559 // Returns the "dominant" script ID for the word. By "dominant", the script
560 // must account for at least half the characters. Otherwise, it returns 0.
561 // Note that for Japanese, Hiragana and Katakana are simply treated as Han.
562 int GetTopScriptID() const;
563
564 // Fixes the state_ for a chop at the given blob_posiiton.
565 void UpdateStateForSplit(int blob_position);
566
567 // Returns the sum of all the state elements, being the total number of blobs.
568 int TotalOfStates() const;
569
570 void print() const { this->print(""); }
571 void print(const char *msg) const;
572 // Prints the segmentation state with an introductory message.
573 void print_state(const char *msg) const;
574
575 // Displays the segmentation state of *this (if not the same as the last
576 // one displayed) and waits for a click in the window.
577 void DisplaySegmentation(TWERD* word);
578
579 WERD_CHOICE& operator+= ( // concatanate
580 const WERD_CHOICE & second);// second on first
581
582 WERD_CHOICE& operator= (const WERD_CHOICE& source);
583
584 private:
585 const UNICHARSET *unicharset_;
586 // TODO(rays) Perhaps replace the multiple arrays with an array of structs?
587 // unichar_ids_ is an array of classifier "results" that make up a word.
588 // For each unichar_ids_[i], script_pos_[i] has the sub/super/normal position
589 // of each unichar_id.
590 // state_[i] indicates the number of blobs in WERD_RES::chopped_word that
591 // were put together to make the classification results in the ith position
592 // in unichar_ids_, and certainties_[i] is the certainty of the choice that
593 // was used in this word.
594 // == Change from before ==
595 // Previously there was fragment_lengths_ that allowed a word to be
596 // artificially composed of multiple fragment results. Since the new
597 // segmentation search doesn't do fragments, treatment of fragments has
598 // been moved to a lower level, augmenting the ratings matrix with the
599 // combined fragments, and allowing the language-model/segmentation-search
600 // to deal with only the combined unichar_ids.
601 UNICHAR_ID *unichar_ids_; // unichar ids that represent the text of the word
602 tesseract::ScriptPos* script_pos_; // Normal/Sub/Superscript of each unichar.
603 int* state_; // Number of blobs in each unichar.
604 float* certainties_; // Certainty of each unichar.
605 int reserved_; // size of the above arrays
606 int length_; // word length
607 // Factor that was used to adjust the rating.
608 float adjust_factor_;
609 // Rating is the sum of the ratings of the individual blobs in the word.
610 float rating_; // size related
611 // certainty is the min (worst) certainty of the individual blobs in the word.
612 float certainty_; // absolute
613 // xheight computed from the result, or 0 if inconsistent.
614 float min_x_height_;
615 float max_x_height_;
616 uint8_t permuter_; // permuter code
617
618 // Normally, the ratings_ matrix represents the recognition results in order
619 // from left-to-right. However, some engines (say Cube) may return
620 // recognition results in the order of the script's major reading direction
621 // (for Arabic, that is right-to-left).
622 bool unichars_in_script_order_;
623 // True if NoDangerousAmbig found an ambiguity.
624 bool dangerous_ambig_found_;
625
626 // The following variables are populated and passed by reference any
627 // time unichar_string() or unichar_lengths() are called.
628 mutable STRING unichar_string_;
629 mutable STRING unichar_lengths_;
630};
631
632// Make WERD_CHOICE listable.
634using BLOB_CHOICE_LIST_VECTOR = GenericVector<BLOB_CHOICE_LIST *>;
635
636// Utilities for comparing WERD_CHOICEs
637
639 const WERD_CHOICE &word2);
640
641// Utilities for debug printing.
643 const char *msg, // intro message
644 BLOB_CHOICE_LIST *ratings, // list of results
645 const UNICHARSET &current_unicharset // unicharset that can be used
646 // for id-to-unichar conversion
647 );
648
649#endif
PermuterType
Definition: ratngs.h:232
@ TOP_CHOICE_PERM
Definition: ratngs.h:235
@ USER_PATTERN_PERM
Definition: ratngs.h:240
@ DOC_DAWG_PERM
Definition: ratngs.h:242
@ FREQ_DAWG_PERM
Definition: ratngs.h:244
@ USER_DAWG_PERM
Definition: ratngs.h:243
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:241
@ NGRAM_PERM
Definition: ratngs.h:238
@ NUMBER_PERM
Definition: ratngs.h:239
@ PUNC_PERM
Definition: ratngs.h:234
@ LOWER_CASE_PERM
Definition: ratngs.h:236
@ UPPER_CASE_PERM
Definition: ratngs.h:237
@ NUM_PERMUTER_TYPES
Definition: ratngs.h:247
@ COMPOUND_PERM
Definition: ratngs.h:245
@ NO_PERM
Definition: ratngs.h:233
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:837
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
Definition: ratngs.cpp:809
BlobChoiceClassifier
Definition: ratngs.h:43
@ BCC_ADAPTED_CLASSIFIER
Definition: ratngs.h:45
@ BCC_FAKE
Definition: ratngs.h:48
@ BCC_SPECKLE_CLASSIFIER
Definition: ratngs.h:46
@ BCC_AMBIG
Definition: ratngs.h:47
@ BCC_STATIC_CLASSIFIER
Definition: ratngs.h:44
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:184
#define ELISTIZEH(CLASSNAME)
Definition: elst.h:918
#define ASSERT_HOST(x)
Definition: errcode.h:88
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int UNICHAR_ID
Definition: unichar.h:34
@ UNICHAR_SPACE
Definition: unicharset.h:34
@ SP_SUBSCRIPT
Definition: ratngs.h:254
@ SP_DROPCAP
Definition: ratngs.h:256
@ SP_NORMAL
Definition: ratngs.h:253
@ SP_SUPERSCRIPT
Definition: ratngs.h:255
const char * ScriptPosToString(enum ScriptPos script_pos)
Definition: ratngs.cpp:204
int size() const
Definition: genericvector.h:72
static T * double_the_size_memcpy(int current_size, T *data)
Definition: blobs.h:284
Definition: blobs.h:418
Definition: matrix.h:578
float max_xheight() const
Definition: ratngs.h:123
void set_script(int newscript_id)
Definition: ratngs.h:150
void set_rating(float newrat)
Definition: ratngs.h:144
float certainty() const
Definition: ratngs.h:83
void set_fonts(const GenericVector< tesseract::ScoredFont > &fonts)
Definition: ratngs.h:96
void set_classifier(BlobChoiceClassifier classifier)
Definition: ratngs.h:157
float yshift() const
Definition: ratngs.h:126
~BLOB_CHOICE()=default
const GenericVector< tesseract::ScoredFont > & fonts() const
Definition: ratngs.h:93
static BLOB_CHOICE * deep_copy(const BLOB_CHOICE *src)
Definition: ratngs.h:160
int16_t fontinfo_id2() const
Definition: ratngs.h:89
const MATRIX_COORD & matrix_cell()
Definition: ratngs.h:117
bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const
Definition: ratngs.cpp:156
int script_id() const
Definition: ratngs.h:114
int16_t fontinfo_id() const
Definition: ratngs.h:86
bool IsAdapted() const
Definition: ratngs.h:132
void print(const UNICHARSET *unicharset) const
Definition: ratngs.h:170
float rating() const
Definition: ratngs.h:80
BLOB_CHOICE()
Definition: ratngs.h:54
BlobChoiceClassifier classifier() const
Definition: ratngs.h:129
void set_matrix_cell(int col, int row)
Definition: ratngs.h:153
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
float min_xheight() const
Definition: ratngs.h:120
bool IsClassified() const
Definition: ratngs.h:135
void print_full() const
Definition: ratngs.h:177
void set_certainty(float newrat)
Definition: ratngs.h:147
static int SortByRating(const void *p1, const void *p2)
Definition: ratngs.h:183
void set_unichar_id(UNICHAR_ID newunichar_id)
Definition: ratngs.h:141
MATRIX_COORD MatrixCoord(int index) const
Definition: ratngs.cpp:306
void set_x_heights(float min_height, float max_height)
Definition: ratngs.h:332
void set_certainty(float new_val)
Definition: ratngs.h:362
WERD_CHOICE shallow_copy(int start, int end) const
Definition: ratngs.cpp:418
WERD_CHOICE(const WERD_CHOICE &word)
Definition: ratngs.h:283
float certainty(int index) const
Definition: ratngs.h:323
const STRING debug_string() const
Definition: ratngs.h:495
bool set_unichars_in_script_order(bool in_script_order)
Definition: ratngs.h:521
void init(int reserved)
Definition: ratngs.h:399
void remove_unichar_id(int index)
Definition: ratngs.h:474
int state(int index) const
Definition: ratngs.h:309
float adjust_factor() const
Definition: ratngs.h:296
void SetScriptPositions(bool small_caps, TWERD *word, int debug=0)
Definition: ratngs.cpp:554
void UpdateStateForSplit(int blob_position)
Definition: ratngs.cpp:703
const STRING & unichar_string() const
Definition: ratngs.h:531
int TotalOfStates() const
Definition: ratngs.cpp:715
void set_adjust_factor(float factor)
Definition: ratngs.h:299
bool IsAllSpaces() const
Definition: ratngs.h:511
void set_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty, int index)
Definition: ratngs.h:454
void remove_last_unichar_id()
Definition: ratngs.h:473
void double_the_size()
Make more space in unichar_id_ and fragment_lengths_ arrays.
Definition: ratngs.h:377
bool unichars_in_script_order() const
Definition: ratngs.h:525
static tesseract::ScriptPos ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset, const TBOX &blob_box, UNICHAR_ID unichar_id)
Definition: ratngs.cpp:633
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:453
int GetTopScriptID() const
Definition: ratngs.cpp:671
WERD_CHOICE & operator=(const WERD_CHOICE &source)
Definition: ratngs.cpp:525
void punct_stripped(int *start_core, int *end_core) const
Definition: ratngs.cpp:387
bool dangerous_ambig_found() const
Definition: ratngs.h:353
void print_state(const char *msg) const
Definition: ratngs.cpp:756
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
uint8_t permuter() const
Definition: ratngs.h:336
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:349
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: ratngs.cpp:330
void set_rating(float new_val)
Definition: ratngs.h:359
const UNICHARSET * unicharset() const
Definition: ratngs.h:290
const char * permuter_name() const
Definition: ratngs.cpp:287
WERD_CHOICE & operator+=(const WERD_CHOICE &second)
Definition: ratngs.cpp:489
void DisplaySegmentation(TWERD *word)
Definition: ratngs.cpp:765
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: ratngs.cpp:627
void set_permuter(uint8_t perm)
Definition: ratngs.h:365
BLOB_CHOICE_LIST * blob_choices(int index, MATRIX *ratings) const
Definition: ratngs.cpp:294
static const float kBadRating
Definition: ratngs.h:265
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:433
bool has_rtl_unichar_id() const
Definition: ratngs.cpp:435
bool ContainsAnyNonSpaceDelimited() const
Definition: ratngs.h:504
void remove_unichar_ids(int index, int num)
Definition: ratngs.cpp:346
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice)
Definition: ratngs.cpp:316
void reverse_and_mirror_unichar_ids()
Definition: ratngs.cpp:369
float min_x_height() const
Definition: ratngs.h:326
float certainty() const
Definition: ratngs.h:320
WERD_CHOICE(const UNICHARSET *unicharset)
Definition: ratngs.h:268
int length() const
Definition: ratngs.h:293
tesseract::ScriptPos BlobPosition(int index) const
Definition: ratngs.h:312
float max_x_height() const
Definition: ratngs.h:329
const UNICHAR_ID * unichar_ids() const
Definition: ratngs.h:302
~WERD_CHOICE()
Definition: ratngs.cpp:280
void set_dangerous_ambig_found_(bool value)
Definition: ratngs.h:356
void GetNonSuperscriptSpan(int *start, int *end) const
Definition: ratngs.cpp:401
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:472
void print() const
Definition: ratngs.h:570
const STRING & unichar_lengths() const
Definition: ratngs.h:538
void set_length(int len)
Definition: ratngs.h:371
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:442
float rating() const
Definition: ratngs.h:317
WERD_CHOICE(const UNICHARSET *unicharset, int reserved)
Definition: ratngs.h:270
WERD_CHOICE(const char *src_string, const char *src_lengths, float src_rating, float src_certainty, uint8_t src_permuter, const UNICHARSET &unicharset)
Definition: ratngs.h:272
Definition: rect.h:34
Definition: strngs.h:45
const char * string() const
Definition: strngs.cpp:194
bool IsSpaceDelimited(UNICHAR_ID unichar_id) const
Definition: unicharset.h:652
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:343