tesseract 4.1.1
Loading...
Searching...
No Matches
tesseract::Dict Class Reference

#include <dict.h>

Public Member Functions

 Dict (CCUtil *image_ptr)
 
 ~Dict ()
 
const CCUtilgetCCUtil () const
 
CCUtilgetCCUtil ()
 
const UNICHARSETgetUnicharset () const
 
UNICHARSETgetUnicharset ()
 
const UnicharAmbigsgetUnicharAmbigs () const
 
bool compound_marker (UNICHAR_ID unichar_id)
 
bool is_apostrophe (UNICHAR_ID unichar_id)
 
bool hyphenated () const
 Returns true if we've recorded the beginning of a hyphenated word. More...
 
int hyphen_base_size () const
 Size of the base word (the part on the line before) of a hyphenated word. More...
 
void copy_hyphen_info (WERD_CHOICE *word) const
 
bool has_hyphen_end (const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
 Check whether the word has a hyphen at the end. More...
 
bool has_hyphen_end (const WERD_CHOICE &word) const
 Same as above, but check the unichar at the end of the word. More...
 
void reset_hyphen_vars (bool last_word_on_line)
 
void set_hyphen_word (const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
 
void update_best_choice (const WERD_CHOICE &word, WERD_CHOICE *best_choice)
 
void init_active_dawgs (DawgPositionVector *active_dawgs, bool ambigs_mode) const
 
void default_dawgs (DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
 
bool NoDangerousAmbig (WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
 
void ReplaceAmbig (int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
 
int LengthOfShortestAlphaRun (const WERD_CHOICE &WordChoice) const
 Returns the length of the shortest alpha run in WordChoice. More...
 
int UniformCertainties (const WERD_CHOICE &word)
 
bool AcceptableChoice (const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
 Returns true if the given best_choice is good enough to stop. More...
 
bool AcceptableResult (WERD_RES *word) const
 
void EndDangerousAmbigs ()
 
void DebugWordChoices ()
 Prints the current choices for this word to stdout. More...
 
void SettupStopperPass1 ()
 Sets up stopper variables in preparation for the first pass. More...
 
void SettupStopperPass2 ()
 Sets up stopper variables in preparation for the second pass. More...
 
int case_ok (const WERD_CHOICE &word) const
 Check a string to see if it matches a set of lexical rules. More...
 
bool absolute_garbage (const WERD_CHOICE &word, const UNICHARSET &unicharset)
 
void SetupForLoad (DawgCache *dawg_cache)
 
void Load (const STRING &lang, TessdataManager *data_file)
 
void LoadLSTM (const STRING &lang, TessdataManager *data_file)
 
bool FinishLoad ()
 
void End ()
 
void ResetDocumentDictionary ()
 
int def_letter_is_okay (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
 
int LetterIsOkay (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
 Calls letter_is_okay_ member function. More...
 
double ProbabilityInContext (const char *context, int context_bytes, const char *character, int character_bytes)
 Calls probability_in_context_ member function. More...
 
double def_probability_in_context (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 Default (no-op) implementation of probability in context function. More...
 
double ngram_probability_in_context (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 
float ParamsModelClassify (const char *lang, void *path)
 
float CallParamsModelClassify (void *path)
 
void SetWildcardID (UNICHAR_ID id)
 
UNICHAR_ID WildcardID () const
 
int NumDawgs () const
 Return the number of dawgs in the dawgs_ vector. More...
 
const DawgGetDawg (int index) const
 Return i-th dawg pointer recorded in the dawgs_ vector. More...
 
const DawgGetPuncDawg () const
 Return the points to the punctuation dawg. More...
 
const DawgGetUnambigDawg () const
 Return the points to the unambiguous words dawg. More...
 
UNICHAR_ID char_for_dawg (const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
 
void ProcessPatternEdges (const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
 
int valid_word (const WERD_CHOICE &word, bool numbers_ok) const
 
int valid_word (const WERD_CHOICE &word) const
 
int valid_word_or_number (const WERD_CHOICE &word) const
 
int valid_word (const char *string) const
 This function is used by api/tesseract_cube_combiner.cpp. More...
 
bool valid_bigram (const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
 
bool valid_punctuation (const WERD_CHOICE &word)
 
int good_choice (const WERD_CHOICE &choice)
 Returns true if a good answer is found for the unknown blob rating. More...
 
void add_document_word (const WERD_CHOICE &best_choice)
 Adds a word found on this document to the document specific dictionary. More...
 
void adjust_word (WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
 Adjusts the rating of the given word. More...
 
void SetWordsegRatingAdjustFactor (float f)
 Set wordseg_rating_adjust_factor_ to the given value. More...
 
bool IsSpaceDelimitedLang () const
 Returns true if the language is space-delimited (not CJ, or T). More...
 
go_deeper_dawg_fxn

If the choice being composed so far could be a dictionary word keep exploring choices.

WERD_CHOICEdawg_permute_and_select (const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
 
void go_deeper_dawg_fxn (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
 
void permute_choices (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
 
void append_choices (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
 
fragment_state

Given the current char choice and information about previously seen fragments, determines whether adjacent character fragments are present and whether they can be concatenated.

The given prev_char_frag_info contains:

  • fragment: if not nullptr contains information about immediately preceding fragmented character choice
  • num_fragments: number of fragments that have been used so far to construct a character
  • certainty: certainty of the current choice or minimum certainty of all fragments concatenated so far
  • rating: rating of the current choice or sum of fragment ratings concatenated so far

The output char_frag_info is filled in as follows:

  • character: is set to be nullptr if the choice is a non-matching or non-ending fragment piece; is set to unichar of the given choice if it represents a regular character or a matching ending fragment
  • fragment,num_fragments,certainty,rating are set as described above
Returns
false if a non-matching fragment is discovered, true otherwise.
bool fragment_state_okay (UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
 

Static Public Member Functions

static TESS_API DawgCacheGlobalDawgCache ()
 
static NODE_REF GetStartingNode (const Dawg *dawg, EDGE_REF edge_ref)
 Returns the appropriate next node given the EDGE_REF. More...
 
static bool valid_word_permuter (uint8_t perm, bool numbers_ok)
 Check all the DAWGs to see if this word is in any of them. More...
 

Public Attributes

void(Dict::* go_deeper_fxn_ )(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
 Pointer to go_deeper function. More...
 
int(Dict::* letter_is_okay_ )(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
 
double(Dict::* probability_in_context_ )(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 Probability in context function used by the ngram permuter. More...
 
float(Dict::* params_model_classify_ )(const char *lang, void *path)
 
char * user_words_file = ""
 
char * user_words_suffix = ""
 
char * user_patterns_file = ""
 
char * user_patterns_suffix = ""
 
bool load_system_dawg = true
 
bool load_freq_dawg = true
 
bool load_unambig_dawg = true
 
bool load_punc_dawg = true
 
bool load_number_dawg = true
 
bool load_bigram_dawg = true
 
double xheight_penalty_subscripts = 0.125
 
double xheight_penalty_inconsistent = 0.25
 
double segment_penalty_dict_frequent_word = 1.0
 
double segment_penalty_dict_case_ok = 1.1
 
double segment_penalty_dict_case_bad = 1.3125
 
double segment_penalty_dict_nonword = 1.25
 
double segment_penalty_garbage = 1.50
 
char * output_ambig_words_file = ""
 
int dawg_debug_level = 0
 
int hyphen_debug_level = 0
 
bool use_only_first_uft8_step = false
 
double certainty_scale = 20.0
 
double stopper_nondict_certainty_base = -2.50
 
double stopper_phase2_certainty_rejection_offset = 1.0
 
int stopper_smallword_size = 2
 
double stopper_certainty_per_char = -0.50
 
double stopper_allowable_character_badness = 3.0
 
int stopper_debug_level = 0
 
bool stopper_no_acceptable_choices = false
 
int tessedit_truncate_wordchoice_log = 10
 
char * word_to_debug = ""
 
bool segment_nonalphabetic_script = false
 
bool save_doc_words = 0
 
double doc_dict_pending_threshold = 0.0
 
double doc_dict_certainty_threshold = -2.25
 
int max_permuter_attempts = 10000
 

Detailed Description

Definition at line 91 of file dict.h.

Constructor & Destructor Documentation

◆ Dict()

tesseract::Dict::Dict ( CCUtil image_ptr)

Definition at line 30 of file dict.cpp.

34 ccutil_(ccutil),
35 wildcard_unichar_id_(INVALID_UNICHAR_ID),
36 apostrophe_unichar_id_(INVALID_UNICHAR_ID),
37 question_unichar_id_(INVALID_UNICHAR_ID),
38 slash_unichar_id_(INVALID_UNICHAR_ID),
39 hyphen_unichar_id_(INVALID_UNICHAR_ID),
40 STRING_MEMBER(user_words_file, "", "A filename of user-provided words.",
41 getCCUtil()->params()),
43 "A suffix of user-provided words located in tessdata.",
44 getCCUtil()->params()),
46 "A filename of user-provided patterns.",
47 getCCUtil()->params()),
49 "A suffix of user-provided patterns located in "
50 "tessdata.",
51 getCCUtil()->params()),
52 BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.",
53 getCCUtil()->params()),
54 BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.",
55 getCCUtil()->params()),
56 BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
57 getCCUtil()->params()),
59 "Load dawg with punctuation"
60 " patterns.",
61 getCCUtil()->params()),
63 "Load dawg with number"
64 " patterns.",
65 getCCUtil()->params()),
67 "Load dawg with special word "
68 "bigrams.",
69 getCCUtil()->params()),
71 "Score penalty (0.1 = 10%) added if there are subscripts "
72 "or superscripts in a word, but it is otherwise OK.",
73 getCCUtil()->params()),
75 "Score penalty (0.1 = 10%) added if an xheight is "
76 "inconsistent.",
77 getCCUtil()->params()),
79 "Score multiplier for word matches which have good case and"
80 " are frequent in the given language (lower is better).",
81 getCCUtil()->params()),
83 "Score multiplier for word matches that have good case "
84 "(lower is better).",
85 getCCUtil()->params()),
87 "Default score multiplier for word matches, which may have "
88 "case issues (lower is better).",
89 getCCUtil()->params()),
91 "Score multiplier for glyph fragment segmentations which "
92 "do not match a dictionary word (lower is better).",
93 getCCUtil()->params()),
95 "Score multiplier for poorly cased strings that are not in"
96 " the dictionary and generally look like garbage (lower is"
97 " better).",
98 getCCUtil()->params()),
100 "Output file for ambiguities found in the dictionary",
101 getCCUtil()->params()),
103 "Set to 1 for general debug info"
104 ", to 2 for more details, to 3 to see all the debug messages",
105 getCCUtil()->params()),
106 INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.",
107 getCCUtil()->params()),
109 "Use only the first UTF8 step of the given string"
110 " when computing log probabilities.",
111 getCCUtil()->params()),
112 double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
113 getCCUtil()->params()),
115 "Certainty threshold for non-dict words",
116 getCCUtil()->params()),
118 "Reject certainty offset", getCCUtil()->params()),
120 "Size of dict word to be treated as non-dict word",
121 getCCUtil()->params()),
123 "Certainty to add"
124 " for each dict char above small word size.",
125 getCCUtil()->params()),
127 "Max certaintly variation allowed in a word (in sigma)",
128 getCCUtil()->params()),
129 INT_MEMBER(stopper_debug_level, 0, "Stopper debug level",
130 getCCUtil()->params()),
132 "Make AcceptableChoice() always return false. Useful"
133 " when there is a need to explore all segmentations",
134 getCCUtil()->params()),
136 "Max words to keep in list", getCCUtil()->params()),
138 "Word for which stopper debug"
139 " information should be printed to stdout",
140 getCCUtil()->params()),
142 "Don't use any alphabetic-specific tricks."
143 " Set to true in the traineddata config file for"
144 " scripts that are cursive or inherently fixed-pitch",
145 getCCUtil()->params()),
146 BOOL_MEMBER(save_doc_words, 0, "Save Document Words",
147 getCCUtil()->params()),
149 "Worst certainty for using pending dictionary",
150 getCCUtil()->params()),
152 "Worst certainty for words that can be inserted into the"
153 " document dictionary",
154 getCCUtil()->params()),
156 "Maximum number of different"
157 " character choices to consider during permutation."
158 " This limit is especially useful when user patterns"
159 " are specified, since overly generic patterns can result in"
160 " dawg search exploring an overly large number of options.",
161 getCCUtil()->params()) {
162 reject_offset_ = 0.0;
163 go_deeper_fxn_ = nullptr;
164 hyphen_word_ = nullptr;
165 last_word_on_line_ = false;
166 document_words_ = nullptr;
167 dawg_cache_ = nullptr;
168 dawg_cache_is_ours_ = false;
169 pending_words_ = nullptr;
170 bigram_dawg_ = nullptr;
171 freq_dawg_ = nullptr;
172 punc_dawg_ = nullptr;
173 unambig_dawg_ = nullptr;
174 wordseg_rating_adjust_factor_ = -1.0f;
175 output_ambig_words_file_ = nullptr;
176}
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:315
#define STRING_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:333
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:330
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:324
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:321
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:318
bool save_doc_words
Definition: dict.h:649
double certainty_scale
Definition: dict.h:627
double xheight_penalty_subscripts
Definition: dict.h:595
double stopper_allowable_character_badness
Definition: dict.h:637
int dawg_debug_level
Definition: dict.h:622
double doc_dict_certainty_threshold
Definition: dict.h:653
double segment_penalty_dict_case_ok
Definition: dict.h:605
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:372
const CCUtil * getCCUtil() const
Definition: dict.h:95
double segment_penalty_dict_case_bad
Definition: dict.h:609
bool load_punc_dawg
Definition: dict.h:589
bool stopper_no_acceptable_choices
Definition: dict.h:641
double segment_penalty_dict_nonword
Definition: dict.h:613
char * user_patterns_suffix
Definition: dict.h:584
float(Dict::* params_model_classify_)(const char *lang, void *path)
Definition: dict.h:418
int tessedit_truncate_wordchoice_log
Definition: dict.h:642
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:216
double segment_penalty_garbage
Definition: dict.h:618
double segment_penalty_dict_frequent_word
Definition: dict.h:601
bool load_number_dawg
Definition: dict.h:590
bool load_freq_dawg
Definition: dict.h:586
int max_permuter_attempts
Definition: dict.h:658
double stopper_phase2_certainty_rejection_offset
Definition: dict.h:631
bool load_unambig_dawg
Definition: dict.h:587
double stopper_nondict_certainty_base
Definition: dict.h:629
bool segment_nonalphabetic_script
Definition: dict.h:648
double xheight_penalty_inconsistent
Definition: dict.h:598
double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Default (no-op) implementation of probability in context function.
Definition: dict.h:401
bool use_only_first_uft8_step
Definition: dict.h:626
char * user_words_file
Definition: dict.h:578
char * word_to_debug
Definition: dict.h:644
bool load_system_dawg
Definition: dict.h:585
double stopper_certainty_per_char
Definition: dict.h:635
int stopper_debug_level
Definition: dict.h:638
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
Definition: dict.h:384
double doc_dict_pending_threshold
Definition: dict.h:651
char * output_ambig_words_file
Definition: dict.h:620
char * user_words_suffix
Definition: dict.h:580
bool load_bigram_dawg
Definition: dict.h:592
int hyphen_debug_level
Definition: dict.h:623
int def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.cpp:395
char * user_patterns_file
Definition: dict.h:582
int stopper_smallword_size
Definition: dict.h:633

◆ ~Dict()

tesseract::Dict::~Dict ( )

Definition at line 178 of file dict.cpp.

178 {
179 End();
180 delete hyphen_word_;
181 if (output_ambig_words_file_ != nullptr) fclose(output_ambig_words_file_);
182}
void End()
Definition: dict.cpp:372

Member Function Documentation

◆ absolute_garbage()

bool tesseract::Dict::absolute_garbage ( const WERD_CHOICE word,
const UNICHARSET unicharset 
)

Returns true if the word looks like an absolute garbage (e.g. image mistakenly recognized as text).

Definition at line 65 of file context.cpp.

66 {
67 if (word.length() < kMinAbsoluteGarbageWordLength) return false;
68 int num_alphanum = 0;
69 for (int x = 0; x < word.length(); ++x) {
70 num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) ||
71 unicharset.get_isdigit(word.unichar_id(x)));
72 }
73 return (static_cast<float>(num_alphanum) /
74 static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac);
75}
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
int length() const
Definition: ratngs.h:293
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512

◆ AcceptableChoice()

bool tesseract::Dict::AcceptableChoice ( const WERD_CHOICE best_choice,
XHeightConsistencyEnum  xheight_consistency 
)

Returns true if the given best_choice is good enough to stop.

Definition at line 42 of file stopper.cpp.

43 {
44 float CertaintyThreshold = stopper_nondict_certainty_base;
45 int WordSize;
46
47 if (stopper_no_acceptable_choices) return false;
48
49 if (best_choice.length() == 0) return false;
50
51 bool no_dang_ambigs = !best_choice.dangerous_ambig_found();
52 bool is_valid_word = valid_word_permuter(best_choice.permuter(), false);
53 bool is_case_ok = case_ok(best_choice);
54
55 if (stopper_debug_level >= 1) {
56 const char *xht = "UNKNOWN";
57 switch (xheight_consistency) {
58 case XH_GOOD: xht = "NORMAL"; break;
59 case XH_SUBNORMAL: xht = "SUBNORMAL"; break;
60 case XH_INCONSISTENT: xht = "INCONSISTENT"; break;
61 default: xht = "UNKNOWN";
62 }
63 tprintf("\nStopper: %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n",
64 best_choice.unichar_string().string(),
65 (is_valid_word ? 'y' : 'n'),
66 (is_case_ok ? 'y' : 'n'),
67 xht,
68 best_choice.min_x_height(),
69 best_choice.max_x_height());
70 }
71 // Do not accept invalid words in PASS1.
72 if (reject_offset_ <= 0.0f && !is_valid_word) return false;
73 if (is_valid_word && is_case_ok) {
74 WordSize = LengthOfShortestAlphaRun(best_choice);
75 WordSize -= stopper_smallword_size;
76 if (WordSize < 0)
77 WordSize = 0;
78 CertaintyThreshold += WordSize * stopper_certainty_per_char;
79 }
80
81 if (stopper_debug_level >= 1)
82 tprintf("Stopper: Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n",
83 best_choice.rating(), best_choice.certainty(), CertaintyThreshold);
84
85 if (no_dang_ambigs &&
86 best_choice.certainty() > CertaintyThreshold &&
87 xheight_consistency < XH_INCONSISTENT &&
88 UniformCertainties(best_choice)) {
89 return true;
90 } else {
91 if (stopper_debug_level >= 1) {
92 tprintf("AcceptableChoice() returned false"
93 " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n",
94 no_dang_ambigs, best_choice.certainty(),
95 CertaintyThreshold,
96 UniformCertainties(best_choice));
97 }
98 return false;
99 }
100}
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
@ XH_GOOD
Definition: dict.h:78
@ XH_SUBNORMAL
Definition: dict.h:78
@ XH_INCONSISTENT
Definition: dict.h:78
const STRING & unichar_string() const
Definition: ratngs.h:531
bool dangerous_ambig_found() const
Definition: ratngs.h:353
uint8_t permuter() const
Definition: ratngs.h:336
float min_x_height() const
Definition: ratngs.h:326
float certainty() const
Definition: ratngs.h:320
float max_x_height() const
Definition: ratngs.h:329
float rating() const
Definition: ratngs.h:317
const char * string() const
Definition: strngs.cpp:194
int UniformCertainties(const WERD_CHOICE &word)
Definition: stopper.cpp:465
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const
Returns the length of the shortest alpha run in WordChoice.
Definition: stopper.cpp:446
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:474
int case_ok(const WERD_CHOICE &word) const
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:46

◆ AcceptableResult()

bool tesseract::Dict::AcceptableResult ( WERD_RES word) const

Returns false if the best choice for the current word is questionable and should be tried again on the second pass or should be flagged to the user.

Definition at line 102 of file stopper.cpp.

102 {
103 if (word->best_choice == nullptr) return false;
104 float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
105 int WordSize;
106
107 if (stopper_debug_level >= 1) {
108 tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n",
110 (valid_word(*word->best_choice) ? 'y' : 'n'),
111 (case_ok(*word->best_choice) ? 'y' : 'n'),
112 word->best_choice->dangerous_ambig_found() ? 'n' : 'y',
113 word->best_choices.singleton() ? 'n' : 'y');
114 }
115
116 if (word->best_choice->length() == 0 || !word->best_choices.singleton())
117 return false;
118 if (valid_word(*word->best_choice) && case_ok(*word->best_choice)) {
119 WordSize = LengthOfShortestAlphaRun(*word->best_choice);
120 WordSize -= stopper_smallword_size;
121 if (WordSize < 0)
122 WordSize = 0;
123 CertaintyThreshold += WordSize * stopper_certainty_per_char;
124 }
125
126 if (stopper_debug_level >= 1)
127 tprintf("Rejecter: Certainty = %4.1f, Threshold = %4.1f ",
128 word->best_choice->certainty(), CertaintyThreshold);
129
130 if (word->best_choice->certainty() > CertaintyThreshold &&
132 if (stopper_debug_level >= 1)
133 tprintf("ACCEPTED\n");
134 return true;
135 } else {
136 if (stopper_debug_level >= 1)
137 tprintf("REJECTED\n");
138 return false;
139 }
140}
WERD_CHOICE_LIST best_choices
Definition: pageres.h:249
WERD_CHOICE * best_choice
Definition: pageres.h:241
const STRING debug_string() const
Definition: ratngs.h:495
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:778

◆ add_document_word()

void tesseract::Dict::add_document_word ( const WERD_CHOICE best_choice)

Adds a word found on this document to the document specific dictionary.

Definition at line 644 of file dict.cpp.

644 {
645 // Do not add hyphenated word parts to the document dawg.
646 // hyphen_word_ will be non-nullptr after the set_hyphen_word() is
647 // called when the first part of the hyphenated word is
648 // discovered and while the second part of the word is recognized.
649 // hyphen_word_ is cleared in cc_recg() before the next word on
650 // the line is recognized.
651 if (hyphen_word_) return;
652
653 int stringlen = best_choice.length();
654
655 if (valid_word(best_choice) || stringlen < 2) return;
656
657 // Discard words that contain >= kDocDictMaxRepChars repeating unichars.
658 if (best_choice.length() >= kDocDictMaxRepChars) {
659 int num_rep_chars = 1;
660 UNICHAR_ID uch_id = best_choice.unichar_id(0);
661 for (int i = 1; i < best_choice.length(); ++i) {
662 if (best_choice.unichar_id(i) != uch_id) {
663 num_rep_chars = 1;
664 uch_id = best_choice.unichar_id(i);
665 } else {
666 ++num_rep_chars;
667 if (num_rep_chars == kDocDictMaxRepChars) return;
668 }
669 }
670 }
671
672 if (best_choice.certainty() < doc_dict_certainty_threshold ||
673 stringlen == 2) {
674 if (best_choice.certainty() < doc_dict_pending_threshold) return;
675
676 if (!pending_words_->word_in_dawg(best_choice)) {
677 if (stringlen > 2 ||
678 (stringlen == 2 &&
679 getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
680 getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
681 pending_words_->add_word_to_dawg(best_choice);
682 }
683 return;
684 }
685 }
686
687 if (save_doc_words) {
688 STRING filename(getCCUtil()->imagefile);
689 filename += ".doc";
690 FILE* doc_word_file = fopen(filename.string(), "a");
691 if (doc_word_file == nullptr) {
692 tprintf("Error: Could not open file %s\n", filename.string());
693 ASSERT_HOST(doc_word_file);
694 }
695 fprintf(doc_word_file, "%s\n", best_choice.debug_string().string());
696 fclose(doc_word_file);
697 }
698 document_words_->add_word_to_dawg(best_choice);
699}
#define ASSERT_HOST(x)
Definition: errcode.h:88
int UNICHAR_ID
Definition: unichar.h:34
Definition: strngs.h:45
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:65
const UNICHARSET & getUnicharset() const
Definition: dict.h:101
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:169

◆ adjust_word()

void tesseract::Dict::adjust_word ( WERD_CHOICE word,
bool  nonword,
XHeightConsistencyEnum  xheight_consistency,
float  additional_adjust,
bool  modify_rating,
bool  debug 
)

Adjusts the rating of the given word.

Definition at line 701 of file dict.cpp.

704 {
705 bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() &&
706 word->GetTopScriptID() == getUnicharset().han_sid());
707 bool case_is_ok = (is_han || case_ok(*word));
708 bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));
709
710 float adjust_factor = additional_adjust;
711 float new_rating = word->rating();
712 new_rating += kRatingPad;
713 const char* xheight_triggered = "";
714 if (word->length() > 1) {
715 // Calculate x-height and y-offset consistency penalties.
716 switch (xheight_consistency) {
717 case XH_INCONSISTENT:
718 adjust_factor += xheight_penalty_inconsistent;
719 xheight_triggered = ", xhtBAD";
720 break;
721 case XH_SUBNORMAL:
722 adjust_factor += xheight_penalty_subscripts;
723 xheight_triggered = ", xhtSUB";
724 break;
725 case XH_GOOD:
726 // leave the factor alone - all good!
727 break;
728 }
729 // TODO(eger): if nonword is true, but there is a "core" that is a dict
730 // word, negate nonword status.
731 } else {
732 if (debug) {
733 tprintf("Consistency could not be calculated.\n");
734 }
735 }
736 if (debug) {
737 tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "",
738 word->unichar_string().string(), word->rating(), xheight_triggered);
739 }
740
741 if (nonword) { // non-dictionary word
742 if (case_is_ok && punc_is_ok) {
743 adjust_factor += segment_penalty_dict_nonword;
744 new_rating *= adjust_factor;
745 if (debug) tprintf(", W");
746 } else {
747 adjust_factor += segment_penalty_garbage;
748 new_rating *= adjust_factor;
749 if (debug) {
750 if (!case_is_ok) tprintf(", C");
751 if (!punc_is_ok) tprintf(", P");
752 }
753 }
754 } else { // dictionary word
755 if (case_is_ok) {
756 if (!is_han && freq_dawg_ != nullptr && freq_dawg_->word_in_dawg(*word)) {
758 adjust_factor += segment_penalty_dict_frequent_word;
759 new_rating *= adjust_factor;
760 if (debug) tprintf(", F");
761 } else {
762 adjust_factor += segment_penalty_dict_case_ok;
763 new_rating *= adjust_factor;
764 if (debug) tprintf(", ");
765 }
766 } else {
767 adjust_factor += segment_penalty_dict_case_bad;
768 new_rating *= adjust_factor;
769 if (debug) tprintf(", C");
770 }
771 }
772 new_rating -= kRatingPad;
773 if (modify_rating) word->set_rating(new_rating);
774 if (debug) tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);
775 word->set_adjust_factor(adjust_factor);
776}
@ FREQ_DAWG_PERM
Definition: ratngs.h:244
void set_adjust_factor(float factor)
Definition: ratngs.h:299
int GetTopScriptID() const
Definition: ratngs.cpp:671
void set_rating(float new_val)
Definition: ratngs.h:359
void set_permuter(uint8_t perm)
Definition: ratngs.h:365
int han_sid() const
Definition: unicharset.h:889
int null_sid() const
Definition: unicharset.h:884
bool valid_punctuation(const WERD_CHOICE &word)
Definition: dict.cpp:856

◆ append_choices()

void tesseract::Dict::append_choices ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
const BLOB_CHOICE blob_choice,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  more_args 
)

append_choices

Checks to see whether or not the next choice is worth appending to the word being generated. If so then keeps going deeper into the word.

This function assumes that Dict::go_deeper_fxn_ is set.

Definition at line 239 of file permdawg.cpp.

250 {
251 int word_ending = (char_choice_index == char_choices.length() - 1);
252
253 // Deal with fragments.
254 CHAR_FRAGMENT_INFO char_frag_info;
255 if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(),
256 blob_choice.certainty(), prev_char_frag_info, debug,
257 word_ending, &char_frag_info)) {
258 return; // blob_choice must be an invalid fragment
259 }
260 // Search the next letter if this character is a fragment.
261 if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
262 permute_choices(debug, char_choices, char_choice_index + 1,
263 &char_frag_info, word, certainties, limit,
264 best_choice, attempts_left, more_args);
265 return;
266 }
267
268 // Add the next unichar.
269 float old_rating = word->rating();
270 float old_certainty = word->certainty();
271 uint8_t old_permuter = word->permuter();
272 certainties[word->length()] = char_frag_info.certainty;
274 char_frag_info.unichar_id, char_frag_info.num_fragments,
275 char_frag_info.rating, char_frag_info.certainty);
276
277 // Explore the next unichar.
278 (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index,
279 &char_frag_info, word_ending, word, certainties,
280 limit, best_choice, attempts_left, more_args);
281
282 // Remove the unichar we added to explore other choices in it's place.
284 word->set_rating(old_rating);
285 word->set_certainty(old_certainty);
286 word->set_permuter(old_permuter);
287}
int length() const
Definition: genericvector.h:86
float certainty() const
Definition: ratngs.h:83
float rating() const
Definition: ratngs.h:80
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
void set_certainty(float new_val)
Definition: ratngs.h:362
void remove_last_unichar_id()
Definition: ratngs.h:473
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:442
UNICHAR_ID unichar_id
Definition: dict.h:44
int num_fragments
Definition: dict.h:46
float rating
Definition: dict.h:47
float certainty
Definition: dict.h:48
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:197
bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
Definition: permdawg.cpp:314

◆ CallParamsModelClassify()

float tesseract::Dict::CallParamsModelClassify ( void *  path)
inline

Definition at line 421 of file dict.h.

421 {
422 ASSERT_HOST(params_model_classify_ != nullptr); // ASSERT_HOST -> assert
423 return (this->*params_model_classify_)(
424 getCCUtil()->lang.string(), path);
425 }
STRING lang
Definition: ccutil.h:71

◆ case_ok()

int tesseract::Dict::case_ok ( const WERD_CHOICE word) const

Check a string to see if it matches a set of lexical rules.

Definition at line 46 of file context.cpp.

46 {
47 int state = 0;
48 int x;
49 const UNICHARSET* unicharset = word.unicharset();
50 for (x = 0; x < word.length(); ++x) {
51 UNICHAR_ID ch_id = word.unichar_id(x);
52 if (unicharset->get_isupper(ch_id))
53 state = case_state_table[state][1];
54 else if (unicharset->get_islower(ch_id))
55 state = case_state_table[state][2];
56 else if (unicharset->get_isdigit(ch_id))
57 state = case_state_table[state][3];
58 else
59 state = case_state_table[state][0];
60 if (state == -1) return false;
61 }
62 return state != 5; // single lower is bad
63}
const int case_state_table[6][4]
Definition: context.cpp:29
const UNICHARSET * unicharset() const
Definition: ratngs.h:290
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:505
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:498

◆ char_for_dawg()

UNICHAR_ID tesseract::Dict::char_for_dawg ( const UNICHARSET unicharset,
UNICHAR_ID  ch,
const Dawg dawg 
) const
inline

Definition at line 448 of file dict.h.

449 {
450 if (!dawg) return ch;
451 switch (dawg->type()) {
452 case DAWG_TYPE_NUMBER:
453 return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
454 default:
455 return ch;
456 }
457 }
@ DAWG_TYPE_NUMBER
Definition: dawg.h:71
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:122

◆ compound_marker()

bool tesseract::Dict::compound_marker ( UNICHAR_ID  unichar_id)
inline

Definition at line 113 of file dict.h.

113 {
114 const UNICHARSET& unicharset = getUnicharset();
115 ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
116 const GenericVector<UNICHAR_ID>& normed_ids =
117 unicharset.normed_ids(unichar_id);
118 return normed_ids.size() == 1 &&
119 (normed_ids[0] == hyphen_unichar_id_ ||
120 normed_ids[0] == slash_unichar_id_);
121 }
int size() const
Definition: genericvector.h:72
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:284
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:835

◆ copy_hyphen_info()

void tesseract::Dict::copy_hyphen_info ( WERD_CHOICE word) const
inline

If this word is hyphenated copy the base word (the part on the line before) of a hyphenated word into the given word. This function assumes that word is not nullptr.

Definition at line 145 of file dict.h.

145 {
146 if (this->hyphenated()) {
147 *word = *hyphen_word_;
148 if (hyphen_debug_level) word->print("copy_hyphen_info: ");
149 }
150 }
void print() const
Definition: ratngs.h:570
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:135

◆ dawg_permute_and_select()

WERD_CHOICE * tesseract::Dict::dawg_permute_and_select ( const BLOB_CHOICE_LIST_VECTOR char_choices,
float  rating_limit 
)

Recursively explore all the possible character combinations in the given char_choices. Use go_deeper_dawg_fxn() to explore all the dawgs in the dawgs_ vector in parallel and discard invalid words.

Allocate and return a WERD_CHOICE with the best valid word found.

dawg_permute_and_select

Recursively explore all the possible character combinations in the given char_choices. Use go_deeper_dawg_fxn() to search all the dawgs in the dawgs_ vector in parallel and discard invalid words.

Allocate and return a WERD_CHOICE with the best valid word found.

Definition at line 168 of file permdawg.cpp.

169 {
170 auto *best_choice = new WERD_CHOICE(&getUnicharset());
171 best_choice->make_bad();
172 best_choice->set_rating(rating_limit);
173 if (char_choices.length() == 0 || char_choices.length() > MAX_WERD_LENGTH)
174 return best_choice;
175 auto *active_dawgs =
176 new DawgPositionVector[char_choices.length() + 1];
177 init_active_dawgs(&(active_dawgs[0]), true);
178 DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
180
181 float certainties[MAX_WERD_LENGTH];
183 int attempts_left = max_permuter_attempts;
184 permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr,
185 char_choices, 0, nullptr, &word, certainties, &rating_limit, best_choice,
186 &attempts_left, &dawg_args);
187 delete[] active_dawgs;
188 return best_choice;
189}
@ NO_PERM
Definition: ratngs.h:233
#define MAX_WERD_LENGTH
Definition: dict.h:39
void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Definition: permdawg.cpp:44
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:600

◆ DebugWordChoices()

void tesseract::Dict::DebugWordChoices ( )

Prints the current choices for this word to stdout.

◆ def_letter_is_okay()

int tesseract::Dict::def_letter_is_okay ( void *  void_dawg_args,
const UNICHARSET unicharset,
UNICHAR_ID  unichar_id,
bool  word_end 
) const

Returns the maximal permuter code (from ccstruct/ratngs.h) if in light of the current state the letter at word_index in the given word is allowed according to at least one of the dawgs in dawgs_, otherwise returns NO_PERM.

The state is described by void_dawg_args, which are interpreted as DawgArgs and contain relevant active dawg positions. Each entry in the active_dawgs vector contains an index into the dawgs_ vector and an EDGE_REF that indicates the last edge followed in the dawg. It also may contain a position in the punctuation dawg which describes surrounding punctuation (see struct DawgPosition).

Input: At word_index 0 dawg_args->active_dawgs should contain an entry for each dawg that may start at the beginning of a word, with punc_ref and edge_ref initialized to NO_EDGE. Since the punctuation dawg includes the empty pattern " " (meaning anything without surrounding punctuation), having a single entry for the punctuation dawg will cover all dawgs reachable therefrom – that includes all number and word dawgs. The only dawg non-reachable from the punctuation_dawg is the pattern dawg. If hyphen state needs to be applied, initial dawg_args->active_dawgs can be copied from the saved hyphen state (maintained by Dict). For word_index > 0 the corresponding state (active_dawgs and punc position) can be obtained from dawg_args->updated_dawgs passed to def_letter_is_okay for word_index-1. Note: the function assumes that active_dawgs, and updated_dawgs member variables of dawg_args are not nullptr.

Output: The function fills in dawg_args->updated_dawgs vector with the entries for dawgs that contain the word up to the letter at word_index.

Definition at line 395 of file dict.cpp.

396 {
397 auto* dawg_args = static_cast<DawgArgs*>(void_dawg_args);
398
399 ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
400
401 if (dawg_debug_level >= 3) {
402 tprintf(
403 "def_letter_is_okay: current unichar=%s word_end=%d"
404 " num active dawgs=%d\n",
405 getUnicharset().debug_str(unichar_id).string(), word_end,
406 dawg_args->active_dawgs->length());
407 }
408
409 // Do not accept words that contain kPatternUnicharID.
410 // (otherwise pattern dawgs would not function correctly).
411 // Do not accept words containing INVALID_UNICHAR_IDs.
412 if (unichar_id == Dawg::kPatternUnicharID ||
413 unichar_id == INVALID_UNICHAR_ID) {
414 dawg_args->permuter = NO_PERM;
415 return NO_PERM;
416 }
417
418 // Initialization.
419 PermuterType curr_perm = NO_PERM;
420 dawg_args->updated_dawgs->clear();
421 dawg_args->valid_end = false;
422
423 // Go over the active_dawgs vector and insert DawgPosition records
424 // with the updated ref (an edge with the corresponding unichar id) into
425 // dawg_args->updated_pos.
426 for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) {
427 const DawgPosition& pos = (*dawg_args->active_dawgs)[a];
428 const Dawg* punc_dawg =
429 pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr;
430 const Dawg* dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr;
431
432 if (!dawg && !punc_dawg) {
433 // shouldn't happen.
434 tprintf("Received DawgPosition with no dawg or punc_dawg. wth?\n");
435 continue;
436 }
437 if (!dawg) {
438 // We're in the punctuation dawg. A core dawg has not been chosen.
439 NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
440 EDGE_REF punc_transition_edge =
441 punc_dawg->edge_char_of(punc_node, Dawg::kPatternUnicharID, word_end);
442 if (punc_transition_edge != NO_EDGE) {
443 // Find all successors, and see which can transition.
444 const SuccessorList& slist = *(successors_[pos.punc_index]);
445 for (int s = 0; s < slist.length(); ++s) {
446 int sdawg_index = slist[s];
447 const Dawg* sdawg = dawgs_[sdawg_index];
448 UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg);
449 EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
450 if (dawg_edge != NO_EDGE) {
451 if (dawg_debug_level >= 3) {
452 tprintf("Letter found in dawg %d\n", sdawg_index);
453 }
454 dawg_args->updated_dawgs->add_unique(
455 DawgPosition(sdawg_index, dawg_edge, pos.punc_index,
456 punc_transition_edge, false),
458 "Append transition from punc dawg to current dawgs: ");
459 if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter();
460 if (sdawg->end_of_word(dawg_edge) &&
461 punc_dawg->end_of_word(punc_transition_edge))
462 dawg_args->valid_end = true;
463 }
464 }
465 }
466 EDGE_REF punc_edge =
467 punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
468 if (punc_edge != NO_EDGE) {
469 if (dawg_debug_level >= 3) {
470 tprintf("Letter found in punctuation dawg\n");
471 }
472 dawg_args->updated_dawgs->add_unique(
473 DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false),
474 dawg_debug_level > 0, "Extend punctuation dawg: ");
475 if (PUNC_PERM > curr_perm) curr_perm = PUNC_PERM;
476 if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
477 }
478 continue;
479 }
480
481 if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) {
482 // We can end the main word here.
483 // If we can continue on the punc ref, add that possibility.
484 NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
485 EDGE_REF punc_edge =
486 punc_node == NO_EDGE
487 ? NO_EDGE
488 : punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
489 if (punc_edge != NO_EDGE) {
490 dawg_args->updated_dawgs->add_unique(
491 DawgPosition(pos.dawg_index, pos.dawg_ref, pos.punc_index,
492 punc_edge, true),
493 dawg_debug_level > 0, "Return to punctuation dawg: ");
494 if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
495 if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
496 }
497 }
498
499 if (pos.back_to_punc) continue;
500
501 // If we are dealing with the pattern dawg, look up all the
502 // possible edges, not only for the exact unichar_id, but also
503 // for all its character classes (alpha, digit, etc).
504 if (dawg->type() == DAWG_TYPE_PATTERN) {
505 ProcessPatternEdges(dawg, pos, unichar_id, word_end, dawg_args,
506 &curr_perm);
507 // There can't be any successors to dawg that is of type
508 // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition.
509 continue;
510 }
511
512 // Find the edge out of the node for the unichar_id.
513 NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
514 EDGE_REF edge =
515 (node == NO_EDGE)
516 ? NO_EDGE
517 : dawg->edge_char_of(
518 node, char_for_dawg(unicharset, unichar_id, dawg), word_end);
519
520 if (dawg_debug_level >= 3) {
521 tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
522 pos.dawg_index, node, edge);
523 }
524
525 if (edge != NO_EDGE) { // the unichar was found in the current dawg
526 if (dawg_debug_level >= 3) {
527 tprintf("Letter found in dawg %d\n", pos.dawg_index);
528 }
529 if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {
530 if (dawg_debug_level >= 3) {
531 tprintf("Punctuation constraint not satisfied at end of word.\n");
532 }
533 continue;
534 }
535 if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
536 if (dawg->end_of_word(edge) &&
537 (punc_dawg == nullptr || punc_dawg->end_of_word(pos.punc_ref)))
538 dawg_args->valid_end = true;
539 dawg_args->updated_dawgs->add_unique(
540 DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
541 false),
543 "Append current dawg to updated active dawgs: ");
544 }
545 } // end for
546 // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM
547 // or if we found the current letter in a non-punctuation dawg. This
548 // allows preserving information on which dawg the "core" word came from.
549 // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.
550 if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||
551 (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {
552 dawg_args->permuter = curr_perm;
553 }
554 if (dawg_debug_level >= 2) {
555 tprintf("Returning %d for permuter code for this character.\n",
556 dawg_args->permuter);
557 }
558 return dawg_args->permuter;
559}
PermuterType
Definition: ratngs.h:232
@ PUNC_PERM
Definition: ratngs.h:234
@ COMPOUND_PERM
Definition: ratngs.h:245
int64_t NODE_REF
Definition: dawg.h:52
#define REFFORMAT
Definition: dawg.h:89
int64_t EDGE_REF
Definition: dawg.h:51
@ DAWG_TYPE_PATTERN
Definition: dawg.h:72
GenericVector< int > SuccessorList
Definition: dawg.h:65
void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
Definition: dict.cpp:561
UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:448
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:438

◆ def_probability_in_context()

double tesseract::Dict::def_probability_in_context ( const char *  lang,
const char *  context,
int  context_bytes,
const char *  character,
int  character_bytes 
)
inline

Default (no-op) implementation of probability in context function.

Definition at line 401 of file dict.h.

403 {
404 (void)lang;
405 (void)context;
406 (void)context_bytes;
407 (void)character;
408 (void)character_bytes;
409 return 0.0;
410 }
@ character
Definition: mfoutline.h:63

◆ default_dawgs()

void tesseract::Dict::default_dawgs ( DawgPositionVector anylength_dawgs,
bool  suppress_patterns 
) const

Definition at line 617 of file dict.cpp.

618 {
619 bool punc_dawg_available =
620 (punc_dawg_ != nullptr) &&
621 punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
622
623 for (int i = 0; i < dawgs_.length(); i++) {
624 if (dawgs_[i] != nullptr &&
625 !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
626 int dawg_ty = dawgs_[i]->type();
627 bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
628 if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
629 *dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false);
630 if (dawg_debug_level >= 3) {
631 tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i,
632 NO_EDGE);
633 }
634 } else if (!punc_dawg_available || !subsumed_by_punc) {
635 *dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false);
636 if (dawg_debug_level >= 3) {
637 tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
638 }
639 }
640 }
641 }
642}
@ DAWG_TYPE_PUNCTUATION
Definition: dawg.h:69
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.

◆ End()

void tesseract::Dict::End ( )

Definition at line 372 of file dict.cpp.

372 {
373 if (dawgs_.length() == 0) return; // Not safe to call twice.
374 for (int i = 0; i < dawgs_.size(); i++) {
375 if (!dawg_cache_->FreeDawg(dawgs_[i])) {
376 delete dawgs_[i];
377 }
378 }
379 dawg_cache_->FreeDawg(bigram_dawg_);
380 if (dawg_cache_is_ours_) {
381 delete dawg_cache_;
382 dawg_cache_ = nullptr;
383 }
384 successors_.delete_data_pointers();
385 dawgs_.clear();
386 successors_.clear();
387 document_words_ = nullptr;
388 delete pending_words_;
389 pending_words_ = nullptr;
390}
void delete_data_pointers()
bool FreeDawg(Dawg *dawg)
Definition: dawg_cache.h:38

◆ EndDangerousAmbigs()

void tesseract::Dict::EndDangerousAmbigs ( )

Definition at line 360 of file stopper.cpp.

360{}

◆ FinishLoad()

bool tesseract::Dict::FinishLoad ( )

Definition at line 351 of file dict.cpp.

351 {
352 if (dawgs_.empty()) return false;
353 // Construct a list of corresponding successors for each dawg. Each entry, i,
354 // in the successors_ vector is a vector of integers that represent the
355 // indices into the dawgs_ vector of the successors for dawg i.
356 successors_.reserve(dawgs_.length());
357 for (int i = 0; i < dawgs_.length(); ++i) {
358 const Dawg* dawg = dawgs_[i];
359 auto* lst = new SuccessorList();
360 for (int j = 0; j < dawgs_.length(); ++j) {
361 const Dawg* other = dawgs_[j];
362 if (dawg != nullptr && other != nullptr &&
363 (dawg->lang() == other->lang()) &&
364 kDawgSuccessors[dawg->type()][other->type()])
365 *lst += j;
366 }
367 successors_ += lst;
368 }
369 return true;
370}
bool empty() const
Definition: genericvector.h:91
void reserve(int size)

◆ fragment_state_okay()

bool tesseract::Dict::fragment_state_okay ( UNICHAR_ID  curr_unichar_id,
float  curr_rating,
float  curr_certainty,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
const char *  debug,
int  word_ending,
CHAR_FRAGMENT_INFO char_frag_info 
)

Definition at line 314 of file permdawg.cpp.

318 {
319 const CHAR_FRAGMENT *this_fragment =
320 getUnicharset().get_fragment(curr_unichar_id);
321 const CHAR_FRAGMENT *prev_fragment =
322 prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr;
323
324 // Print debug info for fragments.
325 if (debug && (prev_fragment || this_fragment)) {
326 tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
327 getUnicharset().debug_str(curr_unichar_id).string(),
328 word_ending);
329 if (prev_fragment) {
330 tprintf("prev_fragment %s\n", prev_fragment->to_string().string());
331 }
332 if (this_fragment) {
333 tprintf("this_fragment %s\n", this_fragment->to_string().string());
334 }
335 }
336
337 char_frag_info->unichar_id = curr_unichar_id;
338 char_frag_info->fragment = this_fragment;
339 char_frag_info->rating = curr_rating;
340 char_frag_info->certainty = curr_certainty;
341 char_frag_info->num_fragments = 1;
342 if (prev_fragment && !this_fragment) {
343 if (debug) tprintf("Skip choice with incomplete fragment\n");
344 return false;
345 }
346 if (this_fragment) {
347 // We are dealing with a fragment.
348 char_frag_info->unichar_id = INVALID_UNICHAR_ID;
349 if (prev_fragment) {
350 if (!this_fragment->is_continuation_of(prev_fragment)) {
351 if (debug) tprintf("Non-matching fragment piece\n");
352 return false;
353 }
354 if (this_fragment->is_ending()) {
355 char_frag_info->unichar_id =
356 getUnicharset().unichar_to_id(this_fragment->get_unichar());
357 char_frag_info->fragment = nullptr;
358 if (debug) {
359 tprintf("Built character %s from fragments\n",
360 getUnicharset().debug_str(
361 char_frag_info->unichar_id).string());
362 }
363 } else {
364 if (debug) tprintf("Record fragment continuation\n");
365 char_frag_info->fragment = this_fragment;
366 }
367 // Update certainty and rating.
368 char_frag_info->rating =
369 prev_char_frag_info->rating + curr_rating;
370 char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
371 char_frag_info->certainty =
372 std::min(curr_certainty, prev_char_frag_info->certainty);
373 } else {
374 if (this_fragment->is_beginning()) {
375 if (debug) tprintf("Record fragment beginning\n");
376 } else {
377 if (debug) {
378 tprintf("Non-starting fragment piece with no prev_fragment\n");
379 }
380 return false;
381 }
382 }
383 }
384 if (word_ending && char_frag_info->fragment) {
385 if (debug) tprintf("Word can not end with a fragment\n");
386 return false;
387 }
388 return true;
389}
bool is_beginning() const
Definition: unicharset.h:105
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
Definition: unicharset.h:98
static STRING to_string(const char *unichar, int pos, int total, bool natural)
const char * get_unichar() const
Definition: unicharset.h:70
bool is_ending() const
Definition: unicharset.h:108
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:734
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
const CHAR_FRAGMENT * fragment
Definition: dict.h:45

◆ getCCUtil() [1/2]

CCUtil * tesseract::Dict::getCCUtil ( )
inline

Definition at line 98 of file dict.h.

98 {
99 return ccutil_;
100 }

◆ getCCUtil() [2/2]

const CCUtil * tesseract::Dict::getCCUtil ( ) const
inline

Definition at line 95 of file dict.h.

95 {
96 return ccutil_;
97 }

◆ GetDawg()

const Dawg * tesseract::Dict::GetDawg ( int  index) const
inline

Return i-th dawg pointer recorded in the dawgs_ vector.

Definition at line 432 of file dict.h.

432{ return dawgs_[index]; }

◆ GetPuncDawg()

const Dawg * tesseract::Dict::GetPuncDawg ( ) const
inline

Return the points to the punctuation dawg.

Definition at line 434 of file dict.h.

434{ return punc_dawg_; }

◆ GetStartingNode()

static NODE_REF tesseract::Dict::GetStartingNode ( const Dawg dawg,
EDGE_REF  edge_ref 
)
inlinestatic

Returns the appropriate next node given the EDGE_REF.

Definition at line 438 of file dict.h.

438 {
439 if (edge_ref == NO_EDGE) return 0; // beginning to explore the dawg
440 NODE_REF node = dawg->next_node(edge_ref);
441 if (node == 0) node = NO_EDGE; // end of word
442 return node;
443 }

◆ GetUnambigDawg()

const Dawg * tesseract::Dict::GetUnambigDawg ( ) const
inline

Return the points to the unambiguous words dawg.

Definition at line 436 of file dict.h.

436{ return unambig_dawg_; }

◆ getUnicharAmbigs()

const UnicharAmbigs & tesseract::Dict::getUnicharAmbigs ( ) const
inline

Definition at line 108 of file dict.h.

108 {
109 return getCCUtil()->unichar_ambigs;
110 }
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:75

◆ getUnicharset() [1/2]

UNICHARSET & tesseract::Dict::getUnicharset ( )
inline

Definition at line 104 of file dict.h.

104 {
105 return getCCUtil()->unicharset;
106 }
UNICHARSET unicharset
Definition: ccutil.h:73

◆ getUnicharset() [2/2]

const UNICHARSET & tesseract::Dict::getUnicharset ( ) const
inline

Definition at line 101 of file dict.h.

101 {
102 return getCCUtil()->unicharset;
103 }

◆ GlobalDawgCache()

DawgCache * tesseract::Dict::GlobalDawgCache ( )
static

Initialize Dict class - load dawgs from [lang].traineddata and user-specified wordlist and parttern list.

Definition at line 184 of file dict.cpp.

184 {
185 // This global cache (a singleton) will outlive every Tesseract instance
186 // (even those that someone else might declare as global statics).
187 static DawgCache cache;
188 return &cache;
189}

◆ go_deeper_dawg_fxn()

void tesseract::Dict::go_deeper_dawg_fxn ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
bool  word_ending,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  void_more_args 
)

If the choice being composed so far could be a dictionary word and we have not reached the end of the word keep exploring the char_choices further.

Definition at line 44 of file permdawg.cpp.

48 {
49 auto *more_args = static_cast<DawgArgs *>(void_more_args);
50 word_ending = (char_choice_index == char_choices.size()-1);
51 int word_index = word->length() - 1;
52 if (best_choice->rating() < *limit) return;
53 // Look up char in DAWG
54
55 // If the current unichar is an ngram first try calling
56 // letter_is_okay() for each unigram it contains separately.
57 UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
58 bool checked_unigrams = false;
59 if (getUnicharset().get_isngram(orig_uch_id)) {
60 if (dawg_debug_level) {
61 tprintf("checking unigrams in an ngram %s\n",
62 getUnicharset().debug_str(orig_uch_id).string());
63 }
64 int num_unigrams = 0;
67 const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
68 // Since the string came out of the unicharset, failure is impossible.
69 ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr,
70 nullptr));
71 bool unigrams_ok = true;
72 // Construct DawgArgs that reflect the current state.
73 DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
74 DawgPositionVector unigram_updated_dawgs;
75 DawgArgs unigram_dawg_args(&unigram_active_dawgs,
76 &unigram_updated_dawgs,
77 more_args->permuter);
78 // Check unigrams in the ngram with letter_is_okay().
79 for (int i = 0; unigrams_ok && i < encoding.size(); ++i) {
80 UNICHAR_ID uch_id = encoding[i];
81 ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
82 ++num_unigrams;
83 word->append_unichar_id(uch_id, 1, 0.0, 0.0);
84 unigrams_ok = (this->*letter_is_okay_)(
85 &unigram_dawg_args, *word->unicharset(),
86 word->unichar_id(word_index+num_unigrams-1),
87 word_ending && i == encoding.size() - 1);
88 (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
89 if (dawg_debug_level) {
90 tprintf("unigram %s is %s\n",
91 getUnicharset().debug_str(uch_id).string(),
92 unigrams_ok ? "OK" : "not OK");
93 }
94 }
95 // Restore the word and copy the updated dawg state if needed.
96 while (num_unigrams-- > 0) word->remove_last_unichar_id();
97 word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
98 if (unigrams_ok) {
99 checked_unigrams = true;
100 more_args->permuter = unigram_dawg_args.permuter;
101 *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
102 }
103 }
104
105 // Check which dawgs from the dawgs_ vector contain the word
106 // up to and including the current unichar.
107 if (checked_unigrams || (this->*letter_is_okay_)(
108 more_args, *word->unicharset(), word->unichar_id(word_index),
109 word_ending)) {
110 // Add a new word choice
111 if (word_ending) {
112 if (dawg_debug_level) {
113 tprintf("found word = %s\n", word->debug_string().string());
114 }
115 if (strcmp(output_ambig_words_file.string(), "") != 0) {
116 if (output_ambig_words_file_ == nullptr) {
117 output_ambig_words_file_ =
118 fopen(output_ambig_words_file.string(), "wb+");
119 if (output_ambig_words_file_ == nullptr) {
120 tprintf("Failed to open output_ambig_words_file %s\n",
121 output_ambig_words_file.string());
122 exit(1);
123 }
124 STRING word_str;
125 word->string_and_lengths(&word_str, nullptr);
126 word_str += " ";
127 fprintf(output_ambig_words_file_, "%s", word_str.string());
128 }
129 STRING word_str;
130 word->string_and_lengths(&word_str, nullptr);
131 word_str += " ";
132 fprintf(output_ambig_words_file_, "%s", word_str.string());
133 }
134 WERD_CHOICE *adjusted_word = word;
135 adjusted_word->set_permuter(more_args->permuter);
136 update_best_choice(*adjusted_word, best_choice);
137 } else { // search the next letter
138 // Make updated_* point to the next entries in the DawgPositionVector
139 // arrays (that were originally created in dawg_permute_and_select)
140 ++(more_args->updated_dawgs);
141 // Make active_dawgs and constraints point to the updated ones.
142 ++(more_args->active_dawgs);
143 permute_choices(debug, char_choices, char_choice_index + 1,
144 prev_char_frag_info, word, certainties, limit,
145 best_choice, attempts_left, more_args);
146 // Restore previous state to explore another letter in this position.
147 --(more_args->updated_dawgs);
148 --(more_args->active_dawgs);
149 }
150 } else {
151 if (dawg_debug_level) {
152 tprintf("last unichar not OK at index %d in %s\n",
153 word_index, word->debug_string().string());
154 }
155 }
156}
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:453
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:472
bool get_isngram(UNICHAR_ID unichar_id) const
Definition: unicharset.h:526
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:182

◆ good_choice()

int tesseract::Dict::good_choice ( const WERD_CHOICE choice)

Returns true if a good answer is found for the unknown blob rating.

◆ has_hyphen_end() [1/2]

bool tesseract::Dict::has_hyphen_end ( const UNICHARSET unicharset,
UNICHAR_ID  unichar_id,
bool  first_pos 
) const
inline

Check whether the word has a hyphen at the end.

Definition at line 152 of file dict.h.

153 {
154 if (!last_word_on_line_ || first_pos)
155 return false;
156 ASSERT_HOST(unicharset->contains_unichar_id(unichar_id));
157 const GenericVector<UNICHAR_ID>& normed_ids =
158 unicharset->normed_ids(unichar_id);
159 return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
160 }

◆ has_hyphen_end() [2/2]

bool tesseract::Dict::has_hyphen_end ( const WERD_CHOICE word) const
inline

Same as above, but check the unichar at the end of the word.

Definition at line 162 of file dict.h.

162 {
163 int word_index = word.length() - 1;
164 return has_hyphen_end(word.unicharset(), word.unichar_id(word_index),
165 word_index == 0);
166 }
bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:152

◆ hyphen_base_size()

int tesseract::Dict::hyphen_base_size ( ) const
inline

Size of the base word (the part on the line before) of a hyphenated word.

Definition at line 139 of file dict.h.

139 {
140 return this->hyphenated() ? hyphen_word_->length() : 0;
141 }

◆ hyphenated()

bool tesseract::Dict::hyphenated ( ) const
inline

Returns true if we've recorded the beginning of a hyphenated word.

Definition at line 135 of file dict.h.

135 { return
136 !last_word_on_line_ && hyphen_word_;
137 }

◆ init_active_dawgs()

void tesseract::Dict::init_active_dawgs ( DawgPositionVector active_dawgs,
bool  ambigs_mode 
) const

Fill the given active_dawgs vector with dawgs that could contain the beginning of the word. If hyphenated() returns true, copy the entries from hyphen_active_dawgs_ instead.

Definition at line 600 of file dict.cpp.

601 {
602 int i;
603 if (hyphenated()) {
604 *active_dawgs = hyphen_active_dawgs_;
605 if (dawg_debug_level >= 3) {
606 for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {
607 tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
608 hyphen_active_dawgs_[i].dawg_index,
609 hyphen_active_dawgs_[i].dawg_ref);
610 }
611 }
612 } else {
613 default_dawgs(active_dawgs, ambigs_mode);
614 }
615}
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:617

◆ is_apostrophe()

bool tesseract::Dict::is_apostrophe ( UNICHAR_ID  unichar_id)
inline

Definition at line 124 of file dict.h.

124 {
125 const UNICHARSET& unicharset = getUnicharset();
126 ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
127 const GenericVector<UNICHAR_ID>& normed_ids =
128 unicharset.normed_ids(unichar_id);
129 return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
130 }

◆ IsSpaceDelimitedLang()

bool tesseract::Dict::IsSpaceDelimitedLang ( ) const

Returns true if the language is space-delimited (not CJ, or T).

Definition at line 883 of file dict.cpp.

883 {
884 const UNICHARSET& u_set = getUnicharset();
885 if (u_set.han_sid() > 0) return false;
886 if (u_set.katakana_sid() > 0) return false;
887 if (u_set.thai_sid() > 0) return false;
888 return true;
889}
int katakana_sid() const
Definition: unicharset.h:891
int thai_sid() const
Definition: unicharset.h:892

◆ LengthOfShortestAlphaRun()

int tesseract::Dict::LengthOfShortestAlphaRun ( const WERD_CHOICE WordChoice) const

Returns the length of the shortest alpha run in WordChoice.

Definition at line 446 of file stopper.cpp.

446 {
447 int shortest = INT32_MAX;
448 int curr_len = 0;
449 for (int w = 0; w < WordChoice.length(); ++w) {
450 if (WordChoice.unicharset()->get_isalpha(WordChoice.unichar_id(w))) {
451 curr_len++;
452 } else if (curr_len > 0) {
453 if (curr_len < shortest) shortest = curr_len;
454 curr_len = 0;
455 }
456 }
457 if (curr_len > 0 && curr_len < shortest) {
458 shortest = curr_len;
459 } else if (shortest == INT32_MAX) {
460 shortest = 0;
461 }
462 return shortest;
463}

◆ LetterIsOkay()

int tesseract::Dict::LetterIsOkay ( void *  void_dawg_args,
const UNICHARSET unicharset,
UNICHAR_ID  unichar_id,
bool  word_end 
) const
inline

Calls letter_is_okay_ member function.

Definition at line 376 of file dict.h.

377 {
378 return (this->*letter_is_okay_)(void_dawg_args,
379 unicharset, unichar_id, word_end);
380 }

◆ Load()

void tesseract::Dict::Load ( const STRING lang,
TessdataManager data_file 
)

Definition at line 210 of file dict.cpp.

210 {
211 // Load dawgs_.
212 if (load_punc_dawg) {
213 punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG,
214 dawg_debug_level, data_file);
215 if (punc_dawg_) dawgs_ += punc_dawg_;
216 }
217 if (load_system_dawg) {
218 Dawg* system_dawg = dawg_cache_->GetSquishedDawg(
219 lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
220 if (system_dawg) dawgs_ += system_dawg;
221 }
222 if (load_number_dawg) {
223 Dawg* number_dawg = dawg_cache_->GetSquishedDawg(
224 lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
225 if (number_dawg) dawgs_ += number_dawg;
226 }
227 if (load_bigram_dawg) {
228 bigram_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG,
229 dawg_debug_level, data_file);
230 // The bigram_dawg_ is NOT used like the other dawgs! DO NOT add to the
231 // dawgs_!!
232 }
233 if (load_freq_dawg) {
234 freq_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG,
235 dawg_debug_level, data_file);
236 if (freq_dawg_) dawgs_ += freq_dawg_;
237 }
238 if (load_unambig_dawg) {
239 unambig_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG,
240 dawg_debug_level, data_file);
241 if (unambig_dawg_) dawgs_ += unambig_dawg_;
242 }
243
244 STRING name;
245 if (!user_words_suffix.empty() || !user_words_file.empty()) {
246 Trie* trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
248 if (!user_words_file.empty()) {
249 name = user_words_file;
250 } else {
252 name += user_words_suffix;
253 }
254 if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
256 tprintf("Error: failed to load %s\n", name.string());
257 delete trie_ptr;
258 } else {
259 dawgs_ += trie_ptr;
260 }
261 }
262
263 if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
264 Trie* trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
266 trie_ptr->initialize_patterns(&(getUnicharset()));
267 if (!user_patterns_file.empty()) {
268 name = user_patterns_file;
269 } else {
271 name += user_patterns_suffix;
272 }
273 if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) {
274 tprintf("Error: failed to load %s\n", name.string());
275 delete trie_ptr;
276 } else {
277 dawgs_ += trie_ptr;
278 }
279 }
280
281 document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
283 dawgs_ += document_words_;
284
285 // This dawg is temporary and should not be searched by letter_is_ok.
286 pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
288}
@ USER_PATTERN_PERM
Definition: ratngs.h:240
@ DOC_DAWG_PERM
Definition: ratngs.h:242
@ USER_DAWG_PERM
Definition: ratngs.h:243
@ DAWG_TYPE_WORD
Definition: dawg.h:70
@ TESSDATA_UNAMBIG_DAWG
@ TESSDATA_NUMBER_DAWG
@ TESSDATA_BIGRAM_DAWG
@ TESSDATA_SYSTEM_DAWG
STRING language_data_path_prefix
Definition: ccutil.h:72
Dawg * GetSquishedDawg(const STRING &lang, TessdataType tessdata_dawg_type, int debug_level, TessdataManager *data_file)
Definition: dawg_cache.cpp:45
@ RRP_REVERSE_IF_HAS_RTL
Definition: trie.h:60

◆ LoadLSTM()

void tesseract::Dict::LoadLSTM ( const STRING lang,
TessdataManager data_file 
)

Definition at line 291 of file dict.cpp.

291 {
292 // Load dawgs_.
293 if (load_punc_dawg) {
294 punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG,
295 dawg_debug_level, data_file);
296 if (punc_dawg_) dawgs_ += punc_dawg_;
297 }
298 if (load_system_dawg) {
299 Dawg* system_dawg = dawg_cache_->GetSquishedDawg(
301 if (system_dawg) dawgs_ += system_dawg;
302 }
303 if (load_number_dawg) {
304 Dawg* number_dawg = dawg_cache_->GetSquishedDawg(
306 if (number_dawg) dawgs_ += number_dawg;
307 }
308
309 // stolen from Dict::Load (but needs params_ from Tesseract
310 // langdata/config/api):
311 STRING name;
312 if (!user_words_suffix.empty() || !user_words_file.empty()) {
313 Trie* trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
315 if (!user_words_file.empty()) {
316 name = user_words_file;
317 } else {
319 name += user_words_suffix;
320 }
321 if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
323 tprintf("Error: failed to load %s\n", name.string());
324 delete trie_ptr;
325 } else {
326 dawgs_ += trie_ptr;
327 }
328 }
329
330 if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
331 Trie* trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
333 trie_ptr->initialize_patterns(&(getUnicharset()));
334 if (!user_patterns_file.empty()) {
335 name = user_patterns_file;
336 } else {
338 name += user_patterns_suffix;
339 }
340 if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) {
341 tprintf("Error: failed to load %s\n", name.string());
342 delete trie_ptr;
343 } else {
344 dawgs_ += trie_ptr;
345 }
346 }
347}
@ TESSDATA_LSTM_SYSTEM_DAWG
@ TESSDATA_LSTM_PUNC_DAWG
@ TESSDATA_LSTM_NUMBER_DAWG

◆ ngram_probability_in_context()

double tesseract::Dict::ngram_probability_in_context ( const char *  lang,
const char *  context,
int  context_bytes,
const char *  character,
int  character_bytes 
)

◆ NoDangerousAmbig()

bool tesseract::Dict::NoDangerousAmbig ( WERD_CHOICE BestChoice,
DANGERR fixpt,
bool  fix_replaceable,
MATRIX ratings 
)

Definition at line 144 of file stopper.cpp.

147 {
148 if (stopper_debug_level > 2) {
149 tprintf("\nRunning NoDangerousAmbig() for %s\n",
150 best_choice->debug_string().string());
151 }
152
153 // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
154 // for each unichar id in BestChoice.
155 BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
156 int i;
157 bool ambigs_found = false;
158 // For each position in best_choice:
159 // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
160 // -- initialize wrong_ngram with a single unichar_id at best_choice[i]
161 // -- look for ambiguities corresponding to wrong_ngram in the list while
162 // adding the following unichar_ids from best_choice to wrong_ngram
163 //
164 // Repeat the above procedure twice: first time look through
165 // ambigs to be replaced and replace all the ambiguities found;
166 // second time look through dangerous ambiguities and construct
167 // ambig_blob_choices with fake a blob choice for each ambiguity
168 // and pass them to dawg_permute_and_select() to search for
169 // ambiguous words in the dictionaries.
170 //
171 // Note that during the execution of the for loop (on the first pass)
172 // if replacements are made the length of best_choice might change.
173 for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
174 bool replace = (fix_replaceable && pass == 0);
175 const UnicharAmbigsVector &table = replace ?
177 if (!replace) {
178 // Initialize ambig_blob_choices with lists containing a single
179 // unichar id for the corresponding position in best_choice.
180 // best_choice consisting from only the original letters will
181 // have a rating of 0.0.
182 for (i = 0; i < best_choice->length(); ++i) {
183 auto *lst = new BLOB_CHOICE_LIST();
184 BLOB_CHOICE_IT lst_it(lst);
185 // TODO(rays/antonova) Put real xheights and y shifts here.
186 lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i),
187 0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG));
188 ambig_blob_choices.push_back(lst);
189 }
190 }
191 UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
192 int wrong_ngram_index;
193 int next_index;
194 int blob_index = 0;
195 for (i = 0; i < best_choice->length(); blob_index += best_choice->state(i),
196 ++i) {
197 UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i);
198 if (stopper_debug_level > 2) {
199 tprintf("Looking for %s ngrams starting with %s:\n",
200 replace ? "replaceable" : "ambiguous",
201 getUnicharset().debug_str(curr_unichar_id).string());
202 }
203 int num_wrong_blobs = best_choice->state(i);
204 wrong_ngram_index = 0;
205 wrong_ngram[wrong_ngram_index] = curr_unichar_id;
206 if (curr_unichar_id == INVALID_UNICHAR_ID ||
207 curr_unichar_id >= table.size() ||
208 table[curr_unichar_id] == nullptr) {
209 continue; // there is no ambig spec for this unichar id
210 }
211 AmbigSpec_IT spec_it(table[curr_unichar_id]);
212 for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
213 const AmbigSpec *ambig_spec = spec_it.data();
214 wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID;
215 int compare = UnicharIdArrayUtils::compare(wrong_ngram,
216 ambig_spec->wrong_ngram);
217 if (stopper_debug_level > 2) {
218 tprintf("candidate ngram: ");
220 tprintf("current ngram from spec: ");
221 UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());
222 tprintf("comparison result: %d\n", compare);
223 }
224 if (compare == 0) {
225 // Record the place where we found an ambiguity.
226 if (fixpt != nullptr) {
227 UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0];
228 fixpt->push_back(DANGERR_INFO(
229 blob_index, blob_index + num_wrong_blobs, replace,
230 getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
231 leftmost_id));
232 if (stopper_debug_level > 1) {
233 tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index,
234 blob_index + num_wrong_blobs, false,
235 getUnicharset().get_isngram(
236 ambig_spec->correct_ngram_id),
237 getUnicharset().id_to_unichar(leftmost_id));
238 }
239 }
240
241 if (replace) {
242 if (stopper_debug_level > 2) {
243 tprintf("replace ambiguity with %s : ",
244 getUnicharset().id_to_unichar(
245 ambig_spec->correct_ngram_id));
247 ambig_spec->correct_fragments, getUnicharset());
248 }
249 ReplaceAmbig(i, ambig_spec->wrong_ngram_size,
250 ambig_spec->correct_ngram_id,
251 best_choice, ratings);
252 } else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
253 // We found dang ambig - update ambig_blob_choices.
254 if (stopper_debug_level > 2) {
255 tprintf("found ambiguity: ");
257 ambig_spec->correct_fragments, getUnicharset());
258 }
259 ambigs_found = true;
260 for (int tmp_index = 0; tmp_index <= wrong_ngram_index;
261 ++tmp_index) {
262 // Add a blob choice for the corresponding fragment of the
263 // ambiguity. These fake blob choices are initialized with
264 // negative ratings (which are not possible for real blob
265 // choices), so that dawg_permute_and_select() considers any
266 // word not consisting of only the original letters a better
267 // choice and stops searching for alternatives once such a
268 // choice is found.
269 BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
270 bc_it.add_to_end(new BLOB_CHOICE(
271 ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
272 -1, 0, 1, 0, BCC_AMBIG));
273 }
274 }
275 spec_it.forward();
276 } else if (compare == -1) {
277 if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size &&
278 ((next_index = wrong_ngram_index+1+i) < best_choice->length())) {
279 // Add the next unichar id to wrong_ngram and keep looking for
280 // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
281 wrong_ngram[++wrong_ngram_index] =
282 best_choice->unichar_id(next_index);
283 num_wrong_blobs += best_choice->state(next_index);
284 } else {
285 break; // no more matching ambigs in this AMBIG_SPEC_LIST
286 }
287 } else {
288 spec_it.forward();
289 }
290 } // end searching AmbigSpec_LIST
291 } // end searching best_choice
292 } // end searching replace and dangerous ambigs
293
294 // If any ambiguities were found permute the constructed ambig_blob_choices
295 // to see if an alternative dictionary word can be found.
296 if (ambigs_found) {
297 if (stopper_debug_level > 2) {
298 tprintf("\nResulting ambig_blob_choices:\n");
299 for (i = 0; i < ambig_blob_choices.length(); ++i) {
300 print_ratings_list("", ambig_blob_choices.get(i), getUnicharset());
301 tprintf("\n");
302 }
303 }
304 WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
305 ambigs_found = (alt_word->rating() < 0.0);
306 if (ambigs_found) {
307 if (stopper_debug_level >= 1) {
308 tprintf ("Stopper: Possible ambiguous word = %s\n",
309 alt_word->debug_string().string());
310 }
311 if (fixpt != nullptr) {
312 // Note: Currently character choices combined from fragments can only
313 // be generated by NoDangrousAmbigs(). This code should be updated if
314 // the capability to produce classifications combined from character
315 // fragments is added to other functions.
316 int orig_i = 0;
317 for (i = 0; i < alt_word->length(); ++i) {
318 const UNICHARSET &uchset = getUnicharset();
319 bool replacement_is_ngram =
320 uchset.get_isngram(alt_word->unichar_id(i));
321 UNICHAR_ID leftmost_id = alt_word->unichar_id(i);
322 if (replacement_is_ngram) {
323 // we have to extract the leftmost unichar from the ngram.
324 const char *str = uchset.id_to_unichar(leftmost_id);
325 int step = uchset.step(str);
326 if (step) leftmost_id = uchset.unichar_to_id(str, step);
327 }
328 int end_i = orig_i + alt_word->state(i);
329 if (alt_word->state(i) > 1 ||
330 (orig_i + 1 == end_i && replacement_is_ngram)) {
331 // Compute proper blob indices.
332 int blob_start = 0;
333 for (int j = 0; j < orig_i; ++j)
334 blob_start += best_choice->state(j);
335 int blob_end = blob_start;
336 for (int j = orig_i; j < end_i; ++j)
337 blob_end += best_choice->state(j);
338 fixpt->push_back(DANGERR_INFO(blob_start, blob_end, true,
339 replacement_is_ngram, leftmost_id));
340 if (stopper_debug_level > 1) {
341 tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i,
342 true, replacement_is_ngram,
343 uchset.id_to_unichar(leftmost_id));
344 }
345 }
346 orig_i += alt_word->state(i);
347 }
348 }
349 }
350 delete alt_word;
351 }
352 if (output_ambig_words_file_ != nullptr) {
353 fprintf(output_ambig_words_file_, "\n");
354 }
355
356 ambig_blob_choices.delete_data_pointers();
357 return !ambigs_found;
358}
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:837
@ BCC_AMBIG
Definition: ratngs.h:47
#define MAX_AMBIG_SIZE
Definition: ambigs.h:31
GenericVector< AmbigSpec_LIST * > UnicharAmbigsVector
Definition: ambigs.h:134
@ CASE_AMBIG
Definition: ambigs.h:42
int push_back(T object)
T & get(int index) const
int state(int index) const
Definition: ratngs.h:309
static void print(const UNICHAR_ID array[], const UNICHARSET &unicharset)
Definition: ambigs.h:91
static int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2)
Definition: ambigs.h:55
const UnicharAmbigsVector & replace_ambigs() const
Definition: ambigs.h:146
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:145
int step(const char *str) const
Definition: unicharset.cpp:233
void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
Definition: stopper.cpp:372
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
Definition: permdawg.cpp:168
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:108

◆ NumDawgs()

int tesseract::Dict::NumDawgs ( ) const
inline

Return the number of dawgs in the dawgs_ vector.

Definition at line 430 of file dict.h.

430{ return dawgs_.size(); }

◆ ParamsModelClassify()

float tesseract::Dict::ParamsModelClassify ( const char *  lang,
void *  path 
)

◆ permute_choices()

void tesseract::Dict::permute_choices ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  more_args 
)

permute_choices

Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST with the given char_choice_index in char_choices.

Definition at line 197 of file permdawg.cpp.

207 {
208 if (debug) {
209 tprintf("%s permute_choices: char_choice_index=%d"
210 " limit=%g rating=%g, certainty=%g word=%s\n",
211 debug, char_choice_index, *limit, word->rating(),
212 word->certainty(), word->debug_string().string());
213 }
214 if (char_choice_index < char_choices.length()) {
215 BLOB_CHOICE_IT blob_choice_it;
216 blob_choice_it.set_to_list(char_choices.get(char_choice_index));
217 for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
218 blob_choice_it.forward()) {
219 (*attempts_left)--;
220 append_choices(debug, char_choices, *(blob_choice_it.data()),
221 char_choice_index, prev_char_frag_info, word,
222 certainties, limit, best_choice, attempts_left, more_args);
223 if (*attempts_left <= 0) {
224 if (debug) tprintf("permute_choices(): attempts_left is 0\n");
225 break;
226 }
227 }
228 }
229}
void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:239

◆ ProbabilityInContext()

double tesseract::Dict::ProbabilityInContext ( const char *  context,
int  context_bytes,
const char *  character,
int  character_bytes 
)
inline

Calls probability_in_context_ member function.

Definition at line 390 of file dict.h.

393 {
394 return (this->*probability_in_context_)(
395 getCCUtil()->lang.string(),
396 context, context_bytes,
397 character, character_bytes);
398 }

◆ ProcessPatternEdges()

void tesseract::Dict::ProcessPatternEdges ( const Dawg dawg,
const DawgPosition info,
UNICHAR_ID  unichar_id,
bool  word_end,
DawgArgs dawg_args,
PermuterType current_permuter 
) const

For each of the character classes of the given unichar_id (and the unichar_id itself) finds the corresponding outgoing node or self-loop in the given dawg and (after checking that it is valid) records it in dawg_args->updated_ative_dawgs. Updates current_permuter if any valid edges were found.

Definition at line 561 of file dict.cpp.

564 {
565 NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
566 // Try to find the edge corresponding to the exact unichar_id and to all the
567 // edges corresponding to the character class of unichar_id.
568 GenericVector<UNICHAR_ID> unichar_id_patterns;
569 unichar_id_patterns.push_back(unichar_id);
570 dawg->unichar_id_to_patterns(unichar_id, getUnicharset(),
571 &unichar_id_patterns);
572 for (int i = 0; i < unichar_id_patterns.size(); ++i) {
573 // On the first iteration check all the outgoing edges.
574 // On the second iteration check all self-loops.
575 for (int k = 0; k < 2; ++k) {
576 EDGE_REF edge =
577 (k == 0) ? dawg->edge_char_of(node, unichar_id_patterns[i], word_end)
578 : dawg->pattern_loop_edge(pos.dawg_ref,
579 unichar_id_patterns[i], word_end);
580 if (edge == NO_EDGE) continue;
581 if (dawg_debug_level >= 3) {
582 tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
583 pos.dawg_index, node, edge);
584 tprintf("Letter found in pattern dawg %d\n", pos.dawg_index);
585 }
586 if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter();
587 if (dawg->end_of_word(edge)) dawg_args->valid_end = true;
588 dawg_args->updated_dawgs->add_unique(
589 DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
590 pos.back_to_punc),
592 "Append current dawg to updated active dawgs: ");
593 }
594 }
595}

◆ ReplaceAmbig()

void tesseract::Dict::ReplaceAmbig ( int  wrong_ngram_begin_index,
int  wrong_ngram_size,
UNICHAR_ID  correct_ngram_id,
WERD_CHOICE werd_choice,
MATRIX ratings 
)

Definition at line 372 of file stopper.cpp.

374 {
375 int num_blobs_to_replace = 0;
376 int begin_blob_index = 0;
377 int i;
378 // Rating and certainty for the new BLOB_CHOICE are derived from the
379 // replaced choices.
380 float new_rating = 0.0f;
381 float new_certainty = 0.0f;
382 BLOB_CHOICE* old_choice = nullptr;
383 for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
384 if (i >= wrong_ngram_begin_index) {
385 int num_blobs = werd_choice->state(i);
386 int col = begin_blob_index + num_blobs_to_replace;
387 int row = col + num_blobs - 1;
388 BLOB_CHOICE_LIST* choices = ratings->get(col, row);
389 ASSERT_HOST(choices != nullptr);
390 old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices);
391 ASSERT_HOST(old_choice != nullptr);
392 new_rating += old_choice->rating();
393 new_certainty += old_choice->certainty();
394 num_blobs_to_replace += num_blobs;
395 } else {
396 begin_blob_index += werd_choice->state(i);
397 }
398 }
399 new_certainty /= wrong_ngram_size;
400 // If there is no entry in the ratings matrix, add it.
401 MATRIX_COORD coord(begin_blob_index,
402 begin_blob_index + num_blobs_to_replace - 1);
403 if (!coord.Valid(*ratings)) {
404 ratings->IncreaseBandSize(coord.row - coord.col + 1);
405 }
406 if (ratings->get(coord.col, coord.row) == nullptr)
407 ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST);
408 BLOB_CHOICE_LIST* new_choices = ratings->get(coord.col, coord.row);
409 BLOB_CHOICE* choice = FindMatchingChoice(correct_ngram_id, new_choices);
410 if (choice != nullptr) {
411 // Already there. Upgrade if new rating better.
412 if (new_rating < choice->rating())
413 choice->set_rating(new_rating);
414 if (new_certainty < choice->certainty())
415 choice->set_certainty(new_certainty);
416 // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState.
417 } else {
418 // Need a new choice with the correct_ngram_id.
419 choice = new BLOB_CHOICE(*old_choice);
420 choice->set_unichar_id(correct_ngram_id);
421 choice->set_rating(new_rating);
422 choice->set_certainty(new_certainty);
423 choice->set_classifier(BCC_AMBIG);
424 choice->set_matrix_cell(coord.col, coord.row);
425 BLOB_CHOICE_IT it (new_choices);
426 it.add_to_end(choice);
427 }
428 // Remove current unichar from werd_choice. On the last iteration
429 // set the correct replacement unichar instead of removing a unichar.
430 for (int replaced_count = 0; replaced_count < wrong_ngram_size;
431 ++replaced_count) {
432 if (replaced_count + 1 == wrong_ngram_size) {
433 werd_choice->set_blob_choice(wrong_ngram_begin_index,
434 num_blobs_to_replace, choice);
435 } else {
436 werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1);
437 }
438 }
439 if (stopper_debug_level >= 1) {
440 werd_choice->print("ReplaceAmbig() ");
441 tprintf("Modified blob_choices: ");
442 print_ratings_list("\n", new_choices, getUnicharset());
443 }
444}
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:184
T get(ICOORD pos) const
Definition: matrix.h:231
void put(ICOORD pos, const T &thing)
Definition: matrix.h:223
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:49
void set_rating(float newrat)
Definition: ratngs.h:144
void set_classifier(BlobChoiceClassifier classifier)
Definition: ratngs.h:157
void set_matrix_cell(int col, int row)
Definition: ratngs.h:153
void set_certainty(float newrat)
Definition: ratngs.h:147
void set_unichar_id(UNICHAR_ID newunichar_id)
Definition: ratngs.h:141
void remove_unichar_id(int index)
Definition: ratngs.h:474
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice)
Definition: ratngs.cpp:316

◆ reset_hyphen_vars()

void tesseract::Dict::reset_hyphen_vars ( bool  last_word_on_line)

Unless the previous word was the last one on the line, and the current one is not (thus it is the first one on the line), erase hyphen_word_, clear hyphen_active_dawgs_, update last_word_on_line_.

Definition at line 28 of file hyphen.cpp.

28 {
29 if (!(last_word_on_line_ == true && last_word_on_line == false)) {
30 if (hyphen_word_ != nullptr) {
31 delete hyphen_word_;
32 hyphen_word_ = nullptr;
33 hyphen_active_dawgs_.clear();
34 }
35 }
37 tprintf("reset_hyphen_vars: last_word_on_line %d -> %d\n",
38 last_word_on_line_, last_word_on_line);
39 }
40 last_word_on_line_ = last_word_on_line;
41}

◆ ResetDocumentDictionary()

void tesseract::Dict::ResetDocumentDictionary ( )
inline

Definition at line 326 of file dict.h.

326 {
327 if (pending_words_ != nullptr)
328 pending_words_->clear();
329 if (document_words_ != nullptr)
330 document_words_->clear();
331 }
void clear()
Definition: trie.cpp:57

◆ set_hyphen_word()

void tesseract::Dict::set_hyphen_word ( const WERD_CHOICE word,
const DawgPositionVector active_dawgs 
)

Update hyphen_word_, and copy the given DawgPositionVectors into hyphen_active_dawgs_ .

Definition at line 45 of file hyphen.cpp.

46 {
47 if (hyphen_word_ == nullptr) {
48 hyphen_word_ = new WERD_CHOICE(word.unicharset());
49 hyphen_word_->make_bad();
50 }
51 if (hyphen_word_->rating() > word.rating()) {
52 *hyphen_word_ = word;
53 // Remove the last unichar id as it is a hyphen, and remove
54 // any unichar_string/lengths that are present.
55 hyphen_word_->remove_last_unichar_id();
56 hyphen_active_dawgs_ = active_dawgs;
57 }
59 hyphen_word_->print("set_hyphen_word: ");
60 }
61}
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:433

◆ SettupStopperPass1()

void tesseract::Dict::SettupStopperPass1 ( )

Sets up stopper variables in preparation for the first pass.

Definition at line 364 of file stopper.cpp.

364 {
365 reject_offset_ = 0.0;
366}

◆ SettupStopperPass2()

void tesseract::Dict::SettupStopperPass2 ( )

Sets up stopper variables in preparation for the second pass.

Definition at line 368 of file stopper.cpp.

368 {
370}

◆ SetupForLoad()

void tesseract::Dict::SetupForLoad ( DawgCache dawg_cache)

Definition at line 192 of file dict.cpp.

192 {
193 if (dawgs_.length() != 0) this->End();
194
195 apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol);
196 question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol);
197 slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol);
198 hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
199
200 if (dawg_cache != nullptr) {
201 dawg_cache_ = dawg_cache;
202 dawg_cache_is_ours_ = false;
203 } else {
204 dawg_cache_ = new DawgCache();
205 dawg_cache_is_ours_ = true;
206 }
207}

◆ SetWildcardID()

void tesseract::Dict::SetWildcardID ( UNICHAR_ID  id)
inline

Definition at line 427 of file dict.h.

427{ wildcard_unichar_id_ = id; }

◆ SetWordsegRatingAdjustFactor()

void tesseract::Dict::SetWordsegRatingAdjustFactor ( float  f)
inline

Set wordseg_rating_adjust_factor_ to the given value.

Definition at line 510 of file dict.h.

510 {
511 wordseg_rating_adjust_factor_ = f;
512 }

◆ UniformCertainties()

int tesseract::Dict::UniformCertainties ( const WERD_CHOICE word)

Returns true if the certainty of the BestChoice word is within a reasonable range of the average certainties for the best choices for each character in the segmentation. This test is used to catch words in which one character is much worse than the other characters in the word (i.e. false will be returned in that case). The algorithm computes the mean and std deviation of the certainties in the word with the worst certainty thrown out.

Definition at line 465 of file stopper.cpp.

465 {
466 float Certainty;
467 float WorstCertainty = FLT_MAX;
468 float CertaintyThreshold;
469 double TotalCertainty;
470 double TotalCertaintySquared;
471 double Variance;
472 float Mean, StdDev;
473 int word_length = word.length();
474
475 if (word_length < 3)
476 return true;
477
478 TotalCertainty = TotalCertaintySquared = 0.0;
479 for (int i = 0; i < word_length; ++i) {
480 Certainty = word.certainty(i);
481 TotalCertainty += Certainty;
482 TotalCertaintySquared += static_cast<double>(Certainty) * Certainty;
483 if (Certainty < WorstCertainty)
484 WorstCertainty = Certainty;
485 }
486
487 // Subtract off worst certainty from statistics.
488 word_length--;
489 TotalCertainty -= WorstCertainty;
490 TotalCertaintySquared -= static_cast<double>(WorstCertainty) * WorstCertainty;
491
492 Mean = TotalCertainty / word_length;
493 Variance = ((word_length * TotalCertaintySquared -
494 TotalCertainty * TotalCertainty) /
495 (word_length * (word_length - 1)));
496 if (Variance < 0.0)
497 Variance = 0.0;
498 StdDev = sqrt(Variance);
499
500 CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
501 if (CertaintyThreshold > stopper_nondict_certainty_base)
502 CertaintyThreshold = stopper_nondict_certainty_base;
503
504 if (word.certainty() < CertaintyThreshold) {
505 if (stopper_debug_level >= 1)
506 tprintf("Stopper: Non-uniform certainty = %4.1f"
507 " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
508 word.certainty(), Mean, StdDev, CertaintyThreshold);
509 return false;
510 } else {
511 return true;
512 }
513}
float Mean(PROTOTYPE *Proto, uint16_t Dimension)
Definition: cluster.cpp:602

◆ update_best_choice()

void tesseract::Dict::update_best_choice ( const WERD_CHOICE word,
WERD_CHOICE best_choice 
)
inline

Copies word into best_choice if its rating is smaller than that of best_choice.

Definition at line 182 of file dict.h.

183 {
184 if (word.rating() < best_choice->rating()) {
185 *best_choice = word;
186 }
187 }

◆ valid_bigram()

bool tesseract::Dict::valid_bigram ( const WERD_CHOICE word1,
const WERD_CHOICE word2 
) const

Definition at line 813 of file dict.cpp.

814 {
815 if (bigram_dawg_ == nullptr) return false;
816
817 // Extract the core word from the middle of each word with any digits
818 // replaced with question marks.
819 int w1start, w1end, w2start, w2end;
820 word1.punct_stripped(&w1start, &w1end);
821 word2.punct_stripped(&w2start, &w2end);
822
823 // We don't want to penalize a single guillemet, hyphen, etc.
824 // But our bigram list doesn't have any information about punctuation.
825 if (w1start >= w1end) return word1.length() < 3;
826 if (w2start >= w2end) return word2.length() < 3;
827
828 const UNICHARSET& uchset = getUnicharset();
829 GenericVector<UNICHAR_ID> bigram_string;
830 bigram_string.reserve(w1end + w2end + 1);
831 for (int i = w1start; i < w1end; i++) {
832 const GenericVector<UNICHAR_ID>& normed_ids =
834 if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
835 bigram_string.push_back(question_unichar_id_);
836 else
837 bigram_string += normed_ids;
838 }
839 bigram_string.push_back(UNICHAR_SPACE);
840 for (int i = w2start; i < w2end; i++) {
841 const GenericVector<UNICHAR_ID>& normed_ids =
843 if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
844 bigram_string.push_back(question_unichar_id_);
845 else
846 bigram_string += normed_ids;
847 }
848 WERD_CHOICE normalized_word(&uchset, bigram_string.size());
849 for (int i = 0; i < bigram_string.size(); ++i) {
850 normalized_word.append_unichar_id_space_allocated(bigram_string[i], 1, 0.0f,
851 0.0f);
852 }
853 return bigram_dawg_->word_in_dawg(normalized_word);
854}
@ UNICHAR_SPACE
Definition: unicharset.h:34
void punct_stripped(int *start_core, int *end_core) const
Definition: ratngs.cpp:387

◆ valid_punctuation()

bool tesseract::Dict::valid_punctuation ( const WERD_CHOICE word)

Returns true if the word contains a valid punctuation pattern. Note: Since the domains of punctuation symbols and symblos used in numbers are not disjoint, a valid number might contain an invalid punctuation pattern (e.g. .99).

Definition at line 856 of file dict.cpp.

856 {
857 if (word.length() == 0) return NO_PERM;
858 int i;
859 WERD_CHOICE new_word(word.unicharset());
860 int last_index = word.length() - 1;
861 int new_len = 0;
862 for (i = 0; i <= last_index; ++i) {
863 UNICHAR_ID unichar_id = (word.unichar_id(i));
864 if (getUnicharset().get_ispunctuation(unichar_id)) {
865 new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
866 } else if (!getUnicharset().get_isalpha(unichar_id) &&
867 !getUnicharset().get_isdigit(unichar_id)) {
868 return false; // neither punc, nor alpha, nor digit
869 } else if ((new_len = new_word.length()) == 0 ||
870 new_word.unichar_id(new_len - 1) != Dawg::kPatternUnicharID) {
871 new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
872 }
873 }
874 for (i = 0; i < dawgs_.size(); ++i) {
875 if (dawgs_[i] != nullptr && dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
876 dawgs_[i]->word_in_dawg(new_word))
877 return true;
878 }
879 return false;
880}
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:519

◆ valid_word() [1/3]

int tesseract::Dict::valid_word ( const char *  string) const
inline

This function is used by api/tesseract_cube_combiner.cpp.

Definition at line 488 of file dict.h.

488 {
489 WERD_CHOICE word(string, getUnicharset());
490 return valid_word(word);
491 }

◆ valid_word() [2/3]

int tesseract::Dict::valid_word ( const WERD_CHOICE word) const
inline

Definition at line 481 of file dict.h.

481 {
482 return valid_word(word, false); // return NO_PERM for words with digits
483 }

◆ valid_word() [3/3]

int tesseract::Dict::valid_word ( const WERD_CHOICE word,
bool  numbers_ok 
) const

Definition at line 778 of file dict.cpp.

778 {
779 const WERD_CHOICE* word_ptr = &word;
780 WERD_CHOICE temp_word(word.unicharset());
781 if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) {
782 copy_hyphen_info(&temp_word);
783 temp_word += word;
784 word_ptr = &temp_word;
785 }
786 if (word_ptr->length() == 0) return NO_PERM;
787 // Allocate vectors for holding current and updated
788 // active_dawgs and initialize them.
789 auto* active_dawgs = new DawgPositionVector[2];
790 init_active_dawgs(&(active_dawgs[0]), false);
791 DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
792 int last_index = word_ptr->length() - 1;
793 // Call letter_is_okay for each letter in the word.
794 for (int i = hyphen_base_size(); i <= last_index; ++i) {
795 if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(),
796 word_ptr->unichar_id(i), i == last_index)))
797 break;
798 // Swap active_dawgs, constraints with the corresponding updated vector.
799 if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
800 dawg_args.updated_dawgs = &(active_dawgs[0]);
801 ++(dawg_args.active_dawgs);
802 } else {
803 ++(dawg_args.updated_dawgs);
804 dawg_args.active_dawgs = &(active_dawgs[0]);
805 }
806 }
807 delete[] active_dawgs;
808 return valid_word_permuter(dawg_args.permuter, numbers_ok)
809 ? dawg_args.permuter
810 : NO_PERM;
811}
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:145
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:139

◆ valid_word_or_number()

int tesseract::Dict::valid_word_or_number ( const WERD_CHOICE word) const
inline

Definition at line 484 of file dict.h.

484 {
485 return valid_word(word, true); // return NUMBER_PERM for valid numbers
486 }

◆ valid_word_permuter()

static bool tesseract::Dict::valid_word_permuter ( uint8_t  perm,
bool  numbers_ok 
)
inlinestatic

Check all the DAWGs to see if this word is in any of them.

Read/Write/Access special purpose dawgs which contain words only of a certain length (used for phrase search for non-space-delimited languages).

Definition at line 474 of file dict.h.

474 {
475 return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
476 perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
477 perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||
478 (numbers_ok && perm == NUMBER_PERM));
479 }
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:241
@ NUMBER_PERM
Definition: ratngs.h:239

◆ WildcardID()

UNICHAR_ID tesseract::Dict::WildcardID ( ) const
inline

Definition at line 428 of file dict.h.

428{ return wildcard_unichar_id_; }

Member Data Documentation

◆ certainty_scale

double tesseract::Dict::certainty_scale = 20.0

"Certainty scaling factor"

Definition at line 627 of file dict.h.

◆ dawg_debug_level

int tesseract::Dict::dawg_debug_level = 0

"Set to 1 for general debug info" ", to 2 for more details, to 3 to see all the debug messages"

Definition at line 622 of file dict.h.

◆ doc_dict_certainty_threshold

double tesseract::Dict::doc_dict_certainty_threshold = -2.25

"Worst certainty" " for words that can be inserted into the document dictionary"

Definition at line 653 of file dict.h.

◆ doc_dict_pending_threshold

double tesseract::Dict::doc_dict_pending_threshold = 0.0

"Worst certainty for using pending dictionary"

Definition at line 651 of file dict.h.

◆ go_deeper_fxn_

void(Dict::* tesseract::Dict::go_deeper_fxn_) (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)

Pointer to go_deeper function.

Definition at line 216 of file dict.h.

◆ hyphen_debug_level

int tesseract::Dict::hyphen_debug_level = 0

"Debug level for hyphenated words."

Definition at line 623 of file dict.h.

◆ letter_is_okay_

int(Dict::* tesseract::Dict::letter_is_okay_) (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const

Definition at line 372 of file dict.h.

◆ load_bigram_dawg

bool tesseract::Dict::load_bigram_dawg = true

"Load dawg with special word bigrams."

Definition at line 592 of file dict.h.

◆ load_freq_dawg

bool tesseract::Dict::load_freq_dawg = true

"Load frequent word dawg."

Definition at line 586 of file dict.h.

◆ load_number_dawg

bool tesseract::Dict::load_number_dawg = true

"Load dawg with number patterns."

Definition at line 590 of file dict.h.

◆ load_punc_dawg

bool tesseract::Dict::load_punc_dawg = true

"Load dawg with punctuation patterns."

Definition at line 589 of file dict.h.

◆ load_system_dawg

bool tesseract::Dict::load_system_dawg = true

"Load system word dawg."

Definition at line 585 of file dict.h.

◆ load_unambig_dawg

bool tesseract::Dict::load_unambig_dawg = true

"Load unambiguous word dawg."

Definition at line 587 of file dict.h.

◆ max_permuter_attempts

int tesseract::Dict::max_permuter_attempts = 10000

"Maximum number of different" " character choices to consider during permutation." " This limit is especially useful when user patterns" " are specified, since overly generic patterns can result in" " dawg search exploring an overly large number of options."

Definition at line 658 of file dict.h.

◆ output_ambig_words_file

char* tesseract::Dict::output_ambig_words_file = ""

"Output file for ambiguities found in the dictionary"

Definition at line 620 of file dict.h.

◆ params_model_classify_

float(Dict::* tesseract::Dict::params_model_classify_) (const char *lang, void *path)

Definition at line 418 of file dict.h.

◆ probability_in_context_

double(Dict::* tesseract::Dict::probability_in_context_) (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)

Probability in context function used by the ngram permuter.

Definition at line 384 of file dict.h.

◆ save_doc_words

bool tesseract::Dict::save_doc_words = 0

"Save Document Words"

Definition at line 649 of file dict.h.

◆ segment_nonalphabetic_script

bool tesseract::Dict::segment_nonalphabetic_script = false

"Don't use any alphabetic-specific tricks." "Set to true in the traineddata config file for" " scripts that are cursive or inherently fixed-pitch"

Definition at line 648 of file dict.h.

◆ segment_penalty_dict_case_bad

double tesseract::Dict::segment_penalty_dict_case_bad = 1.3125

"Default score multiplier for word matches, which may have " "case issues (lower is better)."

Definition at line 609 of file dict.h.

◆ segment_penalty_dict_case_ok

double tesseract::Dict::segment_penalty_dict_case_ok = 1.1

"Score multiplier for word matches that have good case " "(lower is better)."

Definition at line 605 of file dict.h.

◆ segment_penalty_dict_frequent_word

double tesseract::Dict::segment_penalty_dict_frequent_word = 1.0

"Score multiplier for word matches which have good case and" "are frequent in the given language (lower is better)."

Definition at line 601 of file dict.h.

◆ segment_penalty_dict_nonword

double tesseract::Dict::segment_penalty_dict_nonword = 1.25

"Score multiplier for glyph fragment segmentations which " "do not match a dictionary word (lower is better)."

Definition at line 613 of file dict.h.

◆ segment_penalty_garbage

double tesseract::Dict::segment_penalty_garbage = 1.50

"Score multiplier for poorly cased strings that are not in" " the dictionary and generally look like garbage (lower is" " better)."

Definition at line 618 of file dict.h.

◆ stopper_allowable_character_badness

double tesseract::Dict::stopper_allowable_character_badness = 3.0

"Max certaintly variation allowed in a word (in sigma)"

Definition at line 637 of file dict.h.

◆ stopper_certainty_per_char

double tesseract::Dict::stopper_certainty_per_char = -0.50

"Certainty to add for each dict char above small word size."

Definition at line 635 of file dict.h.

◆ stopper_debug_level

int tesseract::Dict::stopper_debug_level = 0

"Stopper debug level"

Definition at line 638 of file dict.h.

◆ stopper_no_acceptable_choices

bool tesseract::Dict::stopper_no_acceptable_choices = false

"Make AcceptableChoice() always return false. Useful" " when there is a need to explore all segmentations"

Definition at line 641 of file dict.h.

◆ stopper_nondict_certainty_base

double tesseract::Dict::stopper_nondict_certainty_base = -2.50

"Certainty threshold for non-dict words"

Definition at line 629 of file dict.h.

◆ stopper_phase2_certainty_rejection_offset

double tesseract::Dict::stopper_phase2_certainty_rejection_offset = 1.0

"Reject certainty offset"

Definition at line 631 of file dict.h.

◆ stopper_smallword_size

int tesseract::Dict::stopper_smallword_size = 2

"Size of dict word to be treated as non-dict word"

Definition at line 633 of file dict.h.

◆ tessedit_truncate_wordchoice_log

int tesseract::Dict::tessedit_truncate_wordchoice_log = 10

"Max words to keep in list"

Definition at line 642 of file dict.h.

◆ use_only_first_uft8_step

bool tesseract::Dict::use_only_first_uft8_step = false

"Use only the first UTF8 step of the given string" " when computing log probabilities."

Definition at line 626 of file dict.h.

◆ user_patterns_file

char* tesseract::Dict::user_patterns_file = ""

"A filename of user-provided patterns."

Definition at line 582 of file dict.h.

◆ user_patterns_suffix

char* tesseract::Dict::user_patterns_suffix = ""

"A suffix of user-provided patterns located in tessdata."

Definition at line 584 of file dict.h.

◆ user_words_file

char* tesseract::Dict::user_words_file = ""

Variable members. These have to be declared and initialized after image_ptr_, which contains the pointer to the params vector - the member of its base CCUtil class. "A filename of user-provided words."

Definition at line 578 of file dict.h.

◆ user_words_suffix

char* tesseract::Dict::user_words_suffix = ""

"A suffix of user-provided words located in tessdata."

Definition at line 580 of file dict.h.

◆ word_to_debug

char* tesseract::Dict::word_to_debug = ""

"Word for which stopper debug information" " should be printed to stdout"

Definition at line 644 of file dict.h.

◆ xheight_penalty_inconsistent

double tesseract::Dict::xheight_penalty_inconsistent = 0.25

"Score penalty (0.1 = 10%) added if an xheight is " "inconsistent."

Definition at line 598 of file dict.h.

◆ xheight_penalty_subscripts

double tesseract::Dict::xheight_penalty_subscripts = 0.125

"Score penalty (0.1 = 10%) added if there are subscripts " "or superscripts in a word, but it is otherwise OK."

Definition at line 595 of file dict.h.


The documentation for this class was generated from the following files: