tesseract 4.1.1
Loading...
Searching...
No Matches
language_model.h
Go to the documentation of this file.
1
2// File: language_model.h
3// Description: Functions that utilize the knowledge about the properties,
4// structure and statistics of the language to help segmentation
5// search.
6// Author: Daria Antonova
7//
8// (C) Copyright 2009, Google Inc.
9// Licensed under the Apache License, Version 2.0 (the "License");
10// you may not use this file except in compliance with the License.
11// You may obtain a copy of the License at
12// http://www.apache.org/licenses/LICENSE-2.0
13// Unless required by applicable law or agreed to in writing, software
14// distributed under the License is distributed on an "AS IS" BASIS,
15// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16// See the License for the specific language governing permissions and
17// limitations under the License.
18//
20
21#ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_H_
22#define TESSERACT_WORDREC_LANGUAGE_MODEL_H_
23
24#include <cmath> // for exp
25#include "associate.h" // for AssociateStats (ptr only), AssociateUtils
26#include "dawg.h" // for DawgPositionVector
27#include "dict.h" // for DawgArgs, Dict
28#include "lm_consistency.h" // for LMConsistencyInfo
29#include "lm_state.h" // for ViterbiStateEntry, LanguageModelFlagsType
30#include "params.h" // for DoubleParam, double_VAR_H, IntParam, Boo...
31#include "params_model.h" // for ParamsModel
32#include "ratngs.h" // for BLOB_CHOICE (ptr only), BLOB_CHOICE_LIST...
33#include "stopper.h" // for DANGERR
34#include "strngs.h" // for STRING
35
36class UNICHARSET;
37class WERD_RES;
38
39struct BlamerBundle;
40
41template <typename T> class UnicityTable;
42
43namespace tesseract {
44
45class LMPainPoints;
46struct FontInfo;
47
48// This class that contains the data structures and functions necessary
49// to represent and use the knowledge about the language.
51 public:
52 // Masks for keeping track of top choices that should not be pruned out.
58
59 // Denominator for normalizing per-letter ngram cost when deriving
60 // penalty adjustments.
61 static const float kMaxAvgNgramCost;
62
63 LanguageModel(const UnicityTable<FontInfo> *fontinfo_table, Dict *dict);
65
66 // Fills the given floats array with features extracted from path represented
67 // by the given ViterbiStateEntry. See ccstruct/params_training_featdef.h
68 // for feature information.
69 // Note: the function assumes that features points to an array of size
70 // PTRAIN_NUM_FEATURE_TYPES.
71 static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse,
72 float features[]);
73
74 // Updates data structures that are used for the duration of the segmentation
75 // search on the current word;
76 void InitForWord(const WERD_CHOICE *prev_word,
77 bool fixed_pitch, float max_char_wh_ratio,
78 float rating_cert_scale);
79
80 // Updates language model state of the given BLOB_CHOICE_LIST (from
81 // the ratings matrix) a its parent. Updates pain_points if new
82 // problematic points are found in the segmentation graph.
83 //
84 // At most language_model_viterbi_list_size are kept in each
85 // LanguageModelState.viterbi_state_entries list.
86 // At most language_model_viterbi_list_max_num_prunable of those are prunable
87 // (non-dictionary) paths.
88 // The entries that represent dictionary word paths are kept at the front
89 // of the list.
90 // The list ordered by cost that is computed collectively by several
91 // language model components (currently dawg and ngram components).
92 bool UpdateState(
93 bool just_classified,
94 int curr_col, int curr_row,
95 BLOB_CHOICE_LIST *curr_list,
96 LanguageModelState *parent_node,
97 LMPainPoints *pain_points,
98 WERD_RES *word_res,
99 BestChoiceBundle *best_choice_bundle,
100 BlamerBundle *blamer_bundle);
101
102 // Returns true if an acceptable best choice was discovered.
104 inline void SetAcceptableChoiceFound(bool val) {
106 }
107 // Returns the reference to ParamsModel.
109
110 protected:
111
112 inline float CertaintyScore(float cert) {
114 // cert is assumed to be between 0 and -dict_->certainty_scale.
115 // If you enable language_model_use_sigmoidal_certainty, you
116 // need to adjust language_model_ngram_nonmatch_score as well.
117 cert = -cert / dict_->certainty_scale;
118 return 1.0f / (1.0f + exp(10.0f * cert));
119 } else {
120 return (-1.0f / cert);
121 }
122 }
123
124 inline float ComputeAdjustment(int num_problems, float penalty) {
125 if (num_problems == 0) return 0.0f;
126 if (num_problems == 1) return penalty;
127 return (penalty + (language_model_penalty_increment *
128 static_cast<float>(num_problems-1)));
129 }
130
131 // Computes the adjustment to the ratings sum based on the given
132 // consistency_info. The paths with invalid punctuation, inconsistent
133 // case and character type are penalized proportionally to the number
134 // of inconsistencies on the path.
136 const LanguageModelDawgInfo *dawg_info,
137 const LMConsistencyInfo &consistency_info) {
138 if (dawg_info != nullptr) {
139 return ComputeAdjustment(consistency_info.NumInconsistentCase(),
141 (consistency_info.inconsistent_script ?
143 }
144 return (ComputeAdjustment(consistency_info.NumInconsistentPunc(),
146 ComputeAdjustment(consistency_info.NumInconsistentCase(),
150 ComputeAdjustment(consistency_info.NumInconsistentSpaces(),
152 (consistency_info.inconsistent_script ?
154 (consistency_info.inconsistent_font ?
156 }
157
158 // Returns an adjusted ratings sum that includes inconsistency penalties,
159 // penalties for non-dictionary paths and paths with dips in ngram
160 // probability.
162
163 // Finds the first lower and upper case letter and first digit in curr_list.
164 // Uses the first character in the list in place of empty results.
165 // Returns true if both alpha and digits are found.
166 bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list,
167 BLOB_CHOICE **first_lower,
168 BLOB_CHOICE **first_upper,
169 BLOB_CHOICE **first_digit) const;
170 // Forces there to be at least one entry in the overall set of the
171 // viterbi_state_entries of each element of parent_node that has the
172 // top_choice_flag set for lower, upper and digit using the same rules as
173 // GetTopLowerUpperDigit, setting the flag on the first found suitable
174 // candidate, whether or not the flag is set on some other parent.
175 // Returns 1 if both alpha and digits are found among the parents, -1 if no
176 // parents are found at all (a legitimate case), and 0 otherwise.
177 int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const;
178
179 // Finds the next ViterbiStateEntry with which the given unichar_id can
180 // combine sensibly, taking into account any mixed alnum/mixed case
181 // situation, and whether this combination has been inspected before.
183 bool just_classified, bool mixed_alnum,
184 const BLOB_CHOICE* bc, LanguageModelFlagsType blob_choice_flags,
185 const UNICHARSET& unicharset, WERD_RES* word_res,
186 ViterbiStateEntry_IT* vse_it,
187 LanguageModelFlagsType* top_choice_flags) const;
188 // Helper function that computes the cost of the path composed of the
189 // path in the given parent ViterbiStateEntry and the given BLOB_CHOICE.
190 // If the new path looks good enough, adds a new ViterbiStateEntry to the
191 // list of viterbi entries in the given BLOB_CHOICE and returns true.
193 LanguageModelFlagsType top_choice_flags, float denom, bool word_end,
194 int curr_col, int curr_row, BLOB_CHOICE *b,
195 LanguageModelState *curr_state, ViterbiStateEntry *parent_vse,
196 LMPainPoints *pain_points, WERD_RES *word_res,
197 BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle);
198
199 // Determines whether a potential entry is a true top choice and
200 // updates changed accordingly.
201 //
202 // Note: The function assumes that b, top_choice_flags and changed
203 // are not nullptr.
205 const ViterbiStateEntry *parent_vse,
206 LanguageModelState *lms);
207
208 // Calls dict_->LetterIsOk() with DawgArgs initialized from parent_vse and
209 // unichar from b.unichar_id(). Constructs and returns LanguageModelDawgInfo
210 // with updated active dawgs, constraints and permuter.
211 //
212 // Note: the caller is responsible for deleting the returned pointer.
214 int curr_col, int curr_row,
215 const BLOB_CHOICE &b,
216 const ViterbiStateEntry *parent_vse);
217
218 // Computes p(unichar | parent context) and records it in ngram_cost.
219 // If b.unichar_id() is an unlikely continuation of the parent context
220 // sets found_small_prob to true and returns nullptr.
221 // Otherwise creates a new LanguageModelNgramInfo entry containing the
222 // updated context (that includes b.unichar_id() at the end) and returns it.
223 //
224 // Note: the caller is responsible for deleting the returned pointer.
226 const char *unichar, float certainty, float denom,
227 int curr_col, int curr_row, float outline_length,
228 const ViterbiStateEntry *parent_vse);
229
230 // Computes -(log(prob(classifier)) + log(prob(ngram model)))
231 // for the given unichar in the given context. If there are multiple
232 // unichars at one position - takes the average of their probabilities.
233 // UNICHAR::utf8_step() is used to separate out individual UTF8 characters,
234 // since probability_in_context() can only handle one at a time (while
235 // unicharset might contain ngrams and glyphs composed from multiple UTF8
236 // characters).
237 float ComputeNgramCost(const char *unichar, float certainty, float denom,
238 const char *context, int *unichar_step_len,
239 bool *found_small_prob, float *ngram_prob);
240
241 // Computes the normalization factors for the classifier confidences
242 // (used by ComputeNgramCost()).
243 float ComputeDenom(BLOB_CHOICE_LIST *curr_list);
244
245 // Fills the given consistenty_info based on parent_vse.consistency_info
246 // and on the consistency of the given unichar_id with parent_vse.
248 int curr_col, bool word_end, BLOB_CHOICE *b,
249 ViterbiStateEntry *parent_vse,
250 WERD_RES *word_res,
251 LMConsistencyInfo *consistency_info);
252
253 // Constructs WERD_CHOICE by recording unichar_ids of the BLOB_CHOICEs
254 // on the path represented by the given BLOB_CHOICE and language model
255 // state entries (lmse, dse). The path is re-constructed by following
256 // the parent pointers in the the lang model state entries). If the
257 // constructed WERD_CHOICE is better than the best/raw choice recorded
258 // in the best_choice_bundle, this function updates the corresponding
259 // fields and sets best_choice_bunldle->updated to true.
261 LMPainPoints *pain_points,
262 WERD_RES *word_res,
263 BestChoiceBundle *best_choice_bundle,
264 BlamerBundle *blamer_bundle);
265
266 // Constructs a WERD_CHOICE by tracing parent pointers starting with
267 // the given LanguageModelStateEntry. Returns the constructed word.
268 // Updates best_char_choices, certainties and state if they are not
269 // nullptr (best_char_choices and certainties are assumed to have the
270 // length equal to lmse->length).
271 // The caller is responsible for freeing memory associated with the
272 // returned WERD_CHOICE.
274 WERD_RES *word_res,
275 DANGERR *fixpt,
276 BlamerBundle *blamer_bundle,
277 bool *truth_path);
278
279 // Wrapper around AssociateUtils::ComputeStats().
280 inline void ComputeAssociateStats(int col, int row,
281 float max_char_wh_ratio,
282 ViterbiStateEntry *parent_vse,
283 WERD_RES *word_res,
284 AssociateStats *associate_stats) {
286 col, row,
287 (parent_vse != nullptr) ? &(parent_vse->associate_stats) : nullptr,
288 (parent_vse != nullptr) ? parent_vse->length : 0,
289 fixed_pitch_, max_char_wh_ratio,
290 word_res, language_model_debug_level > 2, associate_stats);
291 }
292
293 // Returns true if the path with such top_choice_flags and dawg_info
294 // could be pruned out (i.e. is neither a system/user/frequent dictionary
295 // nor a top choice path).
296 // In non-space delimited languages all paths can be "somewhat" dictionary
297 // words. In such languages we can not do dictionary-driven path pruning,
298 // so paths with non-empty dawg_info are considered prunable.
299 inline bool PrunablePath(const ViterbiStateEntry &vse) {
300 if (vse.top_choice_flags) return false;
301 if (vse.dawg_info != nullptr &&
304 vse.dawg_info->permuter == FREQ_DAWG_PERM)) return false;
305 return true;
306 }
307
308 // Returns true if the given ViterbiStateEntry represents an acceptable path.
309 inline bool AcceptablePath(const ViterbiStateEntry &vse) {
310 return (vse.dawg_info != nullptr || vse.Consistent() ||
311 (vse.ngram_info != nullptr && !vse.ngram_info->pruned));
312 }
313
314 public:
315 // Parameters.
316 INT_VAR_H(language_model_debug_level, 0, "Language model debug level");
318 "Turn on/off the use of character ngram model");
320 "Maximum order of the character ngram model");
322 "Maximum number of prunable (those for which PrunablePath() is"
323 " true) entries in each viterbi list recorded in BLOB_CHOICEs");
325 "Maximum size of viterbi lists recorded in BLOB_CHOICEs");
327 "To avoid overly small denominators use this as the floor"
328 " of the probability returned by the ngram model");
330 "Average classifier score of a non-matching unichar");
332 "Use only the first UTF8 step of the given string"
333 " when computing log probabilities");
335 "Strength of the character ngram model relative to the"
336 " character classifier ");
338 "Factor to bring log-probs into the same range as ratings"
339 " when multiplied by outline length ");
341 "Words are delimited by space");
343 "Minimum length of compound words");
344 // Penalties used for adjusting path costs and final word rating.
346 "Penalty for words not in the frequent word dictionary");
348 "Penalty for non-dictionary words");
350 "Penalty for inconsistent punctuation");
352 "Penalty for inconsistent case");
354 "Penalty for inconsistent script");
356 "Penalty for inconsistent character type");
358 "Penalty for inconsistent font");
360 "Penalty for inconsistent spacing");
362 INT_VAR_H(wordrec_display_segmentations, 0, "Display Segmentations");
364 "Use sigmoidal score for certainty");
365
366
367 protected:
368 // Member Variables.
369
370 // Temporary DawgArgs struct that is re-used across different words to
371 // avoid dynamic memory re-allocation (should be cleared before each use).
373 // Scaling for recovering blob outline length from rating and certainty.
374 float rating_cert_scale_ = 0.0f;
375
376 // The following variables are set at construction time.
377
378 // Pointer to fontinfo table (not owned by LanguageModel).
380
381 // Pointer to Dict class, that is used for querying the dictionaries
382 // (the pointer is not owned by LanguageModel).
383 Dict* dict_ = nullptr;
384
385 // TODO(daria): the following variables should become LanguageModel params
386 // when the old code in bestfirst.cpp and heuristic.cpp is deprecated.
387 //
388 // Set to true if we are dealing with fixed pitch text
389 // (set to assume_fixed_pitch_char_segment).
390 bool fixed_pitch_ = false;
391 // Max char width-to-height ratio allowed
392 // (set to segsearch_max_char_wh_ratio).
393 float max_char_wh_ratio_ = 0.0f;
394
395 // The following variables are initialized with InitForWord().
396
397 // String representation of the classification of the previous word
398 // (since this is only used by the character ngram model component,
399 // only the last language_model_ngram_order of the word are stored).
402 // Active dawg vector.
405 // Set to true if acceptable choice was discovered.
406 // Note: it would be nice to use this to terminate the search once an
407 // acceptable choices is found. However we do not do that and once an
408 // acceptable choice is found we finish looking for alternative choices
409 // in the current segmentation graph and then exit the search (no more
410 // classifications are done after an acceptable choice is found).
411 // This is needed in order to let the search find the words very close to
412 // the best choice in rating (e.g. what/What, Cat/cat, etc) and log these
413 // choices. This way the stopper will know that the best choice is not
414 // ambiguous (i.e. there are best choices in the best choice list that have
415 // ratings close to the very best one) and will be less likely to mis-adapt.
417 // Set to true if a choice representing correct segmentation was explored.
419
420 // Params models containing weights for for computing ViterbiStateEntry costs.
422};
423
424} // namespace tesseract
425
426#endif // TESSERACT_WORDREC_LANGUAGE_MODEL_H_
@ FREQ_DAWG_PERM
Definition: ratngs.h:244
@ USER_DAWG_PERM
Definition: ratngs.h:243
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:241
#define BOOL_VAR_H(name, val, comment)
Definition: params.h:297
#define INT_VAR_H(name, val, comment)
Definition: params.h:295
#define double_VAR_H(name, val, comment)
Definition: params.h:301
unsigned char LanguageModelFlagsType
Used for expressing various language model flags.
Definition: lm_state.h:37
Definition: strngs.h:45
double certainty_scale
Definition: dict.h:627
static void ComputeStats(int col, int row, const AssociateStats *parent_stats, int parent_path_length, bool fixed_pitch, float max_char_wh_ratio, WERD_RES *word_res, bool debug, AssociateStats *stats)
Definition: associate.cpp:34
void SetAcceptableChoiceFound(bool val)
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
DawgPositionVector beginning_active_dawgs_
bool PrunablePath(const ViterbiStateEntry &vse)
static const LanguageModelFlagsType kXhtConsistentFlag
float ComputeAdjustment(int num_problems, float penalty)
static const LanguageModelFlagsType kSmallestRatingFlag
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
static const LanguageModelFlagsType kDigitFlag
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
bool AcceptablePath(const ViterbiStateEntry &vse)
void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
ParamsModel & getParamsModel()
bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
static const LanguageModelFlagsType kLowerCaseFlag
bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
double language_model_ngram_nonmatch_score
double language_model_penalty_non_dict_word
bool language_model_ngram_use_only_first_uft8_step
ViterbiStateEntry * GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
WERD_CHOICE * ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
static const float kMaxAvgNgramCost
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
bool UpdateState(bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
float ComputeAdjustedPathCost(ViterbiStateEntry *vse)
int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const
bool language_model_ngram_space_delimited_language
DawgPositionVector very_beginning_active_dawgs_
static const LanguageModelFlagsType kUpperCaseFlag
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
float CertaintyScore(float cert)
void InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
const UnicityTable< FontInfo > * fontinfo_table_
double language_model_penalty_non_freq_dict_word
int language_model_viterbi_list_max_num_prunable
LanguageModelDawgInfo * dawg_info
Definition: lm_state.h:166
AssociateStats associate_stats
character widths/gaps/seams
Definition: lm_state.h:188
int length
number of characters on the path
Definition: lm_state.h:185
LanguageModelNgramInfo * ngram_info
Definition: lm_state.h:170
LanguageModelFlagsType top_choice_flags
Definition: lm_state.h:192
Struct to store information maintained by various language model components.
Definition: lm_state.h:200
Bundle together all the things pertaining to the best choice/state.
Definition: lm_state.h:222