tesseract 4.1.1
|
#include <language_model.h>
Public Member Functions | |
LanguageModel (const UnicityTable< FontInfo > *fontinfo_table, Dict *dict) | |
~LanguageModel () | |
void | InitForWord (const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale) |
bool | UpdateState (bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
bool | AcceptableChoiceFound () |
void | SetAcceptableChoiceFound (bool val) |
ParamsModel & | getParamsModel () |
Static Public Member Functions | |
static void | ExtractFeaturesFromPath (const ViterbiStateEntry &vse, float features[]) |
Static Public Attributes | |
static const LanguageModelFlagsType | kSmallestRatingFlag = 0x1 |
static const LanguageModelFlagsType | kLowerCaseFlag = 0x2 |
static const LanguageModelFlagsType | kUpperCaseFlag = 0x4 |
static const LanguageModelFlagsType | kDigitFlag = 0x8 |
static const LanguageModelFlagsType | kXhtConsistentFlag = 0x10 |
static const float | kMaxAvgNgramCost = 25.0f |
Protected Member Functions | |
float | CertaintyScore (float cert) |
float | ComputeAdjustment (int num_problems, float penalty) |
float | ComputeConsistencyAdjustment (const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info) |
float | ComputeAdjustedPathCost (ViterbiStateEntry *vse) |
bool | GetTopLowerUpperDigit (BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const |
int | SetTopParentLowerUpperDigit (LanguageModelState *parent_node) const |
ViterbiStateEntry * | GetNextParentVSE (bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const |
bool | AddViterbiStateEntry (LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
void | GenerateTopChoiceInfo (ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms) |
LanguageModelDawgInfo * | GenerateDawgInfo (bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse) |
LanguageModelNgramInfo * | GenerateNgramInfo (const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse) |
float | ComputeNgramCost (const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob) |
float | ComputeDenom (BLOB_CHOICE_LIST *curr_list) |
void | FillConsistencyInfo (int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info) |
void | UpdateBestChoice (ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
WERD_CHOICE * | ConstructWord (ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path) |
void | ComputeAssociateStats (int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats) |
bool | PrunablePath (const ViterbiStateEntry &vse) |
bool | AcceptablePath (const ViterbiStateEntry &vse) |
Protected Attributes | |
DawgArgs | dawg_args_ |
float | rating_cert_scale_ = 0.0f |
const UnicityTable< FontInfo > * | fontinfo_table_ = nullptr |
Dict * | dict_ = nullptr |
bool | fixed_pitch_ = false |
float | max_char_wh_ratio_ = 0.0f |
STRING | prev_word_str_ |
int | prev_word_unichar_step_len_ = 0 |
DawgPositionVector | very_beginning_active_dawgs_ |
DawgPositionVector | beginning_active_dawgs_ |
bool | acceptable_choice_found_ = false |
bool | correct_segmentation_explored_ = false |
ParamsModel | params_model_ |
Definition at line 50 of file language_model.h.
tesseract::LanguageModel::LanguageModel | ( | const UnicityTable< FontInfo > * | fontinfo_table, |
Dict * | dict | ||
) |
Definition at line 53 of file language_model.cpp.
tesseract::LanguageModel::~LanguageModel | ( | ) |
Definition at line 134 of file language_model.cpp.
|
inline |
Definition at line 103 of file language_model.h.
|
inlineprotected |
Definition at line 309 of file language_model.h.
|
protected |
Definition at line 561 of file language_model.cpp.
|
inlineprotected |
Definition at line 112 of file language_model.h.
|
protected |
Definition at line 1199 of file language_model.cpp.
|
inlineprotected |
Definition at line 124 of file language_model.h.
|
inlineprotected |
Definition at line 280 of file language_model.h.
|
inlineprotected |
Definition at line 135 of file language_model.h.
|
protected |
Definition at line 994 of file language_model.cpp.
|
protected |
Definition at line 934 of file language_model.cpp.
|
protected |
Definition at line 1390 of file language_model.cpp.
|
static |
Definition at line 1341 of file language_model.cpp.
|
protected |
Definition at line 1015 of file language_model.cpp.
|
protected |
Definition at line 785 of file language_model.cpp.
|
protected |
Definition at line 877 of file language_model.cpp.
|
protected |
Definition at line 769 of file language_model.cpp.
|
protected |
Finds the next ViterbiStateEntry with which the given unichar_id can combine sensibly, taking into account any mixed alnum/mixed case situation, and whether this combination has been inspected before.
Definition at line 500 of file language_model.cpp.
|
inline |
Definition at line 108 of file language_model.h.
|
protected |
Finds the first lower and upper case letter and first digit in curr_list. For non-upper/lower languages, alpha counts as upper. Uses the first character in the list in place of empty results. Returns true if both alpha and digits are found.
Definition at line 383 of file language_model.cpp.
void tesseract::LanguageModel::InitForWord | ( | const WERD_CHOICE * | prev_word, |
bool | fixed_pitch, | ||
float | max_char_wh_ratio, | ||
float | rating_cert_scale | ||
) |
Definition at line 136 of file language_model.cpp.
|
inlineprotected |
Definition at line 299 of file language_model.h.
|
inline |
Definition at line 104 of file language_model.h.
|
protected |
Forces there to be at least one entry in the overall set of the viterbi_state_entries of each element of parent_node that has the top_choice_flag set for lower, upper and digit using the same rules as GetTopLowerUpperDigit, setting the flag on the first found suitable candidate, whether or not the flag is set on some other parent. Returns 1 if both alpha and digits are found among the parents, -1 if no parents are found at all (a legitimate case), and 0 otherwise.
Definition at line 423 of file language_model.cpp.
|
protected |
Definition at line 1241 of file language_model.cpp.
bool tesseract::LanguageModel::UpdateState | ( | bool | just_classified, |
int | curr_col, | ||
int | curr_row, | ||
BLOB_CHOICE_LIST * | curr_list, | ||
LanguageModelState * | parent_node, | ||
LMPainPoints * | pain_points, | ||
WERD_RES * | word_res, | ||
BestChoiceBundle * | best_choice_bundle, | ||
BlamerBundle * | blamer_bundle | ||
) |
UpdateState has the job of combining the ViterbiStateEntry lists on each of the choices on parent_list with each of the blob choices in curr_list, making a new ViterbiStateEntry for each sensible path.
This could be a huge set of combinations, creating a lot of work only to be truncated by some beam limit, but only certain kinds of paths will continue at the next step:
GetNextParentVSE enforces some of these models to minimize the number of calls to AddViterbiStateEntry, even prior to looking at the language model. Thus an n-blob sequence of [l1I] will produce 3n calls to AddViterbiStateEntry instead of 3^n.
Of course it isn't quite that simple as Title Case is handled by allowing lower case to continue an upper case initial, but it has to be detected in the combiner so it knows which upper case letters are initial alphas.
Definition at line 253 of file language_model.cpp.
|
protected |
Definition at line 416 of file language_model.h.
|
protected |
Definition at line 404 of file language_model.h.
|
protected |
Definition at line 418 of file language_model.h.
|
protected |
Definition at line 372 of file language_model.h.
|
protected |
Definition at line 383 of file language_model.h.
|
protected |
Definition at line 390 of file language_model.h.
|
protected |
Definition at line 379 of file language_model.h.
|
static |
Definition at line 56 of file language_model.h.
|
static |
Definition at line 54 of file language_model.h.
|
static |
Definition at line 61 of file language_model.h.
|
static |
Definition at line 53 of file language_model.h.
|
static |
Definition at line 55 of file language_model.h.
|
static |
Definition at line 57 of file language_model.h.
int tesseract::LanguageModel::language_model_debug_level = 0 |
"Language model debug level"
Definition at line 316 of file language_model.h.
int tesseract::LanguageModel::language_model_min_compound_length = 3 |
"Minimum length of compound words"
Definition at line 343 of file language_model.h.
double tesseract::LanguageModel::language_model_ngram_nonmatch_score = -40.0 |
"Average classifier score of a non-matching unichar"
Definition at line 330 of file language_model.h.
bool tesseract::LanguageModel::language_model_ngram_on = false |
"Turn on/off the use of character ngram model"
Definition at line 318 of file language_model.h.
int tesseract::LanguageModel::language_model_ngram_order = 8 |
"Maximum order of the character ngram model"
Definition at line 320 of file language_model.h.
double tesseract::LanguageModel::language_model_ngram_rating_factor = 16.0 |
"Factor to bring log-probs into the same range as ratings" " when multiplied by outline length "
Definition at line 339 of file language_model.h.
double tesseract::LanguageModel::language_model_ngram_scale_factor = 0.03 |
"Strength of the character ngram model relative to the" " character classifier "
Definition at line 336 of file language_model.h.
double tesseract::LanguageModel::language_model_ngram_small_prob = 0.000001 |
"To avoid overly small denominators use this as the floor" " of the probability returned by the ngram model"
Definition at line 328 of file language_model.h.
bool tesseract::LanguageModel::language_model_ngram_space_delimited_language = true |
"Words are delimited by space"
Definition at line 341 of file language_model.h.
bool tesseract::LanguageModel::language_model_ngram_use_only_first_uft8_step = false |
"Use only the first UTF8 step of the given string" " when computing log probabilities"
Definition at line 333 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_case = 0.1 |
"Penalty for inconsistent case"
Definition at line 352 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_chartype = 0.3 |
"Penalty for inconsistent character type"
Definition at line 356 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_font = 0.00 |
"Penalty for inconsistent font"
Definition at line 358 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_increment = 0.01 |
"Penalty increment"
Definition at line 361 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_non_dict_word = 0.15 |
"Penalty for non-dictionary words"
Definition at line 348 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_non_freq_dict_word = 0.1 |
"Penalty for words not in the frequent word dictionary"
Definition at line 346 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_punc = 0.2 |
"Penalty for inconsistent punctuation"
Definition at line 350 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_script = 0.5 |
"Penalty for inconsistent script"
Definition at line 354 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_spacing = 0.05 |
"Penalty for inconsistent spacing"
Definition at line 360 of file language_model.h.
bool tesseract::LanguageModel::language_model_use_sigmoidal_certainty = false |
"Use sigmoidal score for certainty"
Definition at line 364 of file language_model.h.
int tesseract::LanguageModel::language_model_viterbi_list_max_num_prunable = 10 |
"Maximum number of prunable (those for which PrunablePath() is" " true) entries in each viterbi list recorded in BLOB_CHOICEs"
Definition at line 323 of file language_model.h.
int tesseract::LanguageModel::language_model_viterbi_list_max_size = 500 |
"Maximum size of viterbi lists recorded in BLOB_CHOICEs"
Definition at line 325 of file language_model.h.
|
protected |
Definition at line 393 of file language_model.h.
|
protected |
Definition at line 421 of file language_model.h.
|
protected |
Definition at line 400 of file language_model.h.
|
protected |
Definition at line 401 of file language_model.h.
|
protected |
Definition at line 374 of file language_model.h.
|
protected |
Definition at line 403 of file language_model.h.
int tesseract::LanguageModel::wordrec_display_segmentations = 0 |
"Display Segmentations"
Definition at line 362 of file language_model.h.