tesseract 4.1.1
Loading...
Searching...
No Matches
tesseract::LanguageModel Class Reference

#include <language_model.h>

Public Member Functions

 LanguageModel (const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
 
 ~LanguageModel ()
 
void InitForWord (const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
 
bool UpdateState (bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
bool AcceptableChoiceFound ()
 
void SetAcceptableChoiceFound (bool val)
 
ParamsModelgetParamsModel ()
 

Static Public Member Functions

static void ExtractFeaturesFromPath (const ViterbiStateEntry &vse, float features[])
 

Public Attributes

int language_model_debug_level = 0
 
bool language_model_ngram_on = false
 
int language_model_ngram_order = 8
 
int language_model_viterbi_list_max_num_prunable = 10
 
int language_model_viterbi_list_max_size = 500
 
double language_model_ngram_small_prob = 0.000001
 
double language_model_ngram_nonmatch_score = -40.0
 
bool language_model_ngram_use_only_first_uft8_step = false
 
double language_model_ngram_scale_factor = 0.03
 
double language_model_ngram_rating_factor = 16.0
 
bool language_model_ngram_space_delimited_language = true
 
int language_model_min_compound_length = 3
 
double language_model_penalty_non_freq_dict_word = 0.1
 
double language_model_penalty_non_dict_word = 0.15
 
double language_model_penalty_punc = 0.2
 
double language_model_penalty_case = 0.1
 
double language_model_penalty_script = 0.5
 
double language_model_penalty_chartype = 0.3
 
double language_model_penalty_font = 0.00
 
double language_model_penalty_spacing = 0.05
 
double language_model_penalty_increment = 0.01
 
int wordrec_display_segmentations = 0
 
bool language_model_use_sigmoidal_certainty = false
 

Static Public Attributes

static const LanguageModelFlagsType kSmallestRatingFlag = 0x1
 
static const LanguageModelFlagsType kLowerCaseFlag = 0x2
 
static const LanguageModelFlagsType kUpperCaseFlag = 0x4
 
static const LanguageModelFlagsType kDigitFlag = 0x8
 
static const LanguageModelFlagsType kXhtConsistentFlag = 0x10
 
static const float kMaxAvgNgramCost = 25.0f
 

Protected Member Functions

float CertaintyScore (float cert)
 
float ComputeAdjustment (int num_problems, float penalty)
 
float ComputeConsistencyAdjustment (const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
 
float ComputeAdjustedPathCost (ViterbiStateEntry *vse)
 
bool GetTopLowerUpperDigit (BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
 
int SetTopParentLowerUpperDigit (LanguageModelState *parent_node) const
 
ViterbiStateEntryGetNextParentVSE (bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
 
bool AddViterbiStateEntry (LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void GenerateTopChoiceInfo (ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
 
LanguageModelDawgInfoGenerateDawgInfo (bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
 
LanguageModelNgramInfoGenerateNgramInfo (const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
 
float ComputeNgramCost (const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
 
float ComputeDenom (BLOB_CHOICE_LIST *curr_list)
 
void FillConsistencyInfo (int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
 
void UpdateBestChoice (ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
WERD_CHOICEConstructWord (ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
 
void ComputeAssociateStats (int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
 
bool PrunablePath (const ViterbiStateEntry &vse)
 
bool AcceptablePath (const ViterbiStateEntry &vse)
 

Protected Attributes

DawgArgs dawg_args_
 
float rating_cert_scale_ = 0.0f
 
const UnicityTable< FontInfo > * fontinfo_table_ = nullptr
 
Dictdict_ = nullptr
 
bool fixed_pitch_ = false
 
float max_char_wh_ratio_ = 0.0f
 
STRING prev_word_str_
 
int prev_word_unichar_step_len_ = 0
 
DawgPositionVector very_beginning_active_dawgs_
 
DawgPositionVector beginning_active_dawgs_
 
bool acceptable_choice_found_ = false
 
bool correct_segmentation_explored_ = false
 
ParamsModel params_model_
 

Detailed Description

Definition at line 50 of file language_model.h.

Constructor & Destructor Documentation

◆ LanguageModel()

tesseract::LanguageModel::LanguageModel ( const UnicityTable< FontInfo > *  fontinfo_table,
Dict dict 
)

Definition at line 53 of file language_model.cpp.

55 : INT_MEMBER(language_model_debug_level, 0, "Language model debug level",
56 dict->getCCUtil()->params()),
58 "Turn on/off the use of character ngram model",
59 dict->getCCUtil()->params()),
61 "Maximum order of the character ngram model",
62 dict->getCCUtil()->params()),
64 "Maximum number of prunable (those for which"
65 " PrunablePath() is true) entries in each viterbi list"
66 " recorded in BLOB_CHOICEs",
67 dict->getCCUtil()->params()),
69 "Maximum size of viterbi lists recorded in BLOB_CHOICEs",
70 dict->getCCUtil()->params()),
72 "To avoid overly small denominators use this as the "
73 "floor of the probability returned by the ngram model.",
74 dict->getCCUtil()->params()),
76 "Average classifier score of a non-matching unichar.",
77 dict->getCCUtil()->params()),
79 "Use only the first UTF8 step of the given string"
80 " when computing log probabilities.",
81 dict->getCCUtil()->params()),
83 "Strength of the character ngram model relative to the"
84 " character classifier ",
85 dict->getCCUtil()->params()),
87 "Factor to bring log-probs into the same range as ratings"
88 " when multiplied by outline length ",
89 dict->getCCUtil()->params()),
91 "Words are delimited by space", dict->getCCUtil()->params()),
93 "Minimum length of compound words",
94 dict->getCCUtil()->params()),
96 "Penalty for words not in the frequent word dictionary",
97 dict->getCCUtil()->params()),
99 "Penalty for non-dictionary words",
100 dict->getCCUtil()->params()),
102 "Penalty for inconsistent punctuation",
103 dict->getCCUtil()->params()),
105 "Penalty for inconsistent case",
106 dict->getCCUtil()->params()),
108 "Penalty for inconsistent script",
109 dict->getCCUtil()->params()),
111 "Penalty for inconsistent character type",
112 dict->getCCUtil()->params()),
113 // TODO(daria, rays): enable font consistency checking
114 // after improving font analysis.
116 "Penalty for inconsistent font",
117 dict->getCCUtil()->params()),
119 "Penalty for inconsistent spacing",
120 dict->getCCUtil()->params()),
121 double_MEMBER(language_model_penalty_increment, 0.01, "Penalty increment",
122 dict->getCCUtil()->params()),
123 INT_MEMBER(wordrec_display_segmentations, 0, "Display Segmentations",
124 dict->getCCUtil()->params()),
126 "Use sigmoidal score for certainty",
127 dict->getCCUtil()->params()),
128 dawg_args_(nullptr, new DawgPositionVector(), NO_PERM),
129 fontinfo_table_(fontinfo_table),
130 dict_(dict) {
131 ASSERT_HOST(dict_ != nullptr);
132}
@ NO_PERM
Definition: ratngs.h:233
#define ASSERT_HOST(x)
Definition: errcode.h:88
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:315
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:330
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:324
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:318
double language_model_ngram_nonmatch_score
double language_model_penalty_non_dict_word
bool language_model_ngram_use_only_first_uft8_step
bool language_model_ngram_space_delimited_language
const UnicityTable< FontInfo > * fontinfo_table_
double language_model_penalty_non_freq_dict_word
int language_model_viterbi_list_max_num_prunable

◆ ~LanguageModel()

tesseract::LanguageModel::~LanguageModel ( )

Definition at line 134 of file language_model.cpp.

134{ delete dawg_args_.updated_dawgs; }
DawgPositionVector * updated_dawgs
Definition: dict.h:85

Member Function Documentation

◆ AcceptableChoiceFound()

bool tesseract::LanguageModel::AcceptableChoiceFound ( )
inline

Definition at line 103 of file language_model.h.

◆ AcceptablePath()

bool tesseract::LanguageModel::AcceptablePath ( const ViterbiStateEntry vse)
inlineprotected

Definition at line 309 of file language_model.h.

309 {
310 return (vse.dawg_info != nullptr || vse.Consistent() ||
311 (vse.ngram_info != nullptr && !vse.ngram_info->pruned));
312 }

◆ AddViterbiStateEntry()

bool tesseract::LanguageModel::AddViterbiStateEntry ( LanguageModelFlagsType  top_choice_flags,
float  denom,
bool  word_end,
int  curr_col,
int  curr_row,
BLOB_CHOICE b,
LanguageModelState curr_state,
ViterbiStateEntry parent_vse,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 561 of file language_model.cpp.

572 {
573 ViterbiStateEntry_IT vit;
575 tprintf("AddViterbiStateEntry for unichar %s rating=%.4f"
576 " certainty=%.4f top_choice_flags=0x%x",
578 b->rating(), b->certainty(), top_choice_flags);
580 tprintf(" parent_vse=%p\n", parent_vse);
581 else
582 tprintf("\n");
583 }
584 ASSERT_HOST(curr_state != nullptr);
585 // Check whether the list is full.
586 if (curr_state->viterbi_state_entries_length >=
589 tprintf("AddViterbiStateEntry: viterbi list is full!\n");
590 }
591 return false;
592 }
593
594 // Invoke Dawg language model component.
595 LanguageModelDawgInfo *dawg_info =
596 GenerateDawgInfo(word_end, curr_col, curr_row, *b, parent_vse);
597
598 float outline_length =
600 // Invoke Ngram language model component.
601 LanguageModelNgramInfo *ngram_info = nullptr;
603 ngram_info = GenerateNgramInfo(
605 denom, curr_col, curr_row, outline_length, parent_vse);
606 ASSERT_HOST(ngram_info != nullptr);
607 }
608 bool liked_by_language_model = dawg_info != nullptr ||
609 (ngram_info != nullptr && !ngram_info->pruned);
610 // Quick escape if not liked by the language model, can't be consistent
611 // xheight, and not top choice.
612 if (!liked_by_language_model && top_choice_flags == 0) {
614 tprintf("Language model components very early pruned this entry\n");
615 }
616 delete ngram_info;
617 delete dawg_info;
618 return false;
619 }
620
621 // Check consistency of the path and set the relevant consistency_info.
622 LMConsistencyInfo consistency_info(
623 parent_vse != nullptr ? &parent_vse->consistency_info : nullptr);
624 // Start with just the x-height consistency, as it provides significant
625 // pruning opportunity.
626 consistency_info.ComputeXheightConsistency(
628 // Turn off xheight consistent flag if not consistent.
629 if (consistency_info.InconsistentXHeight()) {
630 top_choice_flags &= ~kXhtConsistentFlag;
631 }
632
633 // Quick escape if not liked by the language model, not consistent xheight,
634 // and not top choice.
635 if (!liked_by_language_model && top_choice_flags == 0) {
637 tprintf("Language model components early pruned this entry\n");
638 }
639 delete ngram_info;
640 delete dawg_info;
641 return false;
642 }
643
644 // Compute the rest of the consistency info.
645 FillConsistencyInfo(curr_col, word_end, b, parent_vse,
646 word_res, &consistency_info);
647 if (dawg_info != nullptr && consistency_info.invalid_punc) {
648 consistency_info.invalid_punc = false; // do not penalize dict words
649 }
650
651 // Compute cost of associating the blobs that represent the current unichar.
652 AssociateStats associate_stats;
653 ComputeAssociateStats(curr_col, curr_row, max_char_wh_ratio_,
654 parent_vse, word_res, &associate_stats);
655 if (parent_vse != nullptr) {
656 associate_stats.shape_cost += parent_vse->associate_stats.shape_cost;
657 associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape;
658 }
659
660 // Create the new ViterbiStateEntry compute the adjusted cost of the path.
661 auto *new_vse = new ViterbiStateEntry(
662 parent_vse, b, 0.0, outline_length,
663 consistency_info, associate_stats, top_choice_flags, dawg_info,
664 ngram_info, (language_model_debug_level > 0) ?
665 dict_->getUnicharset().id_to_unichar(b->unichar_id()) : nullptr);
666 new_vse->cost = ComputeAdjustedPathCost(new_vse);
668 tprintf("Adjusted cost = %g\n", new_vse->cost);
669
670 // Invoke Top Choice language model component to make the final adjustments
671 // to new_vse->top_choice_flags.
672 if (!curr_state->viterbi_state_entries.empty() && new_vse->top_choice_flags) {
673 GenerateTopChoiceInfo(new_vse, parent_vse, curr_state);
674 }
675
676 // If language model components did not like this unichar - return.
677 bool keep = new_vse->top_choice_flags || liked_by_language_model;
678 if (!(top_choice_flags & kSmallestRatingFlag) && // no non-top choice paths
679 consistency_info.inconsistent_script) { // with inconsistent script
680 keep = false;
681 }
682 if (!keep) {
684 tprintf("Language model components did not like this entry\n");
685 }
686 delete new_vse;
687 return false;
688 }
689
690 // Discard this entry if it represents a prunable path and
691 // language_model_viterbi_list_max_num_prunable such entries with a lower
692 // cost have already been recorded.
693 if (PrunablePath(*new_vse) &&
694 (curr_state->viterbi_state_entries_prunable_length >=
696 new_vse->cost >= curr_state->viterbi_state_entries_prunable_max_cost) {
698 tprintf("Discarded ViterbiEntry with high cost %g max cost %g\n",
699 new_vse->cost,
700 curr_state->viterbi_state_entries_prunable_max_cost);
701 }
702 delete new_vse;
703 return false;
704 }
705
706 // Update best choice if needed.
707 if (word_end) {
708 UpdateBestChoice(new_vse, pain_points, word_res,
709 best_choice_bundle, blamer_bundle);
710 // Discard the entry if UpdateBestChoice() found flaws in it.
711 if (new_vse->cost >= WERD_CHOICE::kBadRating &&
712 new_vse != best_choice_bundle->best_vse) {
714 tprintf("Discarded ViterbiEntry with high cost %g\n", new_vse->cost);
715 }
716 delete new_vse;
717 return false;
718 }
719 }
720
721 // Add the new ViterbiStateEntry and to curr_state->viterbi_state_entries.
722 curr_state->viterbi_state_entries.add_sorted(ViterbiStateEntry::Compare,
723 false, new_vse);
724 curr_state->viterbi_state_entries_length++;
725 if (PrunablePath(*new_vse)) {
726 curr_state->viterbi_state_entries_prunable_length++;
727 }
728
729 // Update lms->viterbi_state_entries_prunable_max_cost and clear
730 // top_choice_flags of entries with ratings_sum than new_vse->ratings_sum.
731 if ((curr_state->viterbi_state_entries_prunable_length >=
733 new_vse->top_choice_flags) {
734 ASSERT_HOST(!curr_state->viterbi_state_entries.empty());
736 vit.set_to_list(&(curr_state->viterbi_state_entries));
737 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
738 ViterbiStateEntry *curr_vse = vit.data();
739 // Clear the appropriate top choice flags of the entries in the
740 // list that have cost higher thank new_entry->cost
741 // (since they will not be top choices any more).
742 if (curr_vse->top_choice_flags && curr_vse != new_vse &&
743 curr_vse->cost > new_vse->cost) {
744 curr_vse->top_choice_flags &= ~(new_vse->top_choice_flags);
745 }
746 if (prunable_counter > 0 && PrunablePath(*curr_vse)) --prunable_counter;
747 // Update curr_state->viterbi_state_entries_prunable_max_cost.
748 if (prunable_counter == 0) {
749 curr_state->viterbi_state_entries_prunable_max_cost = vit.data()->cost;
751 tprintf("Set viterbi_state_entries_prunable_max_cost to %g\n",
752 curr_state->viterbi_state_entries_prunable_max_cost);
753 }
754 prunable_counter = -1; // stop counting
755 }
756 }
757 }
758
759 // Print the newly created ViterbiStateEntry.
761 new_vse->Print("New");
763 curr_state->Print("Updated viterbi list");
764 }
765
766 return true;
767}
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
float certainty() const
Definition: ratngs.h:83
float rating() const
Definition: ratngs.h:80
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
static const float kBadRating
Definition: ratngs.h:265
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:519
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
const UNICHARSET & getUnicharset() const
Definition: dict.h:101
static float ComputeOutlineLength(float rating_cert_scale, const BLOB_CHOICE &b)
Definition: associate.h:80
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
bool PrunablePath(const ViterbiStateEntry &vse)
static const LanguageModelFlagsType kSmallestRatingFlag
void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
float ComputeAdjustedPathCost(ViterbiStateEntry *vse)
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
static int Compare(const void *e1, const void *e2)
Definition: lm_state.h:132

◆ CertaintyScore()

float tesseract::LanguageModel::CertaintyScore ( float  cert)
inlineprotected

Definition at line 112 of file language_model.h.

112 {
114 // cert is assumed to be between 0 and -dict_->certainty_scale.
115 // If you enable language_model_use_sigmoidal_certainty, you
116 // need to adjust language_model_ngram_nonmatch_score as well.
117 cert = -cert / dict_->certainty_scale;
118 return 1.0f / (1.0f + exp(10.0f * cert));
119 } else {
120 return (-1.0f / cert);
121 }
122 }
double certainty_scale
Definition: dict.h:627

◆ ComputeAdjustedPathCost()

float tesseract::LanguageModel::ComputeAdjustedPathCost ( ViterbiStateEntry vse)
protected

Definition at line 1199 of file language_model.cpp.

1199 {
1200 ASSERT_HOST(vse != nullptr);
1201 if (params_model_.Initialized()) {
1202 float features[PTRAIN_NUM_FEATURE_TYPES];
1203 ExtractFeaturesFromPath(*vse, features);
1204 float cost = params_model_.ComputeCost(features);
1206 tprintf("ComputeAdjustedPathCost %g ParamsModel features:\n", cost);
1207 if (language_model_debug_level >= 5) {
1208 for (int f = 0; f < PTRAIN_NUM_FEATURE_TYPES; ++f) {
1209 tprintf("%s=%g\n", kParamsTrainingFeatureTypeName[f], features[f]);
1210 }
1211 }
1212 }
1213 return cost * vse->outline_length;
1214 } else {
1215 float adjustment = 1.0f;
1216 if (vse->dawg_info == nullptr || vse->dawg_info->permuter != FREQ_DAWG_PERM) {
1218 }
1219 if (vse->dawg_info == nullptr) {
1221 if (vse->length > language_model_min_compound_length) {
1222 adjustment += ((vse->length - language_model_min_compound_length) *
1224 }
1225 }
1226 if (vse->associate_stats.shape_cost > 0) {
1227 adjustment += vse->associate_stats.shape_cost /
1228 static_cast<float>(vse->length);
1229 }
1231 ASSERT_HOST(vse->ngram_info != nullptr);
1232 return vse->ngram_info->ngram_and_classifier_cost * adjustment;
1233 } else {
1234 adjustment += ComputeConsistencyAdjustment(vse->dawg_info,
1235 vse->consistency_info);
1236 return vse->ratings_sum * adjustment;
1237 }
1238 }
1239}
@ FREQ_DAWG_PERM
Definition: ratngs.h:244
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
float ComputeCost(const float features[]) const

◆ ComputeAdjustment()

float tesseract::LanguageModel::ComputeAdjustment ( int  num_problems,
float  penalty 
)
inlineprotected

Definition at line 124 of file language_model.h.

124 {
125 if (num_problems == 0) return 0.0f;
126 if (num_problems == 1) return penalty;
127 return (penalty + (language_model_penalty_increment *
128 static_cast<float>(num_problems-1)));
129 }

◆ ComputeAssociateStats()

void tesseract::LanguageModel::ComputeAssociateStats ( int  col,
int  row,
float  max_char_wh_ratio,
ViterbiStateEntry parent_vse,
WERD_RES word_res,
AssociateStats associate_stats 
)
inlineprotected

Definition at line 280 of file language_model.h.

284 {
286 col, row,
287 (parent_vse != nullptr) ? &(parent_vse->associate_stats) : nullptr,
288 (parent_vse != nullptr) ? parent_vse->length : 0,
289 fixed_pitch_, max_char_wh_ratio,
290 word_res, language_model_debug_level > 2, associate_stats);
291 }
static void ComputeStats(int col, int row, const AssociateStats *parent_stats, int parent_path_length, bool fixed_pitch, float max_char_wh_ratio, WERD_RES *word_res, bool debug, AssociateStats *stats)
Definition: associate.cpp:34

◆ ComputeConsistencyAdjustment()

float tesseract::LanguageModel::ComputeConsistencyAdjustment ( const LanguageModelDawgInfo dawg_info,
const LMConsistencyInfo consistency_info 
)
inlineprotected

Definition at line 135 of file language_model.h.

137 {
138 if (dawg_info != nullptr) {
139 return ComputeAdjustment(consistency_info.NumInconsistentCase(),
141 (consistency_info.inconsistent_script ?
143 }
144 return (ComputeAdjustment(consistency_info.NumInconsistentPunc(),
146 ComputeAdjustment(consistency_info.NumInconsistentCase(),
148 ComputeAdjustment(consistency_info.NumInconsistentChartype(),
150 ComputeAdjustment(consistency_info.NumInconsistentSpaces(),
152 (consistency_info.inconsistent_script ?
154 (consistency_info.inconsistent_font ?
156 }
float ComputeAdjustment(int num_problems, float penalty)

◆ ComputeDenom()

float tesseract::LanguageModel::ComputeDenom ( BLOB_CHOICE_LIST *  curr_list)
protected

Definition at line 994 of file language_model.cpp.

994 {
995 if (curr_list->empty()) return 1.0f;
996 float denom = 0.0f;
997 int len = 0;
998 BLOB_CHOICE_IT c_it(curr_list);
999 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
1000 ASSERT_HOST(c_it.data() != nullptr);
1001 ++len;
1002 denom += CertaintyScore(c_it.data()->certainty());
1003 }
1004 assert(len != 0);
1005 // The ideal situation would be to have the classifier scores for
1006 // classifying each position as each of the characters in the unicharset.
1007 // Since we can not do this because of speed, we add a very crude estimate
1008 // of what these scores for the "missing" classifications would sum up to.
1009 denom += (dict_->getUnicharset().size() - len) *
1011
1012 return denom;
1013}
int size() const
Definition: unicharset.h:341
float CertaintyScore(float cert)

◆ ComputeNgramCost()

float tesseract::LanguageModel::ComputeNgramCost ( const char *  unichar,
float  certainty,
float  denom,
const char *  context,
int *  unichar_step_len,
bool *  found_small_prob,
float *  ngram_prob 
)
protected

Definition at line 934 of file language_model.cpp.

940 {
941 const char *context_ptr = context;
942 char *modified_context = nullptr;
943 char *modified_context_end = nullptr;
944 const char *unichar_ptr = unichar;
945 const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
946 float prob = 0.0f;
947 int step = 0;
948 while (unichar_ptr < unichar_end &&
949 (step = UNICHAR::utf8_step(unichar_ptr)) > 0) {
951 tprintf("prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
952 dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step));
953 }
954 prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step);
955 ++(*unichar_step_len);
957 unichar_ptr += step;
958 // If there are multiple UTF8 characters present in unichar, context is
959 // updated to include the previously examined characters from str,
960 // unless use_only_first_uft8_step is true.
961 if (unichar_ptr < unichar_end) {
962 if (modified_context == nullptr) {
963 size_t context_len = strlen(context);
964 modified_context =
965 new char[context_len + strlen(unichar_ptr) + step + 1];
966 memcpy(modified_context, context, context_len);
967 modified_context_end = modified_context + context_len;
968 context_ptr = modified_context;
969 }
970 strncpy(modified_context_end, unichar_ptr - step, step);
971 modified_context_end += step;
972 *modified_context_end = '\0';
973 }
974 }
975 prob /= static_cast<float>(*unichar_step_len); // normalize
977 if (language_model_debug_level > 0) tprintf("Found small prob %g\n", prob);
978 *found_small_prob = true;
980 }
981 *ngram_cost = -1.0*log2(prob);
982 float ngram_and_classifier_cost =
983 -1.0*log2(CertaintyScore(certainty)/denom) +
986 tprintf("-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\n", unichar,
987 unichar, context_ptr, CertaintyScore(certainty)/denom, prob,
988 ngram_and_classifier_cost);
989 }
990 delete[] modified_context;
991 return ngram_and_classifier_cost;
992}
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:138
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:390

◆ ConstructWord()

WERD_CHOICE * tesseract::LanguageModel::ConstructWord ( ViterbiStateEntry vse,
WERD_RES word_res,
DANGERR fixpt,
BlamerBundle blamer_bundle,
bool *  truth_path 
)
protected

Definition at line 1390 of file language_model.cpp.

1395 {
1396 if (truth_path != nullptr) {
1397 *truth_path =
1398 (blamer_bundle != nullptr &&
1399 vse->length == blamer_bundle->correct_segmentation_length());
1400 }
1401 BLOB_CHOICE *curr_b = vse->curr_b;
1402 ViterbiStateEntry *curr_vse = vse;
1403
1404 int i;
1405 bool compound = dict_->hyphenated(); // treat hyphenated words as compound
1406
1407 // Re-compute the variance of the width-to-height ratios (since we now
1408 // can compute the mean over the whole word).
1409 float full_wh_ratio_mean = 0.0f;
1410 if (vse->associate_stats.full_wh_ratio_var != 0.0f) {
1411 vse->associate_stats.shape_cost -= vse->associate_stats.full_wh_ratio_var;
1412 full_wh_ratio_mean = (vse->associate_stats.full_wh_ratio_total /
1413 static_cast<float>(vse->length));
1414 vse->associate_stats.full_wh_ratio_var = 0.0f;
1415 }
1416
1417 // Construct a WERD_CHOICE by tracing parent pointers.
1418 auto *word = new WERD_CHOICE(word_res->uch_set, vse->length);
1419 word->set_length(vse->length);
1420 int total_blobs = 0;
1421 for (i = (vse->length-1); i >= 0; --i) {
1422 if (blamer_bundle != nullptr && truth_path != nullptr && *truth_path &&
1423 !blamer_bundle->MatrixPositionCorrect(i, curr_b->matrix_cell())) {
1424 *truth_path = false;
1425 }
1426 // The number of blobs used for this choice is row - col + 1.
1427 int num_blobs = curr_b->matrix_cell().row - curr_b->matrix_cell().col + 1;
1428 total_blobs += num_blobs;
1429 word->set_blob_choice(i, num_blobs, curr_b);
1430 // Update the width-to-height ratio variance. Useful non-space delimited
1431 // languages to ensure that the blobs are of uniform width.
1432 // Skip leading and trailing punctuation when computing the variance.
1433 if ((full_wh_ratio_mean != 0.0f &&
1434 ((curr_vse != vse && curr_vse->parent_vse != nullptr) ||
1436 vse->associate_stats.full_wh_ratio_var +=
1437 pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2);
1439 tprintf("full_wh_ratio_var += (%g-%g)^2\n",
1440 full_wh_ratio_mean, curr_vse->associate_stats.full_wh_ratio);
1441 }
1442 }
1443
1444 // Mark the word as compound if compound permuter was set for any of
1445 // the unichars on the path (usually this will happen for unichars
1446 // that are compounding operators, like "-" and "/").
1447 if (!compound && curr_vse->dawg_info &&
1448 curr_vse->dawg_info->permuter == COMPOUND_PERM) compound = true;
1449
1450 // Update curr_* pointers.
1451 curr_vse = curr_vse->parent_vse;
1452 if (curr_vse == nullptr) break;
1453 curr_b = curr_vse->curr_b;
1454 }
1455 ASSERT_HOST(i == 0); // check that we recorded all the unichar ids.
1456 ASSERT_HOST(total_blobs == word_res->ratings->dimension());
1457 // Re-adjust shape cost to include the updated width-to-height variance.
1458 if (full_wh_ratio_mean != 0.0f) {
1459 vse->associate_stats.shape_cost += vse->associate_stats.full_wh_ratio_var;
1460 }
1461
1462 word->set_rating(vse->ratings_sum);
1463 word->set_certainty(vse->min_certainty);
1464 word->set_x_heights(vse->consistency_info.BodyMinXHeight(),
1465 vse->consistency_info.BodyMaxXHeight());
1466 if (vse->dawg_info != nullptr) {
1467 word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter);
1468 } else if (language_model_ngram_on && !vse->ngram_info->pruned) {
1469 word->set_permuter(NGRAM_PERM);
1470 } else if (vse->top_choice_flags) {
1471 word->set_permuter(TOP_CHOICE_PERM);
1472 } else {
1473 word->set_permuter(NO_PERM);
1474 }
1475 word->set_dangerous_ambig_found_(!dict_->NoDangerousAmbig(word, fixpt, true,
1476 word_res->ratings));
1477 return word;
1478}
@ TOP_CHOICE_PERM
Definition: ratngs.h:235
@ NGRAM_PERM
Definition: ratngs.h:238
@ COMPOUND_PERM
Definition: ratngs.h:245
bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord)
Definition: blamer.h:145
int correct_segmentation_length() const
Definition: blamer.h:140
int dimension() const
Definition: matrix.h:536
const UNICHARSET * uch_set
Definition: pageres.h:203
MATRIX * ratings
Definition: pageres.h:237
const MATRIX_COORD & matrix_cell()
Definition: ratngs.h:117
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:135
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
Definition: stopper.cpp:144

◆ ExtractFeaturesFromPath()

void tesseract::LanguageModel::ExtractFeaturesFromPath ( const ViterbiStateEntry vse,
float  features[] 
)
static

Definition at line 1341 of file language_model.cpp.

1342 {
1343 memset(features, 0, sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);
1344 // Record dictionary match info.
1345 int len = vse.length <= kMaxSmallWordUnichars ? 0 :
1346 vse.length <= kMaxMediumWordUnichars ? 1 : 2;
1347 if (vse.dawg_info != nullptr) {
1348 int permuter = vse.dawg_info->permuter;
1349 if (permuter == NUMBER_PERM || permuter == USER_PATTERN_PERM) {
1350 if (vse.consistency_info.num_digits == vse.length) {
1351 features[PTRAIN_DIGITS_SHORT+len] = 1.0;
1352 } else {
1353 features[PTRAIN_NUM_SHORT+len] = 1.0;
1354 }
1355 } else if (permuter == DOC_DAWG_PERM) {
1356 features[PTRAIN_DOC_SHORT+len] = 1.0;
1357 } else if (permuter == SYSTEM_DAWG_PERM || permuter == USER_DAWG_PERM ||
1358 permuter == COMPOUND_PERM) {
1359 features[PTRAIN_DICT_SHORT+len] = 1.0;
1360 } else if (permuter == FREQ_DAWG_PERM) {
1361 features[PTRAIN_FREQ_SHORT+len] = 1.0;
1362 }
1363 }
1364 // Record shape cost feature (normalized by path length).
1365 features[PTRAIN_SHAPE_COST_PER_CHAR] =
1366 vse.associate_stats.shape_cost / static_cast<float>(vse.length);
1367 // Record ngram cost. (normalized by the path length).
1368 features[PTRAIN_NGRAM_COST_PER_CHAR] = 0.0;
1369 if (vse.ngram_info != nullptr) {
1370 features[PTRAIN_NGRAM_COST_PER_CHAR] =
1371 vse.ngram_info->ngram_cost / static_cast<float>(vse.length);
1372 }
1373 // Record consistency-related features.
1374 // Disabled this feature for due to its poor performance.
1375 // features[PTRAIN_NUM_BAD_PUNC] = vse.consistency_info.NumInconsistentPunc();
1376 features[PTRAIN_NUM_BAD_CASE] = vse.consistency_info.NumInconsistentCase();
1377 features[PTRAIN_XHEIGHT_CONSISTENCY] = vse.consistency_info.xht_decision;
1378 features[PTRAIN_NUM_BAD_CHAR_TYPE] = vse.dawg_info == nullptr ?
1379 vse.consistency_info.NumInconsistentChartype() : 0.0;
1380 features[PTRAIN_NUM_BAD_SPACING] =
1381 vse.consistency_info.NumInconsistentSpaces();
1382 // Disabled this feature for now due to its poor performance.
1383 // features[PTRAIN_NUM_BAD_FONT] = vse.consistency_info.inconsistent_font;
1384
1385 // Classifier-related features.
1386 features[PTRAIN_RATING_PER_CHAR] =
1387 vse.ratings_sum / static_cast<float>(vse.outline_length);
1388}
@ USER_PATTERN_PERM
Definition: ratngs.h:240
@ DOC_DAWG_PERM
Definition: ratngs.h:242
@ USER_DAWG_PERM
Definition: ratngs.h:243
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:241
@ NUMBER_PERM
Definition: ratngs.h:239

◆ FillConsistencyInfo()

void tesseract::LanguageModel::FillConsistencyInfo ( int  curr_col,
bool  word_end,
BLOB_CHOICE b,
ViterbiStateEntry parent_vse,
WERD_RES word_res,
LMConsistencyInfo consistency_info 
)
protected

Definition at line 1015 of file language_model.cpp.

1021 {
1022 const UNICHARSET &unicharset = dict_->getUnicharset();
1023 UNICHAR_ID unichar_id = b->unichar_id();
1024 BLOB_CHOICE* parent_b = parent_vse != nullptr ? parent_vse->curr_b : nullptr;
1025
1026 // Check punctuation validity.
1027 if (unicharset.get_ispunctuation(unichar_id)) consistency_info->num_punc++;
1028 if (dict_->GetPuncDawg() != nullptr && !consistency_info->invalid_punc) {
1029 if (dict_->compound_marker(unichar_id) && parent_b != nullptr &&
1030 (unicharset.get_isalpha(parent_b->unichar_id()) ||
1031 unicharset.get_isdigit(parent_b->unichar_id()))) {
1032 // reset punc_ref for compound words
1033 consistency_info->punc_ref = NO_EDGE;
1034 } else {
1035 bool is_apos = dict_->is_apostrophe(unichar_id);
1036 bool prev_is_numalpha = (parent_b != nullptr &&
1037 (unicharset.get_isalpha(parent_b->unichar_id()) ||
1038 unicharset.get_isdigit(parent_b->unichar_id())));
1039 UNICHAR_ID pattern_unichar_id =
1040 (unicharset.get_isalpha(unichar_id) ||
1041 unicharset.get_isdigit(unichar_id) ||
1042 (is_apos && prev_is_numalpha)) ?
1043 Dawg::kPatternUnicharID : unichar_id;
1044 if (consistency_info->punc_ref == NO_EDGE ||
1045 pattern_unichar_id != Dawg::kPatternUnicharID ||
1046 dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) !=
1049 consistency_info->punc_ref);
1050 consistency_info->punc_ref =
1051 (node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of(
1052 node, pattern_unichar_id, word_end) : NO_EDGE;
1053 if (consistency_info->punc_ref == NO_EDGE) {
1054 consistency_info->invalid_punc = true;
1055 }
1056 }
1057 }
1058 }
1059
1060 // Update case related counters.
1061 if (parent_vse != nullptr && !word_end && dict_->compound_marker(unichar_id)) {
1062 // Reset counters if we are dealing with a compound word.
1063 consistency_info->num_lower = 0;
1064 consistency_info->num_non_first_upper = 0;
1065 }
1066 else if (unicharset.get_islower(unichar_id)) {
1067 consistency_info->num_lower++;
1068 } else if ((parent_b != nullptr) && unicharset.get_isupper(unichar_id)) {
1069 if (unicharset.get_isupper(parent_b->unichar_id()) ||
1070 consistency_info->num_lower > 0 ||
1071 consistency_info->num_non_first_upper > 0) {
1072 consistency_info->num_non_first_upper++;
1073 }
1074 }
1075
1076 // Initialize consistency_info->script_id (use script of unichar_id
1077 // if it is not Common, use script id recorded by the parent otherwise).
1078 // Set inconsistent_script to true if the script of the current unichar
1079 // is not consistent with that of the parent.
1080 consistency_info->script_id = unicharset.get_script(unichar_id);
1081 // Hiragana and Katakana can mix with Han.
1083 if ((unicharset.hiragana_sid() != unicharset.null_sid() &&
1084 consistency_info->script_id == unicharset.hiragana_sid()) ||
1085 (unicharset.katakana_sid() != unicharset.null_sid() &&
1086 consistency_info->script_id == unicharset.katakana_sid())) {
1087 consistency_info->script_id = dict_->getUnicharset().han_sid();
1088 }
1089 }
1090
1091 if (parent_vse != nullptr &&
1092 (parent_vse->consistency_info.script_id !=
1094 int parent_script_id = parent_vse->consistency_info.script_id;
1095 // If script_id is Common, use script id of the parent instead.
1096 if (consistency_info->script_id == dict_->getUnicharset().common_sid()) {
1097 consistency_info->script_id = parent_script_id;
1098 }
1099 if (consistency_info->script_id != parent_script_id) {
1100 consistency_info->inconsistent_script = true;
1101 }
1102 }
1103
1104 // Update chartype related counters.
1105 if (unicharset.get_isalpha(unichar_id)) {
1106 consistency_info->num_alphas++;
1107 } else if (unicharset.get_isdigit(unichar_id)) {
1108 consistency_info->num_digits++;
1109 } else if (!unicharset.get_ispunctuation(unichar_id)) {
1110 consistency_info->num_other++;
1111 }
1112
1113 // Check font and spacing consistency.
1114 if (fontinfo_table_->size() > 0 && parent_b != nullptr) {
1115 int fontinfo_id = -1;
1116 if (parent_b->fontinfo_id() == b->fontinfo_id() ||
1117 parent_b->fontinfo_id2() == b->fontinfo_id()) {
1118 fontinfo_id = b->fontinfo_id();
1119 } else if (parent_b->fontinfo_id() == b->fontinfo_id2() ||
1120 parent_b->fontinfo_id2() == b->fontinfo_id2()) {
1121 fontinfo_id = b->fontinfo_id2();
1122 }
1124 tprintf("pfont %s pfont %s font %s font2 %s common %s(%d)\n",
1125 (parent_b->fontinfo_id() >= 0) ?
1126 fontinfo_table_->get(parent_b->fontinfo_id()).name : "" ,
1127 (parent_b->fontinfo_id2() >= 0) ?
1128 fontinfo_table_->get(parent_b->fontinfo_id2()).name : "",
1129 (b->fontinfo_id() >= 0) ?
1130 fontinfo_table_->get(b->fontinfo_id()).name : "",
1131 (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1132 (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1133 fontinfo_id);
1134 }
1135 if (!word_res->blob_widths.empty()) { // if we have widths/gaps info
1136 bool expected_gap_found = false;
1137 float expected_gap = 0.0f;
1138 int temp_gap;
1139 if (fontinfo_id >= 0) { // found a common font
1140 ASSERT_HOST(fontinfo_id < fontinfo_table_->size());
1141 if (fontinfo_table_->get(fontinfo_id).get_spacing(
1142 parent_b->unichar_id(), unichar_id, &temp_gap)) {
1143 expected_gap = temp_gap;
1144 expected_gap_found = true;
1145 }
1146 } else {
1147 consistency_info->inconsistent_font = true;
1148 // Get an average of the expected gaps in each font
1149 int num_addends = 0;
1150 int temp_fid;
1151 for (int i = 0; i < 4; ++i) {
1152 if (i == 0) {
1153 temp_fid = parent_b->fontinfo_id();
1154 } else if (i == 1) {
1155 temp_fid = parent_b->fontinfo_id2();
1156 } else if (i == 2) {
1157 temp_fid = b->fontinfo_id();
1158 } else {
1159 temp_fid = b->fontinfo_id2();
1160 }
1161 ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size());
1162 if (temp_fid >= 0 && fontinfo_table_->get(temp_fid).get_spacing(
1163 parent_b->unichar_id(), unichar_id, &temp_gap)) {
1164 expected_gap += temp_gap;
1165 num_addends++;
1166 }
1167 }
1168 if (num_addends > 0) {
1169 expected_gap /= static_cast<float>(num_addends);
1170 expected_gap_found = true;
1171 }
1172 }
1173 if (expected_gap_found) {
1174 int actual_gap = word_res->GetBlobsGap(curr_col-1);
1175 if (actual_gap == 0) {
1176 consistency_info->num_inconsistent_spaces++;
1177 } else {
1178 float gap_ratio = expected_gap / actual_gap;
1179 // TODO(rays) The gaps seem to be way off most of the time, saved by
1180 // the error here that the ratio was compared to 1/2, when it should
1181 // have been 0.5f. Find the source of the gaps discrepancy and put
1182 // the 0.5f here in place of 0.0f.
1183 // Test on 2476595.sj, pages 0 to 6. (In French.)
1184 if (gap_ratio < 0.0f || gap_ratio > 2.0f) {
1185 consistency_info->num_inconsistent_spaces++;
1186 }
1187 }
1189 tprintf("spacing for %s(%d) %s(%d) col %d: expected %g actual %d\n",
1190 unicharset.id_to_unichar(parent_b->unichar_id()),
1191 parent_b->unichar_id(), unicharset.id_to_unichar(unichar_id),
1192 unichar_id, curr_col, expected_gap, actual_gap);
1193 }
1194 }
1195 }
1196 }
1197}
int UNICHAR_ID
Definition: unichar.h:34
int64_t NODE_REF
Definition: dawg.h:52
bool empty() const
Definition: genericvector.h:91
int GetBlobsGap(int blob_index)
Definition: pageres.cpp:740
GenericVector< int > blob_widths
Definition: pageres.h:216
int16_t fontinfo_id2() const
Definition: ratngs.h:89
int16_t fontinfo_id() const
Definition: ratngs.h:86
int hiragana_sid() const
Definition: unicharset.h:890
int katakana_sid() const
Definition: unicharset.h:891
int han_sid() const
Definition: unicharset.h:889
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:663
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:505
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
int null_sid() const
Definition: unicharset.h:884
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
int common_sid() const
Definition: unicharset.h:885
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:498
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const =0
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:122
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:113
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:438
bool is_apostrophe(UNICHAR_ID unichar_id)
Definition: dict.h:124
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:434

◆ GenerateDawgInfo()

LanguageModelDawgInfo * tesseract::LanguageModel::GenerateDawgInfo ( bool  word_end,
int  curr_col,
int  curr_row,
const BLOB_CHOICE b,
const ViterbiStateEntry parent_vse 
)
protected

Definition at line 785 of file language_model.cpp.

789 {
790 // Initialize active_dawgs from parent_vse if it is not nullptr.
791 // Otherwise use very_beginning_active_dawgs_.
792 if (parent_vse == nullptr) {
795 } else {
796 if (parent_vse->dawg_info == nullptr) return nullptr; // not a dict word path
797 dawg_args_.active_dawgs = &parent_vse->dawg_info->active_dawgs;
798 dawg_args_.permuter = parent_vse->dawg_info->permuter;
799 }
800
801 // Deal with hyphenated words.
802 if (word_end && dict_->has_hyphen_end(&dict_->getUnicharset(),
803 b.unichar_id(), curr_col == 0)) {
804 if (language_model_debug_level > 0) tprintf("Hyphenated word found\n");
805 return new LanguageModelDawgInfo(dawg_args_.active_dawgs, COMPOUND_PERM);
806 }
807
808 // Deal with compound words.
809 if (dict_->compound_marker(b.unichar_id()) &&
810 (parent_vse == nullptr || parent_vse->dawg_info->permuter != NUMBER_PERM)) {
811 if (language_model_debug_level > 0) tprintf("Found compound marker\n");
812 // Do not allow compound operators at the beginning and end of the word.
813 // Do not allow more than one compound operator per word.
814 // Do not allow compounding of words with lengths shorter than
815 // language_model_min_compound_length
816 if (parent_vse == nullptr || word_end ||
818 parent_vse->length < language_model_min_compound_length)
819 return nullptr;
820
821 int i;
822 // Check a that the path terminated before the current character is a word.
823 bool has_word_ending = false;
824 for (i = 0; i < parent_vse->dawg_info->active_dawgs.size(); ++i) {
825 const DawgPosition &pos = parent_vse->dawg_info->active_dawgs[i];
826 const Dawg *pdawg = pos.dawg_index < 0
827 ? nullptr : dict_->GetDawg(pos.dawg_index);
828 if (pdawg == nullptr || pos.back_to_punc) continue;;
829 if (pdawg->type() == DAWG_TYPE_WORD && pos.dawg_ref != NO_EDGE &&
830 pdawg->end_of_word(pos.dawg_ref)) {
831 has_word_ending = true;
832 break;
833 }
834 }
835 if (!has_word_ending) return nullptr;
836
837 if (language_model_debug_level > 0) tprintf("Compound word found\n");
838 return new LanguageModelDawgInfo(&beginning_active_dawgs_, COMPOUND_PERM);
839 } // done dealing with compound words
840
841 LanguageModelDawgInfo *dawg_info = nullptr;
842
843 // Call LetterIsOkay().
844 // Use the normalized IDs so that all shapes of ' can be allowed in words
845 // like don't.
846 const GenericVector<UNICHAR_ID>& normed_ids =
848 DawgPositionVector tmp_active_dawgs;
849 for (int i = 0; i < normed_ids.size(); ++i) {
851 tprintf("Test Letter OK for unichar %d, normed %d\n",
852 b.unichar_id(), normed_ids[i]);
853 dict_->LetterIsOkay(&dawg_args_, dict_->getUnicharset(), normed_ids[i],
854 word_end && i == normed_ids.size() - 1);
855 if (dawg_args_.permuter == NO_PERM) {
856 break;
857 } else if (i < normed_ids.size() - 1) {
858 tmp_active_dawgs = *dawg_args_.updated_dawgs;
859 dawg_args_.active_dawgs = &tmp_active_dawgs;
860 }
862 tprintf("Letter was OK for unichar %d, normed %d\n",
863 b.unichar_id(), normed_ids[i]);
864 }
865 dawg_args_.active_dawgs = nullptr;
866 if (dawg_args_.permuter != NO_PERM) {
867 dawg_info = new LanguageModelDawgInfo(dawg_args_.updated_dawgs,
869 } else if (language_model_debug_level > 3) {
870 tprintf("Letter %s not OK!\n",
872 }
873
874 return dawg_info;
875}
@ DAWG_TYPE_WORD
Definition: dawg.h:70
int size() const
Definition: genericvector.h:72
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:835
DawgPositionVector * active_dawgs
Definition: dict.h:84
PermuterType permuter
Definition: dict.h:86
int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:376
bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:152
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:432
DawgPositionVector beginning_active_dawgs_
DawgPositionVector very_beginning_active_dawgs_

◆ GenerateNgramInfo()

LanguageModelNgramInfo * tesseract::LanguageModel::GenerateNgramInfo ( const char *  unichar,
float  certainty,
float  denom,
int  curr_col,
int  curr_row,
float  outline_length,
const ViterbiStateEntry parent_vse 
)
protected

Definition at line 877 of file language_model.cpp.

880 {
881 // Initialize parent context.
882 const char *pcontext_ptr = "";
883 int pcontext_unichar_step_len = 0;
884 if (parent_vse == nullptr) {
885 pcontext_ptr = prev_word_str_.string();
886 pcontext_unichar_step_len = prev_word_unichar_step_len_;
887 } else {
888 pcontext_ptr = parent_vse->ngram_info->context.string();
889 pcontext_unichar_step_len =
890 parent_vse->ngram_info->context_unichar_step_len;
891 }
892 // Compute p(unichar | parent context).
893 int unichar_step_len = 0;
894 bool pruned = false;
895 float ngram_cost;
896 float ngram_and_classifier_cost =
897 ComputeNgramCost(unichar, certainty, denom,
898 pcontext_ptr, &unichar_step_len,
899 &pruned, &ngram_cost);
900 // Normalize just the ngram_and_classifier_cost by outline_length.
901 // The ngram_cost is used by the params_model, so it needs to be left as-is,
902 // and the params model cost will be normalized by outline_length.
903 ngram_and_classifier_cost *=
904 outline_length / language_model_ngram_rating_factor;
905 // Add the ngram_cost of the parent.
906 if (parent_vse != nullptr) {
907 ngram_and_classifier_cost +=
908 parent_vse->ngram_info->ngram_and_classifier_cost;
909 ngram_cost += parent_vse->ngram_info->ngram_cost;
910 }
911
912 // Shorten parent context string by unichar_step_len unichars.
913 int num_remove = (unichar_step_len + pcontext_unichar_step_len -
915 if (num_remove > 0) pcontext_unichar_step_len -= num_remove;
916 while (num_remove > 0 && *pcontext_ptr != '\0') {
917 pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr);
918 --num_remove;
919 }
920
921 // Decide whether to prune this ngram path and update changed accordingly.
922 if (parent_vse != nullptr && parent_vse->ngram_info->pruned) pruned = true;
923
924 // Construct and return the new LanguageModelNgramInfo.
925 auto *ngram_info = new LanguageModelNgramInfo(
926 pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_cost,
927 ngram_and_classifier_cost);
928 ngram_info->context += unichar;
929 ngram_info->context_unichar_step_len += unichar_step_len;
930 assert(ngram_info->context_unichar_step_len <= language_model_ngram_order);
931 return ngram_info;
932}
const char * string() const
Definition: strngs.cpp:194
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)

◆ GenerateTopChoiceInfo()

void tesseract::LanguageModel::GenerateTopChoiceInfo ( ViterbiStateEntry new_vse,
const ViterbiStateEntry parent_vse,
LanguageModelState lms 
)
protected

Definition at line 769 of file language_model.cpp.

771 {
772 ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
773 for (vit.mark_cycle_pt(); !vit.cycled_list() && new_vse->top_choice_flags &&
774 new_vse->cost >= vit.data()->cost; vit.forward()) {
775 // Clear the appropriate flags if the list already contains
776 // a top choice entry with a lower cost.
777 new_vse->top_choice_flags &= ~(vit.data()->top_choice_flags);
778 }
780 tprintf("GenerateTopChoiceInfo: top_choice_flags=0x%x\n",
781 new_vse->top_choice_flags);
782 }
783}

◆ GetNextParentVSE()

ViterbiStateEntry * tesseract::LanguageModel::GetNextParentVSE ( bool  just_classified,
bool  mixed_alnum,
const BLOB_CHOICE bc,
LanguageModelFlagsType  blob_choice_flags,
const UNICHARSET unicharset,
WERD_RES word_res,
ViterbiStateEntry_IT *  vse_it,
LanguageModelFlagsType top_choice_flags 
) const
protected

Finds the next ViterbiStateEntry with which the given unichar_id can combine sensibly, taking into account any mixed alnum/mixed case situation, and whether this combination has been inspected before.

Definition at line 500 of file language_model.cpp.

504 {
505 for (; !vse_it->cycled_list(); vse_it->forward()) {
506 ViterbiStateEntry* parent_vse = vse_it->data();
507 // Only consider the parent if it has been updated or
508 // if the current ratings cell has just been classified.
509 if (!just_classified && !parent_vse->updated) continue;
511 parent_vse->Print("Considering");
512 // If the parent is non-alnum, then upper counts as lower.
513 *top_choice_flags = blob_choice_flags;
514 if ((blob_choice_flags & kUpperCaseFlag) &&
515 !parent_vse->HasAlnumChoice(unicharset)) {
516 *top_choice_flags |= kLowerCaseFlag;
517 }
518 *top_choice_flags &= parent_vse->top_choice_flags;
519 UNICHAR_ID unichar_id = bc->unichar_id();
520 const BLOB_CHOICE* parent_b = parent_vse->curr_b;
521 UNICHAR_ID parent_id = parent_b->unichar_id();
522 // Digits do not bind to alphas if there is a mix in both parent and current
523 // or if the alpha is not the top choice.
524 if (unicharset.get_isdigit(unichar_id) &&
525 unicharset.get_isalpha(parent_id) &&
526 (mixed_alnum || *top_choice_flags == 0))
527 continue; // Digits don't bind to alphas.
528 // Likewise alphas do not bind to digits if there is a mix in both or if
529 // the digit is not the top choice.
530 if (unicharset.get_isalpha(unichar_id) &&
531 unicharset.get_isdigit(parent_id) &&
532 (mixed_alnum || *top_choice_flags == 0))
533 continue; // Alphas don't bind to digits.
534 // If there is a case mix of the same alpha in the parent list, then
535 // competing_vse is non-null and will be used to determine whether
536 // or not to bind the current blob choice.
537 if (parent_vse->competing_vse != nullptr) {
538 const BLOB_CHOICE* competing_b = parent_vse->competing_vse->curr_b;
539 UNICHAR_ID other_id = competing_b->unichar_id();
541 tprintf("Parent %s has competition %s\n",
542 unicharset.id_to_unichar(parent_id),
543 unicharset.id_to_unichar(other_id));
544 }
545 if (unicharset.SizesDistinct(parent_id, other_id)) {
546 // If other_id matches bc wrt position and size, and parent_id, doesn't,
547 // don't bind to the current parent.
548 if (bc->PosAndSizeAgree(*competing_b, word_res->x_height,
550 !bc->PosAndSizeAgree(*parent_b, word_res->x_height,
552 continue; // Competing blobchoice has a better vertical match.
553 }
554 }
555 vse_it->forward();
556 return parent_vse; // This one is good!
557 }
558 return nullptr; // Ran out of possibilities.
559}
float x_height
Definition: pageres.h:316
bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const
Definition: ratngs.cpp:156
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
Definition: unicharset.cpp:486
static const LanguageModelFlagsType kLowerCaseFlag
static const LanguageModelFlagsType kUpperCaseFlag

◆ getParamsModel()

ParamsModel & tesseract::LanguageModel::getParamsModel ( )
inline

Definition at line 108 of file language_model.h.

108{ return params_model_; }

◆ GetTopLowerUpperDigit()

bool tesseract::LanguageModel::GetTopLowerUpperDigit ( BLOB_CHOICE_LIST *  curr_list,
BLOB_CHOICE **  first_lower,
BLOB_CHOICE **  first_upper,
BLOB_CHOICE **  first_digit 
) const
protected

Finds the first lower and upper case letter and first digit in curr_list. For non-upper/lower languages, alpha counts as upper. Uses the first character in the list in place of empty results. Returns true if both alpha and digits are found.

Definition at line 383 of file language_model.cpp.

386 {
387 BLOB_CHOICE_IT c_it(curr_list);
388 const UNICHARSET &unicharset = dict_->getUnicharset();
389 BLOB_CHOICE *first_unichar = nullptr;
390 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
391 UNICHAR_ID unichar_id = c_it.data()->unichar_id();
392 if (unicharset.get_fragment(unichar_id)) continue; // skip fragments
393 if (first_unichar == nullptr) first_unichar = c_it.data();
394 if (*first_lower == nullptr && unicharset.get_islower(unichar_id)) {
395 *first_lower = c_it.data();
396 }
397 if (*first_upper == nullptr && unicharset.get_isalpha(unichar_id) &&
398 !unicharset.get_islower(unichar_id)) {
399 *first_upper = c_it.data();
400 }
401 if (*first_digit == nullptr && unicharset.get_isdigit(unichar_id)) {
402 *first_digit = c_it.data();
403 }
404 }
405 ASSERT_HOST(first_unichar != nullptr);
406 bool mixed = (*first_lower != nullptr || *first_upper != nullptr) &&
407 *first_digit != nullptr;
408 if (*first_lower == nullptr) *first_lower = first_unichar;
409 if (*first_upper == nullptr) *first_upper = first_unichar;
410 if (*first_digit == nullptr) *first_digit = first_unichar;
411 return mixed;
412}
@ mixed
Definition: cluster.h:44
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:734

◆ InitForWord()

void tesseract::LanguageModel::InitForWord ( const WERD_CHOICE prev_word,
bool  fixed_pitch,
float  max_char_wh_ratio,
float  rating_cert_scale 
)

Definition at line 136 of file language_model.cpp.

138 {
139 fixed_pitch_ = fixed_pitch;
140 max_char_wh_ratio_ = max_char_wh_ratio;
141 rating_cert_scale_ = rating_cert_scale;
144
145 // Initialize vectors with beginning DawgInfos.
150
151 // Fill prev_word_str_ with the last language_model_ngram_order
152 // unichars from prev_word.
154 if (prev_word != nullptr && prev_word->unichar_string() != nullptr) {
155 prev_word_str_ = prev_word->unichar_string();
157 } else {
158 prev_word_str_ = " ";
159 }
160 const char *str_ptr = prev_word_str_.string();
161 const char *str_end = str_ptr + prev_word_str_.length();
162 int step;
164 while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) {
165 str_ptr += step;
167 }
168 ASSERT_HOST(str_ptr == str_end);
169 }
170}
const STRING & unichar_string() const
Definition: ratngs.h:531
int32_t length() const
Definition: strngs.cpp:189
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:617
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:600

◆ PrunablePath()

bool tesseract::LanguageModel::PrunablePath ( const ViterbiStateEntry vse)
inlineprotected

Definition at line 299 of file language_model.h.

299 {
300 if (vse.top_choice_flags) return false;
301 if (vse.dawg_info != nullptr &&
302 (vse.dawg_info->permuter == SYSTEM_DAWG_PERM ||
303 vse.dawg_info->permuter == USER_DAWG_PERM ||
304 vse.dawg_info->permuter == FREQ_DAWG_PERM)) return false;
305 return true;
306 }

◆ SetAcceptableChoiceFound()

void tesseract::LanguageModel::SetAcceptableChoiceFound ( bool  val)
inline

Definition at line 104 of file language_model.h.

104 {
106 }

◆ SetTopParentLowerUpperDigit()

int tesseract::LanguageModel::SetTopParentLowerUpperDigit ( LanguageModelState parent_node) const
protected

Forces there to be at least one entry in the overall set of the viterbi_state_entries of each element of parent_node that has the top_choice_flag set for lower, upper and digit using the same rules as GetTopLowerUpperDigit, setting the flag on the first found suitable candidate, whether or not the flag is set on some other parent. Returns 1 if both alpha and digits are found among the parents, -1 if no parents are found at all (a legitimate case), and 0 otherwise.

Definition at line 423 of file language_model.cpp.

424 {
425 if (parent_node == nullptr) return -1;
426 UNICHAR_ID top_id = INVALID_UNICHAR_ID;
427 ViterbiStateEntry* top_lower = nullptr;
428 ViterbiStateEntry* top_upper = nullptr;
429 ViterbiStateEntry* top_digit = nullptr;
430 ViterbiStateEntry* top_choice = nullptr;
431 float lower_rating = 0.0f;
432 float upper_rating = 0.0f;
433 float digit_rating = 0.0f;
434 float top_rating = 0.0f;
435 const UNICHARSET &unicharset = dict_->getUnicharset();
436 ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
437 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
438 ViterbiStateEntry* vse = vit.data();
439 // INVALID_UNICHAR_ID should be treated like a zero-width joiner, so scan
440 // back to the real character if needed.
441 ViterbiStateEntry* unichar_vse = vse;
442 UNICHAR_ID unichar_id = unichar_vse->curr_b->unichar_id();
443 float rating = unichar_vse->curr_b->rating();
444 while (unichar_id == INVALID_UNICHAR_ID &&
445 unichar_vse->parent_vse != nullptr) {
446 unichar_vse = unichar_vse->parent_vse;
447 unichar_id = unichar_vse->curr_b->unichar_id();
448 rating = unichar_vse->curr_b->rating();
449 }
450 if (unichar_id != INVALID_UNICHAR_ID) {
451 if (unicharset.get_islower(unichar_id)) {
452 if (top_lower == nullptr || lower_rating > rating) {
453 top_lower = vse;
454 lower_rating = rating;
455 }
456 } else if (unicharset.get_isalpha(unichar_id)) {
457 if (top_upper == nullptr || upper_rating > rating) {
458 top_upper = vse;
459 upper_rating = rating;
460 }
461 } else if (unicharset.get_isdigit(unichar_id)) {
462 if (top_digit == nullptr || digit_rating > rating) {
463 top_digit = vse;
464 digit_rating = rating;
465 }
466 }
467 }
468 if (top_choice == nullptr || top_rating > rating) {
469 top_choice = vse;
470 top_rating = rating;
471 top_id = unichar_id;
472 }
473 }
474 if (top_choice == nullptr) return -1;
475 bool mixed = (top_lower != nullptr || top_upper != nullptr) &&
476 top_digit != nullptr;
477 if (top_lower == nullptr) top_lower = top_choice;
478 top_lower->top_choice_flags |= kLowerCaseFlag;
479 if (top_upper == nullptr) top_upper = top_choice;
480 top_upper->top_choice_flags |= kUpperCaseFlag;
481 if (top_digit == nullptr) top_digit = top_choice;
482 top_digit->top_choice_flags |= kDigitFlag;
483 top_choice->top_choice_flags |= kSmallestRatingFlag;
484 if (top_id != INVALID_UNICHAR_ID && dict_->compound_marker(top_id) &&
485 (top_choice->top_choice_flags &
487 // If the compound marker top choice carries any of the top alnum flags,
488 // then give it all of them, allowing words like I-295 to be chosen.
489 top_choice->top_choice_flags |=
491 }
492 return mixed ? 1 : 0;
493}
static const LanguageModelFlagsType kDigitFlag

◆ UpdateBestChoice()

void tesseract::LanguageModel::UpdateBestChoice ( ViterbiStateEntry vse,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 1241 of file language_model.cpp.

1246 {
1247 bool truth_path;
1248 WERD_CHOICE *word = ConstructWord(vse, word_res, &best_choice_bundle->fixpt,
1249 blamer_bundle, &truth_path);
1250 ASSERT_HOST(word != nullptr);
1251 if (dict_->stopper_debug_level >= 1) {
1252 STRING word_str;
1253 word->string_and_lengths(&word_str, nullptr);
1254 vse->Print(word_str.string());
1255 }
1257 word->print("UpdateBestChoice() constructed word");
1258 }
1259 // Record features from the current path if necessary.
1260 ParamsTrainingHypothesis curr_hyp;
1261 if (blamer_bundle != nullptr) {
1262 if (vse->dawg_info != nullptr) vse->dawg_info->permuter =
1263 static_cast<PermuterType>(word->permuter());
1264 ExtractFeaturesFromPath(*vse, curr_hyp.features);
1265 word->string_and_lengths(&(curr_hyp.str), nullptr);
1266 curr_hyp.cost = vse->cost; // record cost for error rate computations
1268 tprintf("Raw features extracted from %s (cost=%g) [ ",
1269 curr_hyp.str.string(), curr_hyp.cost);
1270 for (float feature : curr_hyp.features) {
1271 tprintf("%g ", feature);
1272 }
1273 tprintf("]\n");
1274 }
1275 // Record the current hypothesis in params_training_bundle.
1276 blamer_bundle->AddHypothesis(curr_hyp);
1277 if (truth_path)
1278 blamer_bundle->UpdateBestRating(word->rating());
1279 }
1280 if (blamer_bundle != nullptr && blamer_bundle->GuidedSegsearchStillGoing()) {
1281 // The word was constructed solely for blamer_bundle->AddHypothesis, so
1282 // we no longer need it.
1283 delete word;
1284 return;
1285 }
1286 if (word_res->chopped_word != nullptr && !word_res->chopped_word->blobs.empty())
1288 // Update and log new raw_choice if needed.
1289 if (word_res->raw_choice == nullptr ||
1290 word->rating() < word_res->raw_choice->rating()) {
1291 if (word_res->LogNewRawChoice(word) && language_model_debug_level > 0)
1292 tprintf("Updated raw choice\n");
1293 }
1294 // Set the modified rating for best choice to vse->cost and log best choice.
1295 word->set_rating(vse->cost);
1296 // Call LogNewChoice() for best choice from Dict::adjust_word() since it
1297 // computes adjust_factor that is used by the adaption code (e.g. by
1298 // ClassifyAdaptableWord() to compute adaption acceptance thresholds).
1299 // Note: the rating of the word is not adjusted.
1300 dict_->adjust_word(word, vse->dawg_info == nullptr,
1301 vse->consistency_info.xht_decision, 0.0,
1302 false, language_model_debug_level > 0);
1303 // Hand ownership of the word over to the word_res.
1305 dict_->stopper_debug_level >= 1, word)) {
1306 // The word was so bad that it was deleted.
1307 return;
1308 }
1309 if (word_res->best_choice == word) {
1310 // Word was the new best.
1311 if (dict_->AcceptableChoice(*word, vse->consistency_info.xht_decision) &&
1312 AcceptablePath(*vse)) {
1314 }
1315 // Update best_choice_bundle.
1316 best_choice_bundle->updated = true;
1317 best_choice_bundle->best_vse = vse;
1319 tprintf("Updated best choice\n");
1320 word->print_state("New state ");
1321 }
1322 // Update hyphen state if we are dealing with a dictionary word.
1323 if (vse->dawg_info != nullptr) {
1324 if (dict_->has_hyphen_end(*word)) {
1326 } else {
1327 dict_->reset_hyphen_vars(true);
1328 }
1329 }
1330
1331 if (blamer_bundle != nullptr) {
1333 vse->dawg_info != nullptr && vse->top_choice_flags);
1334 }
1335 }
1336 if (wordrec_display_segmentations && word_res->chopped_word != nullptr) {
1337 word->DisplaySegmentation(word_res->chopped_word);
1338 }
1339}
PermuterType
Definition: ratngs.h:232
void UpdateBestRating(float rating)
Definition: blamer.h:136
bool GuidedSegsearchStillGoing() const
Definition: blamer.cpp:514
void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo)
Definition: blamer.h:169
void set_best_choice_is_dict_and_top_choice(bool value)
Definition: blamer.h:149
GenericVector< TBLOB * > blobs
Definition: blobs.h:459
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:604
WERD_CHOICE * best_choice
Definition: pageres.h:241
WERD_CHOICE * raw_choice
Definition: pageres.h:246
TWERD * chopped_word
Definition: pageres.h:212
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:620
void SetScriptPositions(bool small_caps, TWERD *word, int debug=0)
Definition: ratngs.cpp:554
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:453
void print_state(const char *msg) const
Definition: ratngs.cpp:756
uint8_t permuter() const
Definition: ratngs.h:336
void set_rating(float new_val)
Definition: ratngs.h:359
void DisplaySegmentation(TWERD *word)
Definition: ratngs.cpp:765
void print() const
Definition: ratngs.h:570
float rating() const
Definition: ratngs.h:317
Definition: strngs.h:45
int tessedit_truncate_wordchoice_log
Definition: dict.h:642
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:28
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
Definition: stopper.cpp:42
int stopper_debug_level
Definition: dict.h:638
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:701
void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
Definition: hyphen.cpp:45
bool AcceptablePath(const ViterbiStateEntry &vse)
WERD_CHOICE * ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)

◆ UpdateState()

bool tesseract::LanguageModel::UpdateState ( bool  just_classified,
int  curr_col,
int  curr_row,
BLOB_CHOICE_LIST *  curr_list,
LanguageModelState parent_node,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)

UpdateState has the job of combining the ViterbiStateEntry lists on each of the choices on parent_list with each of the blob choices in curr_list, making a new ViterbiStateEntry for each sensible path.

This could be a huge set of combinations, creating a lot of work only to be truncated by some beam limit, but only certain kinds of paths will continue at the next step:

  • paths that are liked by the language model: either a DAWG or the n-gram model, where active.
  • paths that represent some kind of top choice. The old permuter permuted the top raw classifier score, the top upper case word and the top lower- case word. UpdateState now concentrates its top-choice paths on top lower-case, top upper-case (or caseless alpha), and top digit sequence, with allowance for continuation of these paths through blobs where such a character does not appear in the choices list.

GetNextParentVSE enforces some of these models to minimize the number of calls to AddViterbiStateEntry, even prior to looking at the language model. Thus an n-blob sequence of [l1I] will produce 3n calls to AddViterbiStateEntry instead of 3^n.

Of course it isn't quite that simple as Title Case is handled by allowing lower case to continue an upper case initial, but it has to be detected in the combiner so it knows which upper case letters are initial alphas.

Definition at line 253 of file language_model.cpp.

261 {
263 tprintf("\nUpdateState: col=%d row=%d %s",
264 curr_col, curr_row, just_classified ? "just_classified" : "");
266 tprintf("(parent=%p)\n", parent_node);
267 else
268 tprintf("\n");
269 }
270 // Initialize helper variables.
271 bool word_end = (curr_row+1 >= word_res->ratings->dimension());
272 bool new_changed = false;
273 float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f;
274 const UNICHARSET& unicharset = dict_->getUnicharset();
275 BLOB_CHOICE *first_lower = nullptr;
276 BLOB_CHOICE *first_upper = nullptr;
277 BLOB_CHOICE *first_digit = nullptr;
278 bool has_alnum_mix = false;
279 if (parent_node != nullptr) {
280 int result = SetTopParentLowerUpperDigit(parent_node);
281 if (result < 0) {
283 tprintf("No parents found to process\n");
284 return false;
285 }
286 if (result > 0)
287 has_alnum_mix = true;
288 }
289 if (!GetTopLowerUpperDigit(curr_list, &first_lower, &first_upper,
290 &first_digit))
291 has_alnum_mix = false;;
292 ScanParentsForCaseMix(unicharset, parent_node);
293 if (language_model_debug_level > 3 && parent_node != nullptr) {
294 parent_node->Print("Parent viterbi list");
295 }
296 LanguageModelState *curr_state = best_choice_bundle->beam[curr_row];
297
298 // Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry.
299 ViterbiStateEntry_IT vit;
300 BLOB_CHOICE_IT c_it(curr_list);
301 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
302 BLOB_CHOICE* choice = c_it.data();
303 // TODO(antonova): make sure commenting this out if ok for ngram
304 // model scoring (I think this was introduced to fix ngram model quirks).
305 // Skip nullptr unichars unless it is the only choice.
306 //if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;
307 UNICHAR_ID unichar_id = choice->unichar_id();
308 if (unicharset.get_fragment(unichar_id)) {
309 continue; // Skip fragments.
310 }
311 // Set top choice flags.
312 LanguageModelFlagsType blob_choice_flags = kXhtConsistentFlag;
313 if (c_it.at_first() || !new_changed)
314 blob_choice_flags |= kSmallestRatingFlag;
315 if (first_lower == choice) blob_choice_flags |= kLowerCaseFlag;
316 if (first_upper == choice) blob_choice_flags |= kUpperCaseFlag;
317 if (first_digit == choice) blob_choice_flags |= kDigitFlag;
318
319 if (parent_node == nullptr) {
320 // Process the beginning of a word.
321 // If there is a better case variant that is not distinguished by size,
322 // skip this blob choice, as we have no choice but to accept the result
323 // of the character classifier to distinguish between them, even if
324 // followed by an upper case.
325 // With words like iPoc, and other CamelBackWords, the lower-upper
326 // transition can only be achieved if the classifier has the correct case
327 // as the top choice, and leaving an initial I lower down the list
328 // increases the chances of choosing IPoc simply because it doesn't
329 // include such a transition. iPoc will beat iPOC and ipoc because
330 // the other words are baseline/x-height inconsistent.
331 if (HasBetterCaseVariant(unicharset, choice, curr_list))
332 continue;
333 // Upper counts as lower at the beginning of a word.
334 if (blob_choice_flags & kUpperCaseFlag)
335 blob_choice_flags |= kLowerCaseFlag;
336 new_changed |= AddViterbiStateEntry(
337 blob_choice_flags, denom, word_end, curr_col, curr_row,
338 choice, curr_state, nullptr, pain_points,
339 word_res, best_choice_bundle, blamer_bundle);
340 } else {
341 // Get viterbi entries from each parent ViterbiStateEntry.
342 vit.set_to_list(&parent_node->viterbi_state_entries);
343 int vit_counter = 0;
344 vit.mark_cycle_pt();
345 ViterbiStateEntry* parent_vse = nullptr;
346 LanguageModelFlagsType top_choice_flags;
347 while ((parent_vse = GetNextParentVSE(just_classified, has_alnum_mix,
348 c_it.data(), blob_choice_flags,
349 unicharset, word_res, &vit,
350 &top_choice_flags)) != nullptr) {
351 // Skip pruned entries and do not look at prunable entries if already
352 // examined language_model_viterbi_list_max_num_prunable of those.
353 if (PrunablePath(*parent_vse) &&
355 (language_model_ngram_on && parent_vse->ngram_info->pruned))) {
356 continue;
357 }
358 // If the parent has no alnum choice, (ie choice is the first in a
359 // string of alnum), and there is a better case variant that is not
360 // distinguished by size, skip this blob choice/parent, as with the
361 // initial blob treatment above.
362 if (!parent_vse->HasAlnumChoice(unicharset) &&
363 HasBetterCaseVariant(unicharset, choice, curr_list))
364 continue;
365 // Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data()
366 // looks good according to the Dawgs or character ngram model.
367 new_changed |= AddViterbiStateEntry(
368 top_choice_flags, denom, word_end, curr_col, curr_row,
369 c_it.data(), curr_state, parent_vse, pain_points,
370 word_res, best_choice_bundle, blamer_bundle);
371 }
372 }
373 }
374 return new_changed;
375}
unsigned char LanguageModelFlagsType
Used for expressing various language model flags.
Definition: lm_state.h:37
static const LanguageModelFlagsType kXhtConsistentFlag
bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
ViterbiStateEntry * GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const

Member Data Documentation

◆ acceptable_choice_found_

bool tesseract::LanguageModel::acceptable_choice_found_ = false
protected

Definition at line 416 of file language_model.h.

◆ beginning_active_dawgs_

DawgPositionVector tesseract::LanguageModel::beginning_active_dawgs_
protected

Definition at line 404 of file language_model.h.

◆ correct_segmentation_explored_

bool tesseract::LanguageModel::correct_segmentation_explored_ = false
protected

Definition at line 418 of file language_model.h.

◆ dawg_args_

DawgArgs tesseract::LanguageModel::dawg_args_
protected

Definition at line 372 of file language_model.h.

◆ dict_

Dict* tesseract::LanguageModel::dict_ = nullptr
protected

Definition at line 383 of file language_model.h.

◆ fixed_pitch_

bool tesseract::LanguageModel::fixed_pitch_ = false
protected

Definition at line 390 of file language_model.h.

◆ fontinfo_table_

const UnicityTable<FontInfo>* tesseract::LanguageModel::fontinfo_table_ = nullptr
protected

Definition at line 379 of file language_model.h.

◆ kDigitFlag

const LanguageModelFlagsType tesseract::LanguageModel::kDigitFlag = 0x8
static

Definition at line 56 of file language_model.h.

◆ kLowerCaseFlag

const LanguageModelFlagsType tesseract::LanguageModel::kLowerCaseFlag = 0x2
static

Definition at line 54 of file language_model.h.

◆ kMaxAvgNgramCost

const float tesseract::LanguageModel::kMaxAvgNgramCost = 25.0f
static

Definition at line 61 of file language_model.h.

◆ kSmallestRatingFlag

const LanguageModelFlagsType tesseract::LanguageModel::kSmallestRatingFlag = 0x1
static

Definition at line 53 of file language_model.h.

◆ kUpperCaseFlag

const LanguageModelFlagsType tesseract::LanguageModel::kUpperCaseFlag = 0x4
static

Definition at line 55 of file language_model.h.

◆ kXhtConsistentFlag

const LanguageModelFlagsType tesseract::LanguageModel::kXhtConsistentFlag = 0x10
static

Definition at line 57 of file language_model.h.

◆ language_model_debug_level

int tesseract::LanguageModel::language_model_debug_level = 0

"Language model debug level"

Definition at line 316 of file language_model.h.

◆ language_model_min_compound_length

int tesseract::LanguageModel::language_model_min_compound_length = 3

"Minimum length of compound words"

Definition at line 343 of file language_model.h.

◆ language_model_ngram_nonmatch_score

double tesseract::LanguageModel::language_model_ngram_nonmatch_score = -40.0

"Average classifier score of a non-matching unichar"

Definition at line 330 of file language_model.h.

◆ language_model_ngram_on

bool tesseract::LanguageModel::language_model_ngram_on = false

"Turn on/off the use of character ngram model"

Definition at line 318 of file language_model.h.

◆ language_model_ngram_order

int tesseract::LanguageModel::language_model_ngram_order = 8

"Maximum order of the character ngram model"

Definition at line 320 of file language_model.h.

◆ language_model_ngram_rating_factor

double tesseract::LanguageModel::language_model_ngram_rating_factor = 16.0

"Factor to bring log-probs into the same range as ratings" " when multiplied by outline length "

Definition at line 339 of file language_model.h.

◆ language_model_ngram_scale_factor

double tesseract::LanguageModel::language_model_ngram_scale_factor = 0.03

"Strength of the character ngram model relative to the" " character classifier "

Definition at line 336 of file language_model.h.

◆ language_model_ngram_small_prob

double tesseract::LanguageModel::language_model_ngram_small_prob = 0.000001

"To avoid overly small denominators use this as the floor" " of the probability returned by the ngram model"

Definition at line 328 of file language_model.h.

◆ language_model_ngram_space_delimited_language

bool tesseract::LanguageModel::language_model_ngram_space_delimited_language = true

"Words are delimited by space"

Definition at line 341 of file language_model.h.

◆ language_model_ngram_use_only_first_uft8_step

bool tesseract::LanguageModel::language_model_ngram_use_only_first_uft8_step = false

"Use only the first UTF8 step of the given string" " when computing log probabilities"

Definition at line 333 of file language_model.h.

◆ language_model_penalty_case

double tesseract::LanguageModel::language_model_penalty_case = 0.1

"Penalty for inconsistent case"

Definition at line 352 of file language_model.h.

◆ language_model_penalty_chartype

double tesseract::LanguageModel::language_model_penalty_chartype = 0.3

"Penalty for inconsistent character type"

Definition at line 356 of file language_model.h.

◆ language_model_penalty_font

double tesseract::LanguageModel::language_model_penalty_font = 0.00

"Penalty for inconsistent font"

Definition at line 358 of file language_model.h.

◆ language_model_penalty_increment

double tesseract::LanguageModel::language_model_penalty_increment = 0.01

"Penalty increment"

Definition at line 361 of file language_model.h.

◆ language_model_penalty_non_dict_word

double tesseract::LanguageModel::language_model_penalty_non_dict_word = 0.15

"Penalty for non-dictionary words"

Definition at line 348 of file language_model.h.

◆ language_model_penalty_non_freq_dict_word

double tesseract::LanguageModel::language_model_penalty_non_freq_dict_word = 0.1

"Penalty for words not in the frequent word dictionary"

Definition at line 346 of file language_model.h.

◆ language_model_penalty_punc

double tesseract::LanguageModel::language_model_penalty_punc = 0.2

"Penalty for inconsistent punctuation"

Definition at line 350 of file language_model.h.

◆ language_model_penalty_script

double tesseract::LanguageModel::language_model_penalty_script = 0.5

"Penalty for inconsistent script"

Definition at line 354 of file language_model.h.

◆ language_model_penalty_spacing

double tesseract::LanguageModel::language_model_penalty_spacing = 0.05

"Penalty for inconsistent spacing"

Definition at line 360 of file language_model.h.

◆ language_model_use_sigmoidal_certainty

bool tesseract::LanguageModel::language_model_use_sigmoidal_certainty = false

"Use sigmoidal score for certainty"

Definition at line 364 of file language_model.h.

◆ language_model_viterbi_list_max_num_prunable

int tesseract::LanguageModel::language_model_viterbi_list_max_num_prunable = 10

"Maximum number of prunable (those for which PrunablePath() is" " true) entries in each viterbi list recorded in BLOB_CHOICEs"

Definition at line 323 of file language_model.h.

◆ language_model_viterbi_list_max_size

int tesseract::LanguageModel::language_model_viterbi_list_max_size = 500

"Maximum size of viterbi lists recorded in BLOB_CHOICEs"

Definition at line 325 of file language_model.h.

◆ max_char_wh_ratio_

float tesseract::LanguageModel::max_char_wh_ratio_ = 0.0f
protected

Definition at line 393 of file language_model.h.

◆ params_model_

ParamsModel tesseract::LanguageModel::params_model_
protected

Definition at line 421 of file language_model.h.

◆ prev_word_str_

STRING tesseract::LanguageModel::prev_word_str_
protected

Definition at line 400 of file language_model.h.

◆ prev_word_unichar_step_len_

int tesseract::LanguageModel::prev_word_unichar_step_len_ = 0
protected

Definition at line 401 of file language_model.h.

◆ rating_cert_scale_

float tesseract::LanguageModel::rating_cert_scale_ = 0.0f
protected

Definition at line 374 of file language_model.h.

◆ very_beginning_active_dawgs_

DawgPositionVector tesseract::LanguageModel::very_beginning_active_dawgs_
protected

Definition at line 403 of file language_model.h.

◆ wordrec_display_segmentations

int tesseract::LanguageModel::wordrec_display_segmentations = 0

"Display Segmentations"

Definition at line 362 of file language_model.h.


The documentation for this class was generated from the following files: