tesseract 4.1.1
Loading...
Searching...
No Matches
language_model.cpp
Go to the documentation of this file.
1
2// File: language_model.cpp
3// Description: Functions that utilize the knowledge about the properties,
4// structure and statistics of the language to help recognition.
5// Author: Daria Antonova
6//
7// (C) Copyright 2009, Google Inc.
8// Licensed under the Apache License, Version 2.0 (the "License");
9// you may not use this file except in compliance with the License.
10// You may obtain a copy of the License at
11// http://www.apache.org/licenses/LICENSE-2.0
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17//
19
20#include "language_model.h"
21#include <cassert> // for assert
22#include <cmath> // for log2, pow
23#include "blamer.h" // for BlamerBundle
24#include "ccutil.h" // for CCUtil
25#include "dawg.h" // for NO_EDGE, Dawg, Dawg::kPatternUn...
26#include "errcode.h" // for ASSERT_HOST
27#include "lm_state.h" // for ViterbiStateEntry, ViterbiState...
28#include "matrix.h" // for MATRIX_COORD
29#include "pageres.h" // for WERD_RES
30#include "params.h" // for IntParam, BoolParam, DoubleParam
31#include "params_training_featdef.h" // for ParamsTrainingHypothesis, PTRAI...
32#include "tprintf.h" // for tprintf
33#include "unichar.h" // for UNICHAR_ID, INVALID_UNICHAR_ID
34#include "unicharset.h" // for UNICHARSET
35#include "unicity_table.h" // for UnicityTable
36
37template <typename T> class GenericVector;
38template <typename T> class UnicityTable;
39
40namespace tesseract {
41
42class LMPainPoints;
43struct FontInfo;
44
45#if defined(ANDROID)
46static inline double log2(double n) {
47 return log(n) / log(2.0);
48}
49#endif // ANDROID
50
51const float LanguageModel::kMaxAvgNgramCost = 25.0f;
52
54 Dict *dict)
55 : INT_MEMBER(language_model_debug_level, 0, "Language model debug level",
56 dict->getCCUtil()->params()),
57 BOOL_INIT_MEMBER(language_model_ngram_on, false,
58 "Turn on/off the use of character ngram model",
59 dict->getCCUtil()->params()),
60 INT_MEMBER(language_model_ngram_order, 8,
61 "Maximum order of the character ngram model",
62 dict->getCCUtil()->params()),
63 INT_MEMBER(language_model_viterbi_list_max_num_prunable, 10,
64 "Maximum number of prunable (those for which"
65 " PrunablePath() is true) entries in each viterbi list"
66 " recorded in BLOB_CHOICEs",
67 dict->getCCUtil()->params()),
68 INT_MEMBER(language_model_viterbi_list_max_size, 500,
69 "Maximum size of viterbi lists recorded in BLOB_CHOICEs",
70 dict->getCCUtil()->params()),
71 double_MEMBER(language_model_ngram_small_prob, 0.000001,
72 "To avoid overly small denominators use this as the "
73 "floor of the probability returned by the ngram model.",
74 dict->getCCUtil()->params()),
75 double_MEMBER(language_model_ngram_nonmatch_score, -40.0,
76 "Average classifier score of a non-matching unichar.",
77 dict->getCCUtil()->params()),
78 BOOL_MEMBER(language_model_ngram_use_only_first_uft8_step, false,
79 "Use only the first UTF8 step of the given string"
80 " when computing log probabilities.",
81 dict->getCCUtil()->params()),
82 double_MEMBER(language_model_ngram_scale_factor, 0.03,
83 "Strength of the character ngram model relative to the"
84 " character classifier ",
85 dict->getCCUtil()->params()),
86 double_MEMBER(language_model_ngram_rating_factor, 16.0,
87 "Factor to bring log-probs into the same range as ratings"
88 " when multiplied by outline length ",
89 dict->getCCUtil()->params()),
90 BOOL_MEMBER(language_model_ngram_space_delimited_language, true,
91 "Words are delimited by space", dict->getCCUtil()->params()),
92 INT_MEMBER(language_model_min_compound_length, 3,
93 "Minimum length of compound words",
94 dict->getCCUtil()->params()),
95 double_MEMBER(language_model_penalty_non_freq_dict_word, 0.1,
96 "Penalty for words not in the frequent word dictionary",
97 dict->getCCUtil()->params()),
98 double_MEMBER(language_model_penalty_non_dict_word, 0.15,
99 "Penalty for non-dictionary words",
100 dict->getCCUtil()->params()),
101 double_MEMBER(language_model_penalty_punc, 0.2,
102 "Penalty for inconsistent punctuation",
103 dict->getCCUtil()->params()),
104 double_MEMBER(language_model_penalty_case, 0.1,
105 "Penalty for inconsistent case",
106 dict->getCCUtil()->params()),
107 double_MEMBER(language_model_penalty_script, 0.5,
108 "Penalty for inconsistent script",
109 dict->getCCUtil()->params()),
110 double_MEMBER(language_model_penalty_chartype, 0.3,
111 "Penalty for inconsistent character type",
112 dict->getCCUtil()->params()),
113 // TODO(daria, rays): enable font consistency checking
114 // after improving font analysis.
115 double_MEMBER(language_model_penalty_font, 0.00,
116 "Penalty for inconsistent font",
117 dict->getCCUtil()->params()),
118 double_MEMBER(language_model_penalty_spacing, 0.05,
119 "Penalty for inconsistent spacing",
120 dict->getCCUtil()->params()),
121 double_MEMBER(language_model_penalty_increment, 0.01, "Penalty increment",
122 dict->getCCUtil()->params()),
123 INT_MEMBER(wordrec_display_segmentations, 0, "Display Segmentations",
124 dict->getCCUtil()->params()),
125 BOOL_INIT_MEMBER(language_model_use_sigmoidal_certainty, false,
126 "Use sigmoidal score for certainty",
127 dict->getCCUtil()->params()),
128 dawg_args_(nullptr, new DawgPositionVector(), NO_PERM),
129 fontinfo_table_(fontinfo_table),
130 dict_(dict) {
131 ASSERT_HOST(dict_ != nullptr);
132}
133
135
137 bool fixed_pitch, float max_char_wh_ratio,
138 float rating_cert_scale) {
139 fixed_pitch_ = fixed_pitch;
140 max_char_wh_ratio_ = max_char_wh_ratio;
141 rating_cert_scale_ = rating_cert_scale;
144
145 // Initialize vectors with beginning DawgInfos.
150
151 // Fill prev_word_str_ with the last language_model_ngram_order
152 // unichars from prev_word.
154 if (prev_word != nullptr && prev_word->unichar_string() != nullptr) {
155 prev_word_str_ = prev_word->unichar_string();
157 } else {
158 prev_word_str_ = " ";
159 }
160 const char *str_ptr = prev_word_str_.string();
161 const char *str_end = str_ptr + prev_word_str_.length();
162 int step;
164 while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) {
165 str_ptr += step;
167 }
168 ASSERT_HOST(str_ptr == str_end);
169 }
170}
171
176static void ScanParentsForCaseMix(const UNICHARSET& unicharset,
177 LanguageModelState* parent_node) {
178 if (parent_node == nullptr) return;
179 ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
180 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
181 ViterbiStateEntry* vse = vit.data();
182 vse->competing_vse = nullptr;
183 UNICHAR_ID unichar_id = vse->curr_b->unichar_id();
184 if (unicharset.get_isupper(unichar_id) ||
185 unicharset.get_islower(unichar_id)) {
186 UNICHAR_ID other_case = unicharset.get_other_case(unichar_id);
187 if (other_case == unichar_id) continue; // Not in unicharset.
188 // Find other case in same list. There could be multiple entries with
189 // the same unichar_id, but in theory, they should all point to the
190 // same BLOB_CHOICE, and that is what we will be using to decide
191 // which to keep.
192 ViterbiStateEntry_IT vit2(&parent_node->viterbi_state_entries);
193 for (vit2.mark_cycle_pt(); !vit2.cycled_list() &&
194 vit2.data()->curr_b->unichar_id() != other_case;
195 vit2.forward()) {}
196 if (!vit2.cycled_list()) {
197 vse->competing_vse = vit2.data();
198 }
199 }
200 }
201}
202
207static bool HasBetterCaseVariant(const UNICHARSET& unicharset,
208 const BLOB_CHOICE* choice,
209 BLOB_CHOICE_LIST* choices) {
210 UNICHAR_ID choice_id = choice->unichar_id();
211 UNICHAR_ID other_case = unicharset.get_other_case(choice_id);
212 if (other_case == choice_id || other_case == INVALID_UNICHAR_ID)
213 return false; // Not upper or lower or not in unicharset.
214 if (unicharset.SizesDistinct(choice_id, other_case))
215 return false; // Can be separated by size.
216 BLOB_CHOICE_IT bc_it(choices);
217 for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
218 BLOB_CHOICE* better_choice = bc_it.data();
219 if (better_choice->unichar_id() == other_case)
220 return true; // Found an earlier instance of other_case.
221 else if (better_choice == choice)
222 return false; // Reached the original choice.
223 }
224 return false; // Should never happen, but just in case.
225}
226
254 bool just_classified,
255 int curr_col, int curr_row,
256 BLOB_CHOICE_LIST *curr_list,
257 LanguageModelState *parent_node,
258 LMPainPoints *pain_points,
259 WERD_RES *word_res,
260 BestChoiceBundle *best_choice_bundle,
261 BlamerBundle *blamer_bundle) {
263 tprintf("\nUpdateState: col=%d row=%d %s",
264 curr_col, curr_row, just_classified ? "just_classified" : "");
266 tprintf("(parent=%p)\n", parent_node);
267 else
268 tprintf("\n");
269 }
270 // Initialize helper variables.
271 bool word_end = (curr_row+1 >= word_res->ratings->dimension());
272 bool new_changed = false;
273 float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f;
274 const UNICHARSET& unicharset = dict_->getUnicharset();
275 BLOB_CHOICE *first_lower = nullptr;
276 BLOB_CHOICE *first_upper = nullptr;
277 BLOB_CHOICE *first_digit = nullptr;
278 bool has_alnum_mix = false;
279 if (parent_node != nullptr) {
280 int result = SetTopParentLowerUpperDigit(parent_node);
281 if (result < 0) {
283 tprintf("No parents found to process\n");
284 return false;
285 }
286 if (result > 0)
287 has_alnum_mix = true;
288 }
289 if (!GetTopLowerUpperDigit(curr_list, &first_lower, &first_upper,
290 &first_digit))
291 has_alnum_mix = false;;
292 ScanParentsForCaseMix(unicharset, parent_node);
293 if (language_model_debug_level > 3 && parent_node != nullptr) {
294 parent_node->Print("Parent viterbi list");
295 }
296 LanguageModelState *curr_state = best_choice_bundle->beam[curr_row];
297
298 // Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry.
299 ViterbiStateEntry_IT vit;
300 BLOB_CHOICE_IT c_it(curr_list);
301 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
302 BLOB_CHOICE* choice = c_it.data();
303 // TODO(antonova): make sure commenting this out if ok for ngram
304 // model scoring (I think this was introduced to fix ngram model quirks).
305 // Skip nullptr unichars unless it is the only choice.
306 //if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;
307 UNICHAR_ID unichar_id = choice->unichar_id();
308 if (unicharset.get_fragment(unichar_id)) {
309 continue; // Skip fragments.
310 }
311 // Set top choice flags.
312 LanguageModelFlagsType blob_choice_flags = kXhtConsistentFlag;
313 if (c_it.at_first() || !new_changed)
314 blob_choice_flags |= kSmallestRatingFlag;
315 if (first_lower == choice) blob_choice_flags |= kLowerCaseFlag;
316 if (first_upper == choice) blob_choice_flags |= kUpperCaseFlag;
317 if (first_digit == choice) blob_choice_flags |= kDigitFlag;
318
319 if (parent_node == nullptr) {
320 // Process the beginning of a word.
321 // If there is a better case variant that is not distinguished by size,
322 // skip this blob choice, as we have no choice but to accept the result
323 // of the character classifier to distinguish between them, even if
324 // followed by an upper case.
325 // With words like iPoc, and other CamelBackWords, the lower-upper
326 // transition can only be achieved if the classifier has the correct case
327 // as the top choice, and leaving an initial I lower down the list
328 // increases the chances of choosing IPoc simply because it doesn't
329 // include such a transition. iPoc will beat iPOC and ipoc because
330 // the other words are baseline/x-height inconsistent.
331 if (HasBetterCaseVariant(unicharset, choice, curr_list))
332 continue;
333 // Upper counts as lower at the beginning of a word.
334 if (blob_choice_flags & kUpperCaseFlag)
335 blob_choice_flags |= kLowerCaseFlag;
336 new_changed |= AddViterbiStateEntry(
337 blob_choice_flags, denom, word_end, curr_col, curr_row,
338 choice, curr_state, nullptr, pain_points,
339 word_res, best_choice_bundle, blamer_bundle);
340 } else {
341 // Get viterbi entries from each parent ViterbiStateEntry.
342 vit.set_to_list(&parent_node->viterbi_state_entries);
343 int vit_counter = 0;
344 vit.mark_cycle_pt();
345 ViterbiStateEntry* parent_vse = nullptr;
346 LanguageModelFlagsType top_choice_flags;
347 while ((parent_vse = GetNextParentVSE(just_classified, has_alnum_mix,
348 c_it.data(), blob_choice_flags,
349 unicharset, word_res, &vit,
350 &top_choice_flags)) != nullptr) {
351 // Skip pruned entries and do not look at prunable entries if already
352 // examined language_model_viterbi_list_max_num_prunable of those.
353 if (PrunablePath(*parent_vse) &&
355 (language_model_ngram_on && parent_vse->ngram_info->pruned))) {
356 continue;
357 }
358 // If the parent has no alnum choice, (ie choice is the first in a
359 // string of alnum), and there is a better case variant that is not
360 // distinguished by size, skip this blob choice/parent, as with the
361 // initial blob treatment above.
362 if (!parent_vse->HasAlnumChoice(unicharset) &&
363 HasBetterCaseVariant(unicharset, choice, curr_list))
364 continue;
365 // Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data()
366 // looks good according to the Dawgs or character ngram model.
367 new_changed |= AddViterbiStateEntry(
368 top_choice_flags, denom, word_end, curr_col, curr_row,
369 c_it.data(), curr_state, parent_vse, pain_points,
370 word_res, best_choice_bundle, blamer_bundle);
371 }
372 }
373 }
374 return new_changed;
375}
376
383bool LanguageModel::GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list,
384 BLOB_CHOICE **first_lower,
385 BLOB_CHOICE **first_upper,
386 BLOB_CHOICE **first_digit) const {
387 BLOB_CHOICE_IT c_it(curr_list);
388 const UNICHARSET &unicharset = dict_->getUnicharset();
389 BLOB_CHOICE *first_unichar = nullptr;
390 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
391 UNICHAR_ID unichar_id = c_it.data()->unichar_id();
392 if (unicharset.get_fragment(unichar_id)) continue; // skip fragments
393 if (first_unichar == nullptr) first_unichar = c_it.data();
394 if (*first_lower == nullptr && unicharset.get_islower(unichar_id)) {
395 *first_lower = c_it.data();
396 }
397 if (*first_upper == nullptr && unicharset.get_isalpha(unichar_id) &&
398 !unicharset.get_islower(unichar_id)) {
399 *first_upper = c_it.data();
400 }
401 if (*first_digit == nullptr && unicharset.get_isdigit(unichar_id)) {
402 *first_digit = c_it.data();
403 }
404 }
405 ASSERT_HOST(first_unichar != nullptr);
406 bool mixed = (*first_lower != nullptr || *first_upper != nullptr) &&
407 *first_digit != nullptr;
408 if (*first_lower == nullptr) *first_lower = first_unichar;
409 if (*first_upper == nullptr) *first_upper = first_unichar;
410 if (*first_digit == nullptr) *first_digit = first_unichar;
411 return mixed;
412}
413
424 LanguageModelState *parent_node) const {
425 if (parent_node == nullptr) return -1;
426 UNICHAR_ID top_id = INVALID_UNICHAR_ID;
427 ViterbiStateEntry* top_lower = nullptr;
428 ViterbiStateEntry* top_upper = nullptr;
429 ViterbiStateEntry* top_digit = nullptr;
430 ViterbiStateEntry* top_choice = nullptr;
431 float lower_rating = 0.0f;
432 float upper_rating = 0.0f;
433 float digit_rating = 0.0f;
434 float top_rating = 0.0f;
435 const UNICHARSET &unicharset = dict_->getUnicharset();
436 ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
437 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
438 ViterbiStateEntry* vse = vit.data();
439 // INVALID_UNICHAR_ID should be treated like a zero-width joiner, so scan
440 // back to the real character if needed.
441 ViterbiStateEntry* unichar_vse = vse;
442 UNICHAR_ID unichar_id = unichar_vse->curr_b->unichar_id();
443 float rating = unichar_vse->curr_b->rating();
444 while (unichar_id == INVALID_UNICHAR_ID &&
445 unichar_vse->parent_vse != nullptr) {
446 unichar_vse = unichar_vse->parent_vse;
447 unichar_id = unichar_vse->curr_b->unichar_id();
448 rating = unichar_vse->curr_b->rating();
449 }
450 if (unichar_id != INVALID_UNICHAR_ID) {
451 if (unicharset.get_islower(unichar_id)) {
452 if (top_lower == nullptr || lower_rating > rating) {
453 top_lower = vse;
454 lower_rating = rating;
455 }
456 } else if (unicharset.get_isalpha(unichar_id)) {
457 if (top_upper == nullptr || upper_rating > rating) {
458 top_upper = vse;
459 upper_rating = rating;
460 }
461 } else if (unicharset.get_isdigit(unichar_id)) {
462 if (top_digit == nullptr || digit_rating > rating) {
463 top_digit = vse;
464 digit_rating = rating;
465 }
466 }
467 }
468 if (top_choice == nullptr || top_rating > rating) {
469 top_choice = vse;
470 top_rating = rating;
471 top_id = unichar_id;
472 }
473 }
474 if (top_choice == nullptr) return -1;
475 bool mixed = (top_lower != nullptr || top_upper != nullptr) &&
476 top_digit != nullptr;
477 if (top_lower == nullptr) top_lower = top_choice;
478 top_lower->top_choice_flags |= kLowerCaseFlag;
479 if (top_upper == nullptr) top_upper = top_choice;
480 top_upper->top_choice_flags |= kUpperCaseFlag;
481 if (top_digit == nullptr) top_digit = top_choice;
482 top_digit->top_choice_flags |= kDigitFlag;
484 if (top_id != INVALID_UNICHAR_ID && dict_->compound_marker(top_id) &&
485 (top_choice->top_choice_flags &
487 // If the compound marker top choice carries any of the top alnum flags,
488 // then give it all of them, allowing words like I-295 to be chosen.
489 top_choice->top_choice_flags |=
491 }
492 return mixed ? 1 : 0;
493}
494
501 bool just_classified, bool mixed_alnum, const BLOB_CHOICE* bc,
502 LanguageModelFlagsType blob_choice_flags, const UNICHARSET& unicharset,
503 WERD_RES* word_res, ViterbiStateEntry_IT* vse_it,
504 LanguageModelFlagsType* top_choice_flags) const {
505 for (; !vse_it->cycled_list(); vse_it->forward()) {
506 ViterbiStateEntry* parent_vse = vse_it->data();
507 // Only consider the parent if it has been updated or
508 // if the current ratings cell has just been classified.
509 if (!just_classified && !parent_vse->updated) continue;
511 parent_vse->Print("Considering");
512 // If the parent is non-alnum, then upper counts as lower.
513 *top_choice_flags = blob_choice_flags;
514 if ((blob_choice_flags & kUpperCaseFlag) &&
515 !parent_vse->HasAlnumChoice(unicharset)) {
516 *top_choice_flags |= kLowerCaseFlag;
517 }
518 *top_choice_flags &= parent_vse->top_choice_flags;
519 UNICHAR_ID unichar_id = bc->unichar_id();
520 const BLOB_CHOICE* parent_b = parent_vse->curr_b;
521 UNICHAR_ID parent_id = parent_b->unichar_id();
522 // Digits do not bind to alphas if there is a mix in both parent and current
523 // or if the alpha is not the top choice.
524 if (unicharset.get_isdigit(unichar_id) &&
525 unicharset.get_isalpha(parent_id) &&
526 (mixed_alnum || *top_choice_flags == 0))
527 continue; // Digits don't bind to alphas.
528 // Likewise alphas do not bind to digits if there is a mix in both or if
529 // the digit is not the top choice.
530 if (unicharset.get_isalpha(unichar_id) &&
531 unicharset.get_isdigit(parent_id) &&
532 (mixed_alnum || *top_choice_flags == 0))
533 continue; // Alphas don't bind to digits.
534 // If there is a case mix of the same alpha in the parent list, then
535 // competing_vse is non-null and will be used to determine whether
536 // or not to bind the current blob choice.
537 if (parent_vse->competing_vse != nullptr) {
538 const BLOB_CHOICE* competing_b = parent_vse->competing_vse->curr_b;
539 UNICHAR_ID other_id = competing_b->unichar_id();
541 tprintf("Parent %s has competition %s\n",
542 unicharset.id_to_unichar(parent_id),
543 unicharset.id_to_unichar(other_id));
544 }
545 if (unicharset.SizesDistinct(parent_id, other_id)) {
546 // If other_id matches bc wrt position and size, and parent_id, doesn't,
547 // don't bind to the current parent.
548 if (bc->PosAndSizeAgree(*competing_b, word_res->x_height,
550 !bc->PosAndSizeAgree(*parent_b, word_res->x_height,
552 continue; // Competing blobchoice has a better vertical match.
553 }
554 }
555 vse_it->forward();
556 return parent_vse; // This one is good!
557 }
558 return nullptr; // Ran out of possibilities.
559}
560
562 LanguageModelFlagsType top_choice_flags,
563 float denom,
564 bool word_end,
565 int curr_col, int curr_row,
566 BLOB_CHOICE *b,
567 LanguageModelState *curr_state,
568 ViterbiStateEntry *parent_vse,
569 LMPainPoints *pain_points,
570 WERD_RES *word_res,
571 BestChoiceBundle *best_choice_bundle,
572 BlamerBundle *blamer_bundle) {
573 ViterbiStateEntry_IT vit;
575 tprintf("AddViterbiStateEntry for unichar %s rating=%.4f"
576 " certainty=%.4f top_choice_flags=0x%x",
578 b->rating(), b->certainty(), top_choice_flags);
580 tprintf(" parent_vse=%p\n", parent_vse);
581 else
582 tprintf("\n");
583 }
584 ASSERT_HOST(curr_state != nullptr);
585 // Check whether the list is full.
586 if (curr_state->viterbi_state_entries_length >=
589 tprintf("AddViterbiStateEntry: viterbi list is full!\n");
590 }
591 return false;
592 }
593
594 // Invoke Dawg language model component.
595 LanguageModelDawgInfo *dawg_info =
596 GenerateDawgInfo(word_end, curr_col, curr_row, *b, parent_vse);
597
598 float outline_length =
600 // Invoke Ngram language model component.
601 LanguageModelNgramInfo *ngram_info = nullptr;
603 ngram_info = GenerateNgramInfo(
605 denom, curr_col, curr_row, outline_length, parent_vse);
606 ASSERT_HOST(ngram_info != nullptr);
607 }
608 bool liked_by_language_model = dawg_info != nullptr ||
609 (ngram_info != nullptr && !ngram_info->pruned);
610 // Quick escape if not liked by the language model, can't be consistent
611 // xheight, and not top choice.
612 if (!liked_by_language_model && top_choice_flags == 0) {
614 tprintf("Language model components very early pruned this entry\n");
615 }
616 delete ngram_info;
617 delete dawg_info;
618 return false;
619 }
620
621 // Check consistency of the path and set the relevant consistency_info.
622 LMConsistencyInfo consistency_info(
623 parent_vse != nullptr ? &parent_vse->consistency_info : nullptr);
624 // Start with just the x-height consistency, as it provides significant
625 // pruning opportunity.
626 consistency_info.ComputeXheightConsistency(
628 // Turn off xheight consistent flag if not consistent.
629 if (consistency_info.InconsistentXHeight()) {
630 top_choice_flags &= ~kXhtConsistentFlag;
631 }
632
633 // Quick escape if not liked by the language model, not consistent xheight,
634 // and not top choice.
635 if (!liked_by_language_model && top_choice_flags == 0) {
637 tprintf("Language model components early pruned this entry\n");
638 }
639 delete ngram_info;
640 delete dawg_info;
641 return false;
642 }
643
644 // Compute the rest of the consistency info.
645 FillConsistencyInfo(curr_col, word_end, b, parent_vse,
646 word_res, &consistency_info);
647 if (dawg_info != nullptr && consistency_info.invalid_punc) {
648 consistency_info.invalid_punc = false; // do not penalize dict words
649 }
650
651 // Compute cost of associating the blobs that represent the current unichar.
652 AssociateStats associate_stats;
653 ComputeAssociateStats(curr_col, curr_row, max_char_wh_ratio_,
654 parent_vse, word_res, &associate_stats);
655 if (parent_vse != nullptr) {
656 associate_stats.shape_cost += parent_vse->associate_stats.shape_cost;
657 associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape;
658 }
659
660 // Create the new ViterbiStateEntry compute the adjusted cost of the path.
661 auto *new_vse = new ViterbiStateEntry(
662 parent_vse, b, 0.0, outline_length,
663 consistency_info, associate_stats, top_choice_flags, dawg_info,
664 ngram_info, (language_model_debug_level > 0) ?
665 dict_->getUnicharset().id_to_unichar(b->unichar_id()) : nullptr);
666 new_vse->cost = ComputeAdjustedPathCost(new_vse);
668 tprintf("Adjusted cost = %g\n", new_vse->cost);
669
670 // Invoke Top Choice language model component to make the final adjustments
671 // to new_vse->top_choice_flags.
672 if (!curr_state->viterbi_state_entries.empty() && new_vse->top_choice_flags) {
673 GenerateTopChoiceInfo(new_vse, parent_vse, curr_state);
674 }
675
676 // If language model components did not like this unichar - return.
677 bool keep = new_vse->top_choice_flags || liked_by_language_model;
678 if (!(top_choice_flags & kSmallestRatingFlag) && // no non-top choice paths
679 consistency_info.inconsistent_script) { // with inconsistent script
680 keep = false;
681 }
682 if (!keep) {
684 tprintf("Language model components did not like this entry\n");
685 }
686 delete new_vse;
687 return false;
688 }
689
690 // Discard this entry if it represents a prunable path and
691 // language_model_viterbi_list_max_num_prunable such entries with a lower
692 // cost have already been recorded.
693 if (PrunablePath(*new_vse) &&
696 new_vse->cost >= curr_state->viterbi_state_entries_prunable_max_cost) {
698 tprintf("Discarded ViterbiEntry with high cost %g max cost %g\n",
699 new_vse->cost,
701 }
702 delete new_vse;
703 return false;
704 }
705
706 // Update best choice if needed.
707 if (word_end) {
708 UpdateBestChoice(new_vse, pain_points, word_res,
709 best_choice_bundle, blamer_bundle);
710 // Discard the entry if UpdateBestChoice() found flaws in it.
711 if (new_vse->cost >= WERD_CHOICE::kBadRating &&
712 new_vse != best_choice_bundle->best_vse) {
714 tprintf("Discarded ViterbiEntry with high cost %g\n", new_vse->cost);
715 }
716 delete new_vse;
717 return false;
718 }
719 }
720
721 // Add the new ViterbiStateEntry and to curr_state->viterbi_state_entries.
723 false, new_vse);
724 curr_state->viterbi_state_entries_length++;
725 if (PrunablePath(*new_vse)) {
727 }
728
729 // Update lms->viterbi_state_entries_prunable_max_cost and clear
730 // top_choice_flags of entries with ratings_sum than new_vse->ratings_sum.
731 if ((curr_state->viterbi_state_entries_prunable_length >=
733 new_vse->top_choice_flags) {
734 ASSERT_HOST(!curr_state->viterbi_state_entries.empty());
736 vit.set_to_list(&(curr_state->viterbi_state_entries));
737 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
738 ViterbiStateEntry *curr_vse = vit.data();
739 // Clear the appropriate top choice flags of the entries in the
740 // list that have cost higher thank new_entry->cost
741 // (since they will not be top choices any more).
742 if (curr_vse->top_choice_flags && curr_vse != new_vse &&
743 curr_vse->cost > new_vse->cost) {
744 curr_vse->top_choice_flags &= ~(new_vse->top_choice_flags);
745 }
746 if (prunable_counter > 0 && PrunablePath(*curr_vse)) --prunable_counter;
747 // Update curr_state->viterbi_state_entries_prunable_max_cost.
748 if (prunable_counter == 0) {
749 curr_state->viterbi_state_entries_prunable_max_cost = vit.data()->cost;
751 tprintf("Set viterbi_state_entries_prunable_max_cost to %g\n",
753 }
754 prunable_counter = -1; // stop counting
755 }
756 }
757 }
758
759 // Print the newly created ViterbiStateEntry.
761 new_vse->Print("New");
763 curr_state->Print("Updated viterbi list");
764 }
765
766 return true;
767}
768
770 const ViterbiStateEntry *parent_vse,
771 LanguageModelState *lms) {
772 ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
773 for (vit.mark_cycle_pt(); !vit.cycled_list() && new_vse->top_choice_flags &&
774 new_vse->cost >= vit.data()->cost; vit.forward()) {
775 // Clear the appropriate flags if the list already contains
776 // a top choice entry with a lower cost.
777 new_vse->top_choice_flags &= ~(vit.data()->top_choice_flags);
778 }
780 tprintf("GenerateTopChoiceInfo: top_choice_flags=0x%x\n",
781 new_vse->top_choice_flags);
782 }
783}
784
786 bool word_end,
787 int curr_col, int curr_row,
788 const BLOB_CHOICE &b,
789 const ViterbiStateEntry *parent_vse) {
790 // Initialize active_dawgs from parent_vse if it is not nullptr.
791 // Otherwise use very_beginning_active_dawgs_.
792 if (parent_vse == nullptr) {
795 } else {
796 if (parent_vse->dawg_info == nullptr) return nullptr; // not a dict word path
798 dawg_args_.permuter = parent_vse->dawg_info->permuter;
799 }
800
801 // Deal with hyphenated words.
802 if (word_end && dict_->has_hyphen_end(&dict_->getUnicharset(),
803 b.unichar_id(), curr_col == 0)) {
804 if (language_model_debug_level > 0) tprintf("Hyphenated word found\n");
806 }
807
808 // Deal with compound words.
809 if (dict_->compound_marker(b.unichar_id()) &&
810 (parent_vse == nullptr || parent_vse->dawg_info->permuter != NUMBER_PERM)) {
811 if (language_model_debug_level > 0) tprintf("Found compound marker\n");
812 // Do not allow compound operators at the beginning and end of the word.
813 // Do not allow more than one compound operator per word.
814 // Do not allow compounding of words with lengths shorter than
815 // language_model_min_compound_length
816 if (parent_vse == nullptr || word_end ||
819 return nullptr;
820
821 int i;
822 // Check a that the path terminated before the current character is a word.
823 bool has_word_ending = false;
824 for (i = 0; i < parent_vse->dawg_info->active_dawgs.size(); ++i) {
825 const DawgPosition &pos = parent_vse->dawg_info->active_dawgs[i];
826 const Dawg *pdawg = pos.dawg_index < 0
827 ? nullptr : dict_->GetDawg(pos.dawg_index);
828 if (pdawg == nullptr || pos.back_to_punc) continue;;
829 if (pdawg->type() == DAWG_TYPE_WORD && pos.dawg_ref != NO_EDGE &&
830 pdawg->end_of_word(pos.dawg_ref)) {
831 has_word_ending = true;
832 break;
833 }
834 }
835 if (!has_word_ending) return nullptr;
836
837 if (language_model_debug_level > 0) tprintf("Compound word found\n");
839 } // done dealing with compound words
840
841 LanguageModelDawgInfo *dawg_info = nullptr;
842
843 // Call LetterIsOkay().
844 // Use the normalized IDs so that all shapes of ' can be allowed in words
845 // like don't.
846 const GenericVector<UNICHAR_ID>& normed_ids =
848 DawgPositionVector tmp_active_dawgs;
849 for (int i = 0; i < normed_ids.size(); ++i) {
851 tprintf("Test Letter OK for unichar %d, normed %d\n",
852 b.unichar_id(), normed_ids[i]);
853 dict_->LetterIsOkay(&dawg_args_, dict_->getUnicharset(), normed_ids[i],
854 word_end && i == normed_ids.size() - 1);
855 if (dawg_args_.permuter == NO_PERM) {
856 break;
857 } else if (i < normed_ids.size() - 1) {
858 tmp_active_dawgs = *dawg_args_.updated_dawgs;
859 dawg_args_.active_dawgs = &tmp_active_dawgs;
860 }
862 tprintf("Letter was OK for unichar %d, normed %d\n",
863 b.unichar_id(), normed_ids[i]);
864 }
865 dawg_args_.active_dawgs = nullptr;
866 if (dawg_args_.permuter != NO_PERM) {
869 } else if (language_model_debug_level > 3) {
870 tprintf("Letter %s not OK!\n",
872 }
873
874 return dawg_info;
875}
876
878 const char *unichar, float certainty, float denom,
879 int curr_col, int curr_row, float outline_length,
880 const ViterbiStateEntry *parent_vse) {
881 // Initialize parent context.
882 const char *pcontext_ptr = "";
883 int pcontext_unichar_step_len = 0;
884 if (parent_vse == nullptr) {
885 pcontext_ptr = prev_word_str_.string();
886 pcontext_unichar_step_len = prev_word_unichar_step_len_;
887 } else {
888 pcontext_ptr = parent_vse->ngram_info->context.string();
889 pcontext_unichar_step_len =
891 }
892 // Compute p(unichar | parent context).
893 int unichar_step_len = 0;
894 bool pruned = false;
895 float ngram_cost;
896 float ngram_and_classifier_cost =
897 ComputeNgramCost(unichar, certainty, denom,
898 pcontext_ptr, &unichar_step_len,
899 &pruned, &ngram_cost);
900 // Normalize just the ngram_and_classifier_cost by outline_length.
901 // The ngram_cost is used by the params_model, so it needs to be left as-is,
902 // and the params model cost will be normalized by outline_length.
903 ngram_and_classifier_cost *=
904 outline_length / language_model_ngram_rating_factor;
905 // Add the ngram_cost of the parent.
906 if (parent_vse != nullptr) {
907 ngram_and_classifier_cost +=
909 ngram_cost += parent_vse->ngram_info->ngram_cost;
910 }
911
912 // Shorten parent context string by unichar_step_len unichars.
913 int num_remove = (unichar_step_len + pcontext_unichar_step_len -
915 if (num_remove > 0) pcontext_unichar_step_len -= num_remove;
916 while (num_remove > 0 && *pcontext_ptr != '\0') {
917 pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr);
918 --num_remove;
919 }
920
921 // Decide whether to prune this ngram path and update changed accordingly.
922 if (parent_vse != nullptr && parent_vse->ngram_info->pruned) pruned = true;
923
924 // Construct and return the new LanguageModelNgramInfo.
925 auto *ngram_info = new LanguageModelNgramInfo(
926 pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_cost,
927 ngram_and_classifier_cost);
928 ngram_info->context += unichar;
929 ngram_info->context_unichar_step_len += unichar_step_len;
930 assert(ngram_info->context_unichar_step_len <= language_model_ngram_order);
931 return ngram_info;
932}
933
934float LanguageModel::ComputeNgramCost(const char *unichar,
935 float certainty,
936 float denom,
937 const char *context,
938 int *unichar_step_len,
939 bool *found_small_prob,
940 float *ngram_cost) {
941 const char *context_ptr = context;
942 char *modified_context = nullptr;
943 char *modified_context_end = nullptr;
944 const char *unichar_ptr = unichar;
945 const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
946 float prob = 0.0f;
947 int step = 0;
948 while (unichar_ptr < unichar_end &&
949 (step = UNICHAR::utf8_step(unichar_ptr)) > 0) {
951 tprintf("prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
952 dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step));
953 }
954 prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step);
955 ++(*unichar_step_len);
957 unichar_ptr += step;
958 // If there are multiple UTF8 characters present in unichar, context is
959 // updated to include the previously examined characters from str,
960 // unless use_only_first_uft8_step is true.
961 if (unichar_ptr < unichar_end) {
962 if (modified_context == nullptr) {
963 size_t context_len = strlen(context);
964 modified_context =
965 new char[context_len + strlen(unichar_ptr) + step + 1];
966 memcpy(modified_context, context, context_len);
967 modified_context_end = modified_context + context_len;
968 context_ptr = modified_context;
969 }
970 strncpy(modified_context_end, unichar_ptr - step, step);
971 modified_context_end += step;
972 *modified_context_end = '\0';
973 }
974 }
975 prob /= static_cast<float>(*unichar_step_len); // normalize
977 if (language_model_debug_level > 0) tprintf("Found small prob %g\n", prob);
978 *found_small_prob = true;
980 }
981 *ngram_cost = -1.0*log2(prob);
982 float ngram_and_classifier_cost =
983 -1.0*log2(CertaintyScore(certainty)/denom) +
986 tprintf("-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\n", unichar,
987 unichar, context_ptr, CertaintyScore(certainty)/denom, prob,
988 ngram_and_classifier_cost);
989 }
990 delete[] modified_context;
991 return ngram_and_classifier_cost;
992}
993
994float LanguageModel::ComputeDenom(BLOB_CHOICE_LIST *curr_list) {
995 if (curr_list->empty()) return 1.0f;
996 float denom = 0.0f;
997 int len = 0;
998 BLOB_CHOICE_IT c_it(curr_list);
999 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
1000 ASSERT_HOST(c_it.data() != nullptr);
1001 ++len;
1002 denom += CertaintyScore(c_it.data()->certainty());
1003 }
1004 assert(len != 0);
1005 // The ideal situation would be to have the classifier scores for
1006 // classifying each position as each of the characters in the unicharset.
1007 // Since we can not do this because of speed, we add a very crude estimate
1008 // of what these scores for the "missing" classifications would sum up to.
1009 denom += (dict_->getUnicharset().size() - len) *
1011
1012 return denom;
1013}
1014
1016 int curr_col,
1017 bool word_end,
1018 BLOB_CHOICE *b,
1019 ViterbiStateEntry *parent_vse,
1020 WERD_RES *word_res,
1021 LMConsistencyInfo *consistency_info) {
1022 const UNICHARSET &unicharset = dict_->getUnicharset();
1023 UNICHAR_ID unichar_id = b->unichar_id();
1024 BLOB_CHOICE* parent_b = parent_vse != nullptr ? parent_vse->curr_b : nullptr;
1025
1026 // Check punctuation validity.
1027 if (unicharset.get_ispunctuation(unichar_id)) consistency_info->num_punc++;
1028 if (dict_->GetPuncDawg() != nullptr && !consistency_info->invalid_punc) {
1029 if (dict_->compound_marker(unichar_id) && parent_b != nullptr &&
1030 (unicharset.get_isalpha(parent_b->unichar_id()) ||
1031 unicharset.get_isdigit(parent_b->unichar_id()))) {
1032 // reset punc_ref for compound words
1033 consistency_info->punc_ref = NO_EDGE;
1034 } else {
1035 bool is_apos = dict_->is_apostrophe(unichar_id);
1036 bool prev_is_numalpha = (parent_b != nullptr &&
1037 (unicharset.get_isalpha(parent_b->unichar_id()) ||
1038 unicharset.get_isdigit(parent_b->unichar_id())));
1039 UNICHAR_ID pattern_unichar_id =
1040 (unicharset.get_isalpha(unichar_id) ||
1041 unicharset.get_isdigit(unichar_id) ||
1042 (is_apos && prev_is_numalpha)) ?
1043 Dawg::kPatternUnicharID : unichar_id;
1044 if (consistency_info->punc_ref == NO_EDGE ||
1045 pattern_unichar_id != Dawg::kPatternUnicharID ||
1046 dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) !=
1049 consistency_info->punc_ref);
1050 consistency_info->punc_ref =
1051 (node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of(
1052 node, pattern_unichar_id, word_end) : NO_EDGE;
1053 if (consistency_info->punc_ref == NO_EDGE) {
1054 consistency_info->invalid_punc = true;
1055 }
1056 }
1057 }
1058 }
1059
1060 // Update case related counters.
1061 if (parent_vse != nullptr && !word_end && dict_->compound_marker(unichar_id)) {
1062 // Reset counters if we are dealing with a compound word.
1063 consistency_info->num_lower = 0;
1064 consistency_info->num_non_first_upper = 0;
1065 }
1066 else if (unicharset.get_islower(unichar_id)) {
1067 consistency_info->num_lower++;
1068 } else if ((parent_b != nullptr) && unicharset.get_isupper(unichar_id)) {
1069 if (unicharset.get_isupper(parent_b->unichar_id()) ||
1070 consistency_info->num_lower > 0 ||
1071 consistency_info->num_non_first_upper > 0) {
1072 consistency_info->num_non_first_upper++;
1073 }
1074 }
1075
1076 // Initialize consistency_info->script_id (use script of unichar_id
1077 // if it is not Common, use script id recorded by the parent otherwise).
1078 // Set inconsistent_script to true if the script of the current unichar
1079 // is not consistent with that of the parent.
1080 consistency_info->script_id = unicharset.get_script(unichar_id);
1081 // Hiragana and Katakana can mix with Han.
1083 if ((unicharset.hiragana_sid() != unicharset.null_sid() &&
1084 consistency_info->script_id == unicharset.hiragana_sid()) ||
1085 (unicharset.katakana_sid() != unicharset.null_sid() &&
1086 consistency_info->script_id == unicharset.katakana_sid())) {
1087 consistency_info->script_id = dict_->getUnicharset().han_sid();
1088 }
1089 }
1090
1091 if (parent_vse != nullptr &&
1092 (parent_vse->consistency_info.script_id !=
1094 int parent_script_id = parent_vse->consistency_info.script_id;
1095 // If script_id is Common, use script id of the parent instead.
1096 if (consistency_info->script_id == dict_->getUnicharset().common_sid()) {
1097 consistency_info->script_id = parent_script_id;
1098 }
1099 if (consistency_info->script_id != parent_script_id) {
1100 consistency_info->inconsistent_script = true;
1101 }
1102 }
1103
1104 // Update chartype related counters.
1105 if (unicharset.get_isalpha(unichar_id)) {
1106 consistency_info->num_alphas++;
1107 } else if (unicharset.get_isdigit(unichar_id)) {
1108 consistency_info->num_digits++;
1109 } else if (!unicharset.get_ispunctuation(unichar_id)) {
1110 consistency_info->num_other++;
1111 }
1112
1113 // Check font and spacing consistency.
1114 if (fontinfo_table_->size() > 0 && parent_b != nullptr) {
1115 int fontinfo_id = -1;
1116 if (parent_b->fontinfo_id() == b->fontinfo_id() ||
1117 parent_b->fontinfo_id2() == b->fontinfo_id()) {
1118 fontinfo_id = b->fontinfo_id();
1119 } else if (parent_b->fontinfo_id() == b->fontinfo_id2() ||
1120 parent_b->fontinfo_id2() == b->fontinfo_id2()) {
1121 fontinfo_id = b->fontinfo_id2();
1122 }
1124 tprintf("pfont %s pfont %s font %s font2 %s common %s(%d)\n",
1125 (parent_b->fontinfo_id() >= 0) ?
1126 fontinfo_table_->get(parent_b->fontinfo_id()).name : "" ,
1127 (parent_b->fontinfo_id2() >= 0) ?
1128 fontinfo_table_->get(parent_b->fontinfo_id2()).name : "",
1129 (b->fontinfo_id() >= 0) ?
1130 fontinfo_table_->get(b->fontinfo_id()).name : "",
1131 (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1132 (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1133 fontinfo_id);
1134 }
1135 if (!word_res->blob_widths.empty()) { // if we have widths/gaps info
1136 bool expected_gap_found = false;
1137 float expected_gap = 0.0f;
1138 int temp_gap;
1139 if (fontinfo_id >= 0) { // found a common font
1140 ASSERT_HOST(fontinfo_id < fontinfo_table_->size());
1141 if (fontinfo_table_->get(fontinfo_id).get_spacing(
1142 parent_b->unichar_id(), unichar_id, &temp_gap)) {
1143 expected_gap = temp_gap;
1144 expected_gap_found = true;
1145 }
1146 } else {
1147 consistency_info->inconsistent_font = true;
1148 // Get an average of the expected gaps in each font
1149 int num_addends = 0;
1150 int temp_fid;
1151 for (int i = 0; i < 4; ++i) {
1152 if (i == 0) {
1153 temp_fid = parent_b->fontinfo_id();
1154 } else if (i == 1) {
1155 temp_fid = parent_b->fontinfo_id2();
1156 } else if (i == 2) {
1157 temp_fid = b->fontinfo_id();
1158 } else {
1159 temp_fid = b->fontinfo_id2();
1160 }
1161 ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size());
1162 if (temp_fid >= 0 && fontinfo_table_->get(temp_fid).get_spacing(
1163 parent_b->unichar_id(), unichar_id, &temp_gap)) {
1164 expected_gap += temp_gap;
1165 num_addends++;
1166 }
1167 }
1168 if (num_addends > 0) {
1169 expected_gap /= static_cast<float>(num_addends);
1170 expected_gap_found = true;
1171 }
1172 }
1173 if (expected_gap_found) {
1174 int actual_gap = word_res->GetBlobsGap(curr_col-1);
1175 if (actual_gap == 0) {
1176 consistency_info->num_inconsistent_spaces++;
1177 } else {
1178 float gap_ratio = expected_gap / actual_gap;
1179 // TODO(rays) The gaps seem to be way off most of the time, saved by
1180 // the error here that the ratio was compared to 1/2, when it should
1181 // have been 0.5f. Find the source of the gaps discrepancy and put
1182 // the 0.5f here in place of 0.0f.
1183 // Test on 2476595.sj, pages 0 to 6. (In French.)
1184 if (gap_ratio < 0.0f || gap_ratio > 2.0f) {
1185 consistency_info->num_inconsistent_spaces++;
1186 }
1187 }
1189 tprintf("spacing for %s(%d) %s(%d) col %d: expected %g actual %d\n",
1190 unicharset.id_to_unichar(parent_b->unichar_id()),
1191 parent_b->unichar_id(), unicharset.id_to_unichar(unichar_id),
1192 unichar_id, curr_col, expected_gap, actual_gap);
1193 }
1194 }
1195 }
1196 }
1197}
1198
1200 ASSERT_HOST(vse != nullptr);
1201 if (params_model_.Initialized()) {
1202 float features[PTRAIN_NUM_FEATURE_TYPES];
1203 ExtractFeaturesFromPath(*vse, features);
1204 float cost = params_model_.ComputeCost(features);
1206 tprintf("ComputeAdjustedPathCost %g ParamsModel features:\n", cost);
1207 if (language_model_debug_level >= 5) {
1208 for (int f = 0; f < PTRAIN_NUM_FEATURE_TYPES; ++f) {
1209 tprintf("%s=%g\n", kParamsTrainingFeatureTypeName[f], features[f]);
1210 }
1211 }
1212 }
1213 return cost * vse->outline_length;
1214 } else {
1215 float adjustment = 1.0f;
1216 if (vse->dawg_info == nullptr || vse->dawg_info->permuter != FREQ_DAWG_PERM) {
1218 }
1219 if (vse->dawg_info == nullptr) {
1222 adjustment += ((vse->length - language_model_min_compound_length) *
1224 }
1225 }
1226 if (vse->associate_stats.shape_cost > 0) {
1227 adjustment += vse->associate_stats.shape_cost /
1228 static_cast<float>(vse->length);
1229 }
1231 ASSERT_HOST(vse->ngram_info != nullptr);
1232 return vse->ngram_info->ngram_and_classifier_cost * adjustment;
1233 } else {
1234 adjustment += ComputeConsistencyAdjustment(vse->dawg_info,
1235 vse->consistency_info);
1236 return vse->ratings_sum * adjustment;
1237 }
1238 }
1239}
1240
1242 ViterbiStateEntry *vse,
1243 LMPainPoints *pain_points,
1244 WERD_RES *word_res,
1245 BestChoiceBundle *best_choice_bundle,
1246 BlamerBundle *blamer_bundle) {
1247 bool truth_path;
1248 WERD_CHOICE *word = ConstructWord(vse, word_res, &best_choice_bundle->fixpt,
1249 blamer_bundle, &truth_path);
1250 ASSERT_HOST(word != nullptr);
1251 if (dict_->stopper_debug_level >= 1) {
1252 STRING word_str;
1253 word->string_and_lengths(&word_str, nullptr);
1254 vse->Print(word_str.string());
1255 }
1257 word->print("UpdateBestChoice() constructed word");
1258 }
1259 // Record features from the current path if necessary.
1260 ParamsTrainingHypothesis curr_hyp;
1261 if (blamer_bundle != nullptr) {
1262 if (vse->dawg_info != nullptr) vse->dawg_info->permuter =
1263 static_cast<PermuterType>(word->permuter());
1264 ExtractFeaturesFromPath(*vse, curr_hyp.features);
1265 word->string_and_lengths(&(curr_hyp.str), nullptr);
1266 curr_hyp.cost = vse->cost; // record cost for error rate computations
1268 tprintf("Raw features extracted from %s (cost=%g) [ ",
1269 curr_hyp.str.string(), curr_hyp.cost);
1270 for (float feature : curr_hyp.features) {
1271 tprintf("%g ", feature);
1272 }
1273 tprintf("]\n");
1274 }
1275 // Record the current hypothesis in params_training_bundle.
1276 blamer_bundle->AddHypothesis(curr_hyp);
1277 if (truth_path)
1278 blamer_bundle->UpdateBestRating(word->rating());
1279 }
1280 if (blamer_bundle != nullptr && blamer_bundle->GuidedSegsearchStillGoing()) {
1281 // The word was constructed solely for blamer_bundle->AddHypothesis, so
1282 // we no longer need it.
1283 delete word;
1284 return;
1285 }
1286 if (word_res->chopped_word != nullptr && !word_res->chopped_word->blobs.empty())
1288 // Update and log new raw_choice if needed.
1289 if (word_res->raw_choice == nullptr ||
1290 word->rating() < word_res->raw_choice->rating()) {
1291 if (word_res->LogNewRawChoice(word) && language_model_debug_level > 0)
1292 tprintf("Updated raw choice\n");
1293 }
1294 // Set the modified rating for best choice to vse->cost and log best choice.
1295 word->set_rating(vse->cost);
1296 // Call LogNewChoice() for best choice from Dict::adjust_word() since it
1297 // computes adjust_factor that is used by the adaption code (e.g. by
1298 // ClassifyAdaptableWord() to compute adaption acceptance thresholds).
1299 // Note: the rating of the word is not adjusted.
1300 dict_->adjust_word(word, vse->dawg_info == nullptr,
1302 false, language_model_debug_level > 0);
1303 // Hand ownership of the word over to the word_res.
1305 dict_->stopper_debug_level >= 1, word)) {
1306 // The word was so bad that it was deleted.
1307 return;
1308 }
1309 if (word_res->best_choice == word) {
1310 // Word was the new best.
1312 AcceptablePath(*vse)) {
1314 }
1315 // Update best_choice_bundle.
1316 best_choice_bundle->updated = true;
1317 best_choice_bundle->best_vse = vse;
1319 tprintf("Updated best choice\n");
1320 word->print_state("New state ");
1321 }
1322 // Update hyphen state if we are dealing with a dictionary word.
1323 if (vse->dawg_info != nullptr) {
1324 if (dict_->has_hyphen_end(*word)) {
1326 } else {
1327 dict_->reset_hyphen_vars(true);
1328 }
1329 }
1330
1331 if (blamer_bundle != nullptr) {
1333 vse->dawg_info != nullptr && vse->top_choice_flags);
1334 }
1335 }
1336 if (wordrec_display_segmentations && word_res->chopped_word != nullptr) {
1337 word->DisplaySegmentation(word_res->chopped_word);
1338 }
1339}
1340
1342 const ViterbiStateEntry &vse, float features[]) {
1343 memset(features, 0, sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);
1344 // Record dictionary match info.
1345 int len = vse.length <= kMaxSmallWordUnichars ? 0 :
1346 vse.length <= kMaxMediumWordUnichars ? 1 : 2;
1347 if (vse.dawg_info != nullptr) {
1348 int permuter = vse.dawg_info->permuter;
1349 if (permuter == NUMBER_PERM || permuter == USER_PATTERN_PERM) {
1350 if (vse.consistency_info.num_digits == vse.length) {
1351 features[PTRAIN_DIGITS_SHORT+len] = 1.0;
1352 } else {
1353 features[PTRAIN_NUM_SHORT+len] = 1.0;
1354 }
1355 } else if (permuter == DOC_DAWG_PERM) {
1356 features[PTRAIN_DOC_SHORT+len] = 1.0;
1357 } else if (permuter == SYSTEM_DAWG_PERM || permuter == USER_DAWG_PERM ||
1358 permuter == COMPOUND_PERM) {
1359 features[PTRAIN_DICT_SHORT+len] = 1.0;
1360 } else if (permuter == FREQ_DAWG_PERM) {
1361 features[PTRAIN_FREQ_SHORT+len] = 1.0;
1362 }
1363 }
1364 // Record shape cost feature (normalized by path length).
1365 features[PTRAIN_SHAPE_COST_PER_CHAR] =
1366 vse.associate_stats.shape_cost / static_cast<float>(vse.length);
1367 // Record ngram cost. (normalized by the path length).
1368 features[PTRAIN_NGRAM_COST_PER_CHAR] = 0.0;
1369 if (vse.ngram_info != nullptr) {
1370 features[PTRAIN_NGRAM_COST_PER_CHAR] =
1371 vse.ngram_info->ngram_cost / static_cast<float>(vse.length);
1372 }
1373 // Record consistency-related features.
1374 // Disabled this feature for due to its poor performance.
1375 // features[PTRAIN_NUM_BAD_PUNC] = vse.consistency_info.NumInconsistentPunc();
1378 features[PTRAIN_NUM_BAD_CHAR_TYPE] = vse.dawg_info == nullptr ?
1380 features[PTRAIN_NUM_BAD_SPACING] =
1382 // Disabled this feature for now due to its poor performance.
1383 // features[PTRAIN_NUM_BAD_FONT] = vse.consistency_info.inconsistent_font;
1384
1385 // Classifier-related features.
1386 features[PTRAIN_RATING_PER_CHAR] =
1387 vse.ratings_sum / static_cast<float>(vse.outline_length);
1388}
1389
1391 ViterbiStateEntry *vse,
1392 WERD_RES *word_res,
1393 DANGERR *fixpt,
1394 BlamerBundle *blamer_bundle,
1395 bool *truth_path) {
1396 if (truth_path != nullptr) {
1397 *truth_path =
1398 (blamer_bundle != nullptr &&
1399 vse->length == blamer_bundle->correct_segmentation_length());
1400 }
1401 BLOB_CHOICE *curr_b = vse->curr_b;
1402 ViterbiStateEntry *curr_vse = vse;
1403
1404 int i;
1405 bool compound = dict_->hyphenated(); // treat hyphenated words as compound
1406
1407 // Re-compute the variance of the width-to-height ratios (since we now
1408 // can compute the mean over the whole word).
1409 float full_wh_ratio_mean = 0.0f;
1410 if (vse->associate_stats.full_wh_ratio_var != 0.0f) {
1412 full_wh_ratio_mean = (vse->associate_stats.full_wh_ratio_total /
1413 static_cast<float>(vse->length));
1415 }
1416
1417 // Construct a WERD_CHOICE by tracing parent pointers.
1418 auto *word = new WERD_CHOICE(word_res->uch_set, vse->length);
1419 word->set_length(vse->length);
1420 int total_blobs = 0;
1421 for (i = (vse->length-1); i >= 0; --i) {
1422 if (blamer_bundle != nullptr && truth_path != nullptr && *truth_path &&
1423 !blamer_bundle->MatrixPositionCorrect(i, curr_b->matrix_cell())) {
1424 *truth_path = false;
1425 }
1426 // The number of blobs used for this choice is row - col + 1.
1427 int num_blobs = curr_b->matrix_cell().row - curr_b->matrix_cell().col + 1;
1428 total_blobs += num_blobs;
1429 word->set_blob_choice(i, num_blobs, curr_b);
1430 // Update the width-to-height ratio variance. Useful non-space delimited
1431 // languages to ensure that the blobs are of uniform width.
1432 // Skip leading and trailing punctuation when computing the variance.
1433 if ((full_wh_ratio_mean != 0.0f &&
1434 ((curr_vse != vse && curr_vse->parent_vse != nullptr) ||
1437 pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2);
1439 tprintf("full_wh_ratio_var += (%g-%g)^2\n",
1440 full_wh_ratio_mean, curr_vse->associate_stats.full_wh_ratio);
1441 }
1442 }
1443
1444 // Mark the word as compound if compound permuter was set for any of
1445 // the unichars on the path (usually this will happen for unichars
1446 // that are compounding operators, like "-" and "/").
1447 if (!compound && curr_vse->dawg_info &&
1448 curr_vse->dawg_info->permuter == COMPOUND_PERM) compound = true;
1449
1450 // Update curr_* pointers.
1451 curr_vse = curr_vse->parent_vse;
1452 if (curr_vse == nullptr) break;
1453 curr_b = curr_vse->curr_b;
1454 }
1455 ASSERT_HOST(i == 0); // check that we recorded all the unichar ids.
1456 ASSERT_HOST(total_blobs == word_res->ratings->dimension());
1457 // Re-adjust shape cost to include the updated width-to-height variance.
1458 if (full_wh_ratio_mean != 0.0f) {
1460 }
1461
1462 word->set_rating(vse->ratings_sum);
1463 word->set_certainty(vse->min_certainty);
1464 word->set_x_heights(vse->consistency_info.BodyMinXHeight(),
1466 if (vse->dawg_info != nullptr) {
1467 word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter);
1468 } else if (language_model_ngram_on && !vse->ngram_info->pruned) {
1469 word->set_permuter(NGRAM_PERM);
1470 } else if (vse->top_choice_flags) {
1471 word->set_permuter(TOP_CHOICE_PERM);
1472 } else {
1473 word->set_permuter(NO_PERM);
1474 }
1475 word->set_dangerous_ambig_found_(!dict_->NoDangerousAmbig(word, fixpt, true,
1476 word_res->ratings));
1477 return word;
1478}
1479
1480} // namespace tesseract
PermuterType
Definition: ratngs.h:232
@ TOP_CHOICE_PERM
Definition: ratngs.h:235
@ USER_PATTERN_PERM
Definition: ratngs.h:240
@ DOC_DAWG_PERM
Definition: ratngs.h:242
@ FREQ_DAWG_PERM
Definition: ratngs.h:244
@ USER_DAWG_PERM
Definition: ratngs.h:243
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:241
@ NGRAM_PERM
Definition: ratngs.h:238
@ NUMBER_PERM
Definition: ratngs.h:239
@ COMPOUND_PERM
Definition: ratngs.h:245
@ NO_PERM
Definition: ratngs.h:233
#define ASSERT_HOST(x)
Definition: errcode.h:88
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:315
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:330
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:324
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:318
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int UNICHAR_ID
Definition: unichar.h:34
@ mixed
Definition: cluster.h:44
int64_t NODE_REF
Definition: dawg.h:52
@ DAWG_TYPE_WORD
Definition: dawg.h:70
unsigned char LanguageModelFlagsType
Used for expressing various language model flags.
Definition: lm_state.h:37
bool empty() const
Definition: genericvector.h:91
int size() const
Definition: genericvector.h:72
bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord)
Definition: blamer.h:145
void UpdateBestRating(float rating)
Definition: blamer.h:136
bool GuidedSegsearchStillGoing() const
Definition: blamer.cpp:514
void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo)
Definition: blamer.h:169
void set_best_choice_is_dict_and_top_choice(bool value)
Definition: blamer.h:149
int correct_segmentation_length() const
Definition: blamer.h:140
GenericVector< TBLOB * > blobs
Definition: blobs.h:459
int dimension() const
Definition: matrix.h:536
const UNICHARSET * uch_set
Definition: pageres.h:203
int GetBlobsGap(int blob_index)
Definition: pageres.cpp:740
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:604
GenericVector< int > blob_widths
Definition: pageres.h:216
WERD_CHOICE * best_choice
Definition: pageres.h:241
float x_height
Definition: pageres.h:316
WERD_CHOICE * raw_choice
Definition: pageres.h:246
TWERD * chopped_word
Definition: pageres.h:212
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:620
MATRIX * ratings
Definition: pageres.h:237
float features[PTRAIN_NUM_FEATURE_TYPES]
float certainty() const
Definition: ratngs.h:83
int16_t fontinfo_id2() const
Definition: ratngs.h:89
const MATRIX_COORD & matrix_cell()
Definition: ratngs.h:117
bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const
Definition: ratngs.cpp:156
int16_t fontinfo_id() const
Definition: ratngs.h:86
float rating() const
Definition: ratngs.h:80
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
void SetScriptPositions(bool small_caps, TWERD *word, int debug=0)
Definition: ratngs.cpp:554
const STRING & unichar_string() const
Definition: ratngs.h:531
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:453
void print_state(const char *msg) const
Definition: ratngs.cpp:756
uint8_t permuter() const
Definition: ratngs.h:336
void set_rating(float new_val)
Definition: ratngs.h:359
void DisplaySegmentation(TWERD *word)
Definition: ratngs.cpp:765
static const float kBadRating
Definition: ratngs.h:265
void print() const
Definition: ratngs.h:570
float rating() const
Definition: ratngs.h:317
Definition: strngs.h:45
int32_t length() const
Definition: strngs.cpp:189
const char * string() const
Definition: strngs.cpp:194
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:138
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:734
int hiragana_sid() const
Definition: unicharset.h:890
int katakana_sid() const
Definition: unicharset.h:891
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
Definition: unicharset.cpp:486
int han_sid() const
Definition: unicharset.h:889
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:663
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:519
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:505
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
int null_sid() const
Definition: unicharset.h:884
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:683
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:835
int size() const
Definition: unicharset.h:341
int common_sid() const
Definition: unicharset.h:885
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:498
virtual bool end_of_word(EDGE_REF edge_ref) const =0
DawgType type() const
Definition: dawg.h:124
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const =0
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:122
EDGE_REF dawg_ref
Definition: dawg.h:367
DawgPositionVector * updated_dawgs
Definition: dict.h:85
DawgPositionVector * active_dawgs
Definition: dict.h:84
PermuterType permuter
Definition: dict.h:86
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:617
int tessedit_truncate_wordchoice_log
Definition: dict.h:642
int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:376
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:28
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:135
bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:152
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:432
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
Definition: stopper.cpp:42
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:113
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:438
int stopper_debug_level
Definition: dict.h:638
bool is_apostrophe(UNICHAR_ID unichar_id)
Definition: dict.h:124
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:434
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:390
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
Definition: stopper.cpp:144
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:701
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:600
void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
Definition: hyphen.cpp:45
const UNICHARSET & getUnicharset() const
Definition: dict.h:101
static float ComputeOutlineLength(float rating_cert_scale, const BLOB_CHOICE &b)
Definition: associate.h:80
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
DawgPositionVector beginning_active_dawgs_
bool PrunablePath(const ViterbiStateEntry &vse)
static const LanguageModelFlagsType kXhtConsistentFlag
static const LanguageModelFlagsType kSmallestRatingFlag
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
static const LanguageModelFlagsType kDigitFlag
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
bool AcceptablePath(const ViterbiStateEntry &vse)
void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
static const LanguageModelFlagsType kLowerCaseFlag
bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
double language_model_ngram_nonmatch_score
double language_model_penalty_non_dict_word
bool language_model_ngram_use_only_first_uft8_step
ViterbiStateEntry * GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
WERD_CHOICE * ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
static const float kMaxAvgNgramCost
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
bool UpdateState(bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
float ComputeAdjustedPathCost(ViterbiStateEntry *vse)
int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const
bool language_model_ngram_space_delimited_language
DawgPositionVector very_beginning_active_dawgs_
static const LanguageModelFlagsType kUpperCaseFlag
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
float CertaintyScore(float cert)
void InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
const UnicityTable< FontInfo > * fontinfo_table_
double language_model_penalty_non_freq_dict_word
int language_model_viterbi_list_max_num_prunable
LanguageModel(const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
void ComputeXheightConsistency(const BLOB_CHOICE *b, bool is_punc)
XHeightConsistencyEnum xht_decision
DawgPositionVector active_dawgs
Definition: lm_state.h:64
float ngram_and_classifier_cost
-[ ln(P_classifier(path)) + scale_factor * ln(P_ngram_model(path)) ]
Definition: lm_state.h:86
float ngram_cost
-ln(P_ngram_model(path))
Definition: lm_state.h:84
LanguageModelDawgInfo * dawg_info
Definition: lm_state.h:166
float outline_length
length of the outline so far
Definition: lm_state.h:186
BLOB_CHOICE * curr_b
Pointers to BLOB_CHOICE and parent ViterbiStateEntry (not owned by this).
Definition: lm_state.h:158
AssociateStats associate_stats
character widths/gaps/seams
Definition: lm_state.h:188
ViterbiStateEntry * competing_vse
Definition: lm_state.h:162
int length
number of characters on the path
Definition: lm_state.h:185
void Print(const char *msg) const
Definition: lm_state.cpp:27
ViterbiStateEntry * parent_vse
Definition: lm_state.h:159
LanguageModelNgramInfo * ngram_info
Definition: lm_state.h:170
LanguageModelFlagsType top_choice_flags
Definition: lm_state.h:192
static int Compare(const void *e1, const void *e2)
Definition: lm_state.h:132
float ratings_sum
sum of ratings of character on the path
Definition: lm_state.h:182
bool updated
set to true if the entry has just been created/updated
Definition: lm_state.h:194
LMConsistencyInfo consistency_info
path consistency info
Definition: lm_state.h:187
float min_certainty
minimum certainty on the path
Definition: lm_state.h:183
bool HasAlnumChoice(const UNICHARSET &unicharset)
Definition: lm_state.h:147
Struct to store information maintained by various language model components.
Definition: lm_state.h:200
float viterbi_state_entries_prunable_max_cost
Definition: lm_state.h:216
void Print(const char *msg)
Definition: lm_state.cpp:70
int viterbi_state_entries_length
Total number of entries in viterbi_state_entries.
Definition: lm_state.h:218
int viterbi_state_entries_prunable_length
Number and max cost of prunable paths in viterbi_state_entries.
Definition: lm_state.h:215
ViterbiStateEntry_LIST viterbi_state_entries
Storage for the Viterbi state.
Definition: lm_state.h:213
Bundle together all the things pertaining to the best choice/state.
Definition: lm_state.h:222
DANGERR fixpt
Places to try to fix the word suggested by ambiguity checking.
Definition: lm_state.h:234
ViterbiStateEntry * best_vse
Best ViterbiStateEntry and BLOB_CHOICE.
Definition: lm_state.h:240
bool updated
Flag to indicate whether anything was changed.
Definition: lm_state.h:232
PointerVector< LanguageModelState > beam
Definition: lm_state.h:238
float ComputeCost(const float features[]) const