tesseract 4.1.1
Loading...
Searching...
No Matches
permdawg.cpp
Go to the documentation of this file.
1/* -*-C-*-
2 ********************************************************************************
3 *
4 * File: permdawg.cpp (Formerly permdawg.c)
5 * Description: Scale word choices by a dictionary
6 * Author: Mark Seaman, OCR Technology
7 *
8 * (c) Copyright 1987, Hewlett-Packard Company.
9 ** Licensed under the Apache License, Version 2.0 (the "License");
10 ** you may not use this file except in compliance with the License.
11 ** You may obtain a copy of the License at
12 ** http://www.apache.org/licenses/LICENSE-2.0
13 ** Unless required by applicable law or agreed to in writing, software
14 ** distributed under the License is distributed on an "AS IS" BASIS,
15 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 ** See the License for the specific language governing permissions and
17 ** limitations under the License.
18 *
19 *********************************************************************************/
20/*----------------------------------------------------------------------
21 I n c l u d e s
22----------------------------------------------------------------------*/
23
24#include "dawg.h"
25#include "stopper.h"
26#include "tprintf.h"
27#include "params.h"
28
29#include <algorithm>
30#include <cctype>
31#include "dict.h"
32
33/*----------------------------------------------------------------------
34 F u n c t i o n s
35----------------------------------------------------------------------*/
36namespace tesseract {
37
45 const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
46 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
47 bool word_ending, WERD_CHOICE *word, float certainties[], float *limit,
48 WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args) {
49 auto *more_args = static_cast<DawgArgs *>(void_more_args);
50 word_ending = (char_choice_index == char_choices.size()-1);
51 int word_index = word->length() - 1;
52 if (best_choice->rating() < *limit) return;
53 // Look up char in DAWG
54
55 // If the current unichar is an ngram first try calling
56 // letter_is_okay() for each unigram it contains separately.
57 UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
58 bool checked_unigrams = false;
59 if (getUnicharset().get_isngram(orig_uch_id)) {
60 if (dawg_debug_level) {
61 tprintf("checking unigrams in an ngram %s\n",
62 getUnicharset().debug_str(orig_uch_id).string());
63 }
64 int num_unigrams = 0;
67 const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
68 // Since the string came out of the unicharset, failure is impossible.
69 ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr,
70 nullptr));
71 bool unigrams_ok = true;
72 // Construct DawgArgs that reflect the current state.
73 DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
74 DawgPositionVector unigram_updated_dawgs;
75 DawgArgs unigram_dawg_args(&unigram_active_dawgs,
76 &unigram_updated_dawgs,
77 more_args->permuter);
78 // Check unigrams in the ngram with letter_is_okay().
79 for (int i = 0; unigrams_ok && i < encoding.size(); ++i) {
80 UNICHAR_ID uch_id = encoding[i];
81 ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
82 ++num_unigrams;
83 word->append_unichar_id(uch_id, 1, 0.0, 0.0);
84 unigrams_ok = (this->*letter_is_okay_)(
85 &unigram_dawg_args, *word->unicharset(),
86 word->unichar_id(word_index+num_unigrams-1),
87 word_ending && i == encoding.size() - 1);
88 (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
89 if (dawg_debug_level) {
90 tprintf("unigram %s is %s\n",
91 getUnicharset().debug_str(uch_id).string(),
92 unigrams_ok ? "OK" : "not OK");
93 }
94 }
95 // Restore the word and copy the updated dawg state if needed.
96 while (num_unigrams-- > 0) word->remove_last_unichar_id();
97 word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
98 if (unigrams_ok) {
99 checked_unigrams = true;
100 more_args->permuter = unigram_dawg_args.permuter;
101 *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
102 }
103 }
104
105 // Check which dawgs from the dawgs_ vector contain the word
106 // up to and including the current unichar.
107 if (checked_unigrams || (this->*letter_is_okay_)(
108 more_args, *word->unicharset(), word->unichar_id(word_index),
109 word_ending)) {
110 // Add a new word choice
111 if (word_ending) {
112 if (dawg_debug_level) {
113 tprintf("found word = %s\n", word->debug_string().string());
114 }
115 if (strcmp(output_ambig_words_file.string(), "") != 0) {
116 if (output_ambig_words_file_ == nullptr) {
117 output_ambig_words_file_ =
118 fopen(output_ambig_words_file.string(), "wb+");
119 if (output_ambig_words_file_ == nullptr) {
120 tprintf("Failed to open output_ambig_words_file %s\n",
121 output_ambig_words_file.string());
122 exit(1);
123 }
124 STRING word_str;
125 word->string_and_lengths(&word_str, nullptr);
126 word_str += " ";
127 fprintf(output_ambig_words_file_, "%s", word_str.string());
128 }
129 STRING word_str;
130 word->string_and_lengths(&word_str, nullptr);
131 word_str += " ";
132 fprintf(output_ambig_words_file_, "%s", word_str.string());
133 }
134 WERD_CHOICE *adjusted_word = word;
135 adjusted_word->set_permuter(more_args->permuter);
136 update_best_choice(*adjusted_word, best_choice);
137 } else { // search the next letter
138 // Make updated_* point to the next entries in the DawgPositionVector
139 // arrays (that were originally created in dawg_permute_and_select)
140 ++(more_args->updated_dawgs);
141 // Make active_dawgs and constraints point to the updated ones.
142 ++(more_args->active_dawgs);
143 permute_choices(debug, char_choices, char_choice_index + 1,
144 prev_char_frag_info, word, certainties, limit,
145 best_choice, attempts_left, more_args);
146 // Restore previous state to explore another letter in this position.
147 --(more_args->updated_dawgs);
148 --(more_args->active_dawgs);
149 }
150 } else {
151 if (dawg_debug_level) {
152 tprintf("last unichar not OK at index %d in %s\n",
153 word_index, word->debug_string().string());
154 }
155 }
156}
157
158
169 const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) {
170 auto *best_choice = new WERD_CHOICE(&getUnicharset());
171 best_choice->make_bad();
172 best_choice->set_rating(rating_limit);
173 if (char_choices.length() == 0 || char_choices.length() > MAX_WERD_LENGTH)
174 return best_choice;
175 auto *active_dawgs =
176 new DawgPositionVector[char_choices.length() + 1];
177 init_active_dawgs(&(active_dawgs[0]), true);
178 DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
180
181 float certainties[MAX_WERD_LENGTH];
183 int attempts_left = max_permuter_attempts;
184 permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr,
185 char_choices, 0, nullptr, &word, certainties, &rating_limit, best_choice,
186 &attempts_left, &dawg_args);
187 delete[] active_dawgs;
188 return best_choice;
189}
190
198 const char *debug,
199 const BLOB_CHOICE_LIST_VECTOR &char_choices,
200 int char_choice_index,
201 const CHAR_FRAGMENT_INFO *prev_char_frag_info,
202 WERD_CHOICE *word,
203 float certainties[],
204 float *limit,
205 WERD_CHOICE *best_choice,
206 int *attempts_left,
207 void *more_args) {
208 if (debug) {
209 tprintf("%s permute_choices: char_choice_index=%d"
210 " limit=%g rating=%g, certainty=%g word=%s\n",
211 debug, char_choice_index, *limit, word->rating(),
212 word->certainty(), word->debug_string().string());
213 }
214 if (char_choice_index < char_choices.length()) {
215 BLOB_CHOICE_IT blob_choice_it;
216 blob_choice_it.set_to_list(char_choices.get(char_choice_index));
217 for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
218 blob_choice_it.forward()) {
219 (*attempts_left)--;
220 append_choices(debug, char_choices, *(blob_choice_it.data()),
221 char_choice_index, prev_char_frag_info, word,
222 certainties, limit, best_choice, attempts_left, more_args);
223 if (*attempts_left <= 0) {
224 if (debug) tprintf("permute_choices(): attempts_left is 0\n");
225 break;
226 }
227 }
228 }
229}
230
240 const char *debug,
241 const BLOB_CHOICE_LIST_VECTOR &char_choices,
242 const BLOB_CHOICE &blob_choice,
243 int char_choice_index,
244 const CHAR_FRAGMENT_INFO *prev_char_frag_info,
245 WERD_CHOICE *word,
246 float certainties[],
247 float *limit,
248 WERD_CHOICE *best_choice,
249 int *attempts_left,
250 void *more_args) {
251 int word_ending = (char_choice_index == char_choices.length() - 1);
252
253 // Deal with fragments.
254 CHAR_FRAGMENT_INFO char_frag_info;
255 if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(),
256 blob_choice.certainty(), prev_char_frag_info, debug,
257 word_ending, &char_frag_info)) {
258 return; // blob_choice must be an invalid fragment
259 }
260 // Search the next letter if this character is a fragment.
261 if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
262 permute_choices(debug, char_choices, char_choice_index + 1,
263 &char_frag_info, word, certainties, limit,
264 best_choice, attempts_left, more_args);
265 return;
266 }
267
268 // Add the next unichar.
269 float old_rating = word->rating();
270 float old_certainty = word->certainty();
271 uint8_t old_permuter = word->permuter();
272 certainties[word->length()] = char_frag_info.certainty;
274 char_frag_info.unichar_id, char_frag_info.num_fragments,
275 char_frag_info.rating, char_frag_info.certainty);
276
277 // Explore the next unichar.
278 (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index,
279 &char_frag_info, word_ending, word, certainties,
280 limit, best_choice, attempts_left, more_args);
281
282 // Remove the unichar we added to explore other choices in it's place.
284 word->set_rating(old_rating);
285 word->set_certainty(old_certainty);
286 word->set_permuter(old_permuter);
287}
288
315 float curr_rating, float curr_certainty,
316 const CHAR_FRAGMENT_INFO *prev_char_frag_info,
317 const char *debug, int word_ending,
318 CHAR_FRAGMENT_INFO *char_frag_info) {
319 const CHAR_FRAGMENT *this_fragment =
320 getUnicharset().get_fragment(curr_unichar_id);
321 const CHAR_FRAGMENT *prev_fragment =
322 prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr;
323
324 // Print debug info for fragments.
325 if (debug && (prev_fragment || this_fragment)) {
326 tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
327 getUnicharset().debug_str(curr_unichar_id).string(),
328 word_ending);
329 if (prev_fragment) {
330 tprintf("prev_fragment %s\n", prev_fragment->to_string().string());
331 }
332 if (this_fragment) {
333 tprintf("this_fragment %s\n", this_fragment->to_string().string());
334 }
335 }
336
337 char_frag_info->unichar_id = curr_unichar_id;
338 char_frag_info->fragment = this_fragment;
339 char_frag_info->rating = curr_rating;
340 char_frag_info->certainty = curr_certainty;
341 char_frag_info->num_fragments = 1;
342 if (prev_fragment && !this_fragment) {
343 if (debug) tprintf("Skip choice with incomplete fragment\n");
344 return false;
345 }
346 if (this_fragment) {
347 // We are dealing with a fragment.
348 char_frag_info->unichar_id = INVALID_UNICHAR_ID;
349 if (prev_fragment) {
350 if (!this_fragment->is_continuation_of(prev_fragment)) {
351 if (debug) tprintf("Non-matching fragment piece\n");
352 return false;
353 }
354 if (this_fragment->is_ending()) {
355 char_frag_info->unichar_id =
356 getUnicharset().unichar_to_id(this_fragment->get_unichar());
357 char_frag_info->fragment = nullptr;
358 if (debug) {
359 tprintf("Built character %s from fragments\n",
360 getUnicharset().debug_str(
361 char_frag_info->unichar_id).string());
362 }
363 } else {
364 if (debug) tprintf("Record fragment continuation\n");
365 char_frag_info->fragment = this_fragment;
366 }
367 // Update certainty and rating.
368 char_frag_info->rating =
369 prev_char_frag_info->rating + curr_rating;
370 char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
371 char_frag_info->certainty =
372 std::min(curr_certainty, prev_char_frag_info->certainty);
373 } else {
374 if (this_fragment->is_beginning()) {
375 if (debug) tprintf("Record fragment beginning\n");
376 } else {
377 if (debug) {
378 tprintf("Non-starting fragment piece with no prev_fragment\n");
379 }
380 return false;
381 }
382 }
383 }
384 if (word_ending && char_frag_info->fragment) {
385 if (debug) tprintf("Word can not end with a fragment\n");
386 return false;
387 }
388 return true;
389}
390
391} // namespace tesseract
@ NO_PERM
Definition: ratngs.h:233
#define ASSERT_HOST(x)
Definition: errcode.h:88
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int UNICHAR_ID
Definition: unichar.h:34
#define MAX_WERD_LENGTH
Definition: dict.h:39
int size() const
Definition: genericvector.h:72
int length() const
Definition: genericvector.h:86
T & get(int index) const
float certainty() const
Definition: ratngs.h:83
float rating() const
Definition: ratngs.h:80
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
void set_certainty(float new_val)
Definition: ratngs.h:362
const STRING debug_string() const
Definition: ratngs.h:495
void remove_last_unichar_id()
Definition: ratngs.h:473
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:453
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
uint8_t permuter() const
Definition: ratngs.h:336
void set_rating(float new_val)
Definition: ratngs.h:359
const UNICHARSET * unicharset() const
Definition: ratngs.h:290
void set_permuter(uint8_t perm)
Definition: ratngs.h:365
float certainty() const
Definition: ratngs.h:320
int length() const
Definition: ratngs.h:293
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:472
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:442
float rating() const
Definition: ratngs.h:317
Definition: strngs.h:45
const char * string() const
Definition: strngs.cpp:194
bool is_beginning() const
Definition: unicharset.h:105
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
Definition: unicharset.h:98
static STRING to_string(const char *unichar, int pos, int total, bool natural)
const char * get_unichar() const
Definition: unicharset.h:70
bool is_ending() const
Definition: unicharset.h:108
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:734
bool get_isngram(UNICHAR_ID unichar_id) const
Definition: unicharset.h:526
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
const CHAR_FRAGMENT * fragment
Definition: dict.h:45
UNICHAR_ID unichar_id
Definition: dict.h:44
int num_fragments
Definition: dict.h:46
float rating
Definition: dict.h:47
float certainty
Definition: dict.h:48
DawgPositionVector * updated_dawgs
Definition: dict.h:85
DawgPositionVector * active_dawgs
Definition: dict.h:84
PermuterType permuter
Definition: dict.h:86
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:197
int dawg_debug_level
Definition: dict.h:622
void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:239
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:372
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:216
int max_permuter_attempts
Definition: dict.h:658
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:182
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
Definition: permdawg.cpp:168
char * output_ambig_words_file
Definition: dict.h:620
void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Definition: permdawg.cpp:44
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:600
const UNICHARSET & getUnicharset() const
Definition: dict.h:101
bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
Definition: permdawg.cpp:314