tesseract 4.1.1
Loading...
Searching...
No Matches
dict.cpp
Go to the documentation of this file.
1
2// File: dict.cpp
3// Description: dict class.
4// Author: Samuel Charron
5//
6// (C) Copyright 2006, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16//
18
19#include <cstdio>
20
21#include "dict.h"
22#include "unicodes.h"
23
24#include "tprintf.h"
25
26namespace tesseract {
27
28class Image;
29
31 : letter_is_okay_(&tesseract::Dict::def_letter_is_okay),
32 probability_in_context_(&tesseract::Dict::def_probability_in_context),
33 params_model_classify_(nullptr),
34 ccutil_(ccutil),
35 wildcard_unichar_id_(INVALID_UNICHAR_ID),
36 apostrophe_unichar_id_(INVALID_UNICHAR_ID),
37 question_unichar_id_(INVALID_UNICHAR_ID),
38 slash_unichar_id_(INVALID_UNICHAR_ID),
39 hyphen_unichar_id_(INVALID_UNICHAR_ID),
40 STRING_MEMBER(user_words_file, "", "A filename of user-provided words.",
41 getCCUtil()->params()),
42 STRING_INIT_MEMBER(user_words_suffix, "",
43 "A suffix of user-provided words located in tessdata.",
44 getCCUtil()->params()),
45 STRING_MEMBER(user_patterns_file, "",
46 "A filename of user-provided patterns.",
47 getCCUtil()->params()),
48 STRING_INIT_MEMBER(user_patterns_suffix, "",
49 "A suffix of user-provided patterns located in "
50 "tessdata.",
51 getCCUtil()->params()),
52 BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.",
53 getCCUtil()->params()),
54 BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.",
55 getCCUtil()->params()),
56 BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
57 getCCUtil()->params()),
58 BOOL_INIT_MEMBER(load_punc_dawg, true,
59 "Load dawg with punctuation"
60 " patterns.",
61 getCCUtil()->params()),
62 BOOL_INIT_MEMBER(load_number_dawg, true,
63 "Load dawg with number"
64 " patterns.",
65 getCCUtil()->params()),
66 BOOL_INIT_MEMBER(load_bigram_dawg, true,
67 "Load dawg with special word "
68 "bigrams.",
69 getCCUtil()->params()),
70 double_MEMBER(xheight_penalty_subscripts, 0.125,
71 "Score penalty (0.1 = 10%) added if there are subscripts "
72 "or superscripts in a word, but it is otherwise OK.",
73 getCCUtil()->params()),
74 double_MEMBER(xheight_penalty_inconsistent, 0.25,
75 "Score penalty (0.1 = 10%) added if an xheight is "
76 "inconsistent.",
77 getCCUtil()->params()),
78 double_MEMBER(segment_penalty_dict_frequent_word, 1.0,
79 "Score multiplier for word matches which have good case and"
80 " are frequent in the given language (lower is better).",
81 getCCUtil()->params()),
82 double_MEMBER(segment_penalty_dict_case_ok, 1.1,
83 "Score multiplier for word matches that have good case "
84 "(lower is better).",
85 getCCUtil()->params()),
86 double_MEMBER(segment_penalty_dict_case_bad, 1.3125,
87 "Default score multiplier for word matches, which may have "
88 "case issues (lower is better).",
89 getCCUtil()->params()),
90 double_MEMBER(segment_penalty_dict_nonword, 1.25,
91 "Score multiplier for glyph fragment segmentations which "
92 "do not match a dictionary word (lower is better).",
93 getCCUtil()->params()),
94 double_MEMBER(segment_penalty_garbage, 1.50,
95 "Score multiplier for poorly cased strings that are not in"
96 " the dictionary and generally look like garbage (lower is"
97 " better).",
98 getCCUtil()->params()),
99 STRING_MEMBER(output_ambig_words_file, "",
100 "Output file for ambiguities found in the dictionary",
101 getCCUtil()->params()),
102 INT_MEMBER(dawg_debug_level, 0,
103 "Set to 1 for general debug info"
104 ", to 2 for more details, to 3 to see all the debug messages",
105 getCCUtil()->params()),
106 INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.",
107 getCCUtil()->params()),
108 BOOL_MEMBER(use_only_first_uft8_step, false,
109 "Use only the first UTF8 step of the given string"
110 " when computing log probabilities.",
111 getCCUtil()->params()),
112 double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
113 getCCUtil()->params()),
114 double_MEMBER(stopper_nondict_certainty_base, -2.50,
115 "Certainty threshold for non-dict words",
116 getCCUtil()->params()),
117 double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0,
118 "Reject certainty offset", getCCUtil()->params()),
119 INT_MEMBER(stopper_smallword_size, 2,
120 "Size of dict word to be treated as non-dict word",
121 getCCUtil()->params()),
122 double_MEMBER(stopper_certainty_per_char, -0.50,
123 "Certainty to add"
124 " for each dict char above small word size.",
125 getCCUtil()->params()),
126 double_MEMBER(stopper_allowable_character_badness, 3.0,
127 "Max certaintly variation allowed in a word (in sigma)",
128 getCCUtil()->params()),
129 INT_MEMBER(stopper_debug_level, 0, "Stopper debug level",
130 getCCUtil()->params()),
131 BOOL_MEMBER(stopper_no_acceptable_choices, false,
132 "Make AcceptableChoice() always return false. Useful"
133 " when there is a need to explore all segmentations",
134 getCCUtil()->params()),
135 INT_MEMBER(tessedit_truncate_wordchoice_log, 10,
136 "Max words to keep in list", getCCUtil()->params()),
137 STRING_MEMBER(word_to_debug, "",
138 "Word for which stopper debug"
139 " information should be printed to stdout",
140 getCCUtil()->params()),
141 BOOL_MEMBER(segment_nonalphabetic_script, false,
142 "Don't use any alphabetic-specific tricks."
143 " Set to true in the traineddata config file for"
144 " scripts that are cursive or inherently fixed-pitch",
145 getCCUtil()->params()),
146 BOOL_MEMBER(save_doc_words, 0, "Save Document Words",
147 getCCUtil()->params()),
148 double_MEMBER(doc_dict_pending_threshold, 0.0,
149 "Worst certainty for using pending dictionary",
150 getCCUtil()->params()),
151 double_MEMBER(doc_dict_certainty_threshold, -2.25,
152 "Worst certainty for words that can be inserted into the"
153 " document dictionary",
154 getCCUtil()->params()),
155 INT_MEMBER(max_permuter_attempts, 10000,
156 "Maximum number of different"
157 " character choices to consider during permutation."
158 " This limit is especially useful when user patterns"
159 " are specified, since overly generic patterns can result in"
160 " dawg search exploring an overly large number of options.",
161 getCCUtil()->params()) {
162 reject_offset_ = 0.0;
163 go_deeper_fxn_ = nullptr;
164 hyphen_word_ = nullptr;
165 last_word_on_line_ = false;
166 document_words_ = nullptr;
167 dawg_cache_ = nullptr;
168 dawg_cache_is_ours_ = false;
169 pending_words_ = nullptr;
170 bigram_dawg_ = nullptr;
171 freq_dawg_ = nullptr;
172 punc_dawg_ = nullptr;
173 unambig_dawg_ = nullptr;
174 wordseg_rating_adjust_factor_ = -1.0f;
175 output_ambig_words_file_ = nullptr;
176}
177
179 End();
180 delete hyphen_word_;
181 if (output_ambig_words_file_ != nullptr) fclose(output_ambig_words_file_);
182}
183
185 // This global cache (a singleton) will outlive every Tesseract instance
186 // (even those that someone else might declare as global statics).
187 static DawgCache cache;
188 return &cache;
189}
190
191// Sets up ready for a Load or LoadLSTM.
192void Dict::SetupForLoad(DawgCache* dawg_cache) {
193 if (dawgs_.length() != 0) this->End();
194
195 apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol);
196 question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol);
197 slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol);
198 hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
199
200 if (dawg_cache != nullptr) {
201 dawg_cache_ = dawg_cache;
202 dawg_cache_is_ours_ = false;
203 } else {
204 dawg_cache_ = new DawgCache();
205 dawg_cache_is_ours_ = true;
206 }
207}
208
209// Loads the dawgs needed by Tesseract. Call FinishLoad() after.
210void Dict::Load(const STRING& lang, TessdataManager* data_file) {
211 // Load dawgs_.
212 if (load_punc_dawg) {
213 punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG,
214 dawg_debug_level, data_file);
215 if (punc_dawg_) dawgs_ += punc_dawg_;
216 }
217 if (load_system_dawg) {
218 Dawg* system_dawg = dawg_cache_->GetSquishedDawg(
219 lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
220 if (system_dawg) dawgs_ += system_dawg;
221 }
222 if (load_number_dawg) {
223 Dawg* number_dawg = dawg_cache_->GetSquishedDawg(
224 lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
225 if (number_dawg) dawgs_ += number_dawg;
226 }
227 if (load_bigram_dawg) {
228 bigram_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG,
229 dawg_debug_level, data_file);
230 // The bigram_dawg_ is NOT used like the other dawgs! DO NOT add to the
231 // dawgs_!!
232 }
233 if (load_freq_dawg) {
234 freq_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG,
235 dawg_debug_level, data_file);
236 if (freq_dawg_) dawgs_ += freq_dawg_;
237 }
238 if (load_unambig_dawg) {
239 unambig_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG,
240 dawg_debug_level, data_file);
241 if (unambig_dawg_) dawgs_ += unambig_dawg_;
242 }
243
244 STRING name;
245 if (!user_words_suffix.empty() || !user_words_file.empty()) {
246 Trie* trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
248 if (!user_words_file.empty()) {
249 name = user_words_file;
250 } else {
252 name += user_words_suffix;
253 }
254 if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
256 tprintf("Error: failed to load %s\n", name.string());
257 delete trie_ptr;
258 } else {
259 dawgs_ += trie_ptr;
260 }
261 }
262
263 if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
264 Trie* trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
266 trie_ptr->initialize_patterns(&(getUnicharset()));
267 if (!user_patterns_file.empty()) {
268 name = user_patterns_file;
269 } else {
271 name += user_patterns_suffix;
272 }
273 if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) {
274 tprintf("Error: failed to load %s\n", name.string());
275 delete trie_ptr;
276 } else {
277 dawgs_ += trie_ptr;
278 }
279 }
280
281 document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
283 dawgs_ += document_words_;
284
285 // This dawg is temporary and should not be searched by letter_is_ok.
286 pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
288}
289
290// Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
291void Dict::LoadLSTM(const STRING& lang, TessdataManager* data_file) {
292 // Load dawgs_.
293 if (load_punc_dawg) {
294 punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG,
295 dawg_debug_level, data_file);
296 if (punc_dawg_) dawgs_ += punc_dawg_;
297 }
298 if (load_system_dawg) {
299 Dawg* system_dawg = dawg_cache_->GetSquishedDawg(
301 if (system_dawg) dawgs_ += system_dawg;
302 }
303 if (load_number_dawg) {
304 Dawg* number_dawg = dawg_cache_->GetSquishedDawg(
306 if (number_dawg) dawgs_ += number_dawg;
307 }
308
309 // stolen from Dict::Load (but needs params_ from Tesseract
310 // langdata/config/api):
311 STRING name;
312 if (!user_words_suffix.empty() || !user_words_file.empty()) {
313 Trie* trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
315 if (!user_words_file.empty()) {
316 name = user_words_file;
317 } else {
319 name += user_words_suffix;
320 }
321 if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
323 tprintf("Error: failed to load %s\n", name.string());
324 delete trie_ptr;
325 } else {
326 dawgs_ += trie_ptr;
327 }
328 }
329
330 if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
331 Trie* trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
333 trie_ptr->initialize_patterns(&(getUnicharset()));
334 if (!user_patterns_file.empty()) {
335 name = user_patterns_file;
336 } else {
338 name += user_patterns_suffix;
339 }
340 if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) {
341 tprintf("Error: failed to load %s\n", name.string());
342 delete trie_ptr;
343 } else {
344 dawgs_ += trie_ptr;
345 }
346 }
347}
348
349// Completes the loading process after Load() and/or LoadLSTM().
350// Returns false if no dictionaries were loaded.
352 if (dawgs_.empty()) return false;
353 // Construct a list of corresponding successors for each dawg. Each entry, i,
354 // in the successors_ vector is a vector of integers that represent the
355 // indices into the dawgs_ vector of the successors for dawg i.
356 successors_.reserve(dawgs_.length());
357 for (int i = 0; i < dawgs_.length(); ++i) {
358 const Dawg* dawg = dawgs_[i];
359 auto* lst = new SuccessorList();
360 for (int j = 0; j < dawgs_.length(); ++j) {
361 const Dawg* other = dawgs_[j];
362 if (dawg != nullptr && other != nullptr &&
363 (dawg->lang() == other->lang()) &&
364 kDawgSuccessors[dawg->type()][other->type()])
365 *lst += j;
366 }
367 successors_ += lst;
368 }
369 return true;
370}
371
372void Dict::End() {
373 if (dawgs_.length() == 0) return; // Not safe to call twice.
374 for (int i = 0; i < dawgs_.size(); i++) {
375 if (!dawg_cache_->FreeDawg(dawgs_[i])) {
376 delete dawgs_[i];
377 }
378 }
379 dawg_cache_->FreeDawg(bigram_dawg_);
380 if (dawg_cache_is_ours_) {
381 delete dawg_cache_;
382 dawg_cache_ = nullptr;
383 }
384 successors_.delete_data_pointers();
385 dawgs_.clear();
386 successors_.clear();
387 document_words_ = nullptr;
388 delete pending_words_;
389 pending_words_ = nullptr;
390}
391
392// Returns true if in light of the current state unichar_id is allowed
393// according to at least one of the dawgs in the dawgs_ vector.
394// See more extensive comments in dict.h where this function is declared.
395int Dict::def_letter_is_okay(void* void_dawg_args, const UNICHARSET& unicharset,
396 UNICHAR_ID unichar_id, bool word_end) const {
397 auto* dawg_args = static_cast<DawgArgs*>(void_dawg_args);
398
399 ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
400
401 if (dawg_debug_level >= 3) {
402 tprintf(
403 "def_letter_is_okay: current unichar=%s word_end=%d"
404 " num active dawgs=%d\n",
405 getUnicharset().debug_str(unichar_id).string(), word_end,
406 dawg_args->active_dawgs->length());
407 }
408
409 // Do not accept words that contain kPatternUnicharID.
410 // (otherwise pattern dawgs would not function correctly).
411 // Do not accept words containing INVALID_UNICHAR_IDs.
412 if (unichar_id == Dawg::kPatternUnicharID ||
413 unichar_id == INVALID_UNICHAR_ID) {
414 dawg_args->permuter = NO_PERM;
415 return NO_PERM;
416 }
417
418 // Initialization.
419 PermuterType curr_perm = NO_PERM;
420 dawg_args->updated_dawgs->clear();
421 dawg_args->valid_end = false;
422
423 // Go over the active_dawgs vector and insert DawgPosition records
424 // with the updated ref (an edge with the corresponding unichar id) into
425 // dawg_args->updated_pos.
426 for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) {
427 const DawgPosition& pos = (*dawg_args->active_dawgs)[a];
428 const Dawg* punc_dawg =
429 pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr;
430 const Dawg* dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr;
431
432 if (!dawg && !punc_dawg) {
433 // shouldn't happen.
434 tprintf("Received DawgPosition with no dawg or punc_dawg. wth?\n");
435 continue;
436 }
437 if (!dawg) {
438 // We're in the punctuation dawg. A core dawg has not been chosen.
439 NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
440 EDGE_REF punc_transition_edge =
441 punc_dawg->edge_char_of(punc_node, Dawg::kPatternUnicharID, word_end);
442 if (punc_transition_edge != NO_EDGE) {
443 // Find all successors, and see which can transition.
444 const SuccessorList& slist = *(successors_[pos.punc_index]);
445 for (int s = 0; s < slist.length(); ++s) {
446 int sdawg_index = slist[s];
447 const Dawg* sdawg = dawgs_[sdawg_index];
448 UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg);
449 EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
450 if (dawg_edge != NO_EDGE) {
451 if (dawg_debug_level >= 3) {
452 tprintf("Letter found in dawg %d\n", sdawg_index);
453 }
454 dawg_args->updated_dawgs->add_unique(
455 DawgPosition(sdawg_index, dawg_edge, pos.punc_index,
456 punc_transition_edge, false),
458 "Append transition from punc dawg to current dawgs: ");
459 if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter();
460 if (sdawg->end_of_word(dawg_edge) &&
461 punc_dawg->end_of_word(punc_transition_edge))
462 dawg_args->valid_end = true;
463 }
464 }
465 }
466 EDGE_REF punc_edge =
467 punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
468 if (punc_edge != NO_EDGE) {
469 if (dawg_debug_level >= 3) {
470 tprintf("Letter found in punctuation dawg\n");
471 }
472 dawg_args->updated_dawgs->add_unique(
473 DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false),
474 dawg_debug_level > 0, "Extend punctuation dawg: ");
475 if (PUNC_PERM > curr_perm) curr_perm = PUNC_PERM;
476 if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
477 }
478 continue;
479 }
480
481 if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) {
482 // We can end the main word here.
483 // If we can continue on the punc ref, add that possibility.
484 NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
485 EDGE_REF punc_edge =
486 punc_node == NO_EDGE
487 ? NO_EDGE
488 : punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
489 if (punc_edge != NO_EDGE) {
490 dawg_args->updated_dawgs->add_unique(
492 punc_edge, true),
493 dawg_debug_level > 0, "Return to punctuation dawg: ");
494 if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
495 if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
496 }
497 }
498
499 if (pos.back_to_punc) continue;
500
501 // If we are dealing with the pattern dawg, look up all the
502 // possible edges, not only for the exact unichar_id, but also
503 // for all its character classes (alpha, digit, etc).
504 if (dawg->type() == DAWG_TYPE_PATTERN) {
505 ProcessPatternEdges(dawg, pos, unichar_id, word_end, dawg_args,
506 &curr_perm);
507 // There can't be any successors to dawg that is of type
508 // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition.
509 continue;
510 }
511
512 // Find the edge out of the node for the unichar_id.
513 NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
514 EDGE_REF edge =
515 (node == NO_EDGE)
516 ? NO_EDGE
517 : dawg->edge_char_of(
518 node, char_for_dawg(unicharset, unichar_id, dawg), word_end);
519
520 if (dawg_debug_level >= 3) {
521 tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
522 pos.dawg_index, node, edge);
523 }
524
525 if (edge != NO_EDGE) { // the unichar was found in the current dawg
526 if (dawg_debug_level >= 3) {
527 tprintf("Letter found in dawg %d\n", pos.dawg_index);
528 }
529 if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {
530 if (dawg_debug_level >= 3) {
531 tprintf("Punctuation constraint not satisfied at end of word.\n");
532 }
533 continue;
534 }
535 if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
536 if (dawg->end_of_word(edge) &&
537 (punc_dawg == nullptr || punc_dawg->end_of_word(pos.punc_ref)))
538 dawg_args->valid_end = true;
539 dawg_args->updated_dawgs->add_unique(
540 DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
541 false),
543 "Append current dawg to updated active dawgs: ");
544 }
545 } // end for
546 // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM
547 // or if we found the current letter in a non-punctuation dawg. This
548 // allows preserving information on which dawg the "core" word came from.
549 // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.
550 if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||
551 (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {
552 dawg_args->permuter = curr_perm;
553 }
554 if (dawg_debug_level >= 2) {
555 tprintf("Returning %d for permuter code for this character.\n",
556 dawg_args->permuter);
557 }
558 return dawg_args->permuter;
559}
560
561void Dict::ProcessPatternEdges(const Dawg* dawg, const DawgPosition& pos,
562 UNICHAR_ID unichar_id, bool word_end,
563 DawgArgs* dawg_args,
564 PermuterType* curr_perm) const {
565 NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
566 // Try to find the edge corresponding to the exact unichar_id and to all the
567 // edges corresponding to the character class of unichar_id.
568 GenericVector<UNICHAR_ID> unichar_id_patterns;
569 unichar_id_patterns.push_back(unichar_id);
570 dawg->unichar_id_to_patterns(unichar_id, getUnicharset(),
571 &unichar_id_patterns);
572 for (int i = 0; i < unichar_id_patterns.size(); ++i) {
573 // On the first iteration check all the outgoing edges.
574 // On the second iteration check all self-loops.
575 for (int k = 0; k < 2; ++k) {
576 EDGE_REF edge =
577 (k == 0) ? dawg->edge_char_of(node, unichar_id_patterns[i], word_end)
578 : dawg->pattern_loop_edge(pos.dawg_ref,
579 unichar_id_patterns[i], word_end);
580 if (edge == NO_EDGE) continue;
581 if (dawg_debug_level >= 3) {
582 tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
583 pos.dawg_index, node, edge);
584 tprintf("Letter found in pattern dawg %d\n", pos.dawg_index);
585 }
586 if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter();
587 if (dawg->end_of_word(edge)) dawg_args->valid_end = true;
588 dawg_args->updated_dawgs->add_unique(
589 DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
590 pos.back_to_punc),
592 "Append current dawg to updated active dawgs: ");
593 }
594 }
595}
596
597// Fill the given active_dawgs vector with dawgs that could contain the
598// beginning of the word. If hyphenated() returns true, copy the entries
599// from hyphen_active_dawgs_ instead.
601 bool ambigs_mode) const {
602 int i;
603 if (hyphenated()) {
604 *active_dawgs = hyphen_active_dawgs_;
605 if (dawg_debug_level >= 3) {
606 for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {
607 tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
608 hyphen_active_dawgs_[i].dawg_index,
609 hyphen_active_dawgs_[i].dawg_ref);
610 }
611 }
612 } else {
613 default_dawgs(active_dawgs, ambigs_mode);
614 }
615}
616
618 bool suppress_patterns) const {
619 bool punc_dawg_available =
620 (punc_dawg_ != nullptr) &&
621 punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
622
623 for (int i = 0; i < dawgs_.length(); i++) {
624 if (dawgs_[i] != nullptr &&
625 !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
626 int dawg_ty = dawgs_[i]->type();
627 bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
628 if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
629 *dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false);
630 if (dawg_debug_level >= 3) {
631 tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i,
632 NO_EDGE);
633 }
634 } else if (!punc_dawg_available || !subsumed_by_punc) {
635 *dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false);
636 if (dawg_debug_level >= 3) {
637 tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
638 }
639 }
640 }
641 }
642}
643
644void Dict::add_document_word(const WERD_CHOICE& best_choice) {
645 // Do not add hyphenated word parts to the document dawg.
646 // hyphen_word_ will be non-nullptr after the set_hyphen_word() is
647 // called when the first part of the hyphenated word is
648 // discovered and while the second part of the word is recognized.
649 // hyphen_word_ is cleared in cc_recg() before the next word on
650 // the line is recognized.
651 if (hyphen_word_) return;
652
653 int stringlen = best_choice.length();
654
655 if (valid_word(best_choice) || stringlen < 2) return;
656
657 // Discard words that contain >= kDocDictMaxRepChars repeating unichars.
658 if (best_choice.length() >= kDocDictMaxRepChars) {
659 int num_rep_chars = 1;
660 UNICHAR_ID uch_id = best_choice.unichar_id(0);
661 for (int i = 1; i < best_choice.length(); ++i) {
662 if (best_choice.unichar_id(i) != uch_id) {
663 num_rep_chars = 1;
664 uch_id = best_choice.unichar_id(i);
665 } else {
666 ++num_rep_chars;
667 if (num_rep_chars == kDocDictMaxRepChars) return;
668 }
669 }
670 }
671
672 if (best_choice.certainty() < doc_dict_certainty_threshold ||
673 stringlen == 2) {
674 if (best_choice.certainty() < doc_dict_pending_threshold) return;
675
676 if (!pending_words_->word_in_dawg(best_choice)) {
677 if (stringlen > 2 ||
678 (stringlen == 2 &&
679 getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
680 getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
681 pending_words_->add_word_to_dawg(best_choice);
682 }
683 return;
684 }
685 }
686
687 if (save_doc_words) {
688 STRING filename(getCCUtil()->imagefile);
689 filename += ".doc";
690 FILE* doc_word_file = fopen(filename.string(), "a");
691 if (doc_word_file == nullptr) {
692 tprintf("Error: Could not open file %s\n", filename.string());
693 ASSERT_HOST(doc_word_file);
694 }
695 fprintf(doc_word_file, "%s\n", best_choice.debug_string().string());
696 fclose(doc_word_file);
697 }
698 document_words_->add_word_to_dawg(best_choice);
699}
700
701void Dict::adjust_word(WERD_CHOICE* word, bool nonword,
702 XHeightConsistencyEnum xheight_consistency,
703 float additional_adjust, bool modify_rating,
704 bool debug) {
705 bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() &&
706 word->GetTopScriptID() == getUnicharset().han_sid());
707 bool case_is_ok = (is_han || case_ok(*word));
708 bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));
709
710 float adjust_factor = additional_adjust;
711 float new_rating = word->rating();
712 new_rating += kRatingPad;
713 const char* xheight_triggered = "";
714 if (word->length() > 1) {
715 // Calculate x-height and y-offset consistency penalties.
716 switch (xheight_consistency) {
717 case XH_INCONSISTENT:
718 adjust_factor += xheight_penalty_inconsistent;
719 xheight_triggered = ", xhtBAD";
720 break;
721 case XH_SUBNORMAL:
722 adjust_factor += xheight_penalty_subscripts;
723 xheight_triggered = ", xhtSUB";
724 break;
725 case XH_GOOD:
726 // leave the factor alone - all good!
727 break;
728 }
729 // TODO(eger): if nonword is true, but there is a "core" that is a dict
730 // word, negate nonword status.
731 } else {
732 if (debug) {
733 tprintf("Consistency could not be calculated.\n");
734 }
735 }
736 if (debug) {
737 tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "",
738 word->unichar_string().string(), word->rating(), xheight_triggered);
739 }
740
741 if (nonword) { // non-dictionary word
742 if (case_is_ok && punc_is_ok) {
743 adjust_factor += segment_penalty_dict_nonword;
744 new_rating *= adjust_factor;
745 if (debug) tprintf(", W");
746 } else {
747 adjust_factor += segment_penalty_garbage;
748 new_rating *= adjust_factor;
749 if (debug) {
750 if (!case_is_ok) tprintf(", C");
751 if (!punc_is_ok) tprintf(", P");
752 }
753 }
754 } else { // dictionary word
755 if (case_is_ok) {
756 if (!is_han && freq_dawg_ != nullptr && freq_dawg_->word_in_dawg(*word)) {
758 adjust_factor += segment_penalty_dict_frequent_word;
759 new_rating *= adjust_factor;
760 if (debug) tprintf(", F");
761 } else {
762 adjust_factor += segment_penalty_dict_case_ok;
763 new_rating *= adjust_factor;
764 if (debug) tprintf(", ");
765 }
766 } else {
767 adjust_factor += segment_penalty_dict_case_bad;
768 new_rating *= adjust_factor;
769 if (debug) tprintf(", C");
770 }
771 }
772 new_rating -= kRatingPad;
773 if (modify_rating) word->set_rating(new_rating);
774 if (debug) tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);
775 word->set_adjust_factor(adjust_factor);
776}
777
778int Dict::valid_word(const WERD_CHOICE& word, bool numbers_ok) const {
779 const WERD_CHOICE* word_ptr = &word;
780 WERD_CHOICE temp_word(word.unicharset());
781 if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) {
782 copy_hyphen_info(&temp_word);
783 temp_word += word;
784 word_ptr = &temp_word;
785 }
786 if (word_ptr->length() == 0) return NO_PERM;
787 // Allocate vectors for holding current and updated
788 // active_dawgs and initialize them.
789 auto* active_dawgs = new DawgPositionVector[2];
790 init_active_dawgs(&(active_dawgs[0]), false);
791 DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
792 int last_index = word_ptr->length() - 1;
793 // Call letter_is_okay for each letter in the word.
794 for (int i = hyphen_base_size(); i <= last_index; ++i) {
795 if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(),
796 word_ptr->unichar_id(i), i == last_index)))
797 break;
798 // Swap active_dawgs, constraints with the corresponding updated vector.
799 if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
800 dawg_args.updated_dawgs = &(active_dawgs[0]);
801 ++(dawg_args.active_dawgs);
802 } else {
803 ++(dawg_args.updated_dawgs);
804 dawg_args.active_dawgs = &(active_dawgs[0]);
805 }
806 }
807 delete[] active_dawgs;
808 return valid_word_permuter(dawg_args.permuter, numbers_ok)
809 ? dawg_args.permuter
810 : NO_PERM;
811}
812
814 const WERD_CHOICE& word2) const {
815 if (bigram_dawg_ == nullptr) return false;
816
817 // Extract the core word from the middle of each word with any digits
818 // replaced with question marks.
819 int w1start, w1end, w2start, w2end;
820 word1.punct_stripped(&w1start, &w1end);
821 word2.punct_stripped(&w2start, &w2end);
822
823 // We don't want to penalize a single guillemet, hyphen, etc.
824 // But our bigram list doesn't have any information about punctuation.
825 if (w1start >= w1end) return word1.length() < 3;
826 if (w2start >= w2end) return word2.length() < 3;
827
828 const UNICHARSET& uchset = getUnicharset();
829 GenericVector<UNICHAR_ID> bigram_string;
830 bigram_string.reserve(w1end + w2end + 1);
831 for (int i = w1start; i < w1end; i++) {
832 const GenericVector<UNICHAR_ID>& normed_ids =
834 if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
835 bigram_string.push_back(question_unichar_id_);
836 else
837 bigram_string += normed_ids;
838 }
839 bigram_string.push_back(UNICHAR_SPACE);
840 for (int i = w2start; i < w2end; i++) {
841 const GenericVector<UNICHAR_ID>& normed_ids =
843 if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
844 bigram_string.push_back(question_unichar_id_);
845 else
846 bigram_string += normed_ids;
847 }
848 WERD_CHOICE normalized_word(&uchset, bigram_string.size());
849 for (int i = 0; i < bigram_string.size(); ++i) {
850 normalized_word.append_unichar_id_space_allocated(bigram_string[i], 1, 0.0f,
851 0.0f);
852 }
853 return bigram_dawg_->word_in_dawg(normalized_word);
854}
855
857 if (word.length() == 0) return NO_PERM;
858 int i;
859 WERD_CHOICE new_word(word.unicharset());
860 int last_index = word.length() - 1;
861 int new_len = 0;
862 for (i = 0; i <= last_index; ++i) {
863 UNICHAR_ID unichar_id = (word.unichar_id(i));
864 if (getUnicharset().get_ispunctuation(unichar_id)) {
865 new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
866 } else if (!getUnicharset().get_isalpha(unichar_id) &&
867 !getUnicharset().get_isdigit(unichar_id)) {
868 return false; // neither punc, nor alpha, nor digit
869 } else if ((new_len = new_word.length()) == 0 ||
870 new_word.unichar_id(new_len - 1) != Dawg::kPatternUnicharID) {
871 new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
872 }
873 }
874 for (i = 0; i < dawgs_.size(); ++i) {
875 if (dawgs_[i] != nullptr && dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
876 dawgs_[i]->word_in_dawg(new_word))
877 return true;
878 }
879 return false;
880}
881
884 const UNICHARSET& u_set = getUnicharset();
885 if (u_set.han_sid() > 0) return false;
886 if (u_set.katakana_sid() > 0) return false;
887 if (u_set.thai_sid() > 0) return false;
888 return true;
889}
890
891} // namespace tesseract
PermuterType
Definition: ratngs.h:232
@ USER_PATTERN_PERM
Definition: ratngs.h:240
@ DOC_DAWG_PERM
Definition: ratngs.h:242
@ FREQ_DAWG_PERM
Definition: ratngs.h:244
@ USER_DAWG_PERM
Definition: ratngs.h:243
@ PUNC_PERM
Definition: ratngs.h:234
@ COMPOUND_PERM
Definition: ratngs.h:245
@ NO_PERM
Definition: ratngs.h:233
#define ASSERT_HOST(x)
Definition: errcode.h:88
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:315
#define STRING_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:333
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:330
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:324
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:321
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:318
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int UNICHAR_ID
Definition: unichar.h:34
@ UNICHAR_SPACE
Definition: unicharset.h:34
int64_t NODE_REF
Definition: dawg.h:52
#define REFFORMAT
Definition: dawg.h:89
int64_t EDGE_REF
Definition: dawg.h:51
@ DAWG_TYPE_PATTERN
Definition: dawg.h:72
@ DAWG_TYPE_WORD
Definition: dawg.h:70
@ DAWG_TYPE_PUNCTUATION
Definition: dawg.h:69
XHeightConsistencyEnum
Definition: dict.h:78
@ XH_GOOD
Definition: dict.h:78
@ XH_SUBNORMAL
Definition: dict.h:78
@ XH_INCONSISTENT
Definition: dict.h:78
@ TESSDATA_UNAMBIG_DAWG
@ TESSDATA_LSTM_SYSTEM_DAWG
@ TESSDATA_NUMBER_DAWG
@ TESSDATA_LSTM_PUNC_DAWG
@ TESSDATA_BIGRAM_DAWG
@ TESSDATA_LSTM_NUMBER_DAWG
@ TESSDATA_SYSTEM_DAWG
GenericVector< int > SuccessorList
Definition: dawg.h:65
int push_back(T object)
bool empty() const
Definition: genericvector.h:91
int size() const
Definition: genericvector.h:72
int length() const
Definition: genericvector.h:86
void delete_data_pointers()
void reserve(int size)
const STRING debug_string() const
Definition: ratngs.h:495
const STRING & unichar_string() const
Definition: ratngs.h:531
void set_adjust_factor(float factor)
Definition: ratngs.h:299
int GetTopScriptID() const
Definition: ratngs.cpp:671
void punct_stripped(int *start_core, int *end_core) const
Definition: ratngs.cpp:387
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
void set_rating(float new_val)
Definition: ratngs.h:359
const UNICHARSET * unicharset() const
Definition: ratngs.h:290
void set_permuter(uint8_t perm)
Definition: ratngs.h:365
float certainty() const
Definition: ratngs.h:320
int length() const
Definition: ratngs.h:293
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:472
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:442
float rating() const
Definition: ratngs.h:317
STRING language_data_path_prefix
Definition: ccutil.h:72
Definition: strngs.h:45
const char * string() const
Definition: strngs.cpp:194
int katakana_sid() const
Definition: unicharset.h:891
int han_sid() const
Definition: unicharset.h:889
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:519
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
int null_sid() const
Definition: unicharset.h:884
int thai_sid() const
Definition: unicharset.h:892
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:284
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:835
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:65
virtual bool end_of_word(EDGE_REF edge_ref) const =0
const STRING & lang() const
Definition: dawg.h:125
DawgType type() const
Definition: dawg.h:124
virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: dawg.h:192
PermuterType permuter() const
Definition: dawg.h:126
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
Definition: dawg.h:181
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:122
EDGE_REF punc_ref
Definition: dawg.h:368
EDGE_REF dawg_ref
Definition: dawg.h:367
bool add_unique(const DawgPosition &new_pos, bool debug, const char *debug_msg)
Definition: dawg.h:383
Dawg * GetSquishedDawg(const STRING &lang, TessdataType tessdata_dawg_type, int debug_level, TessdataManager *data_file)
Definition: dawg_cache.cpp:45
bool FreeDawg(Dawg *dawg)
Definition: dawg_cache.h:38
DawgPositionVector * updated_dawgs
Definition: dict.h:85
DawgPositionVector * active_dawgs
Definition: dict.h:84
PermuterType permuter
Definition: dict.h:86
bool valid_end
Definition: dict.h:88
bool save_doc_words
Definition: dict.h:649
void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
Definition: dict.cpp:561
double xheight_penalty_subscripts
Definition: dict.h:595
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:145
int dawg_debug_level
Definition: dict.h:622
double doc_dict_certainty_threshold
Definition: dict.h:653
double segment_penalty_dict_case_ok
Definition: dict.h:605
static TESS_API DawgCache * GlobalDawgCache()
Definition: dict.cpp:184
bool IsSpaceDelimitedLang() const
Returns true if the language is space-delimited (not CJ, or T).
Definition: dict.cpp:883
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:372
const CCUtil * getCCUtil() const
Definition: dict.h:95
double segment_penalty_dict_case_bad
Definition: dict.h:609
bool load_punc_dawg
Definition: dict.h:589
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:617
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:139
double segment_penalty_dict_nonword
Definition: dict.h:613
char * user_patterns_suffix
Definition: dict.h:584
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:474
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:216
double segment_penalty_garbage
Definition: dict.h:618
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:813
double segment_penalty_dict_frequent_word
Definition: dict.h:601
bool load_number_dawg
Definition: dict.h:590
UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:448
bool load_freq_dawg
Definition: dict.h:586
bool load_unambig_dawg
Definition: dict.h:587
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:778
double xheight_penalty_inconsistent
Definition: dict.h:598
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:192
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:135
void End()
Definition: dict.cpp:372
Dict(CCUtil *image_ptr)
Definition: dict.cpp:30
bool FinishLoad()
Definition: dict.cpp:351
char * user_words_file
Definition: dict.h:578
bool load_system_dawg
Definition: dict.h:585
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:438
bool valid_punctuation(const WERD_CHOICE &word)
Definition: dict.cpp:856
double doc_dict_pending_threshold
Definition: dict.h:651
int case_ok(const WERD_CHOICE &word) const
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:46
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
Definition: dict.cpp:644
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:701
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:600
char * user_words_suffix
Definition: dict.h:580
bool load_bigram_dawg
Definition: dict.h:592
void LoadLSTM(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:291
void Load(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:210
int def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.cpp:395
char * user_patterns_file
Definition: dict.h:582
const UNICHARSET & getUnicharset() const
Definition: dict.h:101
bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
Definition: trie.cpp:281
void initialize_patterns(UNICHARSET *unicharset)
Definition: trie.cpp:337
bool read_pattern_list(const char *filename, const UNICHARSET &unicharset)
Definition: trie.cpp:394
@ RRP_REVERSE_IF_HAS_RTL
Definition: trie.h:60
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:169