tesseract 4.1.1
Loading...
Searching...
No Matches
tessedit.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: tessedit.cpp (Formerly tessedit.c)
3 * Description: (Previously) Main program for merge of tess and editor.
4 * Now just code to load the language model and various
5 * engine-specific data files.
6 * Author: Ray Smith
7 *
8 * (C) Copyright 1992, Hewlett-Packard Ltd.
9 ** Licensed under the Apache License, Version 2.0 (the "License");
10 ** you may not use this file except in compliance with the License.
11 ** You may obtain a copy of the License at
12 ** http://www.apache.org/licenses/LICENSE-2.0
13 ** Unless required by applicable law or agreed to in writing, software
14 ** distributed under the License is distributed on an "AS IS" BASIS,
15 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 ** See the License for the specific language governing permissions and
17 ** limitations under the License.
18 *
19 **********************************************************************/
20
21// Include automatically generated configuration file if running autoconf.
22#ifdef HAVE_CONFIG_H
23# include "config_auto.h"
24#endif
25
26#include "control.h"
27# include "matchdefs.h"
28#include "pageres.h"
29#include "params.h"
30#include "stopper.h"
31#include "tesseractclass.h"
32#include "tessvars.h"
33#include "tprintf.h"
34#ifndef DISABLED_LEGACY_ENGINE
35# include "chop.h"
36# include "intmatcher.h"
37# include "reject.h"
38#endif
39#ifndef ANDROID_BUILD
40# include "lstmrecognizer.h"
41#endif
42
43namespace tesseract {
44
45// Read a "config" file containing a set of variable, value pairs.
46// Searches the standard places: tessdata/configs, tessdata/tessconfigs
47// and also accepts a relative or absolute path name.
48void Tesseract::read_config_file(const char* filename,
49 SetParamConstraint constraint) {
50 STRING path = datadir;
51 path += "configs/";
52 path += filename;
53 FILE* fp;
54 if ((fp = fopen(path.string(), "rb")) != nullptr) {
55 fclose(fp);
56 } else {
57 path = datadir;
58 path += "tessconfigs/";
59 path += filename;
60 if ((fp = fopen(path.string(), "rb")) != nullptr) {
61 fclose(fp);
62 } else {
63 path = filename;
64 }
65 }
66 ParamUtils::ReadParamsFile(path.string(), constraint, this->params());
67}
68
69// Returns false if a unicharset file for the specified language was not found
70// or was invalid.
71// This function initializes TessdataManager. After TessdataManager is
72// no longer needed, TessdataManager::End() should be called.
73//
74// This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
75// it is OEM_DEFAULT, in which case the value of the variable will be obtained
76// from the language-specific config file (stored in [lang].traineddata), from
77// the config files specified on the command line or left as the default
78// OEM_TESSERACT_ONLY if none of the configs specify this variable.
80 const char* arg0, const char* textbase, const char* language,
81 OcrEngineMode oem, char** configs, int configs_size,
82 const GenericVector<STRING>* vars_vec,
83 const GenericVector<STRING>* vars_values, bool set_only_non_debug_params,
84 TessdataManager* mgr) {
85 // Set the basename, compute the data directory.
86 main_setup(arg0, textbase);
87
88 // Set the language data path prefix
89 lang = language != nullptr ? language : "eng";
93
94 // Initialize TessdataManager.
95 STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
96 if (!mgr->is_loaded() && !mgr->Init(tessdata_path.string())) {
97 tprintf("Error opening data file %s\n", tessdata_path.string());
98 tprintf(
99 "Please make sure the TESSDATA_PREFIX environment variable is set"
100 " to your \"tessdata\" directory.\n");
101 return false;
102 }
103#ifndef DISABLED_LEGACY_ENGINE
104 if (oem == OEM_DEFAULT) {
105 // Set the engine mode from availability, which can then be overridden by
106 // the config file when we read it below.
107 if (!mgr->IsLSTMAvailable()) {
109 } else if (!mgr->IsBaseAvailable()) {
111 } else {
113 }
114 }
115#endif // ndef DISABLED_LEGACY_ENGINE
116
117 // If a language specific config file (lang.config) exists, load it in.
118 TFile fp;
119 if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
121 this->params());
122 }
123
124 SetParamConstraint set_params_constraint =
125 set_only_non_debug_params ? SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY
127 // Load tesseract variables from config files. This is done after loading
128 // language-specific variables from [lang].traineddata file, so that custom
129 // config files can override values in [lang].traineddata file.
130 for (int i = 0; i < configs_size; ++i) {
131 read_config_file(configs[i], set_params_constraint);
132 }
133
134 // Set params specified in vars_vec (done after setting params from config
135 // files, so that params in vars_vec can override those from files).
136 if (vars_vec != nullptr && vars_values != nullptr) {
137 for (int i = 0; i < vars_vec->size(); ++i) {
138 if (!ParamUtils::SetParam((*vars_vec)[i].string(),
139 (*vars_values)[i].string(),
140 set_params_constraint, this->params())) {
141 tprintf("Warning: The parameter '%s' was not found.\n", (*vars_vec)[i].string());
142 }
143 }
144 }
145
146 if (!tessedit_write_params_to_file.empty()) {
147 FILE* params_file = fopen(tessedit_write_params_to_file.string(), "wb");
148 if (params_file != nullptr) {
149 ParamUtils::PrintParams(params_file, this->params());
150 fclose(params_file);
151 } else {
152 tprintf("Failed to open %s for writing params.\n",
154 }
155 }
156
157 // Determine which ocr engine(s) should be loaded and used for recognition.
158 if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
159
160 // If we are only loading the config file (and so not planning on doing any
161 // recognition) then there's nothing else do here.
163 return true;
164 }
165
166// The various OcrEngineMode settings (see publictypes.h) determine which
167// engine-specific data files need to be loaded.
168// If LSTM_ONLY is requested, the base Tesseract files are *Not* required.
169#ifndef ANDROID_BUILD
170# ifdef DISABLED_LEGACY_ENGINE
172# else
175# endif // ndef DISABLED_LEGACY_ENGINE
177 lstm_recognizer_ = new LSTMRecognizer(language_data_path_prefix);
178 ASSERT_HOST(lstm_recognizer_->Load(
179 this->params(), lstm_use_matrix ? language : nullptr, mgr));
180 } else {
181 tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
183 }
184 }
185#endif // ndef ANDROID_BUILD
186
187 // Load the unicharset
189 // Avoid requiring a unicharset when we aren't running base tesseract.
190#ifndef ANDROID_BUILD
191 unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
192#endif // ndef ANDROID_BUILD
193 }
194#ifndef DISABLED_LEGACY_ENGINE
195 else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) ||
196 !unicharset.load_from_file(&fp, false)) {
197 tprintf("Error: Tesseract (legacy) engine requested, but components are "
198 "not present in %s!!\n", tessdata_path.c_str());
199 return false;
200 }
201#endif // ndef DISABLED_LEGACY_ENGINE
203 tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
204 return false;
205 }
206 right_to_left_ = unicharset.major_right_to_left();
207
208#ifndef DISABLED_LEGACY_ENGINE
209
210 // Setup initial unichar ambigs table and read universal ambigs.
211 UNICHARSET encoder_unicharset;
212 encoder_unicharset.CopyFrom(unicharset);
214 unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
215
217 unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp,
220 }
221
222 // Init ParamsModel.
223 // Load pass1 and pass2 weights (for now these two sets are the same, but in
224 // the future separate sets of weights can be generated).
226 ++p) {
227 language_model_->getParamsModel().SetPass(
228 static_cast<ParamsModel::PassEnum>(p));
229 if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
230 if (!language_model_->getParamsModel().LoadFromFp(lang.string(), &fp)) {
231 return false;
232 }
233 }
234 }
235#endif // ndef DISABLED_LEGACY_ENGINE
236
237 return true;
238}
239
240// Helper returns true if the given string is in the vector of strings.
241static bool IsStrInList(const STRING& str,
242 const GenericVector<STRING>& str_list) {
243 for (int i = 0; i < str_list.size(); ++i) {
244 if (str_list[i] == str) return true;
245 }
246 return false;
247}
248
249// Parse a string of the form [~]<lang>[+[~]<lang>]*.
250// Langs with no prefix get appended to to_load, provided they
251// are not in there already.
252// Langs with ~ prefix get appended to not_to_load, provided they are not in
253// there already.
254void Tesseract::ParseLanguageString(const char* lang_str,
255 GenericVector<STRING>* to_load,
256 GenericVector<STRING>* not_to_load) {
257 STRING remains(lang_str);
258 while (remains.length() > 0) {
259 // Find the start of the lang code and which vector to add to.
260 const char* start = remains.string();
261 while (*start == '+') ++start;
262 GenericVector<STRING>* target = to_load;
263 if (*start == '~') {
264 target = not_to_load;
265 ++start;
266 }
267 // Find the index of the end of the lang code in string start.
268 int end = strlen(start);
269 const char* plus = strchr(start, '+');
270 if (plus != nullptr && plus - start < end) end = plus - start;
271 STRING lang_code(start);
272 lang_code.truncate_at(end);
273 STRING next(start + end);
274 remains = next;
275 // Check whether lang_code is already in the target vector and add.
276 if (!IsStrInList(lang_code, *target)) {
277 target->push_back(lang_code);
278 }
279 }
280}
281
282// Initialize for potentially a set of languages defined by the language
283// string and recursively any additional languages required by any language
284// traineddata file (via tessedit_load_sublangs in its config) that is loaded.
285// See init_tesseract_internal for args.
286int Tesseract::init_tesseract(const char* arg0, const char* textbase,
287 const char* language, OcrEngineMode oem,
288 char** configs, int configs_size,
289 const GenericVector<STRING>* vars_vec,
290 const GenericVector<STRING>* vars_values,
291 bool set_only_non_debug_params,
292 TessdataManager* mgr) {
293 GenericVector<STRING> langs_to_load;
294 GenericVector<STRING> langs_not_to_load;
295 ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
296
297 sub_langs_.delete_data_pointers();
298 sub_langs_.clear();
299 // Find the first loadable lang and load into this.
300 // Add any languages that this language requires
301 bool loaded_primary = false;
302 // Load the rest into sub_langs_.
303 for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
304 if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
305 const char* lang_str = langs_to_load[lang_index].string();
306 Tesseract* tess_to_init;
307 if (!loaded_primary) {
308 tess_to_init = this;
309 } else {
310 tess_to_init = new Tesseract;
311 }
312
313 int result = tess_to_init->init_tesseract_internal(
314 arg0, textbase, lang_str, oem, configs, configs_size, vars_vec,
315 vars_values, set_only_non_debug_params, mgr);
316 // Forget that language, but keep any reader we were given.
317 mgr->Clear();
318
319 if (!loaded_primary) {
320 if (result < 0) {
321 tprintf("Failed loading language '%s'\n", lang_str);
322 } else {
323 ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
324 &langs_to_load, &langs_not_to_load);
325 loaded_primary = true;
326 }
327 } else {
328 if (result < 0) {
329 tprintf("Failed loading language '%s'\n", lang_str);
330 delete tess_to_init;
331 } else {
332 sub_langs_.push_back(tess_to_init);
333 // Add any languages that this language requires
334 ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
335 &langs_to_load, &langs_not_to_load);
336 }
337 }
338 }
339 }
340 if (!loaded_primary) {
341 tprintf("Tesseract couldn't load any languages!\n");
342 return -1; // Couldn't load any language!
343 }
344#ifndef DISABLED_LEGACY_ENGINE
345 if (!sub_langs_.empty()) {
346 // In multilingual mode word ratings have to be directly comparable,
347 // so use the same language model weights for all languages:
348 // use the primary language's params model if
349 // tessedit_use_primary_params_model is set,
350 // otherwise use default language model weights.
352 for (int s = 0; s < sub_langs_.size(); ++s) {
353 sub_langs_[s]->language_model_->getParamsModel().Copy(
354 this->language_model_->getParamsModel());
355 }
356 tprintf("Using params model of the primary language\n");
357 } else {
358 this->language_model_->getParamsModel().Clear();
359 for (int s = 0; s < sub_langs_.size(); ++s) {
360 sub_langs_[s]->language_model_->getParamsModel().Clear();
361 }
362 }
363 }
364
366#endif // ndef DISABLED_LEGACY_ENGINE
367 return 0;
368}
369
370// Common initialization for a single language.
371// arg0 is the datapath for the tessdata directory, which could be the
372// path of the tessdata directory with no trailing /, or (if tessdata
373// lives in the same directory as the executable, the path of the executable,
374// hence the name arg0.
375// textbase is an optional output file basename (used only for training)
376// language is the language code to load.
377// oem controls which engine(s) will operate on the image
378// configs (argv) is an array of config filenames to load variables from.
379// May be nullptr.
380// configs_size (argc) is the number of elements in configs.
381// vars_vec is an optional vector of variables to set.
382// vars_values is an optional corresponding vector of values for the variables
383// in vars_vec.
384// If set_only_init_params is true, then only the initialization variables
385// will be set.
386int Tesseract::init_tesseract_internal(const char* arg0, const char* textbase,
387 const char* language, OcrEngineMode oem,
388 char** configs, int configs_size,
389 const GenericVector<STRING>* vars_vec,
390 const GenericVector<STRING>* vars_values,
391 bool set_only_non_debug_params,
392 TessdataManager* mgr) {
393 if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
394 configs_size, vars_vec, vars_values,
395 set_only_non_debug_params, mgr)) {
396 return -1;
397 }
399 return 0;
400 }
401 // If only LSTM will be used, skip loading Tesseract classifier's
402 // pre-trained templates and dictionary.
404 program_editup(textbase, init_tesseract ? mgr : nullptr,
405 init_tesseract ? mgr : nullptr);
406 return 0; // Normal exit
407}
408
409#ifndef DISABLED_LEGACY_ENGINE
410
411// Helper builds the all_fonts table by adding new fonts from new_fonts.
412static void CollectFonts(const UnicityTable<FontInfo>& new_fonts,
413 UnicityTable<FontInfo>* all_fonts) {
414 for (int i = 0; i < new_fonts.size(); ++i) {
415 // UnicityTable uniques as we go.
416 all_fonts->push_back(new_fonts.get(i));
417 }
418}
419
420// Helper assigns an id to lang_fonts using the index in all_fonts table.
421static void AssignIds(const UnicityTable<FontInfo>& all_fonts,
422 UnicityTable<FontInfo>* lang_fonts) {
423 for (int i = 0; i < lang_fonts->size(); ++i) {
424 int index = all_fonts.get_id(lang_fonts->get(i));
425 lang_fonts->get_mutable(i)->universal_id = index;
426 }
427}
428
429// Set the universal_id member of each font to be unique among all
430// instances of the same font loaded.
432 // Note that we can get away with bitwise copying FontInfo in
433 // all_fonts, as it is a temporary structure and we avoid setting the
434 // delete callback.
435 UnicityTable<FontInfo> all_fonts;
437
438 // Create the universal ID table.
439 CollectFonts(get_fontinfo_table(), &all_fonts);
440 for (int i = 0; i < sub_langs_.size(); ++i) {
441 CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts);
442 }
443 // Assign ids from the table to each font table.
444 AssignIds(all_fonts, &get_fontinfo_table());
445 for (int i = 0; i < sub_langs_.size(); ++i) {
446 AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table());
447 }
448 font_table_size_ = all_fonts.size();
449}
450
451// init the LM component
452int Tesseract::init_tesseract_lm(const char* arg0, const char* textbase,
453 const char* language, TessdataManager* mgr) {
454 if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
455 nullptr, 0, nullptr, nullptr, false, mgr))
456 return -1;
458 getDict().Load(lang, mgr);
460 return 0;
461}
462
463#endif // ndef DISABLED_LEGACY_ENGINE
464
466
467/* Define command type identifiers */
468
475} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:88
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
Definition: tesscallback.h:258
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
#define MAX_NUM_CLASSES
Definition: matchdefs.h:30
@ OEM_TESSERACT_LSTM_COMBINED
Definition: publictypes.h:271
@ OEM_TESSERACT_ONLY
Definition: publictypes.h:269
bool CompareFontInfo(const FontInfo &fi1, const FontInfo &fi2)
Definition: fontinfo.cpp:119
SetParamConstraint
Definition: params.h:35
@ SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY
Definition: params.h:38
@ SET_PARAM_CONSTRAINT_NONE
Definition: params.h:36
@ ACTION_2_CMD_EVENT
Definition: tessedit.cpp:473
@ RECOG_PSEUDO
Definition: tessedit.cpp:472
@ ACTION_1_CMD_EVENT
Definition: tessedit.cpp:470
@ TESSDATA_PARAMS_MODEL
@ TESSDATA_LANG_CONFIG
int push_back(T object)
int size() const
Definition: genericvector.h:72
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:286
int init_tesseract_internal(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:386
void SetupUniversalFontIds()
Definition: tessedit.cpp:431
void read_config_file(const char *filename, SetParamConstraint constraint)
Definition: tessedit.cpp:48
Dict & getDict() override
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:79
int init_tesseract_lm(const char *arg0, const char *textbase, const char *language, TessdataManager *mgr)
Definition: tessedit.cpp:452
char * tessedit_write_params_to_file
void ParseLanguageString(const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
Definition: tessedit.cpp:254
int get_id(T object) const
int push_back(T object)
Add an element in the table.
T * get_mutable(int id)
void set_compare_callback(TessResultCallback2< bool, T const &, T const & > *cb)
int size() const
Return the size used.
const T & get(int id) const
Return the object from an id.
void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset)
Definition: ambigs.cpp:68
void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption)
Definition: ambigs.cpp:54
void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
Definition: ambigs.cpp:75
STRING language_data_path_prefix
Definition: ccutil.h:72
ParamsVectors * params()
Definition: ccutil.h:67
STRING datadir
Definition: ccutil.h:69
UNICHARSET unicharset
Definition: ccutil.h:73
void main_setup(const char *argv0, const char *basename)
CCUtil::main_setup - set location of tessdata and name of image.
Definition: mainblk.cpp:44
STRING lang
Definition: ccutil.h:71
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:75
int ambigs_debug_level
Definition: ccutil.h:87
bool use_ambigs_for_adaption
Definition: ccutil.h:89
static bool ReadParamsFromFp(SetParamConstraint constraint, TFile *fp, ParamsVectors *member_params)
Definition: params.cpp:50
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:39
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:168
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:79
Definition: strngs.h:45
void truncate_at(int32_t index)
Definition: strngs.cpp:265
const char * c_str() const
Definition: strngs.cpp:205
int32_t length() const
Definition: strngs.cpp:189
const char * string() const
Definition: strngs.cpp:194
bool GetComponent(TessdataType type, TFile *fp)
bool IsComponentAvailable(TessdataType type) const
bool Init(const char *data_file_name)
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:448
bool major_right_to_left() const
Definition: unicharset.cpp:992
int size() const
Definition: unicharset.h:341
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:388
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:386
static TESS_API DawgCache * GlobalDawgCache()
Definition: dict.cpp:184
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:192
bool FinishLoad()
Definition: dict.cpp:351
void Load(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:210
bool Load(const ParamsVectors *params, const char *lang, TessdataManager *mgr)
const UNICHARSET & GetUnicharset() const
void program_editup(const char *textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
Definition: tface.cpp:40
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:471