tesseract 4.1.1
Loading...
Searching...
No Matches
combine_tessdata.cpp
Go to the documentation of this file.
1
2// File: combine_tessdata.cpp
3// Description: Creates a unified traineddata file from several
4// data files produced by the training process.
5// Author: Daria Antonova
6// Created: Wed Jun 03 11:26:43 PST 2009
7//
8// (C) Copyright 2009, Google Inc.
9// Licensed under the Apache License, Version 2.0 (the "License");
10// you may not use this file except in compliance with the License.
11// You may obtain a copy of the License at
12// http://www.apache.org/licenses/LICENSE-2.0
13// Unless required by applicable law or agreed to in writing, software
14// distributed under the License is distributed on an "AS IS" BASIS,
15// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16// See the License for the specific language governing permissions and
17// limitations under the License.
18//
20
21#include <cerrno>
22#include "commontraining.h" // CheckSharedLibraryVersion
23#include "lstmrecognizer.h"
24#include "tessdatamanager.h"
25
26// Main program to combine/extract/overwrite tessdata components
27// in [lang].traineddata files.
28//
29// To combine all the individual tessdata components (unicharset, DAWGs,
30// classifier templates, ambiguities, language configs) located at, say,
31// /home/$USER/temp/eng.* run:
32//
33// combine_tessdata /home/$USER/temp/eng.
34//
35// The result will be a combined tessdata file /home/$USER/temp/eng.traineddata
36//
37// Specify option -e if you would like to extract individual components
38// from a combined traineddata file. For example, to extract language config
39// file and the unicharset from tessdata/eng.traineddata run:
40//
41// combine_tessdata -e tessdata/eng.traineddata
42// /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
43//
44// The desired config file and unicharset will be written to
45// /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
46//
47// Specify option -o to overwrite individual components of the given
48// [lang].traineddata file. For example, to overwrite language config
49// and unichar ambiguities files in tessdata/eng.traineddata use:
50//
51// combine_tessdata -o tessdata/eng.traineddata
52// /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs
53//
54// As a result, tessdata/eng.traineddata will contain the new language config
55// and unichar ambigs, plus all the original DAWGs, classifier teamples, etc.
56//
57// Note: the file names of the files to extract to and to overwrite from should
58// have the appropriate file suffixes (extensions) indicating their tessdata
59// component type (.unicharset for the unicharset, .unicharambigs for unichar
60// ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h.
61//
62// Specify option -u to unpack all the components to the specified path:
63//
64// combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng.
65//
66// This will create /home/$USER/temp/eng.* files with individual tessdata
67// components from tessdata/eng.traineddata.
68//
69int main(int argc, char **argv) {
70 tesseract::CheckSharedLibraryVersion();
71
72 int i;
74 if (argc > 1 && (!strcmp(argv[1], "-v") || !strcmp(argv[1], "--version"))) {
75 printf("%s\n", tesseract::TessBaseAPI::Version());
76 return EXIT_SUCCESS;
77 } else if (argc == 2) {
78 printf("Combining tessdata files\n");
79 STRING lang = argv[1];
80 char* last = &argv[1][strlen(argv[1])-1];
81 if (*last != '.')
82 lang += '.';
83 STRING output_file = lang;
84 output_file += kTrainedDataSuffix;
85 if (!tm.CombineDataFiles(lang.string(), output_file.string())) {
86 printf("Error combining tessdata files into %s\n",
87 output_file.string());
88 } else {
89 printf("Output %s created successfully.\n", output_file.string());
90 }
91 } else if (argc >= 4 && (strcmp(argv[1], "-e") == 0 ||
92 strcmp(argv[1], "-u") == 0)) {
93 // Initialize TessdataManager with the data in the given traineddata file.
94 if (!tm.Init(argv[2])) {
95 tprintf("Failed to read %s\n", argv[2]);
96 return EXIT_FAILURE;
97 }
98 printf("Extracting tessdata components from %s\n", argv[2]);
99 if (strcmp(argv[1], "-e") == 0) {
100 for (i = 3; i < argc; ++i) {
101 errno = 0;
102 if (tm.ExtractToFile(argv[i])) {
103 printf("Wrote %s\n", argv[i]);
104 } else if (errno == 0) {
105 printf("Not extracting %s, since this component"
106 " is not present\n", argv[i]);
107 return EXIT_FAILURE;
108 } else {
109 printf("Error, could not extract %s: %s\n",
110 argv[i], strerror(errno));
111 return EXIT_FAILURE;
112 }
113 }
114 } else { // extract all the components
115 for (i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) {
116 STRING filename = argv[3];
117 char* last = &argv[3][strlen(argv[3])-1];
118 if (*last != '.')
119 filename += '.';
120 filename += tesseract::kTessdataFileSuffixes[i];
121 errno = 0;
122 if (tm.ExtractToFile(filename.string())) {
123 printf("Wrote %s\n", filename.string());
124 } else if (errno != 0) {
125 printf("Error, could not extract %s: %s\n",
126 filename.string(), strerror(errno));
127 return EXIT_FAILURE;
128 }
129 }
130 }
131 } else if (argc >= 4 && strcmp(argv[1], "-o") == 0) {
132 // Rename the current traineddata file to a temporary name.
133 const char *new_traineddata_filename = argv[2];
134 STRING traineddata_filename = new_traineddata_filename;
135 traineddata_filename += ".__tmp__";
136 if (rename(new_traineddata_filename, traineddata_filename.string()) != 0) {
137 tprintf("Failed to create a temporary file %s\n",
138 traineddata_filename.string());
139 return EXIT_FAILURE;
140 }
141
142 // Initialize TessdataManager with the data in the given traineddata file.
143 tm.Init(traineddata_filename.string());
144
145 // Write the updated traineddata file.
146 tm.OverwriteComponents(new_traineddata_filename, argv+3, argc-3);
147 } else if (argc == 3 && strcmp(argv[1], "-c") == 0) {
148 if (!tm.Init(argv[2])) {
149 tprintf("Failed to read %s\n", argv[2]);
150 return EXIT_FAILURE;
151 }
154 tprintf("No LSTM Component found in %s!\n", argv[2]);
155 return EXIT_FAILURE;
156 }
157 tesseract::LSTMRecognizer recognizer;
158 if (!recognizer.DeSerialize(&tm, &fp)) {
159 tprintf("Failed to deserialize LSTM in %s!\n", argv[2]);
160 return EXIT_FAILURE;
161 }
162 recognizer.ConvertToInt();
163 GenericVector<char> lstm_data;
164 fp.OpenWrite(&lstm_data);
165 ASSERT_HOST(recognizer.Serialize(&tm, &fp));
167 lstm_data.size());
168 if (!tm.SaveFile(argv[2], nullptr)) {
169 tprintf("Failed to write modified traineddata:%s!\n", argv[2]);
170 return EXIT_FAILURE;
171 }
172 } else if (argc == 3 && strcmp(argv[1], "-d") == 0) {
173 // Initialize TessdataManager with the data in the given traineddata file.
174 tm.Init(argv[2]);
175 } else {
176 printf("Usage for combining tessdata components:\n"
177 " %s language_data_path_prefix\n"
178 " (e.g. %s tessdata/eng.)\n\n", argv[0], argv[0]);
179 printf("Usage for extracting tessdata components:\n"
180 " %s -e traineddata_file [output_component_file...]\n"
181 " (e.g. %s -e eng.traineddata eng.unicharset)\n\n",
182 argv[0], argv[0]);
183 printf("Usage for overwriting tessdata components:\n"
184 " %s -o traineddata_file [input_component_file...]\n"
185 " (e.g. %s -o eng.traineddata eng.unicharset)\n\n",
186 argv[0], argv[0]);
187 printf("Usage for unpacking all tessdata components:\n"
188 " %s -u traineddata_file output_path_prefix\n"
189 " (e.g. %s -u eng.traineddata tmp/eng.)\n", argv[0], argv[0]);
190 printf(
191 "Usage for listing directory of components:\n"
192 " %s -d traineddata_file\n",
193 argv[0]);
194 printf(
195 "Usage for compacting LSTM component to int:\n"
196 " %s -c traineddata_file\n",
197 argv[0]);
198 return 1;
199 }
200 tm.Directory();
201 return EXIT_SUCCESS;
202}
#define ASSERT_HOST(x)
Definition: errcode.h:88
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
LIST last(LIST var_list)
Definition: oldlist.cpp:190
int main(int argc, char **argv)
@ TESSDATA_NUM_ENTRIES
int size() const
Definition: genericvector.h:72
void OpenWrite(GenericVector< char > *data)
Definition: serialis.cpp:296
Definition: strngs.h:45
const char * string() const
Definition: strngs.cpp:194
void OverwriteEntry(TessdataType type, const char *data, int size)
bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
bool GetComponent(TessdataType type, TFile *fp)
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
bool ExtractToFile(const char *filename)
bool SaveFile(const STRING &filename, FileWriter writer) const
bool Init(const char *data_file_name)
bool Serialize(const TessdataManager *mgr, TFile *fp) const
bool DeSerialize(const TessdataManager *mgr, TFile *fp)