tesseract 4.1.1
Loading...
Searching...
No Matches
tessdatamanager.cpp
Go to the documentation of this file.
1
2// File: tessdatamanager.cpp
3// Description: Functions to handle loading/combining tesseract data files.
4// Author: Daria Antonova
5//
6// (C) Copyright 2009, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16//
18
19#ifdef HAVE_CONFIG_H
20#include "config_auto.h"
21#endif
22
23#include "tessdatamanager.h"
24
25#include <cstdio>
26#include <string>
27
28#if defined(HAVE_LIBARCHIVE)
29#include <archive.h>
30#include <archive_entry.h>
31#endif
32
33#include "errcode.h"
34#include "helpers.h"
35#include "serialis.h"
36#include "strngs.h"
37#include "tprintf.h"
38#include "params.h"
39
40namespace tesseract {
41
42TessdataManager::TessdataManager() : reader_(nullptr), is_loaded_(false), swap_(false) {
43 SetVersionString(PACKAGE_VERSION);
44}
45
47 : reader_(reader),
48 is_loaded_(false),
49 swap_(false) {
50 SetVersionString(PACKAGE_VERSION);
51}
52
53// Lazily loads from the the given filename. Won't actually read the file
54// until it needs it.
55void TessdataManager::LoadFileLater(const char *data_file_name) {
56 Clear();
57 data_file_name_ = data_file_name;
58}
59
60#if defined(HAVE_LIBARCHIVE)
61bool TessdataManager::LoadArchiveFile(const char *filename) {
62 bool result = false;
63 archive *a = archive_read_new();
64 if (a != nullptr) {
65 archive_read_support_filter_all(a);
66 archive_read_support_format_all(a);
67 if (archive_read_open_filename(a, filename, 8192) == ARCHIVE_OK) {
68 archive_entry *ae;
69 while (archive_read_next_header(a, &ae) == ARCHIVE_OK) {
70 const char *component = archive_entry_pathname(ae);
71 if (component != nullptr) {
72 TessdataType type;
73 if (TessdataTypeFromFileName(component, &type)) {
74 int64_t size = archive_entry_size(ae);
75 if (size > 0) {
76 entries_[type].resize_no_init(size);
77 if (archive_read_data(a, &entries_[type][0], size) == size) {
78 is_loaded_ = true;
79 }
80 }
81 }
82 }
83 }
84 result = is_loaded_;
85#if defined(DEBUG)
86 } else {
87 tprintf("archive_read_open_filename(...,%s,...) failed, %s\n",
88 filename, strerror(archive_errno(a)));
89#endif
90 }
91 archive_read_free(a);
92 }
93 return result;
94}
95#endif
96
97bool TessdataManager::Init(const char *data_file_name) {
99 if (reader_ == nullptr) {
100#if defined(HAVE_LIBARCHIVE)
101 if (LoadArchiveFile(data_file_name)) return true;
102#endif
103 if (!LoadDataFromFile(data_file_name, &data)) return false;
104 } else {
105 if (!(*reader_)(data_file_name, &data)) return false;
106 }
107 return LoadMemBuffer(data_file_name, &data[0], data.size());
108}
109
110// Loads from the given memory buffer as if a file.
111bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
112 int size) {
113 // TODO: This method supports only the proprietary file format.
114 Clear();
115 data_file_name_ = name;
116 TFile fp;
117 fp.Open(data, size);
118 uint32_t num_entries;
119 if (!fp.DeSerialize(&num_entries)) return false;
120 swap_ = num_entries > kMaxNumTessdataEntries;
121 fp.set_swap(swap_);
122 if (swap_) ReverseN(&num_entries, sizeof(num_entries));
123 if (num_entries > kMaxNumTessdataEntries) return false;
124 GenericVector<int64_t> offset_table;
125 offset_table.resize_no_init(num_entries);
126 if (!fp.DeSerialize(&offset_table[0], num_entries)) return false;
127 for (unsigned i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
128 if (offset_table[i] >= 0) {
129 int64_t entry_size = size - offset_table[i];
130 unsigned j = i + 1;
131 while (j < num_entries && offset_table[j] == -1) ++j;
132 if (j < num_entries) entry_size = offset_table[j] - offset_table[i];
133 entries_[i].resize_no_init(entry_size);
134 if (!fp.DeSerialize(&entries_[i][0], entry_size)) return false;
135 }
136 }
137 if (entries_[TESSDATA_VERSION].empty()) {
138 SetVersionString("Pre-4.0.0");
139 }
140 is_loaded_ = true;
141 return true;
142}
143
144// Overwrites a single entry of the given type.
146 int size) {
147 is_loaded_ = true;
148 entries_[type].resize_no_init(size);
149 memcpy(&entries_[type][0], data, size);
150}
151
152// Saves to the given filename.
154 FileWriter writer) const {
155 // TODO: This method supports only the proprietary file format.
156 ASSERT_HOST(is_loaded_);
158 Serialize(&data);
159 if (writer == nullptr)
160 return SaveDataToFile(data, filename.c_str());
161 else
162 return (*writer)(data, filename.c_str());
163}
164
165// Serializes to the given vector.
167 // TODO: This method supports only the proprietary file format.
168 ASSERT_HOST(is_loaded_);
169 // Compute the offset_table and total size.
170 int64_t offset_table[TESSDATA_NUM_ENTRIES];
171 int64_t offset = sizeof(int32_t) + sizeof(offset_table);
172 for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
173 if (entries_[i].empty()) {
174 offset_table[i] = -1;
175 } else {
176 offset_table[i] = offset;
177 offset += entries_[i].size();
178 }
179 }
180 data->init_to_size(offset, 0);
181 int32_t num_entries = TESSDATA_NUM_ENTRIES;
182 TFile fp;
183 fp.OpenWrite(data);
184 fp.Serialize(&num_entries);
185 fp.Serialize(&offset_table[0], countof(offset_table));
186 for (const auto& entry : entries_) {
187 if (!entry.empty()) {
188 fp.Serialize(&entry[0], entry.size());
189 }
190 }
191}
192
193// Resets to the initial state, keeping the reader.
195 for (auto& entry : entries_) {
196 entry.clear();
197 }
198 is_loaded_ = false;
199}
200
201// Prints a directory of contents.
203 tprintf("Version string:%s\n", VersionString().c_str());
204 int offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t);
205 for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
206 if (!entries_[i].empty()) {
207 tprintf("%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i],
208 entries_[i].size(), offset);
209 offset += entries_[i].size();
210 }
211 }
212}
213
214// Opens the given TFile pointer to the given component type.
215// Returns false in case of failure.
217 if (!is_loaded_ && !Init(data_file_name_.string())) return false;
218 const TessdataManager *const_this = this;
219 return const_this->GetComponent(type, fp);
220}
221
222// As non-const version except it can't load the component if not already
223// loaded.
225 ASSERT_HOST(is_loaded_);
226 if (entries_[type].empty()) return false;
227 fp->Open(&entries_[type][0], entries_[type].size());
228 fp->set_swap(swap_);
229 return true;
230}
231
232// Returns the current version string.
234 return std::string(&entries_[TESSDATA_VERSION][0],
235 entries_[TESSDATA_VERSION].size());
236}
237
238// Sets the version string to the given v_str.
239void TessdataManager::SetVersionString(const std::string &v_str) {
240 entries_[TESSDATA_VERSION].resize_no_init(v_str.size());
241 memcpy(&entries_[TESSDATA_VERSION][0], v_str.data(), v_str.size());
242}
243
245 const char *language_data_path_prefix,
246 const char *output_filename) {
247 // Load individual tessdata components from files.
248 for (auto filesuffix : kTessdataFileSuffixes) {
249 TessdataType type;
250 ASSERT_HOST(TessdataTypeFromFileSuffix(filesuffix, &type));
251 STRING filename = language_data_path_prefix;
252 filename += filesuffix;
253 FILE *fp = fopen(filename.string(), "rb");
254 if (fp != nullptr) {
255 fclose(fp);
256 if (!LoadDataFromFile(filename.c_str(), &entries_[type])) {
257 tprintf("Load of file %s failed!\n", filename.string());
258 return false;
259 }
260 }
261 }
262 is_loaded_ = true;
263
264 // Make sure that the required components are present.
265 if (!IsBaseAvailable() && !IsLSTMAvailable()) {
266 tprintf(
267 "Error: traineddata file must contain at least (a unicharset file"
268 "and inttemp) OR an lstm file.\n");
269 return false;
270 }
271 // Write updated data to the output traineddata file.
272 return SaveFile(output_filename, nullptr);
273}
274
276 const char *new_traineddata_filename,
277 char **component_filenames,
278 int num_new_components) {
279 // Open the files with the new components.
280 // TODO: This method supports only the proprietary file format.
281 for (int i = 0; i < num_new_components; ++i) {
282 TessdataType type;
283 if (TessdataTypeFromFileName(component_filenames[i], &type)) {
284 if (!LoadDataFromFile(component_filenames[i], &entries_[type])) {
285 tprintf("Failed to read component file:%s\n", component_filenames[i]);
286 return false;
287 }
288 }
289 }
290
291 // Write updated data to the output traineddata file.
292 return SaveFile(new_traineddata_filename, nullptr);
293}
294
295bool TessdataManager::ExtractToFile(const char *filename) {
298 tesseract::TessdataManager::TessdataTypeFromFileName(filename, &type));
299 if (entries_[type].empty()) return false;
300 return SaveDataToFile(entries_[type], filename);
301}
302
303bool TessdataManager::TessdataTypeFromFileSuffix(const char *suffix,
304 TessdataType *type) {
305 for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
306 if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
307 *type = static_cast<TessdataType>(i);
308 return true;
309 }
310 }
311#if defined(DEBUG)
312 tprintf("TessdataManager can't determine which tessdata"
313 " component is represented by %s\n", suffix);
314#endif
315 return false;
316}
317
318bool TessdataManager::TessdataTypeFromFileName(const char *filename,
319 TessdataType *type) {
320 // Get the file suffix (extension)
321 const char *suffix = strrchr(filename, '.');
322 if (suffix == nullptr || *(++suffix) == '\0') return false;
323 return TessdataTypeFromFileSuffix(suffix, type);
324}
325
326} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:88
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:185
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
bool(*)(const STRING &, GenericVector< char > *) FileReader
Definition: serialis.h:49
@ TESSDATA_NUM_ENTRIES
constexpr size_t countof(T const (&)[N]) noexcept
Definition: serialis.h:43
bool(*)(const GenericVector< char > &, const STRING &) FileWriter
Definition: serialis.h:52
bool SaveDataToFile(const GenericVector< char > &data, const STRING &filename)
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)
void init_to_size(int size, const T &t)
void resize_no_init(int size)
Definition: genericvector.h:66
int size() const
Definition: genericvector.h:72
void OpenWrite(GenericVector< char > *data)
Definition: serialis.cpp:296
void set_swap(bool value)
Definition: serialis.h:90
bool Open(const STRING &filename, FileReader reader)
Definition: serialis.cpp:197
bool Serialize(const char *data, size_t count=1)
Definition: serialis.cpp:148
bool DeSerialize(char *data, size_t count=1)
Definition: serialis.cpp:104
Definition: strngs.h:45
const char * c_str() const
Definition: strngs.cpp:205
const char * string() const
Definition: strngs.cpp:194
void OverwriteEntry(TessdataType type, const char *data, int size)
std::string VersionString() const
bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
void SetVersionString(const std::string &v_str)
bool GetComponent(TessdataType type, TFile *fp)
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
void Serialize(GenericVector< char > *data) const
bool ExtractToFile(const char *filename)
void LoadFileLater(const char *data_file_name)
bool SaveFile(const STRING &filename, FileWriter writer) const
bool LoadMemBuffer(const char *name, const char *data, int size)
bool Init(const char *data_file_name)