tesseract 4.1.1
Loading...
Searching...
No Matches
tessdatamanager.h
Go to the documentation of this file.
1
2// File: tessdatamanager.h
3// Description: Functions to handle loading/combining tesseract data files.
4// Author: Daria Antonova
5//
6// (C) Copyright 2009, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16//
18
19#ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_
20#define TESSERACT_CCUTIL_TESSDATAMANAGER_H_
21
22#include "genericvector.h"
23#include "strngs.h" // for STRING
24
25static const char kTrainedDataSuffix[] = "traineddata";
26
27// When adding new tessdata types and file suffixes, please make sure to
28// update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText.
29static const char kLangConfigFileSuffix[] = "config";
30static const char kUnicharsetFileSuffix[] = "unicharset";
31static const char kAmbigsFileSuffix[] = "unicharambigs";
32static const char kBuiltInTemplatesFileSuffix[] = "inttemp";
33static const char kBuiltInCutoffsFileSuffix[] = "pffmtable";
34static const char kNormProtoFileSuffix[] = "normproto";
35static const char kPuncDawgFileSuffix[] = "punc-dawg";
36static const char kSystemDawgFileSuffix[] = "word-dawg";
37static const char kNumberDawgFileSuffix[] = "number-dawg";
38static const char kFreqDawgFileSuffix[] = "freq-dawg";
39static const char kFixedLengthDawgsFileSuffix[] = "fixed-length-dawgs";
40static const char kCubeUnicharsetFileSuffix[] = "cube-unicharset";
41static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg";
42static const char kShapeTableFileSuffix[] = "shapetable";
43static const char kBigramDawgFileSuffix[] = "bigram-dawg";
44static const char kUnambigDawgFileSuffix[] = "unambig-dawg";
45static const char kParamsModelFileSuffix[] = "params-model";
46static const char kLSTMModelFileSuffix[] = "lstm";
47static const char kLSTMPuncDawgFileSuffix[] = "lstm-punc-dawg";
48static const char kLSTMSystemDawgFileSuffix[] = "lstm-word-dawg";
49static const char kLSTMNumberDawgFileSuffix[] = "lstm-number-dawg";
50static const char kLSTMUnicharsetFileSuffix[] = "lstm-unicharset";
51static const char kLSTMRecoderFileSuffix[] = "lstm-recoder";
52static const char kVersionFileSuffix[] = "version";
53
54namespace tesseract {
55
67 TESSDATA_FIXED_LENGTH_DAWGS, // 10 // deprecated
68 TESSDATA_CUBE_UNICHARSET, // 11 // deprecated
69 TESSDATA_CUBE_SYSTEM_DAWG, // 12 // deprecated
81
83};
84
89static const char *const kTessdataFileSuffixes[] = {
90 kLangConfigFileSuffix, // 0
91 kUnicharsetFileSuffix, // 1
92 kAmbigsFileSuffix, // 2
93 kBuiltInTemplatesFileSuffix, // 3
94 kBuiltInCutoffsFileSuffix, // 4
95 kNormProtoFileSuffix, // 5
96 kPuncDawgFileSuffix, // 6
97 kSystemDawgFileSuffix, // 7
98 kNumberDawgFileSuffix, // 8
99 kFreqDawgFileSuffix, // 9
100 kFixedLengthDawgsFileSuffix, // 10 // deprecated
101 kCubeUnicharsetFileSuffix, // 11 // deprecated
102 kCubeSystemDawgFileSuffix, // 12 // deprecated
103 kShapeTableFileSuffix, // 13
104 kBigramDawgFileSuffix, // 14
105 kUnambigDawgFileSuffix, // 15
106 kParamsModelFileSuffix, // 16
107 kLSTMModelFileSuffix, // 17
108 kLSTMPuncDawgFileSuffix, // 18
109 kLSTMSystemDawgFileSuffix, // 19
110 kLSTMNumberDawgFileSuffix, // 20
111 kLSTMUnicharsetFileSuffix, // 21
112 kLSTMRecoderFileSuffix, // 22
113 kVersionFileSuffix, // 23
114};
115
123static const int kMaxNumTessdataEntries = 1000;
124
125
127 public:
129 explicit TessdataManager(FileReader reader);
130
131 ~TessdataManager() = default;
132
133 bool swap() const { return swap_; }
134 bool is_loaded() const { return is_loaded_; }
135
136 // Lazily loads from the the given filename. Won't actually read the file
137 // until it needs it.
138 void LoadFileLater(const char *data_file_name);
143 bool Init(const char *data_file_name);
144 // Loads from the given memory buffer as if a file, remembering name as some
145 // arbitrary source id for caching.
146 bool LoadMemBuffer(const char *name, const char *data, int size);
147 // Overwrites a single entry of the given type.
148 void OverwriteEntry(TessdataType type, const char *data, int size);
149
150 // Saves to the given filename.
151 bool SaveFile(const STRING &filename, FileWriter writer) const;
152 // Serializes to the given vector.
153 void Serialize(GenericVector<char> *data) const;
154 // Resets to the initial state, keeping the reader.
155 void Clear();
156
157 // Prints a directory of contents.
158 void Directory() const;
159
160 // Returns true if the component requested is present.
162 return !entries_[type].empty();
163 }
164 // Opens the given TFile pointer to the given component type.
165 // Returns false in case of failure.
166 bool GetComponent(TessdataType type, TFile *fp);
167 // As non-const version except it can't load the component if not already
168 // loaded.
169 bool GetComponent(TessdataType type, TFile *fp) const;
170
171 // Returns the current version string.
172 std::string VersionString() const;
173 // Sets the version string to the given v_str.
174 void SetVersionString(const std::string &v_str);
175
176 // Returns true if the base Tesseract components are present.
177 bool IsBaseAvailable() const {
178 return !entries_[TESSDATA_UNICHARSET].empty() &&
179 !entries_[TESSDATA_INTTEMP].empty();
180 }
181
182 // Returns true if the LSTM components are present.
183 bool IsLSTMAvailable() const { return !entries_[TESSDATA_LSTM].empty(); }
184
185 // Return the name of the underlying data file.
186 const STRING &GetDataFileName() const { return data_file_name_; }
187
193 bool CombineDataFiles(const char *language_data_path_prefix,
194 const char *output_filename);
195
201 bool OverwriteComponents(const char *new_traineddata_filename,
202 char **component_filenames,
203 int num_new_components);
204
215 bool ExtractToFile(const char *filename);
216
217 private:
218
219 // Use libarchive.
220 bool LoadArchiveFile(const char *filename);
221
228 static bool TessdataTypeFromFileSuffix(const char *suffix,
229 TessdataType *type);
230
235 static bool TessdataTypeFromFileName(const char *filename,
236 TessdataType *type);
237
238 // Name of file it came from.
239 STRING data_file_name_;
240 // Function to load the file when we need it.
241 FileReader reader_;
242 // True if the file has been loaded.
243 bool is_loaded_;
244 // True if the bytes need swapping.
245 bool swap_;
246 // Contents of each element of the traineddata file.
248};
249
250} // namespace tesseract
251
252#endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_
bool(*)(const STRING &, GenericVector< char > *) FileReader
Definition: serialis.h:49
@ TESSDATA_UNAMBIG_DAWG
@ TESSDATA_LSTM_SYSTEM_DAWG
@ TESSDATA_LSTM_UNICHARSET
@ TESSDATA_CUBE_SYSTEM_DAWG
@ TESSDATA_PARAMS_MODEL
@ TESSDATA_NUMBER_DAWG
@ TESSDATA_CUBE_UNICHARSET
@ TESSDATA_LSTM_PUNC_DAWG
@ TESSDATA_BIGRAM_DAWG
@ TESSDATA_LSTM_RECODER
@ TESSDATA_LANG_CONFIG
@ TESSDATA_LSTM_NUMBER_DAWG
@ TESSDATA_NUM_ENTRIES
@ TESSDATA_SHAPE_TABLE
@ TESSDATA_FIXED_LENGTH_DAWGS
@ TESSDATA_SYSTEM_DAWG
bool(*)(const GenericVector< char > &, const STRING &) FileWriter
Definition: serialis.h:52
bool empty() const
Definition: genericvector.h:91
Definition: strngs.h:45
void OverwriteEntry(TessdataType type, const char *data, int size)
std::string VersionString() const
bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
void SetVersionString(const std::string &v_str)
bool GetComponent(TessdataType type, TFile *fp)
const STRING & GetDataFileName() const
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
void Serialize(GenericVector< char > *data) const
bool ExtractToFile(const char *filename)
void LoadFileLater(const char *data_file_name)
bool SaveFile(const STRING &filename, FileWriter writer) const
bool IsComponentAvailable(TessdataType type) const
bool LoadMemBuffer(const char *name, const char *data, int size)
bool Init(const char *data_file_name)