tesseract 4.1.1
Loading...
Searching...
No Matches
tesseract::TessdataManager Class Reference

#include <tessdatamanager.h>

Public Member Functions

 TessdataManager ()
 
 TessdataManager (FileReader reader)
 
 ~TessdataManager ()=default
 
bool swap () const
 
bool is_loaded () const
 
void LoadFileLater (const char *data_file_name)
 
bool Init (const char *data_file_name)
 
bool LoadMemBuffer (const char *name, const char *data, int size)
 
void OverwriteEntry (TessdataType type, const char *data, int size)
 
bool SaveFile (const STRING &filename, FileWriter writer) const
 
void Serialize (GenericVector< char > *data) const
 
void Clear ()
 
void Directory () const
 
bool IsComponentAvailable (TessdataType type) const
 
bool GetComponent (TessdataType type, TFile *fp)
 
bool GetComponent (TessdataType type, TFile *fp) const
 
std::string VersionString () const
 
void SetVersionString (const std::string &v_str)
 
bool IsBaseAvailable () const
 
bool IsLSTMAvailable () const
 
const STRINGGetDataFileName () const
 
bool CombineDataFiles (const char *language_data_path_prefix, const char *output_filename)
 
bool OverwriteComponents (const char *new_traineddata_filename, char **component_filenames, int num_new_components)
 
bool ExtractToFile (const char *filename)
 

Detailed Description

Definition at line 126 of file tessdatamanager.h.

Constructor & Destructor Documentation

◆ TessdataManager() [1/2]

tesseract::TessdataManager::TessdataManager ( )

Definition at line 42 of file tessdatamanager.cpp.

42 : reader_(nullptr), is_loaded_(false), swap_(false) {
43 SetVersionString(PACKAGE_VERSION);
44}
void SetVersionString(const std::string &v_str)

◆ TessdataManager() [2/2]

tesseract::TessdataManager::TessdataManager ( FileReader  reader)
explicit

Definition at line 46 of file tessdatamanager.cpp.

47 : reader_(reader),
48 is_loaded_(false),
49 swap_(false) {
50 SetVersionString(PACKAGE_VERSION);
51}

◆ ~TessdataManager()

tesseract::TessdataManager::~TessdataManager ( )
default

Member Function Documentation

◆ Clear()

void tesseract::TessdataManager::Clear ( )

Definition at line 194 of file tessdatamanager.cpp.

194 {
195 for (auto& entry : entries_) {
196 entry.clear();
197 }
198 is_loaded_ = false;
199}

◆ CombineDataFiles()

bool tesseract::TessdataManager::CombineDataFiles ( const char *  language_data_path_prefix,
const char *  output_filename 
)

Reads all the standard tesseract config and data files for a language at the given path and bundles them up into one binary data file. Returns true if the combined traineddata file was successfully written.

Definition at line 244 of file tessdatamanager.cpp.

246 {
247 // Load individual tessdata components from files.
248 for (auto filesuffix : kTessdataFileSuffixes) {
250 ASSERT_HOST(TessdataTypeFromFileSuffix(filesuffix, &type));
251 STRING filename = language_data_path_prefix;
252 filename += filesuffix;
253 FILE *fp = fopen(filename.string(), "rb");
254 if (fp != nullptr) {
255 fclose(fp);
256 if (!LoadDataFromFile(filename.c_str(), &entries_[type])) {
257 tprintf("Load of file %s failed!\n", filename.string());
258 return false;
259 }
260 }
261 }
262 is_loaded_ = true;
263
264 // Make sure that the required components are present.
265 if (!IsBaseAvailable() && !IsLSTMAvailable()) {
266 tprintf(
267 "Error: traineddata file must contain at least (a unicharset file"
268 "and inttemp) OR an lstm file.\n");
269 return false;
270 }
271 // Write updated data to the output traineddata file.
272 return SaveFile(output_filename, nullptr);
273}
#define ASSERT_HOST(x)
Definition: errcode.h:88
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)
Definition: strngs.h:45
const char * c_str() const
Definition: strngs.cpp:205
const char * string() const
Definition: strngs.cpp:194
bool SaveFile(const STRING &filename, FileWriter writer) const

◆ Directory()

void tesseract::TessdataManager::Directory ( ) const

Definition at line 202 of file tessdatamanager.cpp.

202 {
203 tprintf("Version string:%s\n", VersionString().c_str());
204 int offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t);
205 for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
206 if (!entries_[i].empty()) {
207 tprintf("%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i],
208 entries_[i].size(), offset);
209 offset += entries_[i].size();
210 }
211 }
212}
@ TESSDATA_NUM_ENTRIES
int size() const
Definition: genericvector.h:72
std::string VersionString() const

◆ ExtractToFile()

bool tesseract::TessdataManager::ExtractToFile ( const char *  filename)

Extracts tessdata component implied by the name of the input file from the combined traineddata loaded into TessdataManager. Writes the extracted component to the file indicated by the file name. E.g. if the filename given is somepath/somelang.unicharset, unicharset will be extracted from the data loaded into the TessdataManager and will be written to somepath/somelang.unicharset.

Returns
true if the component was successfully extracted, false if the component was not present in the traineddata loaded into TessdataManager.

Definition at line 295 of file tessdatamanager.cpp.

295 {
298 tesseract::TessdataManager::TessdataTypeFromFileName(filename, &type));
299 if (entries_[type].empty()) return false;
300 return SaveDataToFile(entries_[type], filename);
301}
bool SaveDataToFile(const GenericVector< char > &data, const STRING &filename)

◆ GetComponent() [1/2]

bool tesseract::TessdataManager::GetComponent ( TessdataType  type,
TFile fp 
)

Definition at line 216 of file tessdatamanager.cpp.

216 {
217 if (!is_loaded_ && !Init(data_file_name_.string())) return false;
218 const TessdataManager *const_this = this;
219 return const_this->GetComponent(type, fp);
220}
bool Init(const char *data_file_name)

◆ GetComponent() [2/2]

bool tesseract::TessdataManager::GetComponent ( TessdataType  type,
TFile fp 
) const

Definition at line 224 of file tessdatamanager.cpp.

224 {
225 ASSERT_HOST(is_loaded_);
226 if (entries_[type].empty()) return false;
227 fp->Open(&entries_[type][0], entries_[type].size());
228 fp->set_swap(swap_);
229 return true;
230}

◆ GetDataFileName()

const STRING & tesseract::TessdataManager::GetDataFileName ( ) const
inline

Definition at line 186 of file tessdatamanager.h.

186{ return data_file_name_; }

◆ Init()

bool tesseract::TessdataManager::Init ( const char *  data_file_name)

Opens and reads the given data file right now.

Returns
true on success.

Definition at line 97 of file tessdatamanager.cpp.

97 {
99 if (reader_ == nullptr) {
100#if defined(HAVE_LIBARCHIVE)
101 if (LoadArchiveFile(data_file_name)) return true;
102#endif
103 if (!LoadDataFromFile(data_file_name, &data)) return false;
104 } else {
105 if (!(*reader_)(data_file_name, &data)) return false;
106 }
107 return LoadMemBuffer(data_file_name, &data[0], data.size());
108}
bool LoadMemBuffer(const char *name, const char *data, int size)

◆ is_loaded()

bool tesseract::TessdataManager::is_loaded ( ) const
inline

Definition at line 134 of file tessdatamanager.h.

134{ return is_loaded_; }

◆ IsBaseAvailable()

bool tesseract::TessdataManager::IsBaseAvailable ( ) const
inline

Definition at line 177 of file tessdatamanager.h.

177 {
178 return !entries_[TESSDATA_UNICHARSET].empty() &&
179 !entries_[TESSDATA_INTTEMP].empty();
180 }
bool empty() const
Definition: genericvector.h:91

◆ IsComponentAvailable()

bool tesseract::TessdataManager::IsComponentAvailable ( TessdataType  type) const
inline

Definition at line 161 of file tessdatamanager.h.

161 {
162 return !entries_[type].empty();
163 }

◆ IsLSTMAvailable()

bool tesseract::TessdataManager::IsLSTMAvailable ( ) const
inline

Definition at line 183 of file tessdatamanager.h.

183{ return !entries_[TESSDATA_LSTM].empty(); }

◆ LoadFileLater()

void tesseract::TessdataManager::LoadFileLater ( const char *  data_file_name)

Definition at line 55 of file tessdatamanager.cpp.

55 {
56 Clear();
57 data_file_name_ = data_file_name;
58}

◆ LoadMemBuffer()

bool tesseract::TessdataManager::LoadMemBuffer ( const char *  name,
const char *  data,
int  size 
)

Definition at line 111 of file tessdatamanager.cpp.

112 {
113 // TODO: This method supports only the proprietary file format.
114 Clear();
115 data_file_name_ = name;
116 TFile fp;
117 fp.Open(data, size);
118 uint32_t num_entries;
119 if (!fp.DeSerialize(&num_entries)) return false;
120 swap_ = num_entries > kMaxNumTessdataEntries;
121 fp.set_swap(swap_);
122 if (swap_) ReverseN(&num_entries, sizeof(num_entries));
123 if (num_entries > kMaxNumTessdataEntries) return false;
124 GenericVector<int64_t> offset_table;
125 offset_table.resize_no_init(num_entries);
126 if (!fp.DeSerialize(&offset_table[0], num_entries)) return false;
127 for (unsigned i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
128 if (offset_table[i] >= 0) {
129 int64_t entry_size = size - offset_table[i];
130 unsigned j = i + 1;
131 while (j < num_entries && offset_table[j] == -1) ++j;
132 if (j < num_entries) entry_size = offset_table[j] - offset_table[i];
133 entries_[i].resize_no_init(entry_size);
134 if (!fp.DeSerialize(&entries_[i][0], entry_size)) return false;
135 }
136 }
137 if (entries_[TESSDATA_VERSION].empty()) {
138 SetVersionString("Pre-4.0.0");
139 }
140 is_loaded_ = true;
141 return true;
142}
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:185
void resize_no_init(int size)
Definition: genericvector.h:66

◆ OverwriteComponents()

bool tesseract::TessdataManager::OverwriteComponents ( const char *  new_traineddata_filename,
char **  component_filenames,
int  num_new_components 
)

Gets the individual components from the data_file_ with which the class was initialized. Overwrites the components specified by component_filenames. Writes the updated traineddata file to new_traineddata_filename.

Definition at line 275 of file tessdatamanager.cpp.

278 {
279 // Open the files with the new components.
280 // TODO: This method supports only the proprietary file format.
281 for (int i = 0; i < num_new_components; ++i) {
283 if (TessdataTypeFromFileName(component_filenames[i], &type)) {
284 if (!LoadDataFromFile(component_filenames[i], &entries_[type])) {
285 tprintf("Failed to read component file:%s\n", component_filenames[i]);
286 return false;
287 }
288 }
289 }
290
291 // Write updated data to the output traineddata file.
292 return SaveFile(new_traineddata_filename, nullptr);
293}

◆ OverwriteEntry()

void tesseract::TessdataManager::OverwriteEntry ( TessdataType  type,
const char *  data,
int  size 
)

Definition at line 145 of file tessdatamanager.cpp.

146 {
147 is_loaded_ = true;
148 entries_[type].resize_no_init(size);
149 memcpy(&entries_[type][0], data, size);
150}

◆ SaveFile()

bool tesseract::TessdataManager::SaveFile ( const STRING filename,
FileWriter  writer 
) const

Definition at line 153 of file tessdatamanager.cpp.

154 {
155 // TODO: This method supports only the proprietary file format.
156 ASSERT_HOST(is_loaded_);
158 Serialize(&data);
159 if (writer == nullptr)
160 return SaveDataToFile(data, filename.c_str());
161 else
162 return (*writer)(data, filename.c_str());
163}
void Serialize(GenericVector< char > *data) const

◆ Serialize()

void tesseract::TessdataManager::Serialize ( GenericVector< char > *  data) const

Definition at line 166 of file tessdatamanager.cpp.

166 {
167 // TODO: This method supports only the proprietary file format.
168 ASSERT_HOST(is_loaded_);
169 // Compute the offset_table and total size.
170 int64_t offset_table[TESSDATA_NUM_ENTRIES];
171 int64_t offset = sizeof(int32_t) + sizeof(offset_table);
172 for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
173 if (entries_[i].empty()) {
174 offset_table[i] = -1;
175 } else {
176 offset_table[i] = offset;
177 offset += entries_[i].size();
178 }
179 }
180 data->init_to_size(offset, 0);
181 int32_t num_entries = TESSDATA_NUM_ENTRIES;
182 TFile fp;
183 fp.OpenWrite(data);
184 fp.Serialize(&num_entries);
185 fp.Serialize(&offset_table[0], countof(offset_table));
186 for (const auto& entry : entries_) {
187 if (!entry.empty()) {
188 fp.Serialize(&entry[0], entry.size());
189 }
190 }
191}
constexpr size_t countof(T const (&)[N]) noexcept
Definition: serialis.h:43
void init_to_size(int size, const T &t)

◆ SetVersionString()

void tesseract::TessdataManager::SetVersionString ( const std::string &  v_str)

Definition at line 239 of file tessdatamanager.cpp.

239 {
240 entries_[TESSDATA_VERSION].resize_no_init(v_str.size());
241 memcpy(&entries_[TESSDATA_VERSION][0], v_str.data(), v_str.size());
242}

◆ swap()

bool tesseract::TessdataManager::swap ( ) const
inline

Definition at line 133 of file tessdatamanager.h.

133{ return swap_; }

◆ VersionString()

std::string tesseract::TessdataManager::VersionString ( ) const

Definition at line 233 of file tessdatamanager.cpp.

233 {
234 return std::string(&entries_[TESSDATA_VERSION][0],
235 entries_[TESSDATA_VERSION].size());
236}

The documentation for this class was generated from the following files: