tesseract 4.1.1
Loading...
Searching...
No Matches
lang_model_helpers.cpp
Go to the documentation of this file.
1// Copyright 2017 Google Inc. All Rights Reserved.
2// Author: rays@google.com (Ray Smith)
3// Purpose: Collection of convenience functions to simplify creation of the
4// unicharset, recoder, and dawgs for an LSTM model.
5
6// Licensed under the Apache License, Version 2.0 (the "License");
7// you may not use this file except in compliance with the License.
8// You may obtain a copy of the License at
9// http://www.apache.org/licenses/LICENSE-2.0
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15#include "lang_model_helpers.h"
16
17#if defined(_WIN32)
18#include <direct.h>
19#endif
20#include <sys/stat.h>
21#include <sys/types.h>
22#include <cstdlib>
23#include "dawg.h"
24#include "fileio.h"
25#include "tessdatamanager.h"
26#include "trie.h"
27#include "unicharcompress.h"
28
29namespace tesseract {
30
31// Helper makes a filename (<output_dir>/<lang>/<lang><suffix>) and writes data
32// to the file, using writer if not null, otherwise, a default writer.
33// Default writer will overwrite any existing file, but a supplied writer
34// can do its own thing. If lang is empty, returns true but does nothing.
35// NOTE that suffix should contain any required . for the filename.
36bool WriteFile(const std::string& output_dir, const std::string& lang,
37 const std::string& suffix, const GenericVector<char>& data,
38 FileWriter writer) {
39 if (lang.empty()) return true;
40 std::string dirname = output_dir + "/" + lang;
41 // Attempt to make the directory, but ignore errors, as it may not be a
42 // standard filesystem, and the writer will complain if not successful.
43#if defined(_WIN32)
44 _mkdir(dirname.c_str());
45#else
46 mkdir(dirname.c_str(), S_IRWXU | S_IRWXG);
47#endif
48 std::string filename = dirname + "/" + lang + suffix;
49 if (writer == nullptr)
50 return SaveDataToFile(data, filename.c_str());
51 else
52 return (*writer)(data, filename.c_str());
53}
54
55// Helper reads a file with optional reader and returns a STRING.
56// On failure emits a warning message and returns and empty STRING.
57STRING ReadFile(const std::string& filename, FileReader reader) {
58 if (filename.empty()) return STRING();
60 bool read_result;
61 if (reader == nullptr)
62 read_result = LoadDataFromFile(filename.c_str(), &data);
63 else
64 read_result = (*reader)(filename.c_str(), &data);
65 if (read_result) return STRING(&data[0], data.size());
66 tprintf("Failed to read data from: %s\n", filename.c_str());
67 return STRING();
68}
69
70// Helper writes the unicharset to file and to the traineddata.
71bool WriteUnicharset(const UNICHARSET& unicharset, const std::string& output_dir,
72 const std::string& lang, FileWriter writer,
73 TessdataManager* traineddata) {
74 GenericVector<char> unicharset_data;
75 TFile fp;
76 fp.OpenWrite(&unicharset_data);
77 if (!unicharset.save_to_file(&fp)) return false;
78 traineddata->OverwriteEntry(TESSDATA_LSTM_UNICHARSET, &unicharset_data[0],
79 unicharset_data.size());
80 return WriteFile(output_dir, lang, ".unicharset", unicharset_data, writer);
81}
82
83// Helper creates the recoder and writes it to the traineddata, and a human-
84// readable form to file.
85bool WriteRecoder(const UNICHARSET& unicharset, bool pass_through,
86 const std::string& output_dir, const std::string& lang,
87 FileWriter writer, STRING* radical_table_data,
88 TessdataManager* traineddata) {
89 UnicharCompress recoder;
90 // Where the unicharset is carefully setup already to contain a good
91 // compact encoding, use a pass-through recoder that does nothing.
92 // For scripts that have a large number of unicodes (Han, Hangul) we want
93 // to use the recoder to compress the symbol space by re-encoding each
94 // unicode as multiple codes from a smaller 'alphabet' that are related to the
95 // shapes in the character. Hangul Jamo is a perfect example of this.
96 // See the Hangul Syllables section, sub-section "Equivalence" in:
97 // http://www.unicode.org/versions/Unicode10.0.0/ch18.pdf
98 if (pass_through) {
99 recoder.SetupPassThrough(unicharset);
100 } else {
101 int null_char =
102 unicharset.has_special_codes() ? UNICHAR_BROKEN : unicharset.size();
103 tprintf("Null char=%d\n", null_char);
104 if (!recoder.ComputeEncoding(unicharset, null_char, radical_table_data)) {
105 tprintf("Creation of encoded unicharset failed!!\n");
106 return false;
107 }
108 }
109 TFile fp;
110 GenericVector<char> recoder_data;
111 fp.OpenWrite(&recoder_data);
112 if (!recoder.Serialize(&fp)) return false;
113 traineddata->OverwriteEntry(TESSDATA_LSTM_RECODER, &recoder_data[0],
114 recoder_data.size());
115 STRING encoding = recoder.GetEncodingAsString(unicharset);
116 recoder_data.init_to_size(encoding.length(), 0);
117 memcpy(&recoder_data[0], &encoding[0], encoding.length());
118 STRING suffix;
119 suffix.add_str_int(".charset_size=", recoder.code_range());
120 suffix += ".txt";
121 return WriteFile(output_dir, lang, suffix.string(), recoder_data, writer);
122}
123
124// Helper builds a dawg from the given words, using the unicharset as coding,
125// and reverse_policy for LTR/RTL, and overwrites file_type in the traineddata.
126static bool WriteDawg(const GenericVector<STRING>& words,
127 const UNICHARSET& unicharset,
128 Trie::RTLReversePolicy reverse_policy,
129 TessdataType file_type, TessdataManager* traineddata) {
130 // The first 3 arguments are not used in this case.
131 Trie trie(DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM, unicharset.size(), 0);
132 trie.add_word_list(words, unicharset, reverse_policy);
133 tprintf("Reducing Trie to SquishedDawg\n");
134 std::unique_ptr<SquishedDawg> dawg(trie.trie_to_dawg());
135 if (dawg == nullptr || dawg->NumEdges() == 0) return false;
136 TFile fp;
137 GenericVector<char> dawg_data;
138 fp.OpenWrite(&dawg_data);
139 if (!dawg->write_squished_dawg(&fp)) return false;
140 traineddata->OverwriteEntry(file_type, &dawg_data[0], dawg_data.size());
141 return true;
142}
143
144// Builds and writes the dawgs, given a set of words, punctuation
145// patterns, number patterns, to the traineddata. Encoding uses the given
146// unicharset, and the punc dawgs is reversed if lang_is_rtl.
147static bool WriteDawgs(const GenericVector<STRING>& words,
148 const GenericVector<STRING>& puncs,
149 const GenericVector<STRING>& numbers, bool lang_is_rtl,
150 const UNICHARSET& unicharset,
151 TessdataManager* traineddata) {
152 if (puncs.empty()) {
153 tprintf("Must have non-empty puncs list to use language models!!\n");
154 return false;
155 }
156 // For each of the dawg types, make the dawg, and write to traineddata.
157 // Dawgs are reversed as follows:
158 // Words: According to the word content.
159 // Puncs: According to lang_is_rtl.
160 // Numbers: Never.
161 // System dawg (main wordlist).
162 if (!words.empty() &&
163 !WriteDawg(words, unicharset, Trie::RRP_REVERSE_IF_HAS_RTL,
164 TESSDATA_LSTM_SYSTEM_DAWG, traineddata)) {
165 return false;
166 }
167 // punc/punc-dawg.
168 Trie::RTLReversePolicy reverse_policy =
170 if (!WriteDawg(puncs, unicharset, reverse_policy, TESSDATA_LSTM_PUNC_DAWG,
171 traineddata)) {
172 return false;
173 }
174 // numbers/number-dawg.
175 if (!numbers.empty() &&
176 !WriteDawg(numbers, unicharset, Trie::RRP_DO_NO_REVERSE,
177 TESSDATA_LSTM_NUMBER_DAWG, traineddata)) {
178 return false;
179 }
180 return true;
181}
182
183// The main function for combine_lang_model.cpp.
184// Returns EXIT_SUCCESS or EXIT_FAILURE for error.
185int CombineLangModel(const UNICHARSET& unicharset, const std::string& script_dir,
186 const std::string& version_str, const std::string& output_dir,
187 const std::string& lang, bool pass_through_recoder,
188 const GenericVector<STRING>& words,
189 const GenericVector<STRING>& puncs,
190 const GenericVector<STRING>& numbers, bool lang_is_rtl,
191 FileReader reader, FileWriter writer) {
192 // Build the traineddata file.
193 TessdataManager traineddata;
194 if (!version_str.empty()) {
195 traineddata.SetVersionString(traineddata.VersionString() + ":" +
196 version_str);
197 }
198 // Unicharset and recoder.
199 if (!WriteUnicharset(unicharset, output_dir, lang, writer, &traineddata)) {
200 tprintf("Error writing unicharset!!\n");
201 return EXIT_FAILURE;
202 } else {
203 tprintf("Config file is optional, continuing...\n");
204 }
205 // If there is a config file, read it and add to traineddata.
206 std::string config_filename = script_dir + "/" + lang + "/" + lang + ".config";
207 STRING config_file = ReadFile(config_filename, reader);
208 if (config_file.length() > 0) {
209 traineddata.OverwriteEntry(TESSDATA_LANG_CONFIG, &config_file[0],
210 config_file.length());
211 }
212 std::string radical_filename = script_dir + "/radical-stroke.txt";
213 STRING radical_data = ReadFile(radical_filename, reader);
214 if (radical_data.length() == 0) {
215 tprintf("Error reading radical code table %s\n", radical_filename.c_str());
216 return EXIT_FAILURE;
217 }
218 if (!WriteRecoder(unicharset, pass_through_recoder, output_dir, lang, writer,
219 &radical_data, &traineddata)) {
220 tprintf("Error writing recoder!!\n");
221 }
222 if (!words.empty() || !puncs.empty() || !numbers.empty()) {
223 if (!WriteDawgs(words, puncs, numbers, lang_is_rtl, unicharset,
224 &traineddata)) {
225 tprintf("Error during conversion of wordlists to DAWGs!!\n");
226 return EXIT_FAILURE;
227 }
228 }
229
230 // Traineddata file.
231 GenericVector<char> traineddata_data;
232 traineddata.Serialize(&traineddata_data);
233 if (!WriteFile(output_dir, lang, ".traineddata", traineddata_data, writer)) {
234 tprintf("Error writing output traineddata file!!\n");
235 return EXIT_FAILURE;
236 }
237 return EXIT_SUCCESS;
238}
239
240} // namespace tesseract
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:241
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
@ UNICHAR_BROKEN
Definition: unicharset.h:36
@ DAWG_TYPE_WORD
Definition: dawg.h:70
bool(*)(const STRING &, GenericVector< char > *) FileReader
Definition: serialis.h:49
int CombineLangModel(const UNICHARSET &unicharset, const std::string &script_dir, const std::string &version_str, const std::string &output_dir, const std::string &lang, bool pass_through_recoder, const GenericVector< STRING > &words, const GenericVector< STRING > &puncs, const GenericVector< STRING > &numbers, bool lang_is_rtl, FileReader reader, FileWriter writer)
bool WriteRecoder(const UNICHARSET &unicharset, bool pass_through, const std::string &output_dir, const std::string &lang, FileWriter writer, STRING *radical_table_data, TessdataManager *traineddata)
@ TESSDATA_LSTM_SYSTEM_DAWG
@ TESSDATA_LSTM_UNICHARSET
@ TESSDATA_LSTM_PUNC_DAWG
@ TESSDATA_LSTM_RECODER
@ TESSDATA_LANG_CONFIG
@ TESSDATA_LSTM_NUMBER_DAWG
bool WriteFile(const std::string &output_dir, const std::string &lang, const std::string &suffix, const GenericVector< char > &data, FileWriter writer)
bool(*)(const GenericVector< char > &, const STRING &) FileWriter
Definition: serialis.h:52
bool SaveDataToFile(const GenericVector< char > &data, const STRING &filename)
bool WriteUnicharset(const UNICHARSET &unicharset, const std::string &output_dir, const std::string &lang, FileWriter writer, TessdataManager *traineddata)
STRING ReadFile(const std::string &filename, FileReader reader)
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)
void init_to_size(int size, const T &t)
bool empty() const
Definition: genericvector.h:91
int size() const
Definition: genericvector.h:72
void OpenWrite(GenericVector< char > *data)
Definition: serialis.cpp:296
Definition: strngs.h:45
void add_str_int(const char *str, int number)
Definition: strngs.cpp:377
int32_t length() const
Definition: strngs.cpp:189
const char * string() const
Definition: strngs.cpp:194
void OverwriteEntry(TessdataType type, const char *data, int size)
std::string VersionString() const
void SetVersionString(const std::string &v_str)
void Serialize(GenericVector< char > *data) const
STRING GetEncodingAsString(const UNICHARSET &unicharset) const
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, STRING *radical_stroke_table)
void SetupPassThrough(const UNICHARSET &unicharset)
bool Serialize(TFile *fp) const
bool save_to_file(const char *const filename) const
Definition: unicharset.h:350
int size() const
Definition: unicharset.h:341
bool has_special_codes() const
Definition: unicharset.h:722
RTLReversePolicy
Definition: trie.h:58
@ RRP_DO_NO_REVERSE
Definition: trie.h:59
@ RRP_REVERSE_IF_HAS_RTL
Definition: trie.h:60
@ RRP_FORCE_REVERSE
Definition: trie.h:61