tesseract 4.1.1
Loading...
Searching...
No Matches
unicharcompress.cpp
Go to the documentation of this file.
1
2// File: unicharcompress.cpp
3// Description: Unicode re-encoding using a sequence of smaller numbers in
4// place of a single large code for CJK, similarly for Indic,
5// and dissection of ligatures for other scripts.
6// Author: Ray Smith
7//
8// (C) Copyright 2015, Google Inc.
9// Licensed under the Apache License, Version 2.0 (the "License");
10// you may not use this file except in compliance with the License.
11// You may obtain a copy of the License at
12// http://www.apache.org/licenses/LICENSE-2.0
13// Unless required by applicable law or agreed to in writing, software
14// distributed under the License is distributed on an "AS IS" BASIS,
15// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16// See the License for the specific language governing permissions and
17// limitations under the License.
18//
20
21#include "unicharcompress.h"
22#include <algorithm>
23#include <memory>
24#include "tprintf.h"
25
26namespace tesseract {
27
28// String used to represent the null_id in direct_set.
29static const char* kNullChar = "<nul>";
30// Radix to make unique values from the stored radical codes.
31const int kRadicalRadix = 29;
32
33// "Hash" function for const std::vector<int> computes the sum of elements.
34// Build a unique number for each code sequence that we can use as the index in
35// a hash map of ints instead of trying to hash the vectors.
36static int RadicalPreHash(const std::vector<int>& rs) {
37 size_t result = 0;
38 for (int radical : rs) {
39 result *= kRadicalRadix;
40 result += radical;
41 }
42 return result;
43}
44
45// A hash map to convert unicodes to radical encoding.
46using RSMap = std::unordered_map<int, std::unique_ptr<std::vector<int>>>;
47// A hash map to count occurrences of each radical encoding.
48using RSCounts = std::unordered_map<int, int>;
49
50static bool DecodeRadicalLine(STRING* radical_data_line, RSMap* radical_map) {
51 if (radical_data_line->length() == 0 || (*radical_data_line)[0] == '#')
52 return true;
54 radical_data_line->split(' ', &entries);
55 if (entries.size() < 2) return false;
56 char* end = nullptr;
57 int unicode = strtol(&entries[0][0], &end, 10);
58 if (*end != '\0') return false;
59 std::unique_ptr<std::vector<int>> radicals(new std::vector<int>);
60 for (int i = 1; i < entries.size(); ++i) {
61 int radical = strtol(&entries[i][0], &end, 10);
62 if (*end != '\0') return false;
63 radicals->push_back(radical);
64 }
65 (*radical_map)[unicode] = std::move(radicals);
66 return true;
67}
68
69// Helper function builds the RSMap from the radical-stroke file, which has
70// already been read into a STRING. Returns false on error.
71// The radical_stroke_table is non-const because it gets split and the caller
72// is unlikely to want to use it again.
73static bool DecodeRadicalTable(STRING* radical_data, RSMap* radical_map) {
75 radical_data->split('\n', &lines);
76 for (int i = 0; i < lines.size(); ++i) {
77 if (!DecodeRadicalLine(&lines[i], radical_map)) {
78 tprintf("Invalid format in radical table at line %d: %s\n", i,
79 lines[i].string());
80 return false;
81 }
82 }
83 return true;
84}
85
90 Cleanup();
91 encoder_ = src.encoder_;
92 code_range_ = src.code_range_;
93 SetupDecoder();
94 return *this;
95}
96
97// Computes the encoding for the given unicharset. It is a requirement that
98// the file training/langdata/radical-stroke.txt have been read into the
99// input string radical_stroke_table.
100// Returns false if the encoding cannot be constructed.
101bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
102 STRING* radical_stroke_table) {
103 RSMap radical_map;
104 if (radical_stroke_table != nullptr &&
105 !DecodeRadicalTable(radical_stroke_table, &radical_map))
106 return false;
107 encoder_.clear();
108 UNICHARSET direct_set;
109 // To avoid unused codes, clear the special codes from the direct_set.
110 direct_set.clear();
111 // Always keep space as 0;
112 direct_set.unichar_insert(" ", OldUncleanUnichars::kTrue);
113 // Null char is next if we have one.
114 if (null_id >= 0) {
115 direct_set.unichar_insert(kNullChar);
116 }
117 RSCounts radical_counts;
118 // In the initial map, codes [0, unicharset.size()) are
119 // reserved for non-han/hangul sequences of 1 or more unicodes.
120 int hangul_offset = unicharset.size();
121 // Hangul takes the next range [hangul_offset, hangul_offset + kTotalJamos).
122 const int kTotalJamos = kLCount + kVCount + kTCount;
123 // Han takes the codes beyond hangul_offset + kTotalJamos. Since it is hard
124 // to measure the number of radicals and strokes, initially we use the same
125 // code range for all 3 Han code positions, and fix them after.
126 int han_offset = hangul_offset + kTotalJamos;
127 for (int u = 0; u <= unicharset.size(); ++u) {
128 // We special-case allow null_id to be equal to unicharset.size() in case
129 // there is no space in unicharset for it.
130 if (u == unicharset.size() && u != null_id) break; // Finished
131 RecodedCharID code;
132 // Convert to unicodes.
133 std::vector<char32> unicodes;
134 std::string cleaned;
135 if (u < unicharset.size())
136 cleaned = UNICHARSET::CleanupString(unicharset.id_to_unichar(u));
137 if (u < unicharset.size() &&
138 (unicodes = UNICHAR::UTF8ToUTF32(cleaned.c_str())).size() == 1) {
139 // Check single unicodes for Hangul/Han and encode if so.
140 int unicode = unicodes[0];
141 int leading, vowel, trailing;
142 auto it = radical_map.find(unicode);
143 if (it != radical_map.end()) {
144 // This is Han. Use the radical codes directly.
145 int num_radicals = it->second->size();
146 for (int c = 0; c < num_radicals; ++c) {
147 code.Set(c, han_offset + (*it->second)[c]);
148 }
149 int pre_hash = RadicalPreHash(*it->second);
150 int num_samples = radical_counts[pre_hash]++;
151 if (num_samples > 0)
152 code.Set(num_radicals, han_offset + num_samples + kRadicalRadix);
153 } else if (DecomposeHangul(unicode, &leading, &vowel, &trailing)) {
154 // This is Hangul. Since we know the exact size of each part at compile
155 // time, it gets the bottom set of codes.
156 code.Set3(leading + hangul_offset, vowel + kLCount + hangul_offset,
157 trailing + kLCount + kVCount + hangul_offset);
158 }
159 }
160 // If the code is still empty, it wasn't Han or Hangul.
161 if (code.length() == 0) {
162 // Special cases.
163 if (u == UNICHAR_SPACE) {
164 code.Set(0, 0); // Space.
165 } else if (u == null_id || (unicharset.has_special_codes() &&
167 code.Set(0, direct_set.unichar_to_id(kNullChar));
168 } else {
169 // Add the direct_set unichar-ids of the unicodes in sequence to the
170 // code.
171 for (int uni : unicodes) {
172 int position = code.length();
173 if (position >= RecodedCharID::kMaxCodeLen) {
174 tprintf("Unichar %d=%s is too long to encode!!\n", u,
175 unicharset.id_to_unichar(u));
176 return false;
177 }
178 UNICHAR unichar(uni);
179 char* utf8 = unichar.utf8_str();
180 if (!direct_set.contains_unichar(utf8))
181 direct_set.unichar_insert(utf8);
182 code.Set(position, direct_set.unichar_to_id(utf8));
183 delete[] utf8;
184 if (direct_set.size() >
185 unicharset.size() + !unicharset.has_special_codes()) {
186 // Code space got bigger!
187 tprintf("Code space expanded from original unicharset!!\n");
188 return false;
189 }
190 }
191 }
192 }
193 encoder_.push_back(code);
194 }
195 // Now renumber Han to make all codes unique. We already added han_offset to
196 // all Han. Now separate out the radical, stroke, and count codes for Han.
197 int code_offset = 0;
198 for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
199 int max_offset = 0;
200 for (int u = 0; u < unicharset.size(); ++u) {
201 RecodedCharID* code = &encoder_[u];
202 if (code->length() <= i) continue;
203 max_offset = std::max(max_offset, (*code)(i)-han_offset);
204 code->Set(i, (*code)(i) + code_offset);
205 }
206 if (max_offset == 0) break;
207 code_offset += max_offset + 1;
208 }
209 DefragmentCodeValues(null_id >= 0 ? 1 : -1);
210 SetupDecoder();
211 return true;
212}
213
214// Sets up an encoder that doesn't change the unichars at all, so it just
215// passes them through unchanged.
218 for (int u = 0; u < unicharset.size(); ++u) {
219 RecodedCharID code;
220 code.Set(0, u);
221 codes.push_back(code);
222 }
223 if (!unicharset.has_special_codes()) {
224 RecodedCharID code;
225 code.Set(0, unicharset.size());
226 codes.push_back(code);
227 }
228 SetupDirect(codes);
229}
230
231// Sets up an encoder directly using the given encoding vector, which maps
232// unichar_ids to the given codes.
234 encoder_ = codes;
235 ComputeCodeRange();
236 SetupDecoder();
237}
238
239// Renumbers codes to eliminate unused values.
240void UnicharCompress::DefragmentCodeValues(int encoded_null) {
241 // There may not be any Hangul, but even if there is, it is possible that not
242 // all codes are used. Likewise with the Han encoding, it is possible that not
243 // all numbers of strokes are used.
244 ComputeCodeRange();
245 GenericVector<int> offsets;
246 offsets.init_to_size(code_range_, 0);
247 // Find which codes are used
248 for (int c = 0; c < encoder_.size(); ++c) {
249 const RecodedCharID& code = encoder_[c];
250 for (int i = 0; i < code.length(); ++i) {
251 offsets[code(i)] = 1;
252 }
253 }
254 // Compute offsets based on code use.
255 int offset = 0;
256 for (int i = 0; i < offsets.size(); ++i) {
257 // If not used, decrement everything above here.
258 // We are moving encoded_null to the end, so it is not "used".
259 if (offsets[i] == 0 || i == encoded_null) {
260 --offset;
261 } else {
262 offsets[i] = offset;
263 }
264 }
265 if (encoded_null >= 0) {
266 // The encoded_null is moving to the end, for the benefit of TensorFlow,
267 // which is offsets.size() + offsets.back().
268 offsets[encoded_null] = offsets.size() + offsets.back() - encoded_null;
269 }
270 // Now apply the offsets.
271 for (int c = 0; c < encoder_.size(); ++c) {
272 RecodedCharID* code = &encoder_[c];
273 for (int i = 0; i < code->length(); ++i) {
274 int value = (*code)(i);
275 code->Set(i, value + offsets[value]);
276 }
277 }
278 ComputeCodeRange();
279}
280
281// Encodes a single unichar_id. Returns the length of the code, or zero if
282// invalid input, and the encoding itself
283int UnicharCompress::EncodeUnichar(int unichar_id, RecodedCharID* code) const {
284 if (unichar_id < 0 || unichar_id >= encoder_.size()) return 0;
285 *code = encoder_[unichar_id];
286 return code->length();
287}
288
289// Decodes code, returning the original unichar-id, or
290// INVALID_UNICHAR_ID if the input is invalid.
292 int len = code.length();
293 if (len <= 0 || len > RecodedCharID::kMaxCodeLen) return INVALID_UNICHAR_ID;
294 auto it = decoder_.find(code);
295 if (it == decoder_.end()) return INVALID_UNICHAR_ID;
296 return it->second;
297}
298
299// Writes to the given file. Returns false in case of error.
301 return encoder_.SerializeClasses(fp);
302}
303
304// Reads from the given file. Returns false in case of error.
306 if (!encoder_.DeSerializeClasses(fp)) return false;
307 ComputeCodeRange();
308 SetupDecoder();
309 return true;
310}
311
312// Returns a STRING containing a text file that describes the encoding thus:
313// <index>[,<index>]*<tab><UTF8-str><newline>
314// In words, a comma-separated list of one or more indices, followed by a tab
315// and the UTF-8 string that the code represents per line. Most simple scripts
316// will encode a single index to a UTF8-string, but Chinese, Japanese, Korean
317// and the Indic scripts will contain a many-to-many mapping.
318// See the class comment above for details.
320 const UNICHARSET& unicharset) const {
321 STRING encoding;
322 for (int c = 0; c < encoder_.size(); ++c) {
323 const RecodedCharID& code = encoder_[c];
324 if (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && code == encoder_[c - 1]) {
325 // Don't show the duplicate entry.
326 continue;
327 }
328 encoding.add_str_int("", code(0));
329 for (int i = 1; i < code.length(); ++i) {
330 encoding.add_str_int(",", code(i));
331 }
332 encoding += "\t";
333 if (c >= unicharset.size() || (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT &&
334 unicharset.has_special_codes())) {
335 encoding += kNullChar;
336 } else {
337 encoding += unicharset.id_to_unichar(c);
338 }
339 encoding += "\n";
340 }
341 return encoding;
342}
343
344// Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing.
345// Note that the returned values are 0-based indices, NOT unicode Jamo.
346// Returns false if the input is not in the Hangul unicode range.
347/* static */
348bool UnicharCompress::DecomposeHangul(int unicode, int* leading, int* vowel,
349 int* trailing) {
350 if (unicode < kFirstHangul) return false;
351 int offset = unicode - kFirstHangul;
352 if (offset >= kNumHangul) return false;
353 const int kNCount = kVCount * kTCount;
354 *leading = offset / kNCount;
355 *vowel = (offset % kNCount) / kTCount;
356 *trailing = offset % kTCount;
357 return true;
358}
359
360// Computes the value of code_range_ from the encoder_.
361void UnicharCompress::ComputeCodeRange() {
362 code_range_ = -1;
363 for (int c = 0; c < encoder_.size(); ++c) {
364 const RecodedCharID& code = encoder_[c];
365 for (int i = 0; i < code.length(); ++i) {
366 if (code(i) > code_range_) code_range_ = code(i);
367 }
368 }
369 ++code_range_;
370}
371
372// Initializes the decoding hash_map from the encoding array.
373void UnicharCompress::SetupDecoder() {
374 Cleanup();
375 is_valid_start_.init_to_size(code_range_, false);
376 for (int c = 0; c < encoder_.size(); ++c) {
377 const RecodedCharID& code = encoder_[c];
378 decoder_[code] = c;
379 is_valid_start_[code(0)] = true;
380 RecodedCharID prefix = code;
381 int len = code.length() - 1;
382 prefix.Truncate(len);
383 auto final_it = final_codes_.find(prefix);
384 if (final_it == final_codes_.end()) {
385 auto* code_list = new GenericVectorEqEq<int>;
386 code_list->push_back(code(len));
387 final_codes_[prefix] = code_list;
388 while (--len >= 0) {
389 prefix.Truncate(len);
390 auto next_it = next_codes_.find(prefix);
391 if (next_it == next_codes_.end()) {
392 auto* code_list = new GenericVectorEqEq<int>;
393 code_list->push_back(code(len));
394 next_codes_[prefix] = code_list;
395 } else {
396 // We still have to search the list as we may get here via multiple
397 // lengths of code.
398 if (!next_it->second->contains(code(len)))
399 next_it->second->push_back(code(len));
400 break; // This prefix has been processed.
401 }
402 }
403 } else {
404 if (!final_it->second->contains(code(len)))
405 final_it->second->push_back(code(len));
406 }
407 }
408}
409
410// Frees allocated memory.
411void UnicharCompress::Cleanup() {
412 decoder_.clear();
413 is_valid_start_.clear();
414 for (auto& next_code : next_codes_) {
415 delete next_code.second;
416 }
417 for (auto& final_code : final_codes_) {
418 delete final_code.second;
419 }
420 next_codes_.clear();
421 final_codes_.clear();
422}
423
424} // namespace tesseract.
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
@ SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:38
@ UNICHAR_SPACE
Definition: unicharset.h:34
const int kRadicalRadix
std::unordered_map< int, std::unique_ptr< std::vector< int > > > RSMap
std::unordered_map< int, int > RSCounts
void init_to_size(int size, const T &t)
int push_back(T object)
int size() const
Definition: genericvector.h:72
T & back() const
int length() const
Definition: genericvector.h:86
Definition: strngs.h:45
void add_str_int(const char *str, int number)
Definition: strngs.cpp:377
int32_t length() const
Definition: strngs.cpp:189
void split(char c, GenericVector< STRING > *splited)
Definition: strngs.cpp:282
char * utf8_str() const
Definition: unichar.cpp:129
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:215
void Set(int index, int value)
static const int kMaxCodeLen
void Set3(int code0, int code1, int code2)
void SetupDirect(const GenericVector< RecodedCharID > &codes)
int EncodeUnichar(int unichar_id, RecodedCharID *code) const
STRING GetEncodingAsString(const UNICHARSET &unicharset) const
static bool DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing)
UnicharCompress & operator=(const UnicharCompress &src)
static const int kFirstHangul
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, STRING *radical_stroke_table)
void SetupPassThrough(const UNICHARSET &unicharset)
int DecodeUnichar(const RecodedCharID &code) const
bool Serialize(TFile *fp) const
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:626
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:671
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:246
void clear()
Definition: unicharset.h:306
int size() const
Definition: unicharset.h:341
bool has_special_codes() const
Definition: unicharset.h:722
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210