tesseract 4.1.1
Loading...
Searching...
No Matches
tesseract::UnicharCompress Class Reference

#include <unicharcompress.h>

Public Member Functions

 UnicharCompress ()
 
 UnicharCompress (const UnicharCompress &src)
 
 ~UnicharCompress ()
 
UnicharCompressoperator= (const UnicharCompress &src)
 
bool ComputeEncoding (const UNICHARSET &unicharset, int null_id, STRING *radical_stroke_table)
 
void SetupPassThrough (const UNICHARSET &unicharset)
 
void SetupDirect (const GenericVector< RecodedCharID > &codes)
 
int code_range () const
 
int EncodeUnichar (int unichar_id, RecodedCharID *code) const
 
int DecodeUnichar (const RecodedCharID &code) const
 
bool IsValidFirstCode (int code) const
 
const GenericVector< int > * GetNextCodes (const RecodedCharID &code) const
 
const GenericVector< int > * GetFinalCodes (const RecodedCharID &code) const
 
bool Serialize (TFile *fp) const
 
bool DeSerialize (TFile *fp)
 
STRING GetEncodingAsString (const UNICHARSET &unicharset) const
 

Static Public Member Functions

static bool DecomposeHangul (int unicode, int *leading, int *vowel, int *trailing)
 

Static Public Attributes

static const int kFirstHangul = 0xac00
 
static const int kNumHangul = 11172
 
static const int kLCount = 19
 
static const int kVCount = 21
 
static const int kTCount = 28
 

Detailed Description

Definition at line 128 of file unicharcompress.h.

Constructor & Destructor Documentation

◆ UnicharCompress() [1/2]

tesseract::UnicharCompress::UnicharCompress ( )

Definition at line 86 of file unicharcompress.cpp.

86: code_range_(0) {}

◆ UnicharCompress() [2/2]

tesseract::UnicharCompress::UnicharCompress ( const UnicharCompress src)

Definition at line 87 of file unicharcompress.cpp.

87{ *this = src; }

◆ ~UnicharCompress()

tesseract::UnicharCompress::~UnicharCompress ( )

Definition at line 88 of file unicharcompress.cpp.

88{ Cleanup(); }

Member Function Documentation

◆ code_range()

int tesseract::UnicharCompress::code_range ( ) const
inline

Definition at line 161 of file unicharcompress.h.

161{ return code_range_; }

◆ ComputeEncoding()

bool tesseract::UnicharCompress::ComputeEncoding ( const UNICHARSET unicharset,
int  null_id,
STRING radical_stroke_table 
)

Definition at line 101 of file unicharcompress.cpp.

102 {
103 RSMap radical_map;
104 if (radical_stroke_table != nullptr &&
105 !DecodeRadicalTable(radical_stroke_table, &radical_map))
106 return false;
107 encoder_.clear();
108 UNICHARSET direct_set;
109 // To avoid unused codes, clear the special codes from the direct_set.
110 direct_set.clear();
111 // Always keep space as 0;
112 direct_set.unichar_insert(" ", OldUncleanUnichars::kTrue);
113 // Null char is next if we have one.
114 if (null_id >= 0) {
115 direct_set.unichar_insert(kNullChar);
116 }
117 RSCounts radical_counts;
118 // In the initial map, codes [0, unicharset.size()) are
119 // reserved for non-han/hangul sequences of 1 or more unicodes.
120 int hangul_offset = unicharset.size();
121 // Hangul takes the next range [hangul_offset, hangul_offset + kTotalJamos).
122 const int kTotalJamos = kLCount + kVCount + kTCount;
123 // Han takes the codes beyond hangul_offset + kTotalJamos. Since it is hard
124 // to measure the number of radicals and strokes, initially we use the same
125 // code range for all 3 Han code positions, and fix them after.
126 int han_offset = hangul_offset + kTotalJamos;
127 for (int u = 0; u <= unicharset.size(); ++u) {
128 // We special-case allow null_id to be equal to unicharset.size() in case
129 // there is no space in unicharset for it.
130 if (u == unicharset.size() && u != null_id) break; // Finished
131 RecodedCharID code;
132 // Convert to unicodes.
133 std::vector<char32> unicodes;
134 std::string cleaned;
135 if (u < unicharset.size())
136 cleaned = UNICHARSET::CleanupString(unicharset.id_to_unichar(u));
137 if (u < unicharset.size() &&
138 (unicodes = UNICHAR::UTF8ToUTF32(cleaned.c_str())).size() == 1) {
139 // Check single unicodes for Hangul/Han and encode if so.
140 int unicode = unicodes[0];
141 int leading, vowel, trailing;
142 auto it = radical_map.find(unicode);
143 if (it != radical_map.end()) {
144 // This is Han. Use the radical codes directly.
145 int num_radicals = it->second->size();
146 for (int c = 0; c < num_radicals; ++c) {
147 code.Set(c, han_offset + (*it->second)[c]);
148 }
149 int pre_hash = RadicalPreHash(*it->second);
150 int num_samples = radical_counts[pre_hash]++;
151 if (num_samples > 0)
152 code.Set(num_radicals, han_offset + num_samples + kRadicalRadix);
153 } else if (DecomposeHangul(unicode, &leading, &vowel, &trailing)) {
154 // This is Hangul. Since we know the exact size of each part at compile
155 // time, it gets the bottom set of codes.
156 code.Set3(leading + hangul_offset, vowel + kLCount + hangul_offset,
157 trailing + kLCount + kVCount + hangul_offset);
158 }
159 }
160 // If the code is still empty, it wasn't Han or Hangul.
161 if (code.length() == 0) {
162 // Special cases.
163 if (u == UNICHAR_SPACE) {
164 code.Set(0, 0); // Space.
165 } else if (u == null_id || (unicharset.has_special_codes() &&
167 code.Set(0, direct_set.unichar_to_id(kNullChar));
168 } else {
169 // Add the direct_set unichar-ids of the unicodes in sequence to the
170 // code.
171 for (int uni : unicodes) {
172 int position = code.length();
173 if (position >= RecodedCharID::kMaxCodeLen) {
174 tprintf("Unichar %d=%s is too long to encode!!\n", u,
175 unicharset.id_to_unichar(u));
176 return false;
177 }
178 UNICHAR unichar(uni);
179 char* utf8 = unichar.utf8_str();
180 if (!direct_set.contains_unichar(utf8))
181 direct_set.unichar_insert(utf8);
182 code.Set(position, direct_set.unichar_to_id(utf8));
183 delete[] utf8;
184 if (direct_set.size() >
185 unicharset.size() + !unicharset.has_special_codes()) {
186 // Code space got bigger!
187 tprintf("Code space expanded from original unicharset!!\n");
188 return false;
189 }
190 }
191 }
192 }
193 encoder_.push_back(code);
194 }
195 // Now renumber Han to make all codes unique. We already added han_offset to
196 // all Han. Now separate out the radical, stroke, and count codes for Han.
197 int code_offset = 0;
198 for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
199 int max_offset = 0;
200 for (int u = 0; u < unicharset.size(); ++u) {
201 RecodedCharID* code = &encoder_[u];
202 if (code->length() <= i) continue;
203 max_offset = std::max(max_offset, (*code)(i)-han_offset);
204 code->Set(i, (*code)(i) + code_offset);
205 }
206 if (max_offset == 0) break;
207 code_offset += max_offset + 1;
208 }
209 DefragmentCodeValues(null_id >= 0 ? 1 : -1);
210 SetupDecoder();
211 return true;
212}
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
@ SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:38
@ UNICHAR_SPACE
Definition: unicharset.h:34
const int kRadicalRadix
std::unordered_map< int, std::unique_ptr< std::vector< int > > > RSMap
std::unordered_map< int, int > RSCounts
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:215
static const int kMaxCodeLen
static bool DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing)
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:626
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:671
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:246
void clear()
Definition: unicharset.h:306
int size() const
Definition: unicharset.h:341
bool has_special_codes() const
Definition: unicharset.h:722
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210

◆ DecodeUnichar()

int tesseract::UnicharCompress::DecodeUnichar ( const RecodedCharID code) const

Definition at line 291 of file unicharcompress.cpp.

291 {
292 int len = code.length();
293 if (len <= 0 || len > RecodedCharID::kMaxCodeLen) return INVALID_UNICHAR_ID;
294 auto it = decoder_.find(code);
295 if (it == decoder_.end()) return INVALID_UNICHAR_ID;
296 return it->second;
297}

◆ DecomposeHangul()

bool tesseract::UnicharCompress::DecomposeHangul ( int  unicode,
int *  leading,
int *  vowel,
int *  trailing 
)
static

Definition at line 348 of file unicharcompress.cpp.

349 {
350 if (unicode < kFirstHangul) return false;
351 int offset = unicode - kFirstHangul;
352 if (offset >= kNumHangul) return false;
353 const int kNCount = kVCount * kTCount;
354 *leading = offset / kNCount;
355 *vowel = (offset % kNCount) / kTCount;
356 *trailing = offset % kTCount;
357 return true;
358}
static const int kFirstHangul

◆ DeSerialize()

bool tesseract::UnicharCompress::DeSerialize ( TFile fp)

Definition at line 305 of file unicharcompress.cpp.

305 {
306 if (!encoder_.DeSerializeClasses(fp)) return false;
307 ComputeCodeRange();
308 SetupDecoder();
309 return true;
310}

◆ EncodeUnichar()

int tesseract::UnicharCompress::EncodeUnichar ( int  unichar_id,
RecodedCharID code 
) const

Definition at line 283 of file unicharcompress.cpp.

283 {
284 if (unichar_id < 0 || unichar_id >= encoder_.size()) return 0;
285 *code = encoder_[unichar_id];
286 return code->length();
287}

◆ GetEncodingAsString()

STRING tesseract::UnicharCompress::GetEncodingAsString ( const UNICHARSET unicharset) const

Definition at line 319 of file unicharcompress.cpp.

320 {
321 STRING encoding;
322 for (int c = 0; c < encoder_.size(); ++c) {
323 const RecodedCharID& code = encoder_[c];
324 if (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && code == encoder_[c - 1]) {
325 // Don't show the duplicate entry.
326 continue;
327 }
328 encoding.add_str_int("", code(0));
329 for (int i = 1; i < code.length(); ++i) {
330 encoding.add_str_int(",", code(i));
331 }
332 encoding += "\t";
333 if (c >= unicharset.size() || (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT &&
334 unicharset.has_special_codes())) {
335 encoding += kNullChar;
336 } else {
337 encoding += unicharset.id_to_unichar(c);
338 }
339 encoding += "\n";
340 }
341 return encoding;
342}
Definition: strngs.h:45
void add_str_int(const char *str, int number)
Definition: strngs.cpp:377

◆ GetFinalCodes()

const GenericVector< int > * tesseract::UnicharCompress::GetFinalCodes ( const RecodedCharID code) const
inline

Definition at line 179 of file unicharcompress.h.

179 {
180 auto it = final_codes_.find(code);
181 return it == final_codes_.end() ? nullptr : it->second;
182 }

◆ GetNextCodes()

const GenericVector< int > * tesseract::UnicharCompress::GetNextCodes ( const RecodedCharID code) const
inline

Definition at line 173 of file unicharcompress.h.

173 {
174 auto it = next_codes_.find(code);
175 return it == next_codes_.end() ? nullptr : it->second;
176 }

◆ IsValidFirstCode()

bool tesseract::UnicharCompress::IsValidFirstCode ( int  code) const
inline

Definition at line 170 of file unicharcompress.h.

170{ return is_valid_start_[code]; }

◆ operator=()

UnicharCompress & tesseract::UnicharCompress::operator= ( const UnicharCompress src)

Definition at line 89 of file unicharcompress.cpp.

89 {
90 Cleanup();
91 encoder_ = src.encoder_;
92 code_range_ = src.code_range_;
93 SetupDecoder();
94 return *this;
95}

◆ Serialize()

bool tesseract::UnicharCompress::Serialize ( TFile fp) const

Definition at line 300 of file unicharcompress.cpp.

300 {
301 return encoder_.SerializeClasses(fp);
302}

◆ SetupDirect()

void tesseract::UnicharCompress::SetupDirect ( const GenericVector< RecodedCharID > &  codes)

Definition at line 233 of file unicharcompress.cpp.

233 {
234 encoder_ = codes;
235 ComputeCodeRange();
236 SetupDecoder();
237}

◆ SetupPassThrough()

void tesseract::UnicharCompress::SetupPassThrough ( const UNICHARSET unicharset)

Definition at line 216 of file unicharcompress.cpp.

216 {
218 for (int u = 0; u < unicharset.size(); ++u) {
219 RecodedCharID code;
220 code.Set(0, u);
221 codes.push_back(code);
222 }
223 if (!unicharset.has_special_codes()) {
224 RecodedCharID code;
225 code.Set(0, unicharset.size());
226 codes.push_back(code);
227 }
228 SetupDirect(codes);
229}
int push_back(T object)
void SetupDirect(const GenericVector< RecodedCharID > &codes)

Member Data Documentation

◆ kFirstHangul

const int tesseract::UnicharCompress::kFirstHangul = 0xac00
static

Definition at line 136 of file unicharcompress.h.

◆ kLCount

const int tesseract::UnicharCompress::kLCount = 19
static

Definition at line 141 of file unicharcompress.h.

◆ kNumHangul

const int tesseract::UnicharCompress::kNumHangul = 11172
static

Definition at line 138 of file unicharcompress.h.

◆ kTCount

const int tesseract::UnicharCompress::kTCount = 28
static

Definition at line 143 of file unicharcompress.h.

◆ kVCount

const int tesseract::UnicharCompress::kVCount = 21
static

Definition at line 142 of file unicharcompress.h.


The documentation for this class was generated from the following files: