tesseract 4.1.1
Loading...
Searching...
No Matches
unicharcompress.h
Go to the documentation of this file.
1
2// File: unicharcompress.h
3// Description: Unicode re-encoding using a sequence of smaller numbers in
4// place of a single large code for CJK, similarly for Indic,
5// and dissection of ligatures for other scripts.
6// Author: Ray Smith
7// Created: Wed Mar 04 14:45:01 PST 2015
8//
9// (C) Copyright 2015, Google Inc.
10// Licensed under the Apache License, Version 2.0 (the "License");
11// you may not use this file except in compliance with the License.
12// You may obtain a copy of the License at
13// http://www.apache.org/licenses/LICENSE-2.0
14// Unless required by applicable law or agreed to in writing, software
15// distributed under the License is distributed on an "AS IS" BASIS,
16// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17// See the License for the specific language governing permissions and
18// limitations under the License.
19//
21
22#ifndef TESSERACT_CCUTIL_UNICHARCOMPRESS_H_
23#define TESSERACT_CCUTIL_UNICHARCOMPRESS_H_
24
25#include <unordered_map>
26
27#include "serialis.h"
28#include "strngs.h"
29#include "unicharset.h"
30
31namespace tesseract {
32
33// Trivial class to hold the code for a recoded unichar-id.
35 public:
36 // The maximum length of a code.
37 static const int kMaxCodeLen = 9;
38
39 RecodedCharID() : self_normalized_(1), length_(0) {
40 memset(code_, 0, sizeof(code_));
41 }
42 void Truncate(int length) { length_ = length; }
43 // Sets the code value at the given index in the code.
44 void Set(int index, int value) {
45 code_[index] = value;
46 if (length_ <= index) length_ = index + 1;
47 }
48 // Shorthand for setting codes of length 3, as all Hangul and Han codes are
49 // length 3.
50 void Set3(int code0, int code1, int code2) {
51 length_ = 3;
52 code_[0] = code0;
53 code_[1] = code1;
54 code_[2] = code2;
55 }
56 // Accessors
57 int length() const { return length_; }
58 int operator()(int index) const { return code_[index]; }
59
60 // Writes to the given file. Returns false in case of error.
61 bool Serialize(TFile* fp) const {
62 return fp->Serialize(&self_normalized_) &&
63 fp->Serialize(&length_) &&
64 fp->Serialize(&code_[0], length_);
65 }
66 // Reads from the given file. Returns false in case of error.
67 bool DeSerialize(TFile* fp) {
68 return fp->DeSerialize(&self_normalized_) &&
69 fp->DeSerialize(&length_) &&
70 fp->DeSerialize(&code_[0], length_);
71 }
72 bool operator==(const RecodedCharID& other) const {
73 if (length_ != other.length_) return false;
74 for (int i = 0; i < length_; ++i) {
75 if (code_[i] != other.code_[i]) return false;
76 }
77 return true;
78 }
79 // Hash functor for RecodedCharID.
81 uint64_t operator()(const RecodedCharID& code) const {
82 uint64_t result = 0;
83 for (int i = 0; i < code.length_; ++i) {
84 result ^= static_cast<uint64_t>(code(i)) << (7 * i);
85 }
86 return result;
87 }
88 };
89
90 private:
91 // True if this code is self-normalizing, ie is the master entry for indices
92 // that map to the same code. Has boolean value, but int8_t for serialization.
93 int8_t self_normalized_;
94 // The number of elements in use in code_;
95 int32_t length_;
96 // The re-encoded form of the unichar-id to which this RecodedCharID relates.
97 int32_t code_[kMaxCodeLen];
98};
99
100// Class holds a "compression" of a unicharset to simplify the learning problem
101// for a neural-network-based classifier.
102// Objectives:
103// 1 (CJK): Ids of a unicharset with a large number of classes are expressed as
104// a sequence of 3 codes with much fewer values.
105// This is achieved using the Jamo coding for Hangul and the Unicode
106// Radical-Stroke-index for Han.
107// 2 (Indic): Instead of thousands of codes with one for each grapheme, re-code
108// as the unicode sequence (but coded in a more compact space).
109// 3 (the rest): Eliminate multi-path problems with ligatures and fold confusing
110// and not significantly distinct shapes (quotes) together, ie
111// represent the fi ligature as the f-i pair, and fold u+2019 and
112// friends all onto ascii single '
113// 4 The null character and mapping to target activations:
114// To save horizontal coding space, the compressed codes are generally mapped
115// to target network activations without intervening null characters, BUT
116// in the case of ligatures, such as ff, null characters have to be included
117// so existence of repeated codes is detected at codebook-building time, and
118// null characters are embedded directly into the codes, so the rest of the
119// system doesn't need to worry about the problem (much). There is still an
120// effect on the range of ways in which the target activations can be
121// generated.
122//
123// The computed code values are compact (no unused values), and, for CJK,
124// unique (each code position uses a disjoint set of values from each other code
125// position). For non-CJK, the same code value CAN be used in multiple
126// positions, eg the ff ligature is converted to <f> <nullchar> <f>, where <f>
127// is the same code as is used for the single f.
129 public:
134
135 // The 1st Hangul unicode.
136 static const int kFirstHangul = 0xac00;
137 // The number of Hangul unicodes.
138 static const int kNumHangul = 11172;
139 // The number of Jamos for each of the 3 parts of a Hangul character, being
140 // the Leading consonant, Vowel and Trailing consonant.
141 static const int kLCount = 19;
142 static const int kVCount = 21;
143 static const int kTCount = 28;
144
145 // Computes the encoding for the given unicharset. It is a requirement that
146 // the file training/langdata/radical-stroke.txt have been read into the
147 // input string radical_stroke_table.
148 // Returns false if the encoding cannot be constructed.
149 bool ComputeEncoding(const UNICHARSET& unicharset, int null_id,
150 STRING* radical_stroke_table);
151 // Sets up an encoder that doesn't change the unichars at all, so it just
152 // passes them through unchanged.
153 void SetupPassThrough(const UNICHARSET& unicharset);
154 // Sets up an encoder directly using the given encoding vector, which maps
155 // unichar_ids to the given codes.
156 void SetupDirect(const GenericVector<RecodedCharID>& codes);
157
158 // Returns the number of different values that can be used in a code, ie
159 // 1 + the maximum value that will ever be used by an RecodedCharID code in
160 // any position in its array.
161 int code_range() const { return code_range_; }
162
163 // Encodes a single unichar_id. Returns the length of the code, (or zero if
164 // invalid input), and the encoding itself in code.
165 int EncodeUnichar(int unichar_id, RecodedCharID* code) const;
166 // Decodes code, returning the original unichar-id, or
167 // INVALID_UNICHAR_ID if the input is invalid.
168 int DecodeUnichar(const RecodedCharID& code) const;
169 // Returns true if the given code is a valid start or single code.
170 bool IsValidFirstCode(int code) const { return is_valid_start_[code]; }
171 // Returns a list of valid non-final next codes for a given prefix code,
172 // which may be empty.
174 auto it = next_codes_.find(code);
175 return it == next_codes_.end() ? nullptr : it->second;
176 }
177 // Returns a list of valid final codes for a given prefix code, which may
178 // be empty.
180 auto it = final_codes_.find(code);
181 return it == final_codes_.end() ? nullptr : it->second;
182 }
183
184 // Writes to the given file. Returns false in case of error.
185 bool Serialize(TFile* fp) const;
186 // Reads from the given file. Returns false in case of error.
187
188 bool DeSerialize(TFile* fp);
189
190 // Returns a STRING containing a text file that describes the encoding thus:
191 // <index>[,<index>]*<tab><UTF8-str><newline>
192 // In words, a comma-separated list of one or more indices, followed by a tab
193 // and the UTF-8 string that the code represents per line. Most simple scripts
194 // will encode a single index to a UTF8-string, but Chinese, Japanese, Korean
195 // and the Indic scripts will contain a many-to-many mapping.
196 // See the class comment above for details.
197 STRING GetEncodingAsString(const UNICHARSET& unicharset) const;
198
199 // Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing.
200 // Note that the returned values are 0-based indices, NOT unicode Jamo.
201 // Returns false if the input is not in the Hangul unicode range.
202 static bool DecomposeHangul(int unicode, int* leading, int* vowel,
203 int* trailing);
204
205 private:
206 // Renumbers codes to eliminate unused values.
207 void DefragmentCodeValues(int encoded_null);
208 // Computes the value of code_range_ from the encoder_.
209 void ComputeCodeRange();
210 // Initializes the decoding hash_map from the encoder_ array.
211 void SetupDecoder();
212 // Frees allocated memory.
213 void Cleanup();
214
215 // The encoder that maps a unichar-id to a sequence of small codes.
216 // encoder_ is the only part that is serialized. The rest is computed on load.
218 // Decoder converts the output of encoder back to a unichar-id.
219 std::unordered_map<RecodedCharID, int, RecodedCharID::RecodedCharIDHash>
220 decoder_;
221 // True if the index is a valid single or start code.
222 GenericVector<bool> is_valid_start_;
223 // Maps a prefix code to a list of valid next codes.
224 // The map owns the vectors.
225 std::unordered_map<RecodedCharID, GenericVectorEqEq<int>*,
227 next_codes_;
228 // Maps a prefix code to a list of valid final codes.
229 // The map owns the vectors.
230 std::unordered_map<RecodedCharID, GenericVectorEqEq<int>*,
232 final_codes_;
233 // Max of any value in encoder_ + 1.
234 int code_range_;
235};
236
237} // namespace tesseract.
238
239#endif // TESSERACT_CCUTIL_UNICHARCOMPRESS_H_
bool Serialize(const char *data, size_t count=1)
Definition: serialis.cpp:148
bool DeSerialize(char *data, size_t count=1)
Definition: serialis.cpp:104
Definition: strngs.h:45
bool DeSerialize(TFile *fp)
bool Serialize(TFile *fp) const
void Truncate(int length)
void Set(int index, int value)
static const int kMaxCodeLen
int operator()(int index) const
void Set3(int code0, int code1, int code2)
bool operator==(const RecodedCharID &other) const
uint64_t operator()(const RecodedCharID &code) const
void SetupDirect(const GenericVector< RecodedCharID > &codes)
int EncodeUnichar(int unichar_id, RecodedCharID *code) const
STRING GetEncodingAsString(const UNICHARSET &unicharset) const
static bool DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing)
UnicharCompress & operator=(const UnicharCompress &src)
static const int kFirstHangul
bool IsValidFirstCode(int code) const
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, STRING *radical_stroke_table)
const GenericVector< int > * GetFinalCodes(const RecodedCharID &code) const
void SetupPassThrough(const UNICHARSET &unicharset)
const GenericVector< int > * GetNextCodes(const RecodedCharID &code) const
int DecodeUnichar(const RecodedCharID &code) const
bool Serialize(TFile *fp) const