tesseract 4.1.1
Loading...
Searching...
No Matches
UNICHARSET Class Reference

#include <unicharset.h>

Public Types

enum  Direction {
  U_LEFT_TO_RIGHT = 0 , U_RIGHT_TO_LEFT = 1 , U_EUROPEAN_NUMBER = 2 , U_EUROPEAN_NUMBER_SEPARATOR = 3 ,
  U_EUROPEAN_NUMBER_TERMINATOR = 4 , U_ARABIC_NUMBER = 5 , U_COMMON_NUMBER_SEPARATOR = 6 , U_BLOCK_SEPARATOR = 7 ,
  U_SEGMENT_SEPARATOR = 8 , U_WHITE_SPACE_NEUTRAL = 9 , U_OTHER_NEUTRAL = 10 , U_LEFT_TO_RIGHT_EMBEDDING = 11 ,
  U_LEFT_TO_RIGHT_OVERRIDE = 12 , U_RIGHT_TO_LEFT_ARABIC = 13 , U_RIGHT_TO_LEFT_EMBEDDING = 14 , U_RIGHT_TO_LEFT_OVERRIDE = 15 ,
  U_POP_DIRECTIONAL_FORMAT = 16 , U_DIR_NON_SPACING_MARK = 17 , U_BOUNDARY_NEUTRAL = 18 , U_FIRST_STRONG_ISOLATE = 19 ,
  U_LEFT_TO_RIGHT_ISOLATE = 20 , U_RIGHT_TO_LEFT_ISOLATE = 21 , U_POP_DIRECTIONAL_ISOLATE = 22 , U_CHAR_DIRECTION_COUNT
}
 

Public Member Functions

 UNICHARSET ()
 
 ~UNICHARSET ()
 
UNICHAR_ID unichar_to_id (const char *const unichar_repr) const
 
UNICHAR_ID unichar_to_id (const char *const unichar_repr, int length) const
 
int step (const char *str) const
 
bool encodable_string (const char *str, int *first_bad_position) const
 
bool encode_string (const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
 
const char * id_to_unichar (UNICHAR_ID id) const
 
const char * id_to_unichar_ext (UNICHAR_ID id) const
 
STRING debug_str (UNICHAR_ID id) const
 
STRING debug_str (const char *unichar_repr) const
 
void unichar_insert (const char *const unichar_repr, OldUncleanUnichars old_style)
 
void unichar_insert (const char *const unichar_repr)
 
void unichar_insert_backwards_compatible (const char *const unichar_repr)
 
bool contains_unichar_id (UNICHAR_ID unichar_id) const
 
bool contains_unichar (const char *const unichar_repr) const
 
bool contains_unichar (const char *const unichar_repr, int length) const
 
bool eq (UNICHAR_ID unichar_id, const char *const unichar_repr) const
 
void delete_pointers_in_unichars ()
 
void clear ()
 
int size () const
 
void reserve (int unichars_number)
 
bool save_to_file (const char *const filename) const
 
bool save_to_file (FILE *file) const
 
bool save_to_file (tesseract::TFile *file) const
 
bool save_to_string (STRING *str) const
 
bool load_from_inmemory_file (const char *const memory, int mem_size, bool skip_fragments)
 
bool load_from_inmemory_file (const char *const memory, int mem_size)
 
bool load_from_file (const char *const filename, bool skip_fragments)
 
bool load_from_file (const char *const filename)
 
bool load_from_file (FILE *file, bool skip_fragments)
 
bool load_from_file (FILE *file)
 
bool load_from_file (tesseract::TFile *file, bool skip_fragments)
 
void post_load_setup ()
 
bool major_right_to_left () const
 
void set_black_and_whitelist (const char *blacklist, const char *whitelist, const char *unblacklist)
 
void set_isalpha (UNICHAR_ID unichar_id, bool value)
 
void set_islower (UNICHAR_ID unichar_id, bool value)
 
void set_isupper (UNICHAR_ID unichar_id, bool value)
 
void set_isdigit (UNICHAR_ID unichar_id, bool value)
 
void set_ispunctuation (UNICHAR_ID unichar_id, bool value)
 
void set_isngram (UNICHAR_ID unichar_id, bool value)
 
void set_script (UNICHAR_ID unichar_id, const char *value)
 
void set_other_case (UNICHAR_ID unichar_id, UNICHAR_ID other_case)
 
void set_direction (UNICHAR_ID unichar_id, UNICHARSET::Direction value)
 
void set_mirror (UNICHAR_ID unichar_id, UNICHAR_ID mirror)
 
void set_normed (UNICHAR_ID unichar_id, const char *normed)
 
void set_normed_ids (UNICHAR_ID unichar_id)
 
bool get_isalpha (UNICHAR_ID unichar_id) const
 
bool get_islower (UNICHAR_ID unichar_id) const
 
bool get_isupper (UNICHAR_ID unichar_id) const
 
bool get_isdigit (UNICHAR_ID unichar_id) const
 
bool get_ispunctuation (UNICHAR_ID unichar_id) const
 
bool get_isngram (UNICHAR_ID unichar_id) const
 
bool get_isprivate (UNICHAR_ID unichar_id) const
 
bool top_bottom_useful () const
 
void set_ranges_empty ()
 
void SetPropertiesFromOther (const UNICHARSET &src)
 
void PartialSetPropertiesFromOther (int start_index, const UNICHARSET &src)
 
void ExpandRangesFromOther (const UNICHARSET &src)
 
void CopyFrom (const UNICHARSET &src)
 
void AppendOtherUnicharset (const UNICHARSET &src)
 
bool SizesDistinct (UNICHAR_ID id1, UNICHAR_ID id2) const
 
void get_top_bottom (UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
 
void set_top_bottom (UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
 
void get_width_stats (UNICHAR_ID unichar_id, float *width, float *width_sd) const
 
void set_width_stats (UNICHAR_ID unichar_id, float width, float width_sd)
 
void get_bearing_stats (UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
 
void set_bearing_stats (UNICHAR_ID unichar_id, float bearing, float bearing_sd)
 
void get_advance_stats (UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
 
void set_advance_stats (UNICHAR_ID unichar_id, float advance, float advance_sd)
 
bool PropertiesIncomplete (UNICHAR_ID unichar_id) const
 
bool IsSpaceDelimited (UNICHAR_ID unichar_id) const
 
int get_script (UNICHAR_ID unichar_id) const
 
unsigned int get_properties (UNICHAR_ID unichar_id) const
 
char get_chartype (UNICHAR_ID unichar_id) const
 
UNICHAR_ID get_other_case (UNICHAR_ID unichar_id) const
 
Direction get_direction (UNICHAR_ID unichar_id) const
 
UNICHAR_ID get_mirror (UNICHAR_ID unichar_id) const
 
UNICHAR_ID to_lower (UNICHAR_ID unichar_id) const
 
UNICHAR_ID to_upper (UNICHAR_ID unichar_id) const
 
bool has_special_codes () const
 
bool AnyRepeatedUnicodes () const
 
const CHAR_FRAGMENTget_fragment (UNICHAR_ID unichar_id) const
 
bool get_isalpha (const char *const unichar_repr) const
 
bool get_islower (const char *const unichar_repr) const
 
bool get_isupper (const char *const unichar_repr) const
 
bool get_isdigit (const char *const unichar_repr) const
 
bool get_ispunctuation (const char *const unichar_repr) const
 
unsigned int get_properties (const char *const unichar_repr) const
 
char get_chartype (const char *const unichar_repr) const
 
int get_script (const char *const unichar_repr) const
 
const CHAR_FRAGMENTget_fragment (const char *const unichar_repr) const
 
bool get_isalpha (const char *const unichar_repr, int length) const
 
bool get_islower (const char *const unichar_repr, int length) const
 
bool get_isupper (const char *const unichar_repr, int length) const
 
bool get_isdigit (const char *const unichar_repr, int length) const
 
bool get_ispunctuation (const char *const unichar_repr, int length) const
 
const char * get_normed_unichar (UNICHAR_ID unichar_id) const
 
const GenericVector< UNICHAR_ID > & normed_ids (UNICHAR_ID unichar_id) const
 
int get_script (const char *const unichar_repr, int length) const
 
int get_script_table_size () const
 
const char * get_script_from_script_id (int id) const
 
int get_script_id_from_name (const char *script_name) const
 
bool is_null_script (const char *script) const
 
int add_script (const char *script)
 
bool get_enabled (UNICHAR_ID unichar_id) const
 
int null_sid () const
 
int common_sid () const
 
int latin_sid () const
 
int cyrillic_sid () const
 
int greek_sid () const
 
int han_sid () const
 
int hiragana_sid () const
 
int katakana_sid () const
 
int thai_sid () const
 
int hangul_sid () const
 
int default_sid () const
 
bool script_has_upper_lower () const
 
bool script_has_xheight () const
 

Static Public Member Functions

static STRING debug_utf8_str (const char *str)
 
static std::string CleanupString (const char *utf8_str)
 
static std::string CleanupString (const char *utf8_str, size_t length)
 

Static Public Attributes

static TESS_API const char * kCustomLigatures [][2]
 
static TESS_API const char * kSpecialUnicharCodes [SPECIAL_UNICHAR_CODES_COUNT]
 

Detailed Description

Definition at line 145 of file unicharset.h.

Member Enumeration Documentation

◆ Direction

Enumerator
U_LEFT_TO_RIGHT 
U_RIGHT_TO_LEFT 
U_EUROPEAN_NUMBER 
U_EUROPEAN_NUMBER_SEPARATOR 
U_EUROPEAN_NUMBER_TERMINATOR 
U_ARABIC_NUMBER 
U_COMMON_NUMBER_SEPARATOR 
U_BLOCK_SEPARATOR 
U_SEGMENT_SEPARATOR 
U_WHITE_SPACE_NEUTRAL 
U_OTHER_NEUTRAL 
U_LEFT_TO_RIGHT_EMBEDDING 
U_LEFT_TO_RIGHT_OVERRIDE 
U_RIGHT_TO_LEFT_ARABIC 
U_RIGHT_TO_LEFT_EMBEDDING 
U_RIGHT_TO_LEFT_OVERRIDE 
U_POP_DIRECTIONAL_FORMAT 
U_DIR_NON_SPACING_MARK 
U_BOUNDARY_NEUTRAL 
U_FIRST_STRONG_ISOLATE 
U_LEFT_TO_RIGHT_ISOLATE 
U_RIGHT_TO_LEFT_ISOLATE 
U_POP_DIRECTIONAL_ISOLATE 
U_CHAR_DIRECTION_COUNT 

Definition at line 156 of file unicharset.h.

156 {
157 U_LEFT_TO_RIGHT = 0,
158 U_RIGHT_TO_LEFT = 1,
162 U_ARABIC_NUMBER = 5,
167 U_OTHER_NEUTRAL = 10,
180#ifndef U_HIDE_DEPRECATED_API
182#endif // U_HIDE_DEPRECATED_API
183 };
@ U_BOUNDARY_NEUTRAL
Definition: unicharset.h:175
@ U_POP_DIRECTIONAL_ISOLATE
Definition: unicharset.h:179
@ U_SEGMENT_SEPARATOR
Definition: unicharset.h:165
@ U_ARABIC_NUMBER
Definition: unicharset.h:162
@ U_COMMON_NUMBER_SEPARATOR
Definition: unicharset.h:163
@ U_POP_DIRECTIONAL_FORMAT
Definition: unicharset.h:173
@ U_WHITE_SPACE_NEUTRAL
Definition: unicharset.h:166
@ U_OTHER_NEUTRAL
Definition: unicharset.h:167
@ U_FIRST_STRONG_ISOLATE
Definition: unicharset.h:176
@ U_EUROPEAN_NUMBER_TERMINATOR
Definition: unicharset.h:161
@ U_RIGHT_TO_LEFT_EMBEDDING
Definition: unicharset.h:171
@ U_EUROPEAN_NUMBER_SEPARATOR
Definition: unicharset.h:160
@ U_CHAR_DIRECTION_COUNT
Definition: unicharset.h:181
@ U_LEFT_TO_RIGHT_ISOLATE
Definition: unicharset.h:177
@ U_BLOCK_SEPARATOR
Definition: unicharset.h:164
@ U_EUROPEAN_NUMBER
Definition: unicharset.h:159
@ U_RIGHT_TO_LEFT_ARABIC
Definition: unicharset.h:170
@ U_RIGHT_TO_LEFT
Definition: unicharset.h:158
@ U_RIGHT_TO_LEFT_ISOLATE
Definition: unicharset.h:178
@ U_LEFT_TO_RIGHT_EMBEDDING
Definition: unicharset.h:168
@ U_LEFT_TO_RIGHT_OVERRIDE
Definition: unicharset.h:169
@ U_LEFT_TO_RIGHT
Definition: unicharset.h:157
@ U_DIR_NON_SPACING_MARK
Definition: unicharset.h:174
@ U_RIGHT_TO_LEFT_OVERRIDE
Definition: unicharset.h:172

Constructor & Destructor Documentation

◆ UNICHARSET()

UNICHARSET::UNICHARSET ( )

Definition at line 176 of file unicharset.cpp.

176 :
177 unichars(nullptr),
178 ids(),
179 size_used(0),
180 size_reserved(0),
181 script_table(nullptr),
182 script_table_size_used(0) {
183 clear();
184 for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) {
186 if (i == UNICHAR_JOINED)
187 set_isngram(i, true);
188 }
189}
@ SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:38
@ UNICHAR_JOINED
Definition: unicharset.h:35
void set_isngram(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:456
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:626
static TESS_API const char * kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]
Definition: unicharset.h:153
void clear()
Definition: unicharset.h:306

◆ ~UNICHARSET()

UNICHARSET::~UNICHARSET ( )

Definition at line 191 of file unicharset.cpp.

191 {
192 clear();
193}

Member Function Documentation

◆ add_script()

int UNICHARSET::add_script ( const char *  script)

Definition at line 1060 of file unicharset.cpp.

1060 {
1061 for (int i = 0; i < script_table_size_used; ++i) {
1062 if (strcmp(script, script_table[i]) == 0)
1063 return i;
1064 }
1065 if (script_table_size_reserved == 0) {
1066 script_table_size_reserved = 8;
1067 script_table = new char*[script_table_size_reserved];
1068 } else if (script_table_size_used >= script_table_size_reserved) {
1069 assert(script_table_size_used == script_table_size_reserved);
1070 script_table_size_reserved += script_table_size_reserved;
1071 char** new_script_table = new char*[script_table_size_reserved];
1072 memcpy(new_script_table, script_table,
1073 script_table_size_used * sizeof(char*));
1074 delete[] script_table;
1075 script_table = new_script_table;
1076 }
1077 script_table[script_table_size_used] = new char[strlen(script) + 1];
1078 strcpy(script_table[script_table_size_used], script);
1079 return script_table_size_used++;
1080}

◆ AnyRepeatedUnicodes()

bool UNICHARSET::AnyRepeatedUnicodes ( ) const

Definition at line 1047 of file unicharset.cpp.

1047 {
1048 int start_id = 0;
1050 for (int id = start_id; id < size_used; ++id) {
1051 // Convert to unicodes.
1052 std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(get_normed_unichar(id));
1053 for (size_t u = 1; u < unicodes.size(); ++u) {
1054 if (unicodes[u - 1] == unicodes[u]) return true;
1055 }
1056 }
1057 return false;
1058}
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:828
bool has_special_codes() const
Definition: unicharset.h:722

◆ AppendOtherUnicharset()

void UNICHARSET::AppendOtherUnicharset ( const UNICHARSET src)

Definition at line 464 of file unicharset.cpp.

464 {
465 int initial_used = size_used;
466 for (int ch = 0; ch < src.size_used; ++ch) {
467 const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
468 const char* utf8 = src.id_to_unichar(ch);
469 int id = size_used;
470 if (contains_unichar(utf8)) {
471 id = unichar_to_id(utf8);
472 // Just expand current ranges.
473 unichars[id].properties.ExpandRangesFrom(src_props);
474 } else {
476 unichars[id].properties.SetRangesEmpty();
477 }
478 }
479 // Set properties, including mirror and other_case, WITHOUT reordering
480 // the unicharset.
481 PartialSetPropertiesFromOther(initial_used, src);
482}
void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src)
Definition: unicharset.cpp:405
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:671
void unichar_insert_backwards_compatible(const char *const unichar_repr)
Definition: unicharset.h:269
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210

◆ CleanupString() [1/2]

static std::string UNICHARSET::CleanupString ( const char *  utf8_str)
inlinestatic

Definition at line 246 of file unicharset.h.

246 {
247 return CleanupString(utf8_str, strlen(utf8_str));
248 }
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:246

◆ CleanupString() [2/2]

std::string UNICHARSET::CleanupString ( const char *  utf8_str,
size_t  length 
)
static

Definition at line 1150 of file unicharset.cpp.

1150 {
1151 std::string result;
1152 result.reserve(length);
1153 char ch;
1154 while ((ch = *utf8_str) != '\0' && length-- > 0) {
1155 int key_index = 0;
1156 const char* key;
1157 while ((key = kCleanupMaps[key_index][0]) != nullptr) {
1158 int match = 0;
1159 while (key[match] != '\0' && key[match] == utf8_str[match]) ++match;
1160 if (key[match] == '\0') {
1161 utf8_str += match;
1162 break;
1163 }
1164 ++key_index;
1165 }
1166 if (key == nullptr) {
1167 result.push_back(ch);
1168 ++utf8_str;
1169 } else {
1170 result.append(kCleanupMaps[key_index][1]);
1171 }
1172 }
1173 return result;
1174}

◆ clear()

void UNICHARSET::clear ( )
inline

Definition at line 306 of file unicharset.h.

306 {
307 if (script_table != nullptr) {
308 for (int i = 0; i < script_table_size_used; ++i)
309 delete[] script_table[i];
310 delete[] script_table;
311 script_table = nullptr;
312 script_table_size_used = 0;
313 }
314 if (unichars != nullptr) {
316 delete[] unichars;
317 unichars = nullptr;
318 }
319 script_table_size_reserved = 0;
320 size_reserved = 0;
321 size_used = 0;
322 ids.clear();
323 top_bottom_set_ = false;
324 script_has_upper_lower_ = false;
325 script_has_xheight_ = false;
326 old_style_included_ = false;
327 null_sid_ = 0;
328 common_sid_ = 0;
329 latin_sid_ = 0;
330 cyrillic_sid_ = 0;
331 greek_sid_ = 0;
332 han_sid_ = 0;
333 hiragana_sid_ = 0;
334 katakana_sid_ = 0;
335 thai_sid_ = 0;
336 hangul_sid_ = 0;
337 default_sid_ = 0;
338 }
void clear()
Definition: unicharmap.cpp:115
void delete_pointers_in_unichars()
Definition: unicharset.h:298

◆ common_sid()

int UNICHARSET::common_sid ( ) const
inline

Definition at line 885 of file unicharset.h.

885{ return common_sid_; }

◆ contains_unichar() [1/2]

bool UNICHARSET::contains_unichar ( const char *const  unichar_repr) const

Definition at line 671 of file unicharset.cpp.

671 {
672 std::string cleaned =
673 old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
674 return ids.contains(cleaned.data(), cleaned.size());
675}
bool contains(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:79

◆ contains_unichar() [2/2]

bool UNICHARSET::contains_unichar ( const char *const  unichar_repr,
int  length 
) const

Definition at line 677 of file unicharset.cpp.

678 {
679 if (length == 0) {
680 return false;
681 }
682 std::string cleaned(unichar_repr, length);
683 if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
684 return ids.contains(cleaned.data(), cleaned.size());
685}

◆ contains_unichar_id()

bool UNICHARSET::contains_unichar_id ( UNICHAR_ID  unichar_id) const
inline

Definition at line 284 of file unicharset.h.

284 {
285 return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
286 unichar_id >= 0;
287 }

◆ CopyFrom()

void UNICHARSET::CopyFrom ( const UNICHARSET src)

Definition at line 448 of file unicharset.cpp.

448 {
449 clear();
450 for (int ch = 0; ch < src.size_used; ++ch) {
451 const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
452 const char* utf8 = src.id_to_unichar(ch);
454 unichars[ch].properties.ExpandRangesFrom(src_props);
455 }
456 // Set properties, including mirror and other_case, WITHOUT reordering
457 // the unicharset.
459}

◆ cyrillic_sid()

int UNICHARSET::cyrillic_sid ( ) const
inline

Definition at line 887 of file unicharset.h.

887{ return cyrillic_sid_; }

◆ debug_str() [1/2]

STRING UNICHARSET::debug_str ( const char *  unichar_repr) const
inline

Definition at line 254 of file unicharset.h.

254 {
255 return debug_str(unichar_to_id(unichar_repr));
256 }
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:343

◆ debug_str() [2/2]

STRING UNICHARSET::debug_str ( UNICHAR_ID  id) const

Definition at line 343 of file unicharset.cpp.

343 {
344 if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id));
345 const CHAR_FRAGMENT *fragment = this->get_fragment(id);
346 if (fragment) {
347 return fragment->to_string();
348 }
349 const char* str = id_to_unichar(id);
350 STRING result = debug_utf8_str(str);
351 // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
352 if (get_isalpha(id)) {
353 if (get_islower(id))
354 result += "a";
355 else if (get_isupper(id))
356 result += "A";
357 else
358 result += "x";
359 }
360 // Append 0 if a digit.
361 if (get_isdigit(id)) {
362 result += "0";
363 }
364 // Append p is a punctuation symbol.
365 if (get_ispunctuation(id)) {
366 result += "p";
367 }
368 return result;
369}
Definition: strngs.h:45
static STRING to_string(const char *unichar, int pos, int total, bool natural)
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:734
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:519
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:505
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
static STRING debug_utf8_str(const char *str)
Definition: unicharset.cpp:319
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:498

◆ debug_utf8_str()

STRING UNICHARSET::debug_utf8_str ( const char *  str)
static

Definition at line 319 of file unicharset.cpp.

319 {
320 STRING result = str;
321 result += " [";
322 int step = 1;
323 // Chop into unicodes and code each as hex.
324 for (int i = 0; str[i] != '\0'; i += step) {
325 char hex[sizeof(int) * 2 + 1];
326 step = UNICHAR::utf8_step(str + i);
327 if (step == 0) {
328 step = 1;
329 sprintf(hex, "%x", str[i]);
330 } else {
331 UNICHAR ch(str + i, step);
332 sprintf(hex, "%x", ch.first_uni());
333 }
334 result += hex;
335 result += " ";
336 }
337 result += "]";
338 return result;
339}
int step(const char *str) const
Definition: unicharset.cpp:233

◆ default_sid()

int UNICHARSET::default_sid ( ) const
inline

Definition at line 894 of file unicharset.h.

894{ return default_sid_; }

◆ delete_pointers_in_unichars()

void UNICHARSET::delete_pointers_in_unichars ( )
inline

Definition at line 298 of file unicharset.h.

298 {
299 for (int i = 0; i < size_used; ++i) {
300 delete unichars[i].properties.fragment;
301 unichars[i].properties.fragment = nullptr;
302 }
303 }

◆ encodable_string()

bool UNICHARSET::encodable_string ( const char *  str,
int *  first_bad_position 
) const

Definition at line 244 of file unicharset.cpp.

245 {
247 return encode_string(str, true, &encoding, nullptr, first_bad_position);
248}
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:259

◆ encode_string()

bool UNICHARSET::encode_string ( const char *  str,
bool  give_up_on_failure,
GenericVector< UNICHAR_ID > *  encoding,
GenericVector< char > *  lengths,
int *  encoded_length 
) const

Definition at line 259 of file unicharset.cpp.

262 {
263 GenericVector<UNICHAR_ID> working_encoding;
264 GenericVector<char> working_lengths;
265 GenericVector<char> best_lengths;
266 encoding->truncate(0); // Just in case str is empty.
267 int str_length = strlen(str);
268 int str_pos = 0;
269 bool perfect = true;
270 while (str_pos < str_length) {
271 encode_string(str, str_pos, str_length, &working_encoding, &working_lengths,
272 &str_pos, encoding, &best_lengths);
273 if (str_pos < str_length) {
274 // This is a non-match. Skip one utf-8 character.
275 perfect = false;
276 if (give_up_on_failure) break;
277 int step = UNICHAR::utf8_step(str + str_pos);
278 if (step == 0) step = 1;
279 encoding->push_back(INVALID_UNICHAR_ID);
280 best_lengths.push_back(step);
281 str_pos += step;
282 working_encoding = *encoding;
283 working_lengths = best_lengths;
284 }
285 }
286 if (lengths != nullptr) *lengths = best_lengths;
287 if (encoded_length != nullptr) *encoded_length = str_pos;
288 return perfect;
289}
int push_back(T object)
void truncate(int size)

◆ eq()

bool UNICHARSET::eq ( UNICHAR_ID  unichar_id,
const char *const  unichar_repr 
) const

Definition at line 687 of file unicharset.cpp.

688 {
689 return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
690}

◆ ExpandRangesFromOther()

void UNICHARSET::ExpandRangesFromOther ( const UNICHARSET src)

Definition at line 435 of file unicharset.cpp.

435 {
436 for (int ch = 0; ch < size_used; ++ch) {
437 const char* utf8 = id_to_unichar(ch);
438 UNICHAR_PROPERTIES properties;
439 if (src.GetStrProperties(utf8, &properties)) {
440 // Expand just the ranges from properties.
441 unichars[ch].properties.ExpandRangesFrom(properties);
442 }
443 }
444}

◆ get_advance_stats()

void UNICHARSET::get_advance_stats ( UNICHAR_ID  unichar_id,
float *  advance,
float *  advance_sd 
) const
inline

Definition at line 630 of file unicharset.h.

631 {
632 if (INVALID_UNICHAR_ID == unichar_id) {
633 *advance = *advance_sd = 0;
634 return;
635 }
636 ASSERT_HOST(contains_unichar_id(unichar_id));
637 *advance = unichars[unichar_id].properties.advance;
638 *advance_sd = unichars[unichar_id].properties.advance_sd;
639 }
#define ASSERT_HOST(x)
Definition: errcode.h:88
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:284

◆ get_bearing_stats()

void UNICHARSET::get_bearing_stats ( UNICHAR_ID  unichar_id,
float *  bearing,
float *  bearing_sd 
) const
inline

Definition at line 613 of file unicharset.h.

614 {
615 if (INVALID_UNICHAR_ID == unichar_id) {
616 *bearing = *bearing_sd = 0.0f;
617 return;
618 }
619 ASSERT_HOST(contains_unichar_id(unichar_id));
620 *bearing = unichars[unichar_id].properties.bearing;
621 *bearing_sd = unichars[unichar_id].properties.bearing_sd;
622 }

◆ get_chartype() [1/2]

char UNICHARSET::get_chartype ( const char *const  unichar_repr) const
inline

Definition at line 771 of file unicharset.h.

771 {
772 return get_chartype(unichar_to_id(unichar_repr));
773 }
char get_chartype(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:617

◆ get_chartype() [2/2]

char UNICHARSET::get_chartype ( UNICHAR_ID  unichar_id) const

Definition at line 617 of file unicharset.cpp.

617 {
618 if (this->get_isupper(id)) return 'A';
619 if (this->get_islower(id)) return 'a';
620 if (this->get_isalpha(id)) return 'x';
621 if (this->get_isdigit(id)) return '0';
622 if (this->get_ispunctuation(id)) return 'p';
623 return 0;
624}

◆ get_direction()

Direction UNICHARSET::get_direction ( UNICHAR_ID  unichar_id) const
inline

Definition at line 690 of file unicharset.h.

690 {
691 if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL;
692 ASSERT_HOST(contains_unichar_id(unichar_id));
693 return unichars[unichar_id].properties.direction;
694 }

◆ get_enabled()

bool UNICHARSET::get_enabled ( UNICHAR_ID  unichar_id) const
inline

Definition at line 878 of file unicharset.h.

878 {
879 ASSERT_HOST(contains_unichar_id(unichar_id));
880 return unichars[unichar_id].properties.enabled;
881 }

◆ get_fragment() [1/2]

const CHAR_FRAGMENT * UNICHARSET::get_fragment ( const char *const  unichar_repr) const
inline

Definition at line 784 of file unicharset.h.

784 {
785 if (unichar_repr == nullptr || unichar_repr[0] == '\0' ||
786 !ids.contains(unichar_repr, false)) {
787 return nullptr;
788 }
789 return get_fragment(unichar_to_id(unichar_repr));
790 }

◆ get_fragment() [2/2]

const CHAR_FRAGMENT * UNICHARSET::get_fragment ( UNICHAR_ID  unichar_id) const
inline

Definition at line 734 of file unicharset.h.

734 {
735 if (INVALID_UNICHAR_ID == unichar_id) return nullptr;
736 ASSERT_HOST(contains_unichar_id(unichar_id));
737 return unichars[unichar_id].properties.fragment;
738 }

◆ get_isalpha() [1/3]

bool UNICHARSET::get_isalpha ( const char *const  unichar_repr) const
inline

Definition at line 741 of file unicharset.h.

741 {
742 return get_isalpha(unichar_to_id(unichar_repr));
743 }

◆ get_isalpha() [2/3]

bool UNICHARSET::get_isalpha ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 794 of file unicharset.h.

795 {
796 return get_isalpha(unichar_to_id(unichar_repr, length));
797 }

◆ get_isalpha() [3/3]

bool UNICHARSET::get_isalpha ( UNICHAR_ID  unichar_id) const
inline

Definition at line 491 of file unicharset.h.

491 {
492 if (INVALID_UNICHAR_ID == unichar_id) return false;
493 ASSERT_HOST(contains_unichar_id(unichar_id));
494 return unichars[unichar_id].properties.isalpha;
495 }

◆ get_isdigit() [1/3]

bool UNICHARSET::get_isdigit ( const char *const  unichar_repr) const
inline

Definition at line 756 of file unicharset.h.

756 {
757 return get_isdigit(unichar_to_id(unichar_repr));
758 }

◆ get_isdigit() [2/3]

bool UNICHARSET::get_isdigit ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 815 of file unicharset.h.

816 {
817 return get_isdigit(unichar_to_id(unichar_repr, length));
818 }

◆ get_isdigit() [3/3]

bool UNICHARSET::get_isdigit ( UNICHAR_ID  unichar_id) const
inline

Definition at line 512 of file unicharset.h.

512 {
513 if (INVALID_UNICHAR_ID == unichar_id) return false;
514 ASSERT_HOST(contains_unichar_id(unichar_id));
515 return unichars[unichar_id].properties.isdigit;
516 }

◆ get_islower() [1/3]

bool UNICHARSET::get_islower ( const char *const  unichar_repr) const
inline

Definition at line 746 of file unicharset.h.

746 {
747 return get_islower(unichar_to_id(unichar_repr));
748 }

◆ get_islower() [2/3]

bool UNICHARSET::get_islower ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 801 of file unicharset.h.

802 {
803 return get_islower(unichar_to_id(unichar_repr, length));
804 }

◆ get_islower() [3/3]

bool UNICHARSET::get_islower ( UNICHAR_ID  unichar_id) const
inline

Definition at line 498 of file unicharset.h.

498 {
499 if (INVALID_UNICHAR_ID == unichar_id) return false;
500 ASSERT_HOST(contains_unichar_id(unichar_id));
501 return unichars[unichar_id].properties.islower;
502 }

◆ get_isngram()

bool UNICHARSET::get_isngram ( UNICHAR_ID  unichar_id) const
inline

Definition at line 526 of file unicharset.h.

526 {
527 if (INVALID_UNICHAR_ID == unichar_id) return false;
528 ASSERT_HOST(contains_unichar_id(unichar_id));
529 return unichars[unichar_id].properties.isngram;
530 }

◆ get_isprivate()

bool UNICHARSET::get_isprivate ( UNICHAR_ID  unichar_id) const

Definition at line 388 of file unicharset.cpp.

388 {
389 UNICHAR uc(id_to_unichar(unichar_id), -1);
390 int uni = uc.first_uni();
391 return (uni >= 0xE000 && uni <= 0xF8FF);
392}

◆ get_ispunctuation() [1/3]

bool UNICHARSET::get_ispunctuation ( const char *const  unichar_repr) const
inline

Definition at line 761 of file unicharset.h.

761 {
762 return get_ispunctuation(unichar_to_id(unichar_repr));
763 }

◆ get_ispunctuation() [2/3]

bool UNICHARSET::get_ispunctuation ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 822 of file unicharset.h.

823 {
824 return get_ispunctuation(unichar_to_id(unichar_repr, length));
825 }

◆ get_ispunctuation() [3/3]

bool UNICHARSET::get_ispunctuation ( UNICHAR_ID  unichar_id) const
inline

Definition at line 519 of file unicharset.h.

519 {
520 if (INVALID_UNICHAR_ID == unichar_id) return false;
521 ASSERT_HOST(contains_unichar_id(unichar_id));
522 return unichars[unichar_id].properties.ispunctuation;
523 }

◆ get_isupper() [1/3]

bool UNICHARSET::get_isupper ( const char *const  unichar_repr) const
inline

Definition at line 751 of file unicharset.h.

751 {
752 return get_isupper(unichar_to_id(unichar_repr));
753 }

◆ get_isupper() [2/3]

bool UNICHARSET::get_isupper ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 808 of file unicharset.h.

809 {
810 return get_isupper(unichar_to_id(unichar_repr, length));
811 }

◆ get_isupper() [3/3]

bool UNICHARSET::get_isupper ( UNICHAR_ID  unichar_id) const
inline

Definition at line 505 of file unicharset.h.

505 {
506 if (INVALID_UNICHAR_ID == unichar_id) return false;
507 ASSERT_HOST(contains_unichar_id(unichar_id));
508 return unichars[unichar_id].properties.isupper;
509 }

◆ get_mirror()

UNICHAR_ID UNICHARSET::get_mirror ( UNICHAR_ID  unichar_id) const
inline

Definition at line 697 of file unicharset.h.

697 {
698 if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
699 ASSERT_HOST(contains_unichar_id(unichar_id));
700 return unichars[unichar_id].properties.mirror;
701 }

◆ get_normed_unichar()

const char * UNICHARSET::get_normed_unichar ( UNICHAR_ID  unichar_id) const
inline

Definition at line 828 of file unicharset.h.

828 {
829 if (unichar_id == UNICHAR_SPACE) return " ";
830 return unichars[unichar_id].properties.normed.string();
831 }
@ UNICHAR_SPACE
Definition: unicharset.h:34

◆ get_other_case()

UNICHAR_ID UNICHARSET::get_other_case ( UNICHAR_ID  unichar_id) const
inline

Definition at line 683 of file unicharset.h.

683 {
684 if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
685 ASSERT_HOST(contains_unichar_id(unichar_id));
686 return unichars[unichar_id].properties.other_case;
687 }

◆ get_properties() [1/2]

unsigned int UNICHARSET::get_properties ( const char *const  unichar_repr) const
inline

Definition at line 767 of file unicharset.h.

767 {
768 return get_properties(unichar_to_id(unichar_repr));
769 }
unsigned int get_properties(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:602

◆ get_properties() [2/2]

unsigned int UNICHARSET::get_properties ( UNICHAR_ID  unichar_id) const

Definition at line 602 of file unicharset.cpp.

602 {
603 unsigned int properties = 0;
604 if (this->get_isalpha(id))
605 properties |= ISALPHA_MASK;
606 if (this->get_islower(id))
607 properties |= ISLOWER_MASK;
608 if (this->get_isupper(id))
609 properties |= ISUPPER_MASK;
610 if (this->get_isdigit(id))
611 properties |= ISDIGIT_MASK;
612 if (this->get_ispunctuation(id))
613 properties |= ISPUNCTUATION_MASK;
614 return properties;
615}

◆ get_script() [1/3]

int UNICHARSET::get_script ( const char *const  unichar_repr) const
inline

Definition at line 778 of file unicharset.h.

778 {
779 return get_script(unichar_to_id(unichar_repr));
780 }
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:663

◆ get_script() [2/3]

int UNICHARSET::get_script ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 843 of file unicharset.h.

844 {
845 return get_script(unichar_to_id(unichar_repr, length));
846 }

◆ get_script() [3/3]

int UNICHARSET::get_script ( UNICHAR_ID  unichar_id) const
inline

Definition at line 663 of file unicharset.h.

663 {
664 if (INVALID_UNICHAR_ID == unichar_id) return null_sid_;
665 ASSERT_HOST(contains_unichar_id(unichar_id));
666 return unichars[unichar_id].properties.script_id;
667 }

◆ get_script_from_script_id()

const char * UNICHARSET::get_script_from_script_id ( int  id) const
inline

Definition at line 854 of file unicharset.h.

854 {
855 if (id >= script_table_size_used || id < 0)
856 return null_script;
857 return script_table[id];
858 }

◆ get_script_id_from_name()

int UNICHARSET::get_script_id_from_name ( const char *  script_name) const

Definition at line 1139 of file unicharset.cpp.

1139 {
1140 for (int i = 0; i < script_table_size_used; ++i) {
1141 if (strcmp(script_name, script_table[i]) == 0)
1142 return i;
1143 }
1144 return 0; // 0 is always the null_script
1145}

◆ get_script_table_size()

int UNICHARSET::get_script_table_size ( ) const
inline

Definition at line 849 of file unicharset.h.

849 {
850 return script_table_size_used;
851 }

◆ get_top_bottom()

void UNICHARSET::get_top_bottom ( UNICHAR_ID  unichar_id,
int *  min_bottom,
int *  max_bottom,
int *  min_top,
int *  max_top 
) const
inline

Definition at line 568 of file unicharset.h.

570 {
571 if (INVALID_UNICHAR_ID == unichar_id) {
572 *min_bottom = *min_top = 0;
573 *max_bottom = *max_top = 256; // kBlnCellHeight
574 return;
575 }
576 ASSERT_HOST(contains_unichar_id(unichar_id));
577 *min_bottom = unichars[unichar_id].properties.min_bottom;
578 *max_bottom = unichars[unichar_id].properties.max_bottom;
579 *min_top = unichars[unichar_id].properties.min_top;
580 *max_top = unichars[unichar_id].properties.max_top;
581 }

◆ get_width_stats()

void UNICHARSET::get_width_stats ( UNICHAR_ID  unichar_id,
float *  width,
float *  width_sd 
) const
inline

Definition at line 596 of file unicharset.h.

597 {
598 if (INVALID_UNICHAR_ID == unichar_id) {
599 *width = 0.0f;
600 *width_sd = 0.0f;;
601 return;
602 }
603 ASSERT_HOST(contains_unichar_id(unichar_id));
604 *width = unichars[unichar_id].properties.width;
605 *width_sd = unichars[unichar_id].properties.width_sd;
606 }

◆ greek_sid()

int UNICHARSET::greek_sid ( ) const
inline

Definition at line 888 of file unicharset.h.

888{ return greek_sid_; }

◆ han_sid()

int UNICHARSET::han_sid ( ) const
inline

Definition at line 889 of file unicharset.h.

889{ return han_sid_; }

◆ hangul_sid()

int UNICHARSET::hangul_sid ( ) const
inline

Definition at line 893 of file unicharset.h.

893{ return hangul_sid_; }

◆ has_special_codes()

bool UNICHARSET::has_special_codes ( ) const
inline

Definition at line 722 of file unicharset.h.

722 {
723 return get_fragment(UNICHAR_BROKEN) != nullptr &&
726 }
@ UNICHAR_BROKEN
Definition: unicharset.h:36

◆ hiragana_sid()

int UNICHARSET::hiragana_sid ( ) const
inline

Definition at line 890 of file unicharset.h.

890{ return hiragana_sid_; }

◆ id_to_unichar()

const char * UNICHARSET::id_to_unichar ( UNICHAR_ID  id) const

Definition at line 291 of file unicharset.cpp.

291 {
292 if (id == INVALID_UNICHAR_ID) {
293 return INVALID_UNICHAR;
294 }
295 ASSERT_HOST(id < this->size());
296 return unichars[id].representation;
297}
int size() const
Definition: unicharset.h:341

◆ id_to_unichar_ext()

const char * UNICHARSET::id_to_unichar_ext ( UNICHAR_ID  id) const

Definition at line 299 of file unicharset.cpp.

299 {
300 if (id == INVALID_UNICHAR_ID) {
301 return INVALID_UNICHAR;
302 }
303 ASSERT_HOST(id < this->size());
304 // Resolve from the kCustomLigatures table if this is a private encoding.
305 if (get_isprivate(id)) {
306 const char* ch = id_to_unichar(id);
307 for (int i = 0; kCustomLigatures[i][0] != nullptr; ++i) {
308 if (!strcmp(ch, kCustomLigatures[i][1])) {
309 return kCustomLigatures[i][0];
310 }
311 }
312 }
313 // Otherwise return the stored representation.
314 return unichars[id].representation;
315}
bool get_isprivate(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:388
static TESS_API const char * kCustomLigatures[][2]
Definition: unicharset.h:150

◆ is_null_script()

bool UNICHARSET::is_null_script ( const char *  script) const
inline

Definition at line 868 of file unicharset.h.

868 {
869 return script == null_script;
870 }

◆ IsSpaceDelimited()

bool UNICHARSET::IsSpaceDelimited ( UNICHAR_ID  unichar_id) const
inline

Definition at line 652 of file unicharset.h.

652 {
653 if (INVALID_UNICHAR_ID == unichar_id) return true;
654 int script_id = get_script(unichar_id);
655 return script_id != han_sid_ && script_id != thai_sid_ &&
656 script_id != hangul_sid_ && script_id != hiragana_sid_ &&
657 script_id != katakana_sid_;
658 }

◆ katakana_sid()

int UNICHARSET::katakana_sid ( ) const
inline

Definition at line 891 of file unicharset.h.

891{ return katakana_sid_; }

◆ latin_sid()

int UNICHARSET::latin_sid ( ) const
inline

Definition at line 886 of file unicharset.h.

886{ return latin_sid_; }

◆ load_from_file() [1/5]

bool UNICHARSET::load_from_file ( const char *const  filename)
inline

Definition at line 396 of file unicharset.h.

396 {
397 return load_from_file(filename, false);
398 }
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:388

◆ load_from_file() [2/5]

bool UNICHARSET::load_from_file ( const char *const  filename,
bool  skip_fragments 
)
inline

Definition at line 388 of file unicharset.h.

388 {
389 FILE* file = fopen(filename, "rb");
390 if (file == nullptr) return false;
391 bool result = load_from_file(file, skip_fragments);
392 fclose(file);
393 return result;
394 }

◆ load_from_file() [3/5]

bool UNICHARSET::load_from_file ( FILE *  file)
inline

Definition at line 403 of file unicharset.h.

403{ return load_from_file(file, false); }

◆ load_from_file() [4/5]

bool UNICHARSET::load_from_file ( FILE *  file,
bool  skip_fragments 
)

Definition at line 781 of file unicharset.cpp.

781 {
782 LocalFilePointer lfp(file);
785 bool success = load_via_fgets(fgets_cb, skip_fragments);
786 delete fgets_cb;
787 return success;
788}
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
Definition: tesscallback.h:258
char * fgets(char *dst, int size)
Definition: unicharset.cpp:774

◆ load_from_file() [5/5]

bool UNICHARSET::load_from_file ( tesseract::TFile file,
bool  skip_fragments 
)

Definition at line 790 of file unicharset.cpp.

790 {
793 bool success = load_via_fgets(fgets_cb, skip_fragments);
794 delete fgets_cb;
795 return success;
796}
char * FGets(char *buffer, int buffer_size)
Definition: serialis.cpp:249

◆ load_from_inmemory_file() [1/2]

bool UNICHARSET::load_from_inmemory_file ( const char *const  memory,
int  mem_size 
)
inline

Definition at line 381 of file unicharset.h.

381 {
382 return load_from_inmemory_file(memory, mem_size, false);
383 }
bool load_from_inmemory_file(const char *const memory, int mem_size, bool skip_fragments)
Definition: unicharset.cpp:761

◆ load_from_inmemory_file() [2/2]

bool UNICHARSET::load_from_inmemory_file ( const char *const  memory,
int  mem_size,
bool  skip_fragments 
)

Definition at line 761 of file unicharset.cpp.

762 {
763 InMemoryFilePointer mem_fp(memory, mem_size);
766 bool success = load_via_fgets(fgets_cb, skip_fragments);
767 delete fgets_cb;
768 return success;
769}
char * fgets(char *orig_dst, int size)
Definition: unicharset.cpp:739

◆ major_right_to_left()

bool UNICHARSET::major_right_to_left ( ) const

Definition at line 992 of file unicharset.cpp.

992 {
993 int ltr_count = 0;
994 int rtl_count = 0;
995 for (int id = 0; id < size_used; ++id) {
996 int dir = get_direction(id);
997 if (dir == UNICHARSET::U_LEFT_TO_RIGHT) ltr_count++;
998 if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
1000 dir == UNICHARSET::U_ARABIC_NUMBER) rtl_count++;
1001 }
1002 return rtl_count > ltr_count;
1003}
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:690

◆ normed_ids()

const GenericVector< UNICHAR_ID > & UNICHARSET::normed_ids ( UNICHAR_ID  unichar_id) const
inline

Definition at line 835 of file unicharset.h.

835 {
836 return unichars[unichar_id].properties.normed_ids;
837 }

◆ null_sid()

int UNICHARSET::null_sid ( ) const
inline

Definition at line 884 of file unicharset.h.

884{ return null_sid_; }

◆ PartialSetPropertiesFromOther()

void UNICHARSET::PartialSetPropertiesFromOther ( int  start_index,
const UNICHARSET src 
)

Definition at line 405 of file unicharset.cpp.

406 {
407 for (int ch = start_index; ch < size_used; ++ch) {
408 const char* utf8 = id_to_unichar(ch);
409 UNICHAR_PROPERTIES properties;
410 if (src.GetStrProperties(utf8, &properties)) {
411 // Setup the script_id, other_case, and mirror properly.
412 const char* script = src.get_script_from_script_id(properties.script_id);
413 properties.script_id = add_script(script);
414 const char* other_case = src.id_to_unichar(properties.other_case);
415 if (contains_unichar(other_case)) {
416 properties.other_case = unichar_to_id(other_case);
417 } else {
418 properties.other_case = ch;
419 }
420 const char* mirror_str = src.id_to_unichar(properties.mirror);
421 if (contains_unichar(mirror_str)) {
422 properties.mirror = unichar_to_id(mirror_str);
423 } else {
424 properties.mirror = ch;
425 }
426 unichars[ch].properties.CopyFrom(properties);
427 set_normed_ids(ch);
428 }
429 }
430}
int add_script(const char *script)
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:854
void set_normed_ids(UNICHAR_ID unichar_id)
Definition: unicharset.cpp:373

◆ post_load_setup()

void UNICHARSET::post_load_setup ( )

Definition at line 926 of file unicharset.cpp.

926 {
927 // Number of alpha chars with the case property minus those without,
928 // in order to determine that half the alpha chars have case.
929 int net_case_alphas = 0;
930 int x_height_alphas = 0;
931 int cap_height_alphas = 0;
932 top_bottom_set_ = false;
933 for (UNICHAR_ID id = 0; id < size_used; ++id) {
934 int min_bottom = 0;
935 int max_bottom = UINT8_MAX;
936 int min_top = 0;
937 int max_top = UINT8_MAX;
938 get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
939 if (min_top > 0)
940 top_bottom_set_ = true;
941 if (get_isalpha(id)) {
942 if (get_islower(id) || get_isupper(id))
943 ++net_case_alphas;
944 else
945 --net_case_alphas;
946 if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold)
947 ++x_height_alphas;
948 else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold)
949 ++cap_height_alphas;
950 }
951 set_normed_ids(id);
952 }
953
954 script_has_upper_lower_ = net_case_alphas > 0;
955 script_has_xheight_ = script_has_upper_lower_ ||
956 (x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
957 cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
958
959 null_sid_ = get_script_id_from_name(null_script);
960 ASSERT_HOST(null_sid_ == 0);
961 common_sid_ = get_script_id_from_name("Common");
962 latin_sid_ = get_script_id_from_name("Latin");
963 cyrillic_sid_ = get_script_id_from_name("Cyrillic");
964 greek_sid_ = get_script_id_from_name("Greek");
965 han_sid_ = get_script_id_from_name("Han");
966 hiragana_sid_ = get_script_id_from_name("Hiragana");
967 katakana_sid_ = get_script_id_from_name("Katakana");
968 thai_sid_ = get_script_id_from_name("Thai");
969 hangul_sid_ = get_script_id_from_name("Hangul");
970
971 // Compute default script. Use the highest-counting alpha script, that is
972 // not the common script, as that still contains some "alphas".
973 int* script_counts = new int[script_table_size_used];
974 memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
975 for (int id = 0; id < size_used; ++id) {
976 if (get_isalpha(id)) {
977 ++script_counts[get_script(id)];
978 }
979 }
980 default_sid_ = 0;
981 for (int s = 1; s < script_table_size_used; ++s) {
982 if (script_counts[s] > script_counts[default_sid_] && s != common_sid_)
983 default_sid_ = s;
984 }
985 delete [] script_counts;
986}
int UNICHAR_ID
Definition: unichar.h:34
const double kMinCapHeightFraction
Definition: unicharset.cpp:60
const double kMinXHeightFraction
Definition: unicharset.cpp:59
int get_script_id_from_name(const char *script_name) const
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:568

◆ PropertiesIncomplete()

bool UNICHARSET::PropertiesIncomplete ( UNICHAR_ID  unichar_id) const
inline

Definition at line 646 of file unicharset.h.

646 {
647 return unichars[unichar_id].properties.AnyRangeEmpty();
648 }

◆ reserve()

void UNICHARSET::reserve ( int  unichars_number)

Definition at line 195 of file unicharset.cpp.

195 {
196 if (unichars_number > size_reserved) {
197 auto* unichars_new = new UNICHAR_SLOT[unichars_number];
198 for (int i = 0; i < size_used; ++i)
199 unichars_new[i] = unichars[i];
200 for (int j = size_used; j < unichars_number; ++j) {
201 unichars_new[j].properties.script_id = add_script(null_script);
202 }
203 delete[] unichars;
204 unichars = unichars_new;
205 size_reserved = unichars_number;
206 }
207}

◆ save_to_file() [1/3]

bool UNICHARSET::save_to_file ( const char *const  filename) const
inline

Definition at line 350 of file unicharset.h.

350 {
351 FILE* file = fopen(filename, "w+b");
352 if (file == nullptr) return false;
353 bool result = save_to_file(file);
354 fclose(file);
355 return result;
356 }
bool save_to_file(const char *const filename) const
Definition: unicharset.h:350

◆ save_to_file() [2/3]

bool UNICHARSET::save_to_file ( FILE *  file) const
inline

Definition at line 360 of file unicharset.h.

360 {
361 STRING str;
362 return save_to_string(&str) &&
363 tesseract::Serialize(file, &str[0], str.length());
364 }
bool Serialize(FILE *fp, const char *data, size_t n)
Definition: serialis.cpp:60
int32_t length() const
Definition: strngs.cpp:189
bool save_to_string(STRING *str) const
Definition: unicharset.cpp:692

◆ save_to_file() [3/3]

bool UNICHARSET::save_to_file ( tesseract::TFile file) const
inline

Definition at line 366 of file unicharset.h.

366 {
367 STRING str;
368 return save_to_string(&str) && file->Serialize(&str[0], str.length());
369 }
bool Serialize(const char *data, size_t count=1)
Definition: serialis.cpp:148

◆ save_to_string()

bool UNICHARSET::save_to_string ( STRING str) const

Definition at line 692 of file unicharset.cpp.

692 {
693 const int kFileBufSize = 1024;
694 char buffer[kFileBufSize + 1];
695 snprintf(buffer, kFileBufSize, "%d\n", this->size());
696 *str = buffer;
697 for (UNICHAR_ID id = 0; id < this->size(); ++id) {
698 int min_bottom, max_bottom, min_top, max_top;
699 get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
700 float width, width_sd;
701 get_width_stats(id, &width, &width_sd);
702 float bearing, bearing_sd;
703 get_bearing_stats(id, &bearing, &bearing_sd);
704 float advance, advance_sd;
705 get_advance_stats(id, &advance, &advance_sd);
706 unsigned int properties = this->get_properties(id);
707 if (strcmp(this->id_to_unichar(id), " ") == 0) {
708 snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
709 this->get_script_from_script_id(this->get_script(id)),
710 this->get_other_case(id));
711 *str += buffer;
712 } else {
713 std::ostringstream stream;
714 stream.imbue(std::locale::classic());
715 stream << this->id_to_unichar(id) << ' ' << properties << ' ' <<
716 min_bottom << ',' << max_bottom << ',' <<
717 min_top << ',' << max_top << ',' <<
718 width << ',' << width_sd << ',' <<
719 bearing << ',' << bearing_sd << ',' <<
720 advance << ',' << advance_sd << ' ' <<
721 this->get_script_from_script_id(this->get_script(id)) << ' ' <<
722 this->get_other_case(id) << ' ' <<
723 this->get_direction(id) << ' ' <<
724 this->get_mirror(id) << ' ' <<
725 this->get_normed_unichar(id) << "\t# " <<
726 this->debug_str(id).string() << '\n';
727 *str += stream.str().c_str();
728 }
729 }
730 return true;
731}
const char * c_str() const
Definition: strngs.cpp:205
const char * string() const
Definition: strngs.cpp:194
void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
Definition: unicharset.h:613
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:683
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:697
void get_advance_stats(UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
Definition: unicharset.h:630
void get_width_stats(UNICHAR_ID unichar_id, float *width, float *width_sd) const
Definition: unicharset.h:596

◆ script_has_upper_lower()

bool UNICHARSET::script_has_upper_lower ( ) const
inline

Definition at line 897 of file unicharset.h.

897 {
898 return script_has_upper_lower_;
899 }

◆ script_has_xheight()

bool UNICHARSET::script_has_xheight ( ) const
inline

Definition at line 904 of file unicharset.h.

904 {
905 return script_has_xheight_;
906 }

◆ set_advance_stats()

void UNICHARSET::set_advance_stats ( UNICHAR_ID  unichar_id,
float  advance,
float  advance_sd 
)
inline

Definition at line 640 of file unicharset.h.

641 {
642 unichars[unichar_id].properties.advance = advance;
643 unichars[unichar_id].properties.advance_sd = advance_sd;
644 }

◆ set_bearing_stats()

void UNICHARSET::set_bearing_stats ( UNICHAR_ID  unichar_id,
float  bearing,
float  bearing_sd 
)
inline

Definition at line 623 of file unicharset.h.

624 {
625 unichars[unichar_id].properties.bearing = bearing;
626 unichars[unichar_id].properties.bearing_sd = bearing_sd;
627 }

◆ set_black_and_whitelist()

void UNICHARSET::set_black_and_whitelist ( const char *  blacklist,
const char *  whitelist,
const char *  unblacklist 
)

Definition at line 1009 of file unicharset.cpp.

1011 {
1012 bool def_enabled = whitelist == nullptr || whitelist[0] == '\0';
1013 // Set everything to default
1014 for (int ch = 0; ch < size_used; ++ch)
1015 unichars[ch].properties.enabled = def_enabled;
1016 if (!def_enabled) {
1017 // Enable the whitelist.
1019 encode_string(whitelist, false, &encoding, nullptr, nullptr);
1020 for (int i = 0; i < encoding.size(); ++i) {
1021 if (encoding[i] != INVALID_UNICHAR_ID)
1022 unichars[encoding[i]].properties.enabled = true;
1023 }
1024 }
1025 if (blacklist != nullptr && blacklist[0] != '\0') {
1026 // Disable the blacklist.
1028 encode_string(blacklist, false, &encoding, nullptr, nullptr);
1029 for (int i = 0; i < encoding.size(); ++i) {
1030 if (encoding[i] != INVALID_UNICHAR_ID)
1031 unichars[encoding[i]].properties.enabled = false;
1032 }
1033 }
1034 if (unblacklist != nullptr && unblacklist[0] != '\0') {
1035 // Re-enable the unblacklist.
1037 encode_string(unblacklist, false, &encoding, nullptr, nullptr);
1038 for (int i = 0; i < encoding.size(); ++i) {
1039 if (encoding[i] != INVALID_UNICHAR_ID)
1040 unichars[encoding[i]].properties.enabled = true;
1041 }
1042 }
1043}
int size() const
Definition: genericvector.h:72

◆ set_direction()

void UNICHARSET::set_direction ( UNICHAR_ID  unichar_id,
UNICHARSET::Direction  value 
)
inline

Definition at line 472 of file unicharset.h.

472 {
473 unichars[unichar_id].properties.direction = value;
474 }

◆ set_isalpha()

void UNICHARSET::set_isalpha ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 431 of file unicharset.h.

431 {
432 unichars[unichar_id].properties.isalpha = value;
433 }

◆ set_isdigit()

void UNICHARSET::set_isdigit ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 446 of file unicharset.h.

446 {
447 unichars[unichar_id].properties.isdigit = value;
448 }

◆ set_islower()

void UNICHARSET::set_islower ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 436 of file unicharset.h.

436 {
437 unichars[unichar_id].properties.islower = value;
438 }

◆ set_isngram()

void UNICHARSET::set_isngram ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 456 of file unicharset.h.

456 {
457 unichars[unichar_id].properties.isngram = value;
458 }

◆ set_ispunctuation()

void UNICHARSET::set_ispunctuation ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 451 of file unicharset.h.

451 {
452 unichars[unichar_id].properties.ispunctuation = value;
453 }

◆ set_isupper()

void UNICHARSET::set_isupper ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 441 of file unicharset.h.

441 {
442 unichars[unichar_id].properties.isupper = value;
443 }

◆ set_mirror()

void UNICHARSET::set_mirror ( UNICHAR_ID  unichar_id,
UNICHAR_ID  mirror 
)
inline

Definition at line 477 of file unicharset.h.

477 {
478 unichars[unichar_id].properties.mirror = mirror;
479 }

◆ set_normed()

void UNICHARSET::set_normed ( UNICHAR_ID  unichar_id,
const char *  normed 
)
inline

Definition at line 482 of file unicharset.h.

482 {
483 unichars[unichar_id].properties.normed = normed;
484 unichars[unichar_id].properties.normed_ids.truncate(0);
485 }

◆ set_normed_ids()

void UNICHARSET::set_normed_ids ( UNICHAR_ID  unichar_id)

Definition at line 373 of file unicharset.cpp.

373 {
374 unichars[unichar_id].properties.normed_ids.truncate(0);
375 if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
376 unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
377 } else if (!encode_string(unichars[unichar_id].properties.normed.string(),
378 true, &unichars[unichar_id].properties.normed_ids,
379 nullptr, nullptr)) {
380 unichars[unichar_id].properties.normed_ids.truncate(0);
381 unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
382 }
383}

◆ set_other_case()

void UNICHARSET::set_other_case ( UNICHAR_ID  unichar_id,
UNICHAR_ID  other_case 
)
inline

Definition at line 467 of file unicharset.h.

467 {
468 unichars[unichar_id].properties.other_case = other_case;
469 }

◆ set_ranges_empty()

void UNICHARSET::set_ranges_empty ( )

Definition at line 396 of file unicharset.cpp.

396 {
397 for (int id = 0; id < size_used; ++id) {
398 unichars[id].properties.SetRangesEmpty();
399 }
400}

◆ set_script()

void UNICHARSET::set_script ( UNICHAR_ID  unichar_id,
const char *  value 
)
inline

Definition at line 462 of file unicharset.h.

462 {
463 unichars[unichar_id].properties.script_id = add_script(value);
464 }

◆ set_top_bottom()

void UNICHARSET::set_top_bottom ( UNICHAR_ID  unichar_id,
int  min_bottom,
int  max_bottom,
int  min_top,
int  max_top 
)
inline

Definition at line 582 of file unicharset.h.

584 {
585 unichars[unichar_id].properties.min_bottom =
586 ClipToRange<int>(min_bottom, 0, UINT8_MAX);
587 unichars[unichar_id].properties.max_bottom =
588 ClipToRange<int>(max_bottom, 0, UINT8_MAX);
589 unichars[unichar_id].properties.min_top =
590 ClipToRange<int>(min_top, 0, UINT8_MAX);
591 unichars[unichar_id].properties.max_top =
592 ClipToRange<int>(max_top, 0, UINT8_MAX);
593 }

◆ set_width_stats()

void UNICHARSET::set_width_stats ( UNICHAR_ID  unichar_id,
float  width,
float  width_sd 
)
inline

Definition at line 607 of file unicharset.h.

607 {
608 unichars[unichar_id].properties.width = width;
609 unichars[unichar_id].properties.width_sd = width_sd;
610 }

◆ SetPropertiesFromOther()

void UNICHARSET::SetPropertiesFromOther ( const UNICHARSET src)
inline

Definition at line 545 of file unicharset.h.

545 {
547 }

◆ size()

int UNICHARSET::size ( ) const
inline

Definition at line 341 of file unicharset.h.

341 {
342 return size_used;
343 }

◆ SizesDistinct()

bool UNICHARSET::SizesDistinct ( UNICHAR_ID  id1,
UNICHAR_ID  id2 
) const

Definition at line 486 of file unicharset.cpp.

486 {
487 int overlap = std::min(unichars[id1].properties.max_top,
488 unichars[id2].properties.max_top) -
489 std::max(unichars[id1].properties.min_top,
490 unichars[id2].properties.min_top);
491 return overlap <= 0;
492}

◆ step()

int UNICHARSET::step ( const char *  str) const

Definition at line 233 of file unicharset.cpp.

233 {
235 GenericVector<char> lengths;
236 encode_string(str, true, &encoding, &lengths, nullptr);
237 if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0;
238 return lengths[0];
239}
bool empty() const
Definition: genericvector.h:91

◆ thai_sid()

int UNICHARSET::thai_sid ( ) const
inline

Definition at line 892 of file unicharset.h.

892{ return thai_sid_; }

◆ to_lower()

UNICHAR_ID UNICHARSET::to_lower ( UNICHAR_ID  unichar_id) const
inline

Definition at line 704 of file unicharset.h.

704 {
705 if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
706 ASSERT_HOST(contains_unichar_id(unichar_id));
707 if (unichars[unichar_id].properties.islower) return unichar_id;
708 return unichars[unichar_id].properties.other_case;
709 }

◆ to_upper()

UNICHAR_ID UNICHARSET::to_upper ( UNICHAR_ID  unichar_id) const
inline

Definition at line 712 of file unicharset.h.

712 {
713 if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
714 ASSERT_HOST(contains_unichar_id(unichar_id));
715 if (unichars[unichar_id].properties.isupper) return unichar_id;
716 return unichars[unichar_id].properties.other_case;
717 }

◆ top_bottom_useful()

bool UNICHARSET::top_bottom_useful ( ) const
inline

Definition at line 537 of file unicharset.h.

537 {
538 return top_bottom_set_;
539 }

◆ unichar_insert() [1/2]

void UNICHARSET::unichar_insert ( const char *const  unichar_repr)
inline

Definition at line 264 of file unicharset.h.

264 {
265 unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
266 }

◆ unichar_insert() [2/2]

void UNICHARSET::unichar_insert ( const char *const  unichar_repr,
OldUncleanUnichars  old_style 
)

Definition at line 626 of file unicharset.cpp.

627 {
628 if (old_style == OldUncleanUnichars::kTrue) old_style_included_ = true;
629 std::string cleaned =
630 old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
631 if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) {
632 const char* str = cleaned.c_str();
633 GenericVector<int> encoding;
634 if (!old_style_included_ &&
635 encode_string(str, true, &encoding, nullptr, nullptr))
636 return;
637 if (size_used == size_reserved) {
638 if (size_used == 0)
639 reserve(8);
640 else
641 reserve(2 * size_used);
642 }
643 int index = 0;
644 do {
645 if (index >= UNICHAR_LEN) {
646 fprintf(stderr, "Utf8 buffer too big, size>%d for %s\n", UNICHAR_LEN,
647 unichar_repr);
648 return;
649 }
650 unichars[size_used].representation[index++] = *str++;
651 } while (*str != '\0');
652 unichars[size_used].representation[index] = '\0';
653 this->set_script(size_used, null_script);
654 // If the given unichar_repr represents a fragmented character, set
655 // fragment property to a pointer to CHAR_FRAGMENT class instance with
656 // information parsed from the unichar representation. Use the script
657 // of the base unichar for the fragmented character if possible.
658 CHAR_FRAGMENT* frag =
659 CHAR_FRAGMENT::parse_from_string(unichars[size_used].representation);
660 this->unichars[size_used].properties.fragment = frag;
661 if (frag != nullptr && this->contains_unichar(frag->get_unichar())) {
662 this->unichars[size_used].properties.script_id =
663 this->get_script(frag->get_unichar());
664 }
665 this->unichars[size_used].properties.enabled = true;
666 ids.insert(unichars[size_used].representation, size_used);
667 ++size_used;
668 }
669}
#define UNICHAR_LEN
Definition: unichar.h:30
void insert(const char *const unichar_repr, UNICHAR_ID id)
Definition: unicharmap.cpp:56
const char * get_unichar() const
Definition: unicharset.h:70
static CHAR_FRAGMENT * parse_from_string(const char *str)
void reserve(int unichars_number)
Definition: unicharset.cpp:195
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:462

◆ unichar_insert_backwards_compatible()

void UNICHARSET::unichar_insert_backwards_compatible ( const char *const  unichar_repr)
inline

Definition at line 269 of file unicharset.h.

269 {
270 std::string cleaned = CleanupString(unichar_repr);
271 if (cleaned != unichar_repr) {
272 unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
273 } else {
274 int old_size = size();
275 unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
276 if (size() == old_size) {
277 unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
278 }
279 }
280 }

◆ unichar_to_id() [1/2]

UNICHAR_ID UNICHARSET::unichar_to_id ( const char *const  unichar_repr) const

Definition at line 210 of file unicharset.cpp.

210 {
211 std::string cleaned =
212 old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
213 return ids.contains(cleaned.data(), cleaned.size())
214 ? ids.unichar_to_id(cleaned.data(), cleaned.size())
215 : INVALID_UNICHAR_ID;
216}
UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:34

◆ unichar_to_id() [2/2]

UNICHAR_ID UNICHARSET::unichar_to_id ( const char *const  unichar_repr,
int  length 
) const

Definition at line 218 of file unicharset.cpp.

219 {
220 assert(length > 0 && length <= UNICHAR_LEN);
221 std::string cleaned(unichar_repr, length);
222 if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
223 return ids.contains(cleaned.data(), cleaned.size())
224 ? ids.unichar_to_id(cleaned.data(), cleaned.size())
225 : INVALID_UNICHAR_ID;
226}

Member Data Documentation

◆ kCustomLigatures

const char * UNICHARSET::kCustomLigatures
static
Initial value:
= {
{"ct", "\uE003"},
{"ſh", "\uE006"},
{"ſi", "\uE007"},
{"ſl", "\uE008"},
{"ſſ", "\uE009"},
{nullptr, nullptr}
}

Definition at line 150 of file unicharset.h.

◆ kSpecialUnicharCodes

const char * UNICHARSET::kSpecialUnicharCodes
static
Initial value:
= {
" ",
"Joined",
"|Broken|0|1"
}

Definition at line 153 of file unicharset.h.


The documentation for this class was generated from the following files: