tesseract 4.1.1
Loading...
Searching...
No Matches
unicharset.cpp
Go to the documentation of this file.
1
2// File: unicharset.cpp
3// Description: Unicode character/ligature set class.
4// Author: Thomas Kielbus
5//
6// (C) Copyright 2006, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16//
18
19#include "unicharset.h"
20
21#include <algorithm>
22#include <cassert>
23#include <cstdio>
24#include <cstring>
25#include <iomanip> // for std::setw
26#include <locale> // for std::locale::classic
27#include <sstream> // for std::istringstream, std::ostringstream
28
29#include "params.h"
30#include "serialis.h"
31#include "tesscallback.h"
32#include "unichar.h"
33
34// TODO(rays) Move UNICHARSET to tesseract namespace.
37
38// Special character used in representing character fragments.
39static const char kSeparator = '|';
40// Special character used in representing 'natural' character fragments.
41static const char kNaturalFlag = 'n';
42
43static const int ISALPHA_MASK = 0x1;
44static const int ISLOWER_MASK = 0x2;
45static const int ISUPPER_MASK = 0x4;
46static const int ISDIGIT_MASK = 0x8;
47static const int ISPUNCTUATION_MASK = 0x10;
48
49// Y coordinate threshold for determining cap-height vs x-height.
50// TODO(rays) Bring the global definition down to the ccutil library level,
51// so this constant is relative to some other constants.
52static const int kMeanlineThreshold = 220;
53// Let C be the number of alpha chars for which all tops exceed
54// kMeanlineThreshold, and X the number of alpha chars for which all
55// tops are below kMeanlineThreshold, then if X > C *
56// kMinXHeightFraction and C > X * kMinCapHeightFraction or more than
57// half the alpha characters have upper or lower case, then the
58// unicharset "has x-height".
59const double kMinXHeightFraction = 0.25;
60const double kMinCapHeightFraction = 0.05;
61
62/*static */
63const char* UNICHARSET::kCustomLigatures[][2] = {
64 {"ct", "\uE003"}, // c + t -> U+E003
65 {"ſh", "\uE006"}, // long-s + h -> U+E006
66 {"ſi", "\uE007"}, // long-s + i -> U+E007
67 {"ſl", "\uE008"}, // long-s + l -> U+E008
68 {"ſſ", "\uE009"}, // long-s + long-s -> U+E009
69 {nullptr, nullptr}
70};
71
72// List of mappings to make when ingesting strings from the outside.
73// The substitutions clean up text that should exist for rendering of
74// synthetic data, but not in the recognition set.
75const char* UNICHARSET::kCleanupMaps[][2] = {
76 {"\u0640", ""}, // TATWEEL is deleted.
77 {"\ufb01", "fi"}, // fi ligature->fi pair.
78 {"\ufb02", "fl"}, // fl ligature->fl pair.
79 {nullptr, nullptr}};
80
81// List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
83 " ",
84 "Joined",
85 "|Broken|0|1"
86};
87
88const char* UNICHARSET::null_script = "NULL";
89
90UNICHARSET::UNICHAR_PROPERTIES::UNICHAR_PROPERTIES() {
91 Init();
92}
93
94// Initialize all properties to sensible default values.
95void UNICHARSET::UNICHAR_PROPERTIES::Init() {
96 isalpha = false;
97 islower = false;
98 isupper = false;
99 isdigit = false;
100 ispunctuation = false;
101 isngram = false;
102 enabled = false;
103 SetRangesOpen();
104 script_id = 0;
105 other_case = 0;
106 mirror = 0;
107 normed = "";
108 direction = UNICHARSET::U_LEFT_TO_RIGHT;
109 fragment = nullptr;
110}
111
112// Sets all ranges wide open. Initialization default in case there are
113// no useful values available.
114void UNICHARSET::UNICHAR_PROPERTIES::SetRangesOpen() {
115 min_bottom = 0;
116 max_bottom = UINT8_MAX;
117 min_top = 0;
118 max_top = UINT8_MAX;
119 width = 0.0f;
120 width_sd = 0.0f;
121 bearing = 0.0f;
122 bearing_sd = 0.0f;
123 advance = 0.0f;
124 advance_sd = 0.0f;
125}
126
127// Sets all ranges to empty. Used before expanding with font-based data.
128void UNICHARSET::UNICHAR_PROPERTIES::SetRangesEmpty() {
129 min_bottom = UINT8_MAX;
130 max_bottom = 0;
131 min_top = UINT8_MAX;
132 max_top = 0;
133 width = 0.0f;
134 width_sd = 0.0f;
135 bearing = 0.0f;
136 bearing_sd = 0.0f;
137 advance = 0.0f;
138 advance_sd = 0.0f;
139}
140
141// Returns true if any of the top/bottom/width/bearing/advance ranges/stats
142// is empty.
143bool UNICHARSET::UNICHAR_PROPERTIES::AnyRangeEmpty() const {
144 return width == 0.0f || advance == 0.0f;
145}
146
147// Expands the ranges with the ranges from the src properties.
148void UNICHARSET::UNICHAR_PROPERTIES::ExpandRangesFrom(
149 const UNICHAR_PROPERTIES& src) {
150 UpdateRange(src.min_bottom, &min_bottom, &max_bottom);
151 UpdateRange(src.max_bottom, &min_bottom, &max_bottom);
152 UpdateRange(src.min_top, &min_top, &max_top);
153 UpdateRange(src.max_top, &min_top, &max_top);
154 if (src.width_sd > width_sd) {
155 width = src.width;
156 width_sd = src.width_sd;
157 }
158 if (src.bearing_sd > bearing_sd) {
159 bearing = src.bearing;
160 bearing_sd = src.bearing_sd;
161 }
162 if (src.advance_sd > advance_sd) {
163 advance = src.advance;
164 advance_sd = src.advance_sd;
165 }
166}
167
168// Copies the properties from src into this.
169void UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(const UNICHAR_PROPERTIES& src) {
170 // Apart from the fragment, everything else can be done with a default copy.
171 CHAR_FRAGMENT* saved_fragment = fragment;
172 *this = src; // Bitwise copy.
173 fragment = saved_fragment;
174}
175
177 unichars(nullptr),
178 ids(),
179 size_used(0),
180 size_reserved(0),
181 script_table(nullptr),
182 script_table_size_used(0) {
183 clear();
184 for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) {
186 if (i == UNICHAR_JOINED)
187 set_isngram(i, true);
188 }
189}
190
192 clear();
193}
194
195void UNICHARSET::reserve(int unichars_number) {
196 if (unichars_number > size_reserved) {
197 auto* unichars_new = new UNICHAR_SLOT[unichars_number];
198 for (int i = 0; i < size_used; ++i)
199 unichars_new[i] = unichars[i];
200 for (int j = size_used; j < unichars_number; ++j) {
201 unichars_new[j].properties.script_id = add_script(null_script);
202 }
203 delete[] unichars;
204 unichars = unichars_new;
205 size_reserved = unichars_number;
206 }
207}
208
210UNICHARSET::unichar_to_id(const char* const unichar_repr) const {
211 std::string cleaned =
212 old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
213 return ids.contains(cleaned.data(), cleaned.size())
214 ? ids.unichar_to_id(cleaned.data(), cleaned.size())
215 : INVALID_UNICHAR_ID;
216}
217
218UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
219 int length) const {
220 assert(length > 0 && length <= UNICHAR_LEN);
221 std::string cleaned(unichar_repr, length);
222 if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
223 return ids.contains(cleaned.data(), cleaned.size())
224 ? ids.unichar_to_id(cleaned.data(), cleaned.size())
225 : INVALID_UNICHAR_ID;
226}
227
228// Return the minimum number of bytes that matches a legal UNICHAR_ID,
229// while leaving the rest of the string encodable. Returns 0 if the
230// beginning of the string is not encodable.
231// WARNING: this function now encodes the whole string for precision.
232// Use encode_string in preference to repeatedly calling step.
233int UNICHARSET::step(const char* str) const {
235 GenericVector<char> lengths;
236 encode_string(str, true, &encoding, &lengths, nullptr);
237 if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0;
238 return lengths[0];
239}
240
241// Return whether the given UTF-8 string is encodable with this UNICHARSET.
242// If not encodable, write the first byte offset which cannot be converted
243// into the second (return) argument.
244bool UNICHARSET::encodable_string(const char *str,
245 int *first_bad_position) const {
247 return encode_string(str, true, &encoding, nullptr, first_bad_position);
248}
249
250// Encodes the given UTF-8 string with this UNICHARSET.
251// Returns true if the encoding succeeds completely, false if there is at
252// least one INVALID_UNICHAR_ID in the returned encoding, but in this case
253// the rest of the string is still encoded.
254// If lengths is not nullptr, then it is filled with the corresponding
255// byte length of each encoded UNICHAR_ID.
256// WARNING: Caller must guarantee that str has already been cleaned of codes
257// that do not belong in the unicharset, or encoding may fail.
258// Use CleanupString to perform the cleaning.
259bool UNICHARSET::encode_string(const char* str, bool give_up_on_failure,
261 GenericVector<char>* lengths,
262 int* encoded_length) const {
263 GenericVector<UNICHAR_ID> working_encoding;
264 GenericVector<char> working_lengths;
265 GenericVector<char> best_lengths;
266 encoding->truncate(0); // Just in case str is empty.
267 int str_length = strlen(str);
268 int str_pos = 0;
269 bool perfect = true;
270 while (str_pos < str_length) {
271 encode_string(str, str_pos, str_length, &working_encoding, &working_lengths,
272 &str_pos, encoding, &best_lengths);
273 if (str_pos < str_length) {
274 // This is a non-match. Skip one utf-8 character.
275 perfect = false;
276 if (give_up_on_failure) break;
277 int step = UNICHAR::utf8_step(str + str_pos);
278 if (step == 0) step = 1;
279 encoding->push_back(INVALID_UNICHAR_ID);
280 best_lengths.push_back(step);
281 str_pos += step;
282 working_encoding = *encoding;
283 working_lengths = best_lengths;
284 }
285 }
286 if (lengths != nullptr) *lengths = best_lengths;
287 if (encoded_length != nullptr) *encoded_length = str_pos;
288 return perfect;
289}
290
292 if (id == INVALID_UNICHAR_ID) {
293 return INVALID_UNICHAR;
294 }
295 ASSERT_HOST(id < this->size());
296 return unichars[id].representation;
297}
298
300 if (id == INVALID_UNICHAR_ID) {
301 return INVALID_UNICHAR;
302 }
303 ASSERT_HOST(id < this->size());
304 // Resolve from the kCustomLigatures table if this is a private encoding.
305 if (get_isprivate(id)) {
306 const char* ch = id_to_unichar(id);
307 for (int i = 0; kCustomLigatures[i][0] != nullptr; ++i) {
308 if (!strcmp(ch, kCustomLigatures[i][1])) {
309 return kCustomLigatures[i][0];
310 }
311 }
312 }
313 // Otherwise return the stored representation.
314 return unichars[id].representation;
315}
316
317// Return a STRING that reformats the utf8 str into the str followed
318// by its hex unicodes.
320 STRING result = str;
321 result += " [";
322 int step = 1;
323 // Chop into unicodes and code each as hex.
324 for (int i = 0; str[i] != '\0'; i += step) {
325 char hex[sizeof(int) * 2 + 1];
326 step = UNICHAR::utf8_step(str + i);
327 if (step == 0) {
328 step = 1;
329 sprintf(hex, "%x", str[i]);
330 } else {
331 UNICHAR ch(str + i, step);
332 sprintf(hex, "%x", ch.first_uni());
333 }
334 result += hex;
335 result += " ";
336 }
337 result += "]";
338 return result;
339}
340
341// Return a STRING containing debug information on the unichar, including
342// the id_to_unichar, its hex unicodes and the properties.
344 if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id));
345 const CHAR_FRAGMENT *fragment = this->get_fragment(id);
346 if (fragment) {
347 return fragment->to_string();
348 }
349 const char* str = id_to_unichar(id);
350 STRING result = debug_utf8_str(str);
351 // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
352 if (get_isalpha(id)) {
353 if (get_islower(id))
354 result += "a";
355 else if (get_isupper(id))
356 result += "A";
357 else
358 result += "x";
359 }
360 // Append 0 if a digit.
361 if (get_isdigit(id)) {
362 result += "0";
363 }
364 // Append p is a punctuation symbol.
365 if (get_ispunctuation(id)) {
366 result += "p";
367 }
368 return result;
369}
370
371// Sets the normed_ids vector from the normed string. normed_ids is not
372// stored in the file, and needs to be set when the UNICHARSET is loaded.
374 unichars[unichar_id].properties.normed_ids.truncate(0);
375 if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
376 unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
377 } else if (!encode_string(unichars[unichar_id].properties.normed.string(),
378 true, &unichars[unichar_id].properties.normed_ids,
379 nullptr, nullptr)) {
380 unichars[unichar_id].properties.normed_ids.truncate(0);
381 unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
382 }
383}
384
385// Returns whether the unichar id represents a unicode value in the private use
386// area. We use this range only internally to represent uncommon ligatures
387// (eg. 'ct') that do not have regular unicode values.
388bool UNICHARSET::get_isprivate(UNICHAR_ID unichar_id) const {
389 UNICHAR uc(id_to_unichar(unichar_id), -1);
390 int uni = uc.first_uni();
391 return (uni >= 0xE000 && uni <= 0xF8FF);
392}
393
394
395// Sets all ranges to empty, so they can be expanded to set the values.
397 for (int id = 0; id < size_used; ++id) {
398 unichars[id].properties.SetRangesEmpty();
399 }
400}
401
402// Sets all the properties for this unicharset given a src unicharset with
403// everything set. The unicharsets don't have to be the same, and graphemes
404// are correctly accounted for.
406 const UNICHARSET& src) {
407 for (int ch = start_index; ch < size_used; ++ch) {
408 const char* utf8 = id_to_unichar(ch);
409 UNICHAR_PROPERTIES properties;
410 if (src.GetStrProperties(utf8, &properties)) {
411 // Setup the script_id, other_case, and mirror properly.
412 const char* script = src.get_script_from_script_id(properties.script_id);
413 properties.script_id = add_script(script);
414 const char* other_case = src.id_to_unichar(properties.other_case);
415 if (contains_unichar(other_case)) {
416 properties.other_case = unichar_to_id(other_case);
417 } else {
418 properties.other_case = ch;
419 }
420 const char* mirror_str = src.id_to_unichar(properties.mirror);
421 if (contains_unichar(mirror_str)) {
422 properties.mirror = unichar_to_id(mirror_str);
423 } else {
424 properties.mirror = ch;
425 }
426 unichars[ch].properties.CopyFrom(properties);
427 set_normed_ids(ch);
428 }
429 }
430}
431
432// Expands the tops and bottoms and widths for this unicharset given a
433// src unicharset with ranges in it. The unicharsets don't have to be the
434// same, and graphemes are correctly accounted for.
436 for (int ch = 0; ch < size_used; ++ch) {
437 const char* utf8 = id_to_unichar(ch);
438 UNICHAR_PROPERTIES properties;
439 if (src.GetStrProperties(utf8, &properties)) {
440 // Expand just the ranges from properties.
441 unichars[ch].properties.ExpandRangesFrom(properties);
442 }
443 }
444}
445
446// Makes this a copy of src. Clears this completely first, so the automatic
447// ids will not be present in this if not in src. Does NOT reorder the set!
449 clear();
450 for (int ch = 0; ch < src.size_used; ++ch) {
451 const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
452 const char* utf8 = src.id_to_unichar(ch);
454 unichars[ch].properties.ExpandRangesFrom(src_props);
455 }
456 // Set properties, including mirror and other_case, WITHOUT reordering
457 // the unicharset.
459}
460
461// For each id in src, if it does not occur in this, add it, as in
462// SetPropertiesFromOther, otherwise expand the ranges, as in
463// ExpandRangesFromOther.
465 int initial_used = size_used;
466 for (int ch = 0; ch < src.size_used; ++ch) {
467 const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
468 const char* utf8 = src.id_to_unichar(ch);
469 int id = size_used;
470 if (contains_unichar(utf8)) {
471 id = unichar_to_id(utf8);
472 // Just expand current ranges.
473 unichars[id].properties.ExpandRangesFrom(src_props);
474 } else {
476 unichars[id].properties.SetRangesEmpty();
477 }
478 }
479 // Set properties, including mirror and other_case, WITHOUT reordering
480 // the unicharset.
481 PartialSetPropertiesFromOther(initial_used, src);
482}
483
484// Returns true if the acceptable ranges of the tops of the characters do
485// not overlap, making their x-height calculations distinct.
487 int overlap = std::min(unichars[id1].properties.max_top,
488 unichars[id2].properties.max_top) -
489 std::max(unichars[id1].properties.min_top,
490 unichars[id2].properties.min_top);
491 return overlap <= 0;
492}
493
494// Internal recursive version of encode_string above.
495// Seeks to encode the given string as a sequence of UNICHAR_IDs such that
496// each UNICHAR_ID uses the least possible part of the utf8 str.
497// It does this by depth-first tail recursion on increasing length matches
498// to the UNICHARSET, saving the first encountered result that encodes the
499// maximum total length of str. It stops on a failure to encode to make
500// the overall process of encoding a partially failed string more efficient.
501// See unicharset.h for definition of the args.
502void UNICHARSET::encode_string(const char* str, int str_index, int str_length,
504 GenericVector<char>* lengths,
505 int* best_total_length,
506 GenericVector<UNICHAR_ID>* best_encoding,
507 GenericVector<char>* best_lengths) const {
508 if (str_index > *best_total_length) {
509 // This is the best result so far.
510 *best_total_length = str_index;
511 *best_encoding = *encoding;
512 if (best_lengths != nullptr)
513 *best_lengths = *lengths;
514 }
515 if (str_index == str_length) return;
516 int encoding_index = encoding->size();
517 // Find the length of the first matching unicharset member.
518 int length = ids.minmatch(str + str_index);
519 if (length == 0 || str_index + length > str_length) return;
520 do {
521 if (ids.contains(str + str_index, length)) {
522 // Successful encoding so far.
523 UNICHAR_ID id = ids.unichar_to_id(str + str_index, length);
524 encoding->push_back(id);
525 lengths->push_back(length);
526 encode_string(str, str_index + length, str_length, encoding, lengths,
527 best_total_length, best_encoding, best_lengths);
528 if (*best_total_length == str_length)
529 return; // Tail recursion success!
530 // Failed with that length, truncate back and try again.
531 encoding->truncate(encoding_index);
532 lengths->truncate(encoding_index);
533 }
534 int step = UNICHAR::utf8_step(str + str_index + length);
535 if (step == 0) step = 1;
536 length += step;
537 } while (length <= UNICHAR_LEN && str_index + length <= str_length);
538}
539
540// Gets the properties for a grapheme string, combining properties for
541// multiple characters in a meaningful way where possible.
542// Returns false if no valid match was found in the unicharset.
543// NOTE that script_id, mirror, and other_case refer to this unicharset on
544// return and will need translation if the target unicharset is different.
545bool UNICHARSET::GetStrProperties(const char* utf8_str,
546 UNICHAR_PROPERTIES* props) const {
547 props->Init();
548 props->SetRangesEmpty();
549 int total_unicodes = 0;
551 if (!encode_string(utf8_str, true, &encoding, nullptr, nullptr))
552 return false; // Some part was invalid.
553 for (int i = 0; i < encoding.size(); ++i) {
554 int id = encoding[i];
555 const UNICHAR_PROPERTIES& src_props = unichars[id].properties;
556 // Logical OR all the bools.
557 if (src_props.isalpha) props->isalpha = true;
558 if (src_props.islower) props->islower = true;
559 if (src_props.isupper) props->isupper = true;
560 if (src_props.isdigit) props->isdigit = true;
561 if (src_props.ispunctuation) props->ispunctuation = true;
562 if (src_props.isngram) props->isngram = true;
563 if (src_props.enabled) props->enabled = true;
564 // Min/max the tops/bottoms.
565 UpdateRange(src_props.min_bottom, &props->min_bottom, &props->max_bottom);
566 UpdateRange(src_props.max_bottom, &props->min_bottom, &props->max_bottom);
567 UpdateRange(src_props.min_top, &props->min_top, &props->max_top);
568 UpdateRange(src_props.max_top, &props->min_top, &props->max_top);
569 float bearing = props->advance + src_props.bearing;
570 if (total_unicodes == 0 || bearing < props->bearing) {
571 props->bearing = bearing;
572 props->bearing_sd = props->advance_sd + src_props.bearing_sd;
573 }
574 props->advance += src_props.advance;
575 props->advance_sd += src_props.advance_sd;
576 // With a single width, just use the widths stored in the unicharset.
577 props->width = src_props.width;
578 props->width_sd = src_props.width_sd;
579 // Use the first script id, other_case, mirror, direction.
580 // Note that these will need translation, except direction.
581 if (total_unicodes == 0) {
582 props->script_id = src_props.script_id;
583 props->other_case = src_props.other_case;
584 props->mirror = src_props.mirror;
585 props->direction = src_props.direction;
586 }
587 // The normed string for the compound character is the concatenation of
588 // the normed versions of the individual characters.
589 props->normed += src_props.normed;
590 ++total_unicodes;
591 }
592 if (total_unicodes > 1) {
593 // Estimate the total widths from the advance - bearing.
594 props->width = props->advance - props->bearing;
595 props->width_sd = props->advance_sd + props->bearing_sd;
596 }
597 return total_unicodes > 0;
598}
599
600// TODO(rays) clean-up the order of functions to match unicharset.h.
601
603 unsigned int properties = 0;
604 if (this->get_isalpha(id))
605 properties |= ISALPHA_MASK;
606 if (this->get_islower(id))
607 properties |= ISLOWER_MASK;
608 if (this->get_isupper(id))
609 properties |= ISUPPER_MASK;
610 if (this->get_isdigit(id))
611 properties |= ISDIGIT_MASK;
612 if (this->get_ispunctuation(id))
613 properties |= ISPUNCTUATION_MASK;
614 return properties;
615}
616
618 if (this->get_isupper(id)) return 'A';
619 if (this->get_islower(id)) return 'a';
620 if (this->get_isalpha(id)) return 'x';
621 if (this->get_isdigit(id)) return '0';
622 if (this->get_ispunctuation(id)) return 'p';
623 return 0;
624}
625
626void UNICHARSET::unichar_insert(const char* const unichar_repr,
627 OldUncleanUnichars old_style) {
628 if (old_style == OldUncleanUnichars::kTrue) old_style_included_ = true;
629 std::string cleaned =
630 old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
631 if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) {
632 const char* str = cleaned.c_str();
633 GenericVector<int> encoding;
634 if (!old_style_included_ &&
635 encode_string(str, true, &encoding, nullptr, nullptr))
636 return;
637 if (size_used == size_reserved) {
638 if (size_used == 0)
639 reserve(8);
640 else
641 reserve(2 * size_used);
642 }
643 int index = 0;
644 do {
645 if (index >= UNICHAR_LEN) {
646 fprintf(stderr, "Utf8 buffer too big, size>%d for %s\n", UNICHAR_LEN,
647 unichar_repr);
648 return;
649 }
650 unichars[size_used].representation[index++] = *str++;
651 } while (*str != '\0');
652 unichars[size_used].representation[index] = '\0';
653 this->set_script(size_used, null_script);
654 // If the given unichar_repr represents a fragmented character, set
655 // fragment property to a pointer to CHAR_FRAGMENT class instance with
656 // information parsed from the unichar representation. Use the script
657 // of the base unichar for the fragmented character if possible.
658 CHAR_FRAGMENT* frag =
659 CHAR_FRAGMENT::parse_from_string(unichars[size_used].representation);
660 this->unichars[size_used].properties.fragment = frag;
661 if (frag != nullptr && this->contains_unichar(frag->get_unichar())) {
662 this->unichars[size_used].properties.script_id =
663 this->get_script(frag->get_unichar());
664 }
665 this->unichars[size_used].properties.enabled = true;
666 ids.insert(unichars[size_used].representation, size_used);
667 ++size_used;
668 }
669}
670
671bool UNICHARSET::contains_unichar(const char* const unichar_repr) const {
672 std::string cleaned =
673 old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
674 return ids.contains(cleaned.data(), cleaned.size());
675}
676
677bool UNICHARSET::contains_unichar(const char* const unichar_repr,
678 int length) const {
679 if (length == 0) {
680 return false;
681 }
682 std::string cleaned(unichar_repr, length);
683 if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
684 return ids.contains(cleaned.data(), cleaned.size());
685}
686
688 const char* const unichar_repr) const {
689 return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
690}
691
693 const int kFileBufSize = 1024;
694 char buffer[kFileBufSize + 1];
695 snprintf(buffer, kFileBufSize, "%d\n", this->size());
696 *str = buffer;
697 for (UNICHAR_ID id = 0; id < this->size(); ++id) {
698 int min_bottom, max_bottom, min_top, max_top;
699 get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
700 float width, width_sd;
701 get_width_stats(id, &width, &width_sd);
702 float bearing, bearing_sd;
703 get_bearing_stats(id, &bearing, &bearing_sd);
704 float advance, advance_sd;
705 get_advance_stats(id, &advance, &advance_sd);
706 unsigned int properties = this->get_properties(id);
707 if (strcmp(this->id_to_unichar(id), " ") == 0) {
708 snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
709 this->get_script_from_script_id(this->get_script(id)),
710 this->get_other_case(id));
711 *str += buffer;
712 } else {
713 std::ostringstream stream;
714 stream.imbue(std::locale::classic());
715 stream << this->id_to_unichar(id) << ' ' << properties << ' ' <<
716 min_bottom << ',' << max_bottom << ',' <<
717 min_top << ',' << max_top << ',' <<
718 width << ',' << width_sd << ',' <<
719 bearing << ',' << bearing_sd << ',' <<
720 advance << ',' << advance_sd << ' ' <<
721 this->get_script_from_script_id(this->get_script(id)) << ' ' <<
722 this->get_other_case(id) << ' ' <<
723 this->get_direction(id) << ' ' <<
724 this->get_mirror(id) << ' ' <<
725 this->get_normed_unichar(id) << "\t# " <<
726 this->debug_str(id).string() << '\n';
727 *str += stream.str().c_str();
728 }
729 }
730 return true;
731}
732
733// TODO(rays) Replace with TFile everywhere.
735 public:
736 InMemoryFilePointer(const char *memory, int mem_size)
737 : memory_(memory), fgets_ptr_(memory), mem_size_(mem_size) { }
738
739 char *fgets(char *orig_dst, int size) {
740 const char *src_end = memory_ + mem_size_;
741 char *dst_end = orig_dst + size - 1;
742 if (size < 1) {
743 return fgets_ptr_ < src_end ? orig_dst : nullptr;
744 }
745
746 char *dst = orig_dst;
747 char ch = '^';
748 while (fgets_ptr_ < src_end && dst < dst_end && ch != '\n') {
749 ch = *dst++ = *fgets_ptr_++;
750 }
751 *dst = 0;
752 return (dst == orig_dst) ? nullptr : orig_dst;
753 }
754
755 private:
756 const char *memory_;
757 const char *fgets_ptr_;
758 const int mem_size_;
759};
760
762 const char *memory, int mem_size, bool skip_fragments) {
763 InMemoryFilePointer mem_fp(memory, mem_size);
766 bool success = load_via_fgets(fgets_cb, skip_fragments);
767 delete fgets_cb;
768 return success;
769}
770
772 public:
773 LocalFilePointer(FILE *stream) : fp_(stream) {}
774 char *fgets(char *dst, int size) {
775 return ::fgets(dst, size, fp_);
776 }
777 private:
778 FILE *fp_;
779};
780
781bool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) {
782 LocalFilePointer lfp(file);
785 bool success = load_via_fgets(fgets_cb, skip_fragments);
786 delete fgets_cb;
787 return success;
788}
789
790bool UNICHARSET::load_from_file(tesseract::TFile *file, bool skip_fragments) {
793 bool success = load_via_fgets(fgets_cb, skip_fragments);
794 delete fgets_cb;
795 return success;
796}
797
798bool UNICHARSET::load_via_fgets(
800 bool skip_fragments) {
801 int unicharset_size;
802 char buffer[256];
803
804 this->clear();
805 if (fgets_cb->Run(buffer, sizeof(buffer)) == nullptr ||
806 sscanf(buffer, "%d", &unicharset_size) != 1) {
807 return false;
808 }
809 this->reserve(unicharset_size);
810 for (UNICHAR_ID id = 0; id < unicharset_size; ++id) {
811 char unichar[256];
812 unsigned int properties;
813 char script[64];
814
815 strncpy(script, null_script, sizeof(script) - 1);
816 int min_bottom = 0;
817 int max_bottom = UINT8_MAX;
818 int min_top = 0;
819 int max_top = UINT8_MAX;
820 float width = 0.0f;
821 float width_sd = 0.0f;
822 float bearing = 0.0f;
823 float bearing_sd = 0.0f;
824 float advance = 0.0f;
825 float advance_sd = 0.0f;
826 // TODO(eger): check that this default it ok
827 // after enabling BiDi iterator for Arabic.
828 int direction = UNICHARSET::U_LEFT_TO_RIGHT;
829 UNICHAR_ID other_case = unicharset_size;
830 UNICHAR_ID mirror = unicharset_size;
831 if (fgets_cb->Run(buffer, sizeof (buffer)) == nullptr) {
832 return false;
833 }
834 char normed[64];
835 normed[0] = '\0';
836 std::istringstream stream(buffer);
837 stream.imbue(std::locale::classic());
838 // 标 1 0,255,0,255,0,0,0,0,0,0 Han 68 0 68 标 # 标 [6807 ]x
839 //stream.flags(std::ios::hex);
840 stream >> std::setw(255) >> unichar >> std::hex >> properties >> std::dec;
841 //stream.flags(std::ios::dec);
842 if (stream.fail()) {
843 fprintf(stderr, "%s:%u failed\n", __FILE__, __LINE__);
844 return false;
845 }
846 auto position = stream.tellg();
847 stream.seekg(position);
848 char c1, c2, c3, c4, c5, c6, c7, c8, c9;
849 stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >> c4 >>
850 width >> c5 >>width_sd >> c6 >> bearing >> c7 >> bearing_sd >> c8 >>
851 advance >> c9 >> advance_sd >> std::setw(63) >> script >>
852 other_case >> direction >> mirror >> std::setw(63) >> normed;
853 if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',' || c4 != ',' ||
854 c5 != ',' || c6 != ',' || c7 != ',' || c8 != ',' || c9 != ',') {
855 stream.clear();
856 stream.seekg(position);
857 stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >> c4 >>
858 width >> c5 >>width_sd >> c6 >> bearing >> c7 >> bearing_sd >> c8 >>
859 advance >> c9 >> advance_sd >> std::setw(63) >> script >>
860 other_case >> direction >> mirror;
861 if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',' || c4 != ',' ||
862 c5 != ',' || c6 != ',' || c7 != ',' || c8 != ',' || c9 != ',') {
863 stream.clear();
864 stream.seekg(position);
865 stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >>
866 std::setw(63) >> script >> other_case >> direction >> mirror;
867 if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',') {
868 stream.clear();
869 stream.seekg(position);
870 stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >>
871 std::setw(63) >> script >> other_case;
872 if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',') {
873 stream.clear();
874 stream.seekg(position);
875 stream >> std::setw(63) >> script >> other_case;
876 if (stream.fail()) {
877 stream.clear();
878 stream.seekg(position);
879 stream >> std::setw(63) >> script;
880 }
881 }
882 }
883 }
884 }
885
886 // Skip fragments if needed.
887 CHAR_FRAGMENT *frag = nullptr;
888 if (skip_fragments && (frag = CHAR_FRAGMENT::parse_from_string(unichar))) {
889 int num_pieces = frag->get_total();
890 delete frag;
891 // Skip multi-element fragments, but keep singles like UNICHAR_BROKEN in.
892 if (num_pieces > 1)
893 continue;
894 }
895 // Insert unichar into unicharset and set its properties.
896 if (strcmp(unichar, "NULL") == 0)
897 this->unichar_insert(" ");
898 else
900
901 this->set_isalpha(id, properties & ISALPHA_MASK);
902 this->set_islower(id, properties & ISLOWER_MASK);
903 this->set_isupper(id, properties & ISUPPER_MASK);
904 this->set_isdigit(id, properties & ISDIGIT_MASK);
905 this->set_ispunctuation(id, properties & ISPUNCTUATION_MASK);
906 this->set_isngram(id, false);
907 this->set_script(id, script);
908 this->unichars[id].properties.enabled = true;
909 this->set_top_bottom(id, min_bottom, max_bottom, min_top, max_top);
910 this->set_width_stats(id, width, width_sd);
911 this->set_bearing_stats(id, bearing, bearing_sd);
912 this->set_advance_stats(id, advance, advance_sd);
913 this->set_direction(id, static_cast<UNICHARSET::Direction>(direction));
914 this->set_other_case(
915 id, (other_case < unicharset_size) ? other_case : id);
916 this->set_mirror(id, (mirror < unicharset_size) ? mirror : id);
917 this->set_normed(id, normed[0] != '\0' ? normed : unichar);
918 }
920 return true;
921}
922
923// Sets up internal data after loading the file, based on the char
924// properties. Called from load_from_file, but also needs to be run
925// during set_unicharset_properties.
927 // Number of alpha chars with the case property minus those without,
928 // in order to determine that half the alpha chars have case.
929 int net_case_alphas = 0;
930 int x_height_alphas = 0;
931 int cap_height_alphas = 0;
932 top_bottom_set_ = false;
933 for (UNICHAR_ID id = 0; id < size_used; ++id) {
934 int min_bottom = 0;
935 int max_bottom = UINT8_MAX;
936 int min_top = 0;
937 int max_top = UINT8_MAX;
938 get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
939 if (min_top > 0)
940 top_bottom_set_ = true;
941 if (get_isalpha(id)) {
942 if (get_islower(id) || get_isupper(id))
943 ++net_case_alphas;
944 else
945 --net_case_alphas;
946 if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold)
947 ++x_height_alphas;
948 else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold)
949 ++cap_height_alphas;
950 }
951 set_normed_ids(id);
952 }
953
954 script_has_upper_lower_ = net_case_alphas > 0;
955 script_has_xheight_ = script_has_upper_lower_ ||
956 (x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
957 cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
958
959 null_sid_ = get_script_id_from_name(null_script);
960 ASSERT_HOST(null_sid_ == 0);
961 common_sid_ = get_script_id_from_name("Common");
962 latin_sid_ = get_script_id_from_name("Latin");
963 cyrillic_sid_ = get_script_id_from_name("Cyrillic");
964 greek_sid_ = get_script_id_from_name("Greek");
965 han_sid_ = get_script_id_from_name("Han");
966 hiragana_sid_ = get_script_id_from_name("Hiragana");
967 katakana_sid_ = get_script_id_from_name("Katakana");
968 thai_sid_ = get_script_id_from_name("Thai");
969 hangul_sid_ = get_script_id_from_name("Hangul");
970
971 // Compute default script. Use the highest-counting alpha script, that is
972 // not the common script, as that still contains some "alphas".
973 int* script_counts = new int[script_table_size_used];
974 memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
975 for (int id = 0; id < size_used; ++id) {
976 if (get_isalpha(id)) {
977 ++script_counts[get_script(id)];
978 }
979 }
980 default_sid_ = 0;
981 for (int s = 1; s < script_table_size_used; ++s) {
982 if (script_counts[s] > script_counts[default_sid_] && s != common_sid_)
983 default_sid_ = s;
984 }
985 delete [] script_counts;
986}
987
988// Returns true if right_to_left scripts are significant in the unicharset,
989// but without being so sensitive that "universal" unicharsets containing
990// characters from many scripts, like orientation and script detection,
991// look like they are right_to_left.
993 int ltr_count = 0;
994 int rtl_count = 0;
995 for (int id = 0; id < size_used; ++id) {
996 int dir = get_direction(id);
997 if (dir == UNICHARSET::U_LEFT_TO_RIGHT) ltr_count++;
998 if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
1000 dir == UNICHARSET::U_ARABIC_NUMBER) rtl_count++;
1001 }
1002 return rtl_count > ltr_count;
1003}
1004
1005// Set a whitelist and/or blacklist of characters to recognize.
1006// An empty or nullptr whitelist enables everything (minus any blacklist).
1007// An empty or nullptr blacklist disables nothing.
1008// An empty or nullptr blacklist has no effect.
1009void UNICHARSET::set_black_and_whitelist(const char* blacklist,
1010 const char* whitelist,
1011 const char* unblacklist) {
1012 bool def_enabled = whitelist == nullptr || whitelist[0] == '\0';
1013 // Set everything to default
1014 for (int ch = 0; ch < size_used; ++ch)
1015 unichars[ch].properties.enabled = def_enabled;
1016 if (!def_enabled) {
1017 // Enable the whitelist.
1019 encode_string(whitelist, false, &encoding, nullptr, nullptr);
1020 for (int i = 0; i < encoding.size(); ++i) {
1021 if (encoding[i] != INVALID_UNICHAR_ID)
1022 unichars[encoding[i]].properties.enabled = true;
1023 }
1024 }
1025 if (blacklist != nullptr && blacklist[0] != '\0') {
1026 // Disable the blacklist.
1028 encode_string(blacklist, false, &encoding, nullptr, nullptr);
1029 for (int i = 0; i < encoding.size(); ++i) {
1030 if (encoding[i] != INVALID_UNICHAR_ID)
1031 unichars[encoding[i]].properties.enabled = false;
1032 }
1033 }
1034 if (unblacklist != nullptr && unblacklist[0] != '\0') {
1035 // Re-enable the unblacklist.
1037 encode_string(unblacklist, false, &encoding, nullptr, nullptr);
1038 for (int i = 0; i < encoding.size(); ++i) {
1039 if (encoding[i] != INVALID_UNICHAR_ID)
1040 unichars[encoding[i]].properties.enabled = true;
1041 }
1042 }
1043}
1044
1045// Returns true if there are any repeated unicodes in the normalized
1046// text of any unichar-id in the unicharset.
1048 int start_id = 0;
1050 for (int id = start_id; id < size_used; ++id) {
1051 // Convert to unicodes.
1052 std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(get_normed_unichar(id));
1053 for (size_t u = 1; u < unicodes.size(); ++u) {
1054 if (unicodes[u - 1] == unicodes[u]) return true;
1055 }
1056 }
1057 return false;
1058}
1059
1060int UNICHARSET::add_script(const char* script) {
1061 for (int i = 0; i < script_table_size_used; ++i) {
1062 if (strcmp(script, script_table[i]) == 0)
1063 return i;
1064 }
1065 if (script_table_size_reserved == 0) {
1066 script_table_size_reserved = 8;
1067 script_table = new char*[script_table_size_reserved];
1068 } else if (script_table_size_used >= script_table_size_reserved) {
1069 assert(script_table_size_used == script_table_size_reserved);
1070 script_table_size_reserved += script_table_size_reserved;
1071 char** new_script_table = new char*[script_table_size_reserved];
1072 memcpy(new_script_table, script_table,
1073 script_table_size_used * sizeof(char*));
1074 delete[] script_table;
1075 script_table = new_script_table;
1076 }
1077 script_table[script_table_size_used] = new char[strlen(script) + 1];
1078 strcpy(script_table[script_table_size_used], script);
1079 return script_table_size_used++;
1080}
1081
1082// Returns the string that represents a fragment
1083// with the given unichar, pos and total.
1084STRING CHAR_FRAGMENT::to_string(const char *unichar, int pos, int total,
1085 bool natural) {
1086 if (total == 1) return STRING(unichar);
1087 STRING result = "";
1088 result += kSeparator;
1089 result += unichar;
1090 char buffer[kMaxLen];
1091 snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos,
1092 natural ? kNaturalFlag : kSeparator, total);
1093 result += buffer;
1094 return result;
1095}
1096
1098 const char *ptr = string;
1099 int len = strlen(string);
1100 if (len < kMinLen || *ptr != kSeparator) {
1101 return nullptr; // this string can not represent a fragment
1102 }
1103 ptr++; // move to the next character
1104 int step = 0;
1105 while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) {
1106 step += UNICHAR::utf8_step(ptr + step);
1107 }
1108 if (step == 0 || step > UNICHAR_LEN) {
1109 return nullptr; // no character for unichar or the character is too long
1110 }
1111 char unichar[UNICHAR_LEN + 1];
1112 strncpy(unichar, ptr, step);
1113 unichar[step] = '\0'; // null terminate unichar
1114 ptr += step; // move to the next fragment separator
1115 int pos = 0;
1116 int total = 0;
1117 bool natural = false;
1118 char *end_ptr = nullptr;
1119 for (int i = 0; i < 2; i++) {
1120 if (ptr > string + len || *ptr != kSeparator) {
1121 if (i == 1 && *ptr == kNaturalFlag)
1122 natural = true;
1123 else
1124 return nullptr; // Failed to parse fragment representation.
1125 }
1126 ptr++; // move to the next character
1127 i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10))
1128 : total = static_cast<int>(strtol(ptr, &end_ptr, 10));
1129 ptr = end_ptr;
1130 }
1131 if (ptr != string + len) {
1132 return nullptr; // malformed fragment representation
1133 }
1134 auto *fragment = new CHAR_FRAGMENT();
1135 fragment->set_all(unichar, pos, total, natural);
1136 return fragment;
1137}
1138
1139int UNICHARSET::get_script_id_from_name(const char* script_name) const {
1140 for (int i = 0; i < script_table_size_used; ++i) {
1141 if (strcmp(script_name, script_table[i]) == 0)
1142 return i;
1143 }
1144 return 0; // 0 is always the null_script
1145}
1146
1147// Removes/replaces content that belongs in rendered text, but not in the
1148// unicharset.
1149/* static */
1150std::string UNICHARSET::CleanupString(const char* utf8_str, size_t length) {
1151 std::string result;
1152 result.reserve(length);
1153 char ch;
1154 while ((ch = *utf8_str) != '\0' && length-- > 0) {
1155 int key_index = 0;
1156 const char* key;
1157 while ((key = kCleanupMaps[key_index][0]) != nullptr) {
1158 int match = 0;
1159 while (key[match] != '\0' && key[match] == utf8_str[match]) ++match;
1160 if (key[match] == '\0') {
1161 utf8_str += match;
1162 break;
1163 }
1164 ++key_index;
1165 }
1166 if (key == nullptr) {
1167 result.push_back(ch);
1168 ++utf8_str;
1169 } else {
1170 result.append(kCleanupMaps[key_index][1]);
1171 }
1172 }
1173 return result;
1174}
#define ASSERT_HOST(x)
Definition: errcode.h:88
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
Definition: helpers.h:120
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
Definition: tesscallback.h:258
#define UNICHAR_LEN
Definition: unichar.h:30
int UNICHAR_ID
Definition: unichar.h:34
const double kMinCapHeightFraction
Definition: unicharset.cpp:60
const double kMinXHeightFraction
Definition: unicharset.cpp:59
@ SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:38
@ UNICHAR_JOINED
Definition: unicharset.h:35
@ UNICHAR_SPACE
Definition: unicharset.h:34
OldUncleanUnichars
Definition: unicharset.h:43
signed int char32
Definition: unichar.h:51
int push_back(T object)
bool empty() const
Definition: genericvector.h:91
int size() const
Definition: genericvector.h:72
void truncate(int size)
virtual R Run(A1, A2)=0
char * FGets(char *buffer, int buffer_size)
Definition: serialis.cpp:249
Definition: strngs.h:45
const char * c_str() const
Definition: strngs.cpp:205
const char * string() const
Definition: strngs.cpp:194
int first_uni() const
Definition: unichar.cpp:98
bool contains(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:79
UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:34
int minmatch(const char *const unichar_repr) const
Definition: unicharmap.cpp:100
void insert(const char *const unichar_repr, UNICHAR_ID id)
Definition: unicharmap.cpp:56
char * fgets(char *orig_dst, int size)
Definition: unicharset.cpp:739
InMemoryFilePointer(const char *memory, int mem_size)
Definition: unicharset.cpp:736
char * fgets(char *dst, int size)
Definition: unicharset.cpp:774
LocalFilePointer(FILE *stream)
Definition: unicharset.cpp:773
static const int kMaxLen
Definition: unicharset.h:53
int get_total() const
Definition: unicharset.h:72
STRING to_string() const
Definition: unicharset.h:79
static const int kMinLen
Definition: unicharset.h:51
static STRING to_string(const char *unichar, int pos, int total, bool natural)
const char * get_unichar() const
Definition: unicharset.h:70
static CHAR_FRAGMENT * parse_from_string(const char *str)
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:828
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:734
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:448
void set_isngram(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:456
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:299
void ExpandRangesFromOther(const UNICHARSET &src)
Definition: unicharset.cpp:435
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:690
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:451
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
Definition: unicharset.h:477
void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd)
Definition: unicharset.h:607
@ U_ARABIC_NUMBER
Definition: unicharset.h:162
@ U_RIGHT_TO_LEFT_ARABIC
Definition: unicharset.h:170
@ U_RIGHT_TO_LEFT
Definition: unicharset.h:158
@ U_LEFT_TO_RIGHT
Definition: unicharset.h:157
bool get_isprivate(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:388
void reserve(int unichars_number)
Definition: unicharset.cpp:195
void set_isalpha(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:431
bool load_from_inmemory_file(const char *const memory, int mem_size, bool skip_fragments)
Definition: unicharset.cpp:761
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
Definition: unicharset.cpp:486
static TESS_API const char * kCustomLigatures[][2]
Definition: unicharset.h:150
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:663
void set_ranges_empty()
Definition: unicharset.cpp:396
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:519
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:626
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:505
void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src)
Definition: unicharset.cpp:405
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
void set_advance_stats(UNICHAR_ID unichar_id, float advance, float advance_sd)
Definition: unicharset.h:640
void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
Definition: unicharset.h:582
static TESS_API const char * kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]
Definition: unicharset.h:153
bool save_to_string(STRING *str) const
Definition: unicharset.cpp:692
void set_isdigit(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:446
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
Definition: unicharset.h:467
void set_bearing_stats(UNICHAR_ID unichar_id, float bearing, float bearing_sd)
Definition: unicharset.h:623
void set_normed(UNICHAR_ID unichar_id, const char *normed)
Definition: unicharset.h:482
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:671
void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
Definition: unicharset.h:613
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
Definition: unicharset.h:472
int get_script_id_from_name(const char *script_name) const
int add_script(const char *script)
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:462
bool AnyRepeatedUnicodes() const
int step(const char *str) const
Definition: unicharset.cpp:233
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:683
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:854
void set_isupper(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:441
char get_chartype(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:617
bool major_right_to_left() const
Definition: unicharset.cpp:992
void unichar_insert_backwards_compatible(const char *const unichar_repr)
Definition: unicharset.h:269
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
void set_islower(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:436
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
void AppendOtherUnicharset(const UNICHARSET &src)
Definition: unicharset.cpp:464
void set_normed_ids(UNICHAR_ID unichar_id)
Definition: unicharset.cpp:373
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:259
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:697
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:343
void post_load_setup()
Definition: unicharset.cpp:926
void get_advance_stats(UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
Definition: unicharset.h:630
void get_width_stats(UNICHAR_ID unichar_id, float *width, float *width_sd) const
Definition: unicharset.h:596
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:246
void clear()
Definition: unicharset.h:306
int size() const
Definition: unicharset.h:341
unsigned int get_properties(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:602
bool has_special_codes() const
Definition: unicharset.h:722
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:388
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:568
static STRING debug_utf8_str(const char *str)
Definition: unicharset.cpp:319
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:687
bool encodable_string(const char *str, int *first_bad_position) const
Definition: unicharset.cpp:244
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:498