tesseract 4.1.1
Loading...
Searching...
No Matches
unicharset.h
Go to the documentation of this file.
1
2// File: unicharset.h
3// Description: Unicode character/ligature set class.
4// Author: Thomas Kielbus
5//
6// (C) Copyright 2006, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16//
18
19#ifndef TESSERACT_CCUTIL_UNICHARSET_H_
20#define TESSERACT_CCUTIL_UNICHARSET_H_
21
22#include "errcode.h"
23#include "genericvector.h"
24#include "helpers.h"
25#include "serialis.h"
26#include "strngs.h"
27#include "tesscallback.h"
28#include "unichar.h"
29#include "unicharmap.h"
30
31// Enum holding special values of unichar_id. Every unicharset has these.
32// Warning! Keep in sync with kSpecialUnicharCodes.
37
39};
40
41// Boolean flag for unichar_insert. It's a bit of a double negative to allow
42// the default value to be false.
44 kFalse,
45 kTrue,
46};
47
49 public:
50 // Minimum number of characters used for fragment representation.
51 static const int kMinLen = 6;
52 // Maximum number of characters used for fragment representation.
53 static const int kMaxLen = 3 + UNICHAR_LEN + 2;
54 // Maximum number of fragments per character.
55 static const int kMaxChunks = 5;
56
57 // Setters and Getters.
58 inline void set_all(const char *unichar, int pos, int total, bool natural) {
59 set_unichar(unichar);
60 set_pos(pos);
61 set_total(total);
62 set_natural(natural);
63 }
64 inline void set_unichar(const char *uch) {
65 strncpy(this->unichar, uch, sizeof(this->unichar));
66 this->unichar[UNICHAR_LEN] = '\0';
67 }
68 inline void set_pos(int p) { this->pos = p; }
69 inline void set_total(int t) { this->total = t; }
70 inline const char* get_unichar() const { return this->unichar; }
71 inline int get_pos() const { return this->pos; }
72 inline int get_total() const { return this->total; }
73
74 // Returns the string that represents a fragment
75 // with the given unichar, pos and total.
76 static STRING to_string(const char *unichar, int pos, int total,
77 bool natural);
78 // Returns the string that represents this fragment.
79 STRING to_string() const {
80 return to_string(unichar, pos, total, natural);
81 }
82
83 // Checks whether a fragment has the same unichar,
84 // position and total as the given inputs.
85 inline bool equals(const char *other_unichar,
86 int other_pos, int other_total) const {
87 return (strcmp(this->unichar, other_unichar) == 0 &&
88 this->pos == other_pos && this->total == other_total);
89 }
90 inline bool equals(const CHAR_FRAGMENT *other) const {
91 return this->equals(other->get_unichar(),
92 other->get_pos(),
93 other->get_total());
94 }
95
96 // Checks whether a given fragment is a continuation of this fragment.
97 // Assumes that the given fragment pointer is not nullptr.
98 inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
99 return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
100 this->total == fragment->get_total() &&
101 this->pos == fragment->get_pos() + 1);
102 }
103
104 // Returns true if this fragment is a beginning fragment.
105 inline bool is_beginning() const { return this->pos == 0; }
106
107 // Returns true if this fragment is an ending fragment.
108 inline bool is_ending() const { return this->pos == this->total-1; }
109
110 // Returns true if the fragment was a separate component to begin with,
111 // ie did not need chopping to be isolated, but may have been separated
112 // out from a multi-outline blob.
113 inline bool is_natural() const { return natural; }
114 void set_natural(bool value) { natural = value; }
115
116 // Parses the string to see whether it represents a character fragment
117 // (rather than a regular character). If so, allocates memory for a new
118 // CHAR_FRAGMENT instance and fills it in with the corresponding fragment
119 // information. Fragments are of the form:
120 // |m|1|2, meaning chunk 1 of 2 of character m, or
121 // |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed
122 // to divide the parts, as they were already separate connected components.
123 //
124 // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT
125 // instance, otherwise (if the string does not represent a fragment or it
126 // looks like it does, but parsing it as a fragment fails) returns nullptr.
127 //
128 // Note: The caller is responsible for deallocating memory
129 // associated with the returned pointer.
130 static CHAR_FRAGMENT *parse_from_string(const char *str);
131
132 private:
133 char unichar[UNICHAR_LEN + 1];
134 // True if the fragment was a separate component to begin with,
135 // ie did not need chopping to be isolated, but may have been separated
136 // out from a multi-outline blob.
137 bool natural;
138 int16_t pos; // fragment position in the character
139 int16_t total; // total number of fragments in the character
140};
141
142// The UNICHARSET class is an utility class for Tesseract that holds the
143// set of characters that are used by the engine. Each character is identified
144// by a unique number, from 0 to (size - 1).
146 public:
147 // Custom list of characters and their ligature forms (UTF8)
148 // These map to unicode values in the private use area (PUC) and are supported
149 // by only few font families (eg. Wyld, Adobe Caslon Pro).
150 static TESS_API const char* kCustomLigatures[][2];
151
152 // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
154
155 // ICU 2.0 UCharDirection enum (from icu/include/unicode/uchar.h)
180#ifndef U_HIDE_DEPRECATED_API
182#endif // U_HIDE_DEPRECATED_API
183 };
184
185 // Create an empty UNICHARSET
186 UNICHARSET();
187
188 ~UNICHARSET();
189
190 // Return the UNICHAR_ID of a given unichar representation within the
191 // UNICHARSET.
192 UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;
193
194 // Return the UNICHAR_ID of a given unichar representation within the
195 // UNICHARSET. Only the first length characters from unichar_repr are used.
196 UNICHAR_ID unichar_to_id(const char* const unichar_repr, int length) const;
197
198 // Return the minimum number of bytes that matches a legal UNICHAR_ID,
199 // while leaving the rest of the string encodable. Returns 0 if the
200 // beginning of the string is not encodable.
201 // WARNING: this function now encodes the whole string for precision.
202 // Use encode_string in preference to repeatedly calling step.
203 int step(const char* str) const;
204
205 // Returns true if the given UTF-8 string is encodable with this UNICHARSET.
206 // If not encodable, write the first byte offset which cannot be converted
207 // into the second (return) argument.
208 bool encodable_string(const char *str, int *first_bad_position) const;
209
210 // Encodes the given UTF-8 string with this UNICHARSET.
211 // Any part of the string that cannot be encoded (because the utf8 can't
212 // be broken up into pieces that are in the unicharset) then:
213 // if give_up_on_failure, stops and returns a partial encoding,
214 // else continues and inserts an INVALID_UNICHAR_ID in the returned encoding.
215 // Returns true if the encoding succeeds completely, false if there is at
216 // least one failure.
217 // If lengths is not nullptr, then it is filled with the corresponding
218 // byte length of each encoded UNICHAR_ID.
219 // If encoded_length is not nullptr then on return it contains the length of
220 // str that was encoded. (if give_up_on_failure the location of the first
221 // failure, otherwise strlen(str).)
222 // WARNING: Caller must guarantee that str has already been cleaned of codes
223 // that do not belong in the unicharset, or encoding may fail.
224 // Use CleanupString to perform the cleaning.
225 bool encode_string(const char* str, bool give_up_on_failure,
227 GenericVector<char>* lengths,
228 int* encoded_length) const;
229
230 // Return the unichar representation corresponding to the given UNICHAR_ID
231 // within the UNICHARSET.
232 const char* id_to_unichar(UNICHAR_ID id) const;
233
234 // Return the UTF8 representation corresponding to the given UNICHAR_ID after
235 // resolving any private encodings internal to Tesseract. This method is
236 // preferable to id_to_unichar for outputting text that will be visible to
237 // external applications.
238 const char* id_to_unichar_ext(UNICHAR_ID id) const;
239
240 // Return a STRING that reformats the utf8 str into the str followed
241 // by its hex unicodes.
242 static STRING debug_utf8_str(const char* str);
243
244 // Removes/replaces content that belongs in rendered text, but not in the
245 // unicharset.
246 static std::string CleanupString(const char* utf8_str) {
247 return CleanupString(utf8_str, strlen(utf8_str));
248 }
249 static std::string CleanupString(const char* utf8_str, size_t length);
250
251 // Return a STRING containing debug information on the unichar, including
252 // the id_to_unichar, its hex unicodes and the properties.
253 STRING debug_str(UNICHAR_ID id) const;
254 STRING debug_str(const char * unichar_repr) const {
255 return debug_str(unichar_to_id(unichar_repr));
256 }
257
258 // Adds a unichar representation to the set. If old_style is true, then
259 // TATWEEL characters are kept and n-grams are allowed. Otherwise TATWEEL
260 // characters are ignored/skipped as if they don't exist and n-grams that
261 // can already be encoded are not added.
262 void unichar_insert(const char* const unichar_repr,
263 OldUncleanUnichars old_style);
264 void unichar_insert(const char* const unichar_repr) {
265 unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
266 }
267 // Adds a unichar representation to the set. Avoids setting old_style to true,
268 // unless it is necessary to make the new unichar get added.
269 void unichar_insert_backwards_compatible(const char* const unichar_repr) {
270 std::string cleaned = CleanupString(unichar_repr);
271 if (cleaned != unichar_repr) {
272 unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
273 } else {
274 int old_size = size();
275 unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
276 if (size() == old_size) {
277 unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
278 }
279 }
280 }
281
282 // Return true if the given unichar id exists within the set.
283 // Relies on the fact that unichar ids are contiguous in the unicharset.
284 bool contains_unichar_id(UNICHAR_ID unichar_id) const {
285 return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
286 unichar_id >= 0;
287 }
288
289 // Return true if the given unichar representation exists within the set.
290 bool contains_unichar(const char* const unichar_repr) const;
291 bool contains_unichar(const char* const unichar_repr, int length) const;
292
293 // Return true if the given unichar representation corresponds to the given
294 // UNICHAR_ID within the set.
295 bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const;
296
297 // Delete CHAR_FRAGMENTs stored in properties of unichars array.
299 for (int i = 0; i < size_used; ++i) {
300 delete unichars[i].properties.fragment;
301 unichars[i].properties.fragment = nullptr;
302 }
303 }
304
305 // Clear the UNICHARSET (all the previous data is lost).
306 void clear() {
307 if (script_table != nullptr) {
308 for (int i = 0; i < script_table_size_used; ++i)
309 delete[] script_table[i];
310 delete[] script_table;
311 script_table = nullptr;
312 script_table_size_used = 0;
313 }
314 if (unichars != nullptr) {
316 delete[] unichars;
317 unichars = nullptr;
318 }
319 script_table_size_reserved = 0;
320 size_reserved = 0;
321 size_used = 0;
322 ids.clear();
323 top_bottom_set_ = false;
324 script_has_upper_lower_ = false;
325 script_has_xheight_ = false;
326 old_style_included_ = false;
327 null_sid_ = 0;
328 common_sid_ = 0;
329 latin_sid_ = 0;
330 cyrillic_sid_ = 0;
331 greek_sid_ = 0;
332 han_sid_ = 0;
333 hiragana_sid_ = 0;
334 katakana_sid_ = 0;
335 thai_sid_ = 0;
336 hangul_sid_ = 0;
337 default_sid_ = 0;
338 }
339
340 // Return the size of the set (the number of different UNICHAR it holds).
341 int size() const {
342 return size_used;
343 }
344
345 // Reserve enough memory space for the given number of UNICHARS
346 void reserve(int unichars_number);
347
348 // Opens the file indicated by filename and saves unicharset to that file.
349 // Returns true if the operation is successful.
350 bool save_to_file(const char * const filename) const {
351 FILE* file = fopen(filename, "w+b");
352 if (file == nullptr) return false;
353 bool result = save_to_file(file);
354 fclose(file);
355 return result;
356 }
357
358 // Saves the content of the UNICHARSET to the given file.
359 // Returns true if the operation is successful.
360 bool save_to_file(FILE *file) const {
361 STRING str;
362 return save_to_string(&str) &&
363 tesseract::Serialize(file, &str[0], str.length());
364 }
365
366 bool save_to_file(tesseract::TFile *file) const {
367 STRING str;
368 return save_to_string(&str) && file->Serialize(&str[0], str.length());
369 }
370
371 // Saves the content of the UNICHARSET to the given STRING.
372 // Returns true if the operation is successful.
373 bool save_to_string(STRING *str) const;
374
375 // Load a unicharset from a unicharset file that has been loaded into
376 // the given memory buffer.
377 // Returns true if the operation is successful.
378 bool load_from_inmemory_file(const char* const memory, int mem_size,
379 bool skip_fragments);
380 // Returns true if the operation is successful.
381 bool load_from_inmemory_file(const char* const memory, int mem_size) {
382 return load_from_inmemory_file(memory, mem_size, false);
383 }
384
385 // Opens the file indicated by filename and loads the UNICHARSET
386 // from the given file. The previous data is lost.
387 // Returns true if the operation is successful.
388 bool load_from_file(const char* const filename, bool skip_fragments) {
389 FILE* file = fopen(filename, "rb");
390 if (file == nullptr) return false;
391 bool result = load_from_file(file, skip_fragments);
392 fclose(file);
393 return result;
394 }
395 // returns true if the operation is successful.
396 bool load_from_file(const char* const filename) {
397 return load_from_file(filename, false);
398 }
399
400 // Loads the UNICHARSET from the given file. The previous data is lost.
401 // Returns true if the operation is successful.
402 bool load_from_file(FILE *file, bool skip_fragments);
403 bool load_from_file(FILE *file) { return load_from_file(file, false); }
404 bool load_from_file(tesseract::TFile *file, bool skip_fragments);
405
406
407 // Sets up internal data after loading the file, based on the char
408 // properties. Called from load_from_file, but also needs to be run
409 // during set_unicharset_properties.
410 void post_load_setup();
411
412 // Returns true if right_to_left scripts are significant in the unicharset,
413 // but without being so sensitive that "universal" unicharsets containing
414 // characters from many scripts, like orientation and script detection,
415 // look like they are right_to_left.
416 bool major_right_to_left() const;
417
418 // Set a whitelist and/or blacklist of characters to recognize.
419 // An empty or nullptr whitelist enables everything (minus any blacklist).
420 // An empty or nullptr blacklist disables nothing.
421 // An empty or nullptr unblacklist has no effect.
422 // The blacklist overrides the whitelist.
423 // The unblacklist overrides the blacklist.
424 // Each list is a string of utf8 character strings. Boundaries between
425 // unicharset units are worked out automatically, and characters not in
426 // the unicharset are silently ignored.
427 void set_black_and_whitelist(const char* blacklist, const char* whitelist,
428 const char* unblacklist);
429
430 // Set the isalpha property of the given unichar to the given value.
431 void set_isalpha(UNICHAR_ID unichar_id, bool value) {
432 unichars[unichar_id].properties.isalpha = value;
433 }
434
435 // Set the islower property of the given unichar to the given value.
436 void set_islower(UNICHAR_ID unichar_id, bool value) {
437 unichars[unichar_id].properties.islower = value;
438 }
439
440 // Set the isupper property of the given unichar to the given value.
441 void set_isupper(UNICHAR_ID unichar_id, bool value) {
442 unichars[unichar_id].properties.isupper = value;
443 }
444
445 // Set the isdigit property of the given unichar to the given value.
446 void set_isdigit(UNICHAR_ID unichar_id, bool value) {
447 unichars[unichar_id].properties.isdigit = value;
448 }
449
450 // Set the ispunctuation property of the given unichar to the given value.
451 void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {
452 unichars[unichar_id].properties.ispunctuation = value;
453 }
454
455 // Set the isngram property of the given unichar to the given value.
456 void set_isngram(UNICHAR_ID unichar_id, bool value) {
457 unichars[unichar_id].properties.isngram = value;
458 }
459
460 // Set the script name of the given unichar to the given value.
461 // Value is copied and thus can be a temporary;
462 void set_script(UNICHAR_ID unichar_id, const char* value) {
463 unichars[unichar_id].properties.script_id = add_script(value);
464 }
465
466 // Set other_case unichar id in the properties for the given unichar id.
467 void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {
468 unichars[unichar_id].properties.other_case = other_case;
469 }
470
471 // Set the direction property of the given unichar to the given value.
473 unichars[unichar_id].properties.direction = value;
474 }
475
476 // Set mirror unichar id in the properties for the given unichar id.
477 void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) {
478 unichars[unichar_id].properties.mirror = mirror;
479 }
480
481 // Record normalized version of unichar with the given unichar_id.
482 void set_normed(UNICHAR_ID unichar_id, const char* normed) {
483 unichars[unichar_id].properties.normed = normed;
484 unichars[unichar_id].properties.normed_ids.truncate(0);
485 }
486 // Sets the normed_ids vector from the normed string. normed_ids is not
487 // stored in the file, and needs to be set when the UNICHARSET is loaded.
488 void set_normed_ids(UNICHAR_ID unichar_id);
489
490 // Return the isalpha property of the given unichar.
491 bool get_isalpha(UNICHAR_ID unichar_id) const {
492 if (INVALID_UNICHAR_ID == unichar_id) return false;
493 ASSERT_HOST(contains_unichar_id(unichar_id));
494 return unichars[unichar_id].properties.isalpha;
495 }
496
497 // Return the islower property of the given unichar.
498 bool get_islower(UNICHAR_ID unichar_id) const {
499 if (INVALID_UNICHAR_ID == unichar_id) return false;
500 ASSERT_HOST(contains_unichar_id(unichar_id));
501 return unichars[unichar_id].properties.islower;
502 }
503
504 // Return the isupper property of the given unichar.
505 bool get_isupper(UNICHAR_ID unichar_id) const {
506 if (INVALID_UNICHAR_ID == unichar_id) return false;
507 ASSERT_HOST(contains_unichar_id(unichar_id));
508 return unichars[unichar_id].properties.isupper;
509 }
510
511 // Return the isdigit property of the given unichar.
512 bool get_isdigit(UNICHAR_ID unichar_id) const {
513 if (INVALID_UNICHAR_ID == unichar_id) return false;
514 ASSERT_HOST(contains_unichar_id(unichar_id));
515 return unichars[unichar_id].properties.isdigit;
516 }
517
518 // Return the ispunctuation property of the given unichar.
519 bool get_ispunctuation(UNICHAR_ID unichar_id) const {
520 if (INVALID_UNICHAR_ID == unichar_id) return false;
521 ASSERT_HOST(contains_unichar_id(unichar_id));
522 return unichars[unichar_id].properties.ispunctuation;
523 }
524
525 // Return the isngram property of the given unichar.
526 bool get_isngram(UNICHAR_ID unichar_id) const {
527 if (INVALID_UNICHAR_ID == unichar_id) return false;
528 ASSERT_HOST(contains_unichar_id(unichar_id));
529 return unichars[unichar_id].properties.isngram;
530 }
531
532 // Returns whether the unichar id represents a unicode value in the private
533 // use area.
534 bool get_isprivate(UNICHAR_ID unichar_id) const;
535
536 // Returns true if the ids have useful min/max top/bottom values.
537 bool top_bottom_useful() const {
538 return top_bottom_set_;
539 }
540 // Sets all ranges to empty, so they can be expanded to set the values.
541 void set_ranges_empty();
542 // Sets all the properties for this unicharset given a src_unicharset with
543 // everything set. The unicharsets don't have to be the same, and graphemes
544 // are correctly accounted for.
547 }
548 // Sets properties from Other, starting only at the given index.
549 void PartialSetPropertiesFromOther(int start_index, const UNICHARSET& src);
550 // Expands the tops and bottoms and widths for this unicharset given a
551 // src_unicharset with ranges in it. The unicharsets don't have to be the
552 // same, and graphemes are correctly accounted for.
553 void ExpandRangesFromOther(const UNICHARSET& src);
554 // Makes this a copy of src. Clears this completely first, so the automattic
555 // ids will not be present in this if not in src.
556 void CopyFrom(const UNICHARSET& src);
557 // For each id in src, if it does not occur in this, add it, as in
558 // SetPropertiesFromOther, otherwise expand the ranges, as in
559 // ExpandRangesFromOther.
560 void AppendOtherUnicharset(const UNICHARSET& src);
561 // Returns true if the acceptable ranges of the tops of the characters do
562 // not overlap, making their x-height calculations distinct.
563 bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const;
564 // Returns the min and max bottom and top of the given unichar in
565 // baseline-normalized coordinates, ie, where the baseline is
566 // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
567 // (See normalis.h for the definitions).
568 void get_top_bottom(UNICHAR_ID unichar_id,
569 int* min_bottom, int* max_bottom,
570 int* min_top, int* max_top) const {
571 if (INVALID_UNICHAR_ID == unichar_id) {
572 *min_bottom = *min_top = 0;
573 *max_bottom = *max_top = 256; // kBlnCellHeight
574 return;
575 }
576 ASSERT_HOST(contains_unichar_id(unichar_id));
577 *min_bottom = unichars[unichar_id].properties.min_bottom;
578 *max_bottom = unichars[unichar_id].properties.max_bottom;
579 *min_top = unichars[unichar_id].properties.min_top;
580 *max_top = unichars[unichar_id].properties.max_top;
581 }
582 void set_top_bottom(UNICHAR_ID unichar_id,
583 int min_bottom, int max_bottom,
584 int min_top, int max_top) {
585 unichars[unichar_id].properties.min_bottom =
586 ClipToRange<int>(min_bottom, 0, UINT8_MAX);
587 unichars[unichar_id].properties.max_bottom =
588 ClipToRange<int>(max_bottom, 0, UINT8_MAX);
589 unichars[unichar_id].properties.min_top =
590 ClipToRange<int>(min_top, 0, UINT8_MAX);
591 unichars[unichar_id].properties.max_top =
592 ClipToRange<int>(max_top, 0, UINT8_MAX);
593 }
594 // Returns the width stats (as mean, sd) of the given unichar relative to the
595 // median advance of all characters in the character set.
597 float* width, float* width_sd) const {
598 if (INVALID_UNICHAR_ID == unichar_id) {
599 *width = 0.0f;
600 *width_sd = 0.0f;;
601 return;
602 }
603 ASSERT_HOST(contains_unichar_id(unichar_id));
604 *width = unichars[unichar_id].properties.width;
605 *width_sd = unichars[unichar_id].properties.width_sd;
606 }
607 void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd) {
608 unichars[unichar_id].properties.width = width;
609 unichars[unichar_id].properties.width_sd = width_sd;
610 }
611 // Returns the stats of the x-bearing (as mean, sd) of the given unichar
612 // relative to the median advance of all characters in the character set.
614 float* bearing, float* bearing_sd) const {
615 if (INVALID_UNICHAR_ID == unichar_id) {
616 *bearing = *bearing_sd = 0.0f;
617 return;
618 }
619 ASSERT_HOST(contains_unichar_id(unichar_id));
620 *bearing = unichars[unichar_id].properties.bearing;
621 *bearing_sd = unichars[unichar_id].properties.bearing_sd;
622 }
624 float bearing, float bearing_sd) {
625 unichars[unichar_id].properties.bearing = bearing;
626 unichars[unichar_id].properties.bearing_sd = bearing_sd;
627 }
628 // Returns the stats of the x-advance of the given unichar (as mean, sd)
629 // relative to the median advance of all characters in the character set.
631 float* advance, float* advance_sd) const {
632 if (INVALID_UNICHAR_ID == unichar_id) {
633 *advance = *advance_sd = 0;
634 return;
635 }
636 ASSERT_HOST(contains_unichar_id(unichar_id));
637 *advance = unichars[unichar_id].properties.advance;
638 *advance_sd = unichars[unichar_id].properties.advance_sd;
639 }
641 float advance, float advance_sd) {
642 unichars[unichar_id].properties.advance = advance;
643 unichars[unichar_id].properties.advance_sd = advance_sd;
644 }
645 // Returns true if the font metrics properties are empty.
646 bool PropertiesIncomplete(UNICHAR_ID unichar_id) const {
647 return unichars[unichar_id].properties.AnyRangeEmpty();
648 }
649
650 // Returns true if the script of the given id is space delimited.
651 // Returns false for Han and Thai scripts.
652 bool IsSpaceDelimited(UNICHAR_ID unichar_id) const {
653 if (INVALID_UNICHAR_ID == unichar_id) return true;
654 int script_id = get_script(unichar_id);
655 return script_id != han_sid_ && script_id != thai_sid_ &&
656 script_id != hangul_sid_ && script_id != hiragana_sid_ &&
657 script_id != katakana_sid_;
658 }
659
660 // Return the script name of the given unichar.
661 // The returned pointer will always be the same for the same script, it's
662 // managed by unicharset and thus MUST NOT be deleted
663 int get_script(UNICHAR_ID unichar_id) const {
664 if (INVALID_UNICHAR_ID == unichar_id) return null_sid_;
665 ASSERT_HOST(contains_unichar_id(unichar_id));
666 return unichars[unichar_id].properties.script_id;
667 }
668
669 // Return the character properties, eg. alpha/upper/lower/digit/punct,
670 // as a bit field of unsigned int.
671 unsigned int get_properties(UNICHAR_ID unichar_id) const;
672
673 // Return the character property as a single char. If a character has
674 // multiple attributes, the main property is defined by the following order:
675 // upper_case : 'A'
676 // lower_case : 'a'
677 // alpha : 'x'
678 // digit : '0'
679 // punctuation: 'p'
680 char get_chartype(UNICHAR_ID unichar_id) const;
681
682 // Get other_case unichar id in the properties for the given unichar id.
684 if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
685 ASSERT_HOST(contains_unichar_id(unichar_id));
686 return unichars[unichar_id].properties.other_case;
687 }
688
689 // Returns the direction property of the given unichar.
691 if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL;
692 ASSERT_HOST(contains_unichar_id(unichar_id));
693 return unichars[unichar_id].properties.direction;
694 }
695
696 // Get mirror unichar id in the properties for the given unichar id.
697 UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const {
698 if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
699 ASSERT_HOST(contains_unichar_id(unichar_id));
700 return unichars[unichar_id].properties.mirror;
701 }
702
703 // Returns UNICHAR_ID of the corresponding lower-case unichar.
704 UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
705 if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
706 ASSERT_HOST(contains_unichar_id(unichar_id));
707 if (unichars[unichar_id].properties.islower) return unichar_id;
708 return unichars[unichar_id].properties.other_case;
709 }
710
711 // Returns UNICHAR_ID of the corresponding upper-case unichar.
712 UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
713 if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
714 ASSERT_HOST(contains_unichar_id(unichar_id));
715 if (unichars[unichar_id].properties.isupper) return unichar_id;
716 return unichars[unichar_id].properties.other_case;
717 }
718
719 // Returns true if this UNICHARSET has the special codes in
720 // SpecialUnicharCodes available. If false then there are normal unichars
721 // at these codes and they should not be used.
722 bool has_special_codes() const {
723 return get_fragment(UNICHAR_BROKEN) != nullptr &&
726 }
727
728 // Returns true if there are any repeated unicodes in the normalized
729 // text of any unichar-id in the unicharset.
730 bool AnyRepeatedUnicodes() const;
731
732 // Return a pointer to the CHAR_FRAGMENT class if the given
733 // unichar id represents a character fragment.
734 const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
735 if (INVALID_UNICHAR_ID == unichar_id) return nullptr;
736 ASSERT_HOST(contains_unichar_id(unichar_id));
737 return unichars[unichar_id].properties.fragment;
738 }
739
740 // Return the isalpha property of the given unichar representation.
741 bool get_isalpha(const char* const unichar_repr) const {
742 return get_isalpha(unichar_to_id(unichar_repr));
743 }
744
745 // Return the islower property of the given unichar representation.
746 bool get_islower(const char* const unichar_repr) const {
747 return get_islower(unichar_to_id(unichar_repr));
748 }
749
750 // Return the isupper property of the given unichar representation.
751 bool get_isupper(const char* const unichar_repr) const {
752 return get_isupper(unichar_to_id(unichar_repr));
753 }
754
755 // Return the isdigit property of the given unichar representation.
756 bool get_isdigit(const char* const unichar_repr) const {
757 return get_isdigit(unichar_to_id(unichar_repr));
758 }
759
760 // Return the ispunctuation property of the given unichar representation.
761 bool get_ispunctuation(const char* const unichar_repr) const {
762 return get_ispunctuation(unichar_to_id(unichar_repr));
763 }
764
765 // Return the character properties, eg. alpha/upper/lower/digit/punct,
766 // of the given unichar representation
767 unsigned int get_properties(const char* const unichar_repr) const {
768 return get_properties(unichar_to_id(unichar_repr));
769 }
770
771 char get_chartype(const char* const unichar_repr) const {
772 return get_chartype(unichar_to_id(unichar_repr));
773 }
774
775 // Return the script name of the given unichar representation.
776 // The returned pointer will always be the same for the same script, it's
777 // managed by unicharset and thus MUST NOT be deleted
778 int get_script(const char* const unichar_repr) const {
779 return get_script(unichar_to_id(unichar_repr));
780 }
781
782 // Return a pointer to the CHAR_FRAGMENT class struct if the given
783 // unichar representation represents a character fragment.
784 const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {
785 if (unichar_repr == nullptr || unichar_repr[0] == '\0' ||
786 !ids.contains(unichar_repr, false)) {
787 return nullptr;
788 }
789 return get_fragment(unichar_to_id(unichar_repr));
790 }
791
792 // Return the isalpha property of the given unichar representation.
793 // Only the first length characters from unichar_repr are used.
794 bool get_isalpha(const char* const unichar_repr,
795 int length) const {
796 return get_isalpha(unichar_to_id(unichar_repr, length));
797 }
798
799 // Return the islower property of the given unichar representation.
800 // Only the first length characters from unichar_repr are used.
801 bool get_islower(const char* const unichar_repr,
802 int length) const {
803 return get_islower(unichar_to_id(unichar_repr, length));
804 }
805
806 // Return the isupper property of the given unichar representation.
807 // Only the first length characters from unichar_repr are used.
808 bool get_isupper(const char* const unichar_repr,
809 int length) const {
810 return get_isupper(unichar_to_id(unichar_repr, length));
811 }
812
813 // Return the isdigit property of the given unichar representation.
814 // Only the first length characters from unichar_repr are used.
815 bool get_isdigit(const char* const unichar_repr,
816 int length) const {
817 return get_isdigit(unichar_to_id(unichar_repr, length));
818 }
819
820 // Return the ispunctuation property of the given unichar representation.
821 // Only the first length characters from unichar_repr are used.
822 bool get_ispunctuation(const char* const unichar_repr,
823 int length) const {
824 return get_ispunctuation(unichar_to_id(unichar_repr, length));
825 }
826
827 // Returns normalized version of unichar with the given unichar_id.
828 const char *get_normed_unichar(UNICHAR_ID unichar_id) const {
829 if (unichar_id == UNICHAR_SPACE) return " ";
830 return unichars[unichar_id].properties.normed.string();
831 }
832 // Returns a vector of UNICHAR_IDs that represent the ids of the normalized
833 // version of the given id. There may be more than one UNICHAR_ID in the
834 // vector if unichar_id represents a ligature.
836 return unichars[unichar_id].properties.normed_ids;
837 }
838
839 // Return the script name of the given unichar representation.
840 // Only the first length characters from unichar_repr are used.
841 // The returned pointer will always be the same for the same script, it's
842 // managed by unicharset and thus MUST NOT be deleted
843 int get_script(const char* const unichar_repr,
844 int length) const {
845 return get_script(unichar_to_id(unichar_repr, length));
846 }
847
848 // Return the (current) number of scripts in the script table
850 return script_table_size_used;
851 }
852
853 // Return the script string from its id
854 const char* get_script_from_script_id(int id) const {
855 if (id >= script_table_size_used || id < 0)
856 return null_script;
857 return script_table[id];
858 }
859
860 // Returns the id from the name of the script, or 0 if script is not found.
861 // Note that this is an expensive operation since it involves iteratively
862 // comparing strings in the script table. To avoid dependency on STL, we
863 // won't use a hash. Instead, the calling function can use this to lookup
864 // and save the ID for relevant scripts for fast comparisons later.
865 int get_script_id_from_name(const char* script_name) const;
866
867 // Return true if the given script is the null script
868 bool is_null_script(const char* script) const {
869 return script == null_script;
870 }
871
872 // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
873 // then the returned pointer will be the same.
874 // The script parameter is copied and thus can be a temporary.
875 int add_script(const char* script);
876
877 // Return the enabled property of the given unichar.
878 bool get_enabled(UNICHAR_ID unichar_id) const {
879 ASSERT_HOST(contains_unichar_id(unichar_id));
880 return unichars[unichar_id].properties.enabled;
881 }
882
883
884 int null_sid() const { return null_sid_; }
885 int common_sid() const { return common_sid_; }
886 int latin_sid() const { return latin_sid_; }
887 int cyrillic_sid() const { return cyrillic_sid_; }
888 int greek_sid() const { return greek_sid_; }
889 int han_sid() const { return han_sid_; }
890 int hiragana_sid() const { return hiragana_sid_; }
891 int katakana_sid() const { return katakana_sid_; }
892 int thai_sid() const { return thai_sid_; }
893 int hangul_sid() const { return hangul_sid_; }
894 int default_sid() const { return default_sid_; }
895
896 // Returns true if the unicharset has the concept of upper/lower case.
898 return script_has_upper_lower_;
899 }
900 // Returns true if the unicharset has the concept of x-height.
901 // script_has_xheight can be true even if script_has_upper_lower is not,
902 // when the script has a sufficiently predominant top line with ascenders,
903 // such as Devanagari and Thai.
904 bool script_has_xheight() const {
905 return script_has_xheight_;
906 }
907
908 private:
909
910 struct UNICHAR_PROPERTIES {
911 UNICHAR_PROPERTIES();
912 // Initializes all properties to sensible default values.
913 void Init();
914 // Sets all ranges wide open. Initialization default in case there are
915 // no useful values available.
916 void SetRangesOpen();
917 // Sets all ranges to empty. Used before expanding with font-based data.
918 void SetRangesEmpty();
919 // Returns true if any of the top/bottom/width/bearing/advance ranges/stats
920 // is empty.
921 bool AnyRangeEmpty() const;
922 // Expands the ranges with the ranges from the src properties.
923 void ExpandRangesFrom(const UNICHAR_PROPERTIES& src);
924 // Copies the properties from src into this.
925 void CopyFrom(const UNICHAR_PROPERTIES& src);
926
927 bool isalpha;
928 bool islower;
929 bool isupper;
930 bool isdigit;
931 bool ispunctuation;
932 bool isngram;
933 bool enabled;
934 // Possible limits of the top and bottom of the bounding box in
935 // baseline-normalized coordinates, ie, where the baseline is
936 // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
937 // (See normalis.h for the definitions).
938 uint8_t min_bottom;
939 uint8_t max_bottom;
940 uint8_t min_top;
941 uint8_t max_top;
942 // Statstics of the widths of bounding box, relative to the median advance.
943 float width;
944 float width_sd;
945 // Stats of the x-bearing and advance, also relative to the median advance.
946 float bearing;
947 float bearing_sd;
948 float advance;
949 float advance_sd;
950 int script_id;
951 UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar
952 Direction direction; // direction of this unichar
953 // Mirror property is useful for reverse DAWG lookup for words in
954 // right-to-left languages (e.g. "(word)" would be in
955 // '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string.
956 // However, what we want in our DAWG is
957 // '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not
958 // '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'.
959 UNICHAR_ID mirror;
960 // A string of unichar_ids that represent the corresponding normed string.
961 // For awkward characters like em-dash, this gives hyphen.
962 // For ligatures, this gives the string of normal unichars.
963 GenericVector<UNICHAR_ID> normed_ids;
964 STRING normed; // normalized version of this unichar
965 // Contains meta information about the fragment if a unichar represents
966 // a fragment of a character, otherwise should be set to nullptr.
967 // It is assumed that character fragments are added to the unicharset
968 // after the corresponding 'base' characters.
969 CHAR_FRAGMENT *fragment;
970 };
971
972 struct UNICHAR_SLOT {
973 char representation[UNICHAR_LEN + 1];
974 UNICHAR_PROPERTIES properties;
975 };
976
977 // Internal recursive version of encode_string above.
978 // str is the start of the whole string.
979 // str_index is the current position in str.
980 // str_length is the length of str.
981 // encoding is a working encoding of str.
982 // lengths is a working set of lengths of each element of encoding.
983 // best_total_length is the longest length of str that has been successfully
984 // encoded so far.
985 // On return:
986 // best_encoding contains the encoding that used the longest part of str.
987 // best_lengths (may be null) contains the lengths of best_encoding.
988 void encode_string(const char* str, int str_index, int str_length,
990 GenericVector<char>* lengths,
991 int* best_total_length,
992 GenericVector<UNICHAR_ID>* best_encoding,
993 GenericVector<char>* best_lengths) const;
994
995 // Gets the properties for a grapheme string, combining properties for
996 // multiple characters in a meaningful way where possible.
997 // Returns false if no valid match was found in the unicharset.
998 // NOTE that script_id, mirror, and other_case refer to this unicharset on
999 // return and will need redirecting if the target unicharset is different.
1000 bool GetStrProperties(const char* utf8_str,
1001 UNICHAR_PROPERTIES* props) const;
1002
1003 // Load ourselves from a "file" where our only interface to the file is
1004 // an implementation of fgets(). This is the parsing primitive accessed by
1005 // the public routines load_from_file() and load_from_inmemory_file().
1006 bool load_via_fgets(TessResultCallback2<char *, char *, int> *fgets_cb,
1007 bool skip_fragments);
1008
1009 // List of mappings to make when ingesting strings from the outside.
1010 // The substitutions clean up text that should exists for rendering of
1011 // synthetic data, but not in the recognition set.
1012 static const char* kCleanupMaps[][2];
1013 static TESS_API const char* null_script;
1014
1015 UNICHAR_SLOT* unichars;
1016 UNICHARMAP ids;
1017 int size_used;
1018 int size_reserved;
1019 char** script_table;
1020 int script_table_size_used;
1021 int script_table_size_reserved;
1022 // True if the unichars have their tops/bottoms set.
1023 bool top_bottom_set_;
1024 // True if the unicharset has significant upper/lower case chars.
1025 bool script_has_upper_lower_;
1026 // True if the unicharset has a significant mean-line with significant
1027 // ascenders above that.
1028 bool script_has_xheight_;
1029 // True if the set contains chars that would be changed by the cleanup.
1030 bool old_style_included_;
1031
1032 // A few convenient script name-to-id mapping without using hash.
1033 // These are initialized when unicharset file is loaded. Anything
1034 // missing from this list can be looked up using get_script_id_from_name.
1035 int null_sid_;
1036 int common_sid_;
1037 int latin_sid_;
1038 int cyrillic_sid_;
1039 int greek_sid_;
1040 int han_sid_;
1041 int hiragana_sid_;
1042 int katakana_sid_;
1043 int thai_sid_;
1044 int hangul_sid_;
1045 // The most frequently occurring script in the charset.
1046 int default_sid_;
1047};
1048
1049#endif // TESSERACT_CCUTIL_UNICHARSET_H_
#define ASSERT_HOST(x)
Definition: errcode.h:88
#define TESS_API
Definition: platform.h:54
#define UNICHAR_LEN
Definition: unichar.h:30
int UNICHAR_ID
Definition: unichar.h:34
SpecialUnicharCodes
Definition: unicharset.h:33
@ SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:38
@ UNICHAR_BROKEN
Definition: unicharset.h:36
@ UNICHAR_JOINED
Definition: unicharset.h:35
@ UNICHAR_SPACE
Definition: unicharset.h:34
OldUncleanUnichars
Definition: unicharset.h:43
bool Serialize(FILE *fp, const char *data, size_t n)
Definition: serialis.cpp:60
bool Serialize(const char *data, size_t count=1)
Definition: serialis.cpp:148
Definition: strngs.h:45
int32_t length() const
Definition: strngs.cpp:189
void clear()
Definition: unicharmap.cpp:115
bool contains(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:79
void set_unichar(const char *uch)
Definition: unicharset.h:64
bool is_beginning() const
Definition: unicharset.h:105
static const int kMaxLen
Definition: unicharset.h:53
static const int kMaxChunks
Definition: unicharset.h:55
void set_natural(bool value)
Definition: unicharset.h:114
int get_total() const
Definition: unicharset.h:72
STRING to_string() const
Definition: unicharset.h:79
static const int kMinLen
Definition: unicharset.h:51
void set_pos(int p)
Definition: unicharset.h:68
bool equals(const CHAR_FRAGMENT *other) const
Definition: unicharset.h:90
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
Definition: unicharset.h:98
const char * get_unichar() const
Definition: unicharset.h:70
static CHAR_FRAGMENT * parse_from_string(const char *str)
void set_all(const char *unichar, int pos, int total, bool natural)
Definition: unicharset.h:58
int get_pos() const
Definition: unicharset.h:71
bool equals(const char *other_unichar, int other_pos, int other_total) const
Definition: unicharset.h:85
void set_total(int t)
Definition: unicharset.h:69
bool is_natural() const
Definition: unicharset.h:113
bool is_ending() const
Definition: unicharset.h:108
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:828
int get_script(const char *const unichar_repr) const
Definition: unicharset.h:778
bool load_from_inmemory_file(const char *const memory, int mem_size)
Definition: unicharset.h:381
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:734
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:448
int hiragana_sid() const
Definition: unicharset.h:890
void set_isngram(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:456
bool is_null_script(const char *script) const
Definition: unicharset.h:868
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:299
void ExpandRangesFromOther(const UNICHARSET &src)
Definition: unicharset.cpp:435
int katakana_sid() const
Definition: unicharset.h:891
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:690
UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:704
bool get_isalpha(const char *const unichar_repr) const
Definition: unicharset.h:741
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:451
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
Definition: unicharset.h:477
void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd)
Definition: unicharset.h:607
@ U_BOUNDARY_NEUTRAL
Definition: unicharset.h:175
@ U_POP_DIRECTIONAL_ISOLATE
Definition: unicharset.h:179
@ U_SEGMENT_SEPARATOR
Definition: unicharset.h:165
@ U_ARABIC_NUMBER
Definition: unicharset.h:162
@ U_COMMON_NUMBER_SEPARATOR
Definition: unicharset.h:163
@ U_POP_DIRECTIONAL_FORMAT
Definition: unicharset.h:173
@ U_WHITE_SPACE_NEUTRAL
Definition: unicharset.h:166
@ U_OTHER_NEUTRAL
Definition: unicharset.h:167
@ U_FIRST_STRONG_ISOLATE
Definition: unicharset.h:176
@ U_EUROPEAN_NUMBER_TERMINATOR
Definition: unicharset.h:161
@ U_RIGHT_TO_LEFT_EMBEDDING
Definition: unicharset.h:171
@ U_EUROPEAN_NUMBER_SEPARATOR
Definition: unicharset.h:160
@ U_CHAR_DIRECTION_COUNT
Definition: unicharset.h:181
@ U_LEFT_TO_RIGHT_ISOLATE
Definition: unicharset.h:177
@ U_BLOCK_SEPARATOR
Definition: unicharset.h:164
@ U_EUROPEAN_NUMBER
Definition: unicharset.h:159
@ U_RIGHT_TO_LEFT_ARABIC
Definition: unicharset.h:170
@ U_RIGHT_TO_LEFT
Definition: unicharset.h:158
@ U_RIGHT_TO_LEFT_ISOLATE
Definition: unicharset.h:178
@ U_LEFT_TO_RIGHT_EMBEDDING
Definition: unicharset.h:168
@ U_LEFT_TO_RIGHT_OVERRIDE
Definition: unicharset.h:169
@ U_LEFT_TO_RIGHT
Definition: unicharset.h:157
@ U_DIR_NON_SPACING_MARK
Definition: unicharset.h:174
@ U_RIGHT_TO_LEFT_OVERRIDE
Definition: unicharset.h:172
bool get_isprivate(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:388
void reserve(int unichars_number)
Definition: unicharset.cpp:195
void set_isalpha(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:431
bool load_from_inmemory_file(const char *const memory, int mem_size, bool skip_fragments)
Definition: unicharset.cpp:761
int greek_sid() const
Definition: unicharset.h:888
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
Definition: unicharset.cpp:486
int default_sid() const
Definition: unicharset.h:894
bool script_has_upper_lower() const
Definition: unicharset.h:897
bool save_to_file(FILE *file) const
Definition: unicharset.h:360
int han_sid() const
Definition: unicharset.h:889
const CHAR_FRAGMENT * get_fragment(const char *const unichar_repr) const
Definition: unicharset.h:784
static TESS_API const char * kCustomLigatures[][2]
Definition: unicharset.h:150
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:663
void set_ranges_empty()
Definition: unicharset.cpp:396
bool save_to_file(tesseract::TFile *file) const
Definition: unicharset.h:366
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:519
void SetPropertiesFromOther(const UNICHARSET &src)
Definition: unicharset.h:545
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:626
bool top_bottom_useful() const
Definition: unicharset.h:537
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:505
char get_chartype(const char *const unichar_repr) const
Definition: unicharset.h:771
bool get_isdigit(const char *const unichar_repr) const
Definition: unicharset.h:756
void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src)
Definition: unicharset.cpp:405
bool IsSpaceDelimited(UNICHAR_ID unichar_id) const
Definition: unicharset.h:652
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
bool get_isngram(UNICHAR_ID unichar_id) const
Definition: unicharset.h:526
UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:712
bool get_isdigit(const char *const unichar_repr, int length) const
Definition: unicharset.h:815
void set_advance_stats(UNICHAR_ID unichar_id, float advance, float advance_sd)
Definition: unicharset.h:640
void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
Definition: unicharset.h:582
static TESS_API const char * kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]
Definition: unicharset.h:153
bool save_to_string(STRING *str) const
Definition: unicharset.cpp:692
int null_sid() const
Definition: unicharset.h:884
void set_isdigit(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:446
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
Definition: unicharset.h:467
void set_bearing_stats(UNICHAR_ID unichar_id, float bearing, float bearing_sd)
Definition: unicharset.h:623
void delete_pointers_in_unichars()
Definition: unicharset.h:298
bool get_ispunctuation(const char *const unichar_repr, int length) const
Definition: unicharset.h:822
void set_normed(UNICHAR_ID unichar_id, const char *normed)
Definition: unicharset.h:482
int cyrillic_sid() const
Definition: unicharset.h:887
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:671
int get_script(const char *const unichar_repr, int length) const
Definition: unicharset.h:843
void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
Definition: unicharset.h:613
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
Definition: unicharset.h:472
int get_script_id_from_name(const char *script_name) const
int add_script(const char *script)
bool load_from_file(FILE *file)
Definition: unicharset.h:403
int latin_sid() const
Definition: unicharset.h:886
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:462
bool get_isupper(const char *const unichar_repr) const
Definition: unicharset.h:751
bool AnyRepeatedUnicodes() const
int thai_sid() const
Definition: unicharset.h:892
int step(const char *str) const
Definition: unicharset.cpp:233
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:683
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:854
bool save_to_file(const char *const filename) const
Definition: unicharset.h:350
void set_isupper(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:441
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:878
char get_chartype(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:617
bool major_right_to_left() const
Definition: unicharset.cpp:992
bool get_isalpha(const char *const unichar_repr, int length) const
Definition: unicharset.h:794
void unichar_insert_backwards_compatible(const char *const unichar_repr)
Definition: unicharset.h:269
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
STRING debug_str(const char *unichar_repr) const
Definition: unicharset.h:254
void set_islower(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:436
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:284
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:835
void AppendOtherUnicharset(const UNICHARSET &src)
Definition: unicharset.cpp:464
bool PropertiesIncomplete(UNICHAR_ID unichar_id) const
Definition: unicharset.h:646
bool load_from_file(const char *const filename)
Definition: unicharset.h:396
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.h:264
void set_normed_ids(UNICHAR_ID unichar_id)
Definition: unicharset.cpp:373
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:259
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:697
int get_script_table_size() const
Definition: unicharset.h:849
bool get_isupper(const char *const unichar_repr, int length) const
Definition: unicharset.h:808
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:343
void post_load_setup()
Definition: unicharset.cpp:926
int hangul_sid() const
Definition: unicharset.h:893
void get_advance_stats(UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
Definition: unicharset.h:630
bool get_ispunctuation(const char *const unichar_repr) const
Definition: unicharset.h:761
void get_width_stats(UNICHAR_ID unichar_id, float *width, float *width_sd) const
Definition: unicharset.h:596
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:246
void clear()
Definition: unicharset.h:306
int size() const
Definition: unicharset.h:341
int common_sid() const
Definition: unicharset.h:885
bool get_islower(const char *const unichar_repr, int length) const
Definition: unicharset.h:801
unsigned int get_properties(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:602
bool has_special_codes() const
Definition: unicharset.h:722
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
bool script_has_xheight() const
Definition: unicharset.h:904
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:388
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:568
static STRING debug_utf8_str(const char *str)
Definition: unicharset.cpp:319
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:687
bool get_islower(const char *const unichar_repr) const
Definition: unicharset.h:746
bool encodable_string(const char *str, int *first_bad_position) const
Definition: unicharset.cpp:244
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:498
unsigned int get_properties(const char *const unichar_repr) const
Definition: unicharset.h:767