tesseract 4.1.1
Loading...
Searching...
No Matches
unicharset_training_utils.cpp
Go to the documentation of this file.
1
2// File: unicharset_training_utils.cpp
3// Description: Training utilities for UNICHARSET.
4// Author: Ray Smith
5// Created: Fri Oct 17 17:09:01 PDT 2014
6//
7// (C) Copyright 2014, Google Inc.
8// Licensed under the Apache License, Version 2.0 (the "License");
9// you may not use this file except in compliance with the License.
10// You may obtain a copy of the License at
11// http://www.apache.org/licenses/LICENSE-2.0
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17//
19
21
22#include <cstdlib>
23#include <cstring>
24#include <string>
25#include <vector>
26
27#include "fileio.h"
28#include "icuerrorcode.h"
29#include "normstrngs.h"
30#include "statistc.h"
31#include "unichar.h"
32#include "unicharset.h"
33#include "unicode/uchar.h" // from libicu
34#include "unicode/uscript.h" // from libicu
35
36namespace tesseract {
37
38// Helper sets the character attribute properties and sets up the script table.
39// Does not set tops and bottoms.
40void SetupBasicProperties(bool report_errors, bool decompose,
41 UNICHARSET* unicharset) {
42 for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) {
43 // Convert any custom ligatures.
44 const char* unichar_str = unicharset->id_to_unichar(unichar_id);
45 for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr; ++i) {
46 if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) {
47 unichar_str = UNICHARSET::kCustomLigatures[i][0];
48 break;
49 }
50 }
51
52 // Convert the unichar to UTF32 representation
53 std::vector<char32> uni_vector = UNICHAR::UTF8ToUTF32(unichar_str);
54
55 // Assume that if the property is true for any character in the string,
56 // then it holds for the whole "character".
57 bool unichar_isalpha = false;
58 bool unichar_islower = false;
59 bool unichar_isupper = false;
60 bool unichar_isdigit = false;
61 bool unichar_ispunct = false;
62
63 for (char32 u_ch : uni_vector) {
64 if (u_isalpha(u_ch)) unichar_isalpha = true;
65 if (u_islower(u_ch)) unichar_islower = true;
66 if (u_isupper(u_ch)) unichar_isupper = true;
67 if (u_isdigit(u_ch)) unichar_isdigit = true;
68 if (u_ispunct(u_ch)) unichar_ispunct = true;
69 }
70
71 unicharset->set_isalpha(unichar_id, unichar_isalpha);
72 unicharset->set_islower(unichar_id, unichar_islower);
73 unicharset->set_isupper(unichar_id, unichar_isupper);
74 unicharset->set_isdigit(unichar_id, unichar_isdigit);
75 unicharset->set_ispunctuation(unichar_id, unichar_ispunct);
76
78 unicharset->set_script(unichar_id, uscript_getName(
79 uscript_getScript(uni_vector[0], err)));
80
81 const int num_code_points = uni_vector.size();
82 // Obtain the lower/upper case if needed and record it in the properties.
83 unicharset->set_other_case(unichar_id, unichar_id);
84 if (unichar_islower || unichar_isupper) {
85 std::vector<char32> other_case(num_code_points, 0);
86 for (int i = 0; i < num_code_points; ++i) {
87 // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used.
88 // However since they deal with UChars (so need a conversion function
89 // from char32 or UTF8string) and require a meaningful locale string,
90 // for now u_tolower()/u_toupper() are used.
91 other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) :
92 u_tolower(uni_vector[i]);
93 }
94 std::string other_case_uch = UNICHAR::UTF32ToUTF8(other_case);
95 UNICHAR_ID other_case_id =
96 unicharset->unichar_to_id(other_case_uch.c_str());
97 if (other_case_id != INVALID_UNICHAR_ID) {
98 unicharset->set_other_case(unichar_id, other_case_id);
99 } else if (unichar_id >= SPECIAL_UNICHAR_CODES_COUNT && report_errors) {
100 tprintf("Other case %s of %s is not in unicharset\n",
101 other_case_uch.c_str(), unichar_str);
102 }
103 }
104
105 // Set RTL property and obtain mirror unichar ID from ICU.
106 std::vector<char32> mirrors(num_code_points, 0);
107 for (int i = 0; i < num_code_points; ++i) {
108 mirrors[i] = u_charMirror(uni_vector[i]);
109 if (i == 0) { // set directionality to that of the 1st code point
110 unicharset->set_direction(unichar_id,
111 static_cast<UNICHARSET::Direction>(
112 u_charDirection(uni_vector[i])));
113 }
114 }
115 std::string mirror_uch = UNICHAR::UTF32ToUTF8(mirrors);
116 UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str());
117 if (mirror_uch_id != INVALID_UNICHAR_ID) {
118 unicharset->set_mirror(unichar_id, mirror_uch_id);
119 } else if (report_errors) {
120 tprintf("Mirror %s of %s is not in unicharset\n",
121 mirror_uch.c_str(), unichar_str);
122 }
123
124 // Record normalized version of this unichar.
125 std::string normed_str;
126 if (unichar_id != 0 &&
131 unichar_str, &normed_str) &&
132 !normed_str.empty()) {
133 unicharset->set_normed(unichar_id, normed_str.c_str());
134 } else {
135 unicharset->set_normed(unichar_id, unichar_str);
136 }
137 ASSERT_HOST(unicharset->get_other_case(unichar_id) < unicharset->size());
138 }
139 unicharset->post_load_setup();
140}
141
142// Helper sets the properties from universal script unicharsets, if found.
143void SetScriptProperties(const std::string& script_dir, UNICHARSET* unicharset) {
144 for (int s = 0; s < unicharset->get_script_table_size(); ++s) {
145 // Load the unicharset for the script if available.
146 std::string filename = script_dir + "/" +
147 unicharset->get_script_from_script_id(s) + ".unicharset";
148 UNICHARSET script_set;
149 if (script_set.load_from_file(filename.c_str())) {
150 unicharset->SetPropertiesFromOther(script_set);
151 } else if (s != unicharset->common_sid() && s != unicharset->null_sid()) {
152 tprintf("Failed to load script unicharset from:%s\n", filename.c_str());
153 }
154 }
155 for (int c = SPECIAL_UNICHAR_CODES_COUNT; c < unicharset->size(); ++c) {
156 if (unicharset->PropertiesIncomplete(c)) {
157 tprintf("Warning: properties incomplete for index %d = %s\n", c,
158 unicharset->id_to_unichar(c));
159 }
160 }
161}
162
163// Helper gets the combined x-heights string.
164std::string GetXheightString(const std::string& script_dir,
165 const UNICHARSET& unicharset) {
166 std::string xheights_str;
167 for (int s = 0; s < unicharset.get_script_table_size(); ++s) {
168 // Load the xheights for the script if available.
169 std::string filename = script_dir + "/" +
170 unicharset.get_script_from_script_id(s) + ".xheights";
171 std::string script_heights;
172 if (File::ReadFileToString(filename, &script_heights))
173 xheights_str += script_heights;
174 }
175 return xheights_str;
176}
177
178// Helper to set the properties for an input unicharset file, writes to the
179// output file. If an appropriate script unicharset can be found in the
180// script_dir directory, then the tops and bottoms are expanded using the
181// script unicharset.
182// If non-empty, xheight data for the fonts are written to the xheights_file.
183void SetPropertiesForInputFile(const std::string& script_dir,
184 const std::string& input_unicharset_file,
185 const std::string& output_unicharset_file,
186 const std::string& output_xheights_file) {
187 UNICHARSET unicharset;
188
189 // Load the input unicharset
190 unicharset.load_from_file(input_unicharset_file.c_str());
191 tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(),
192 input_unicharset_file.c_str());
193
194 // Set unichar properties
195 tprintf("Setting unichar properties\n");
196 SetupBasicProperties(true, false, &unicharset);
197 tprintf("Setting script properties\n");
198 SetScriptProperties(script_dir, &unicharset);
199 if (!output_xheights_file.empty()) {
200 std::string xheights_str = GetXheightString(script_dir, unicharset);
201 File::WriteStringToFileOrDie(xheights_str, output_xheights_file);
202 }
203
204 // Write the output unicharset
205 tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str());
206 unicharset.save_to_file(output_unicharset_file.c_str());
207}
208
209} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:88
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int UNICHAR_ID
Definition: unichar.h:34
@ SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:38
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNorm grapheme_normalize, const char *str8, std::string *normalized)
Definition: normstrngs.cpp:147
std::string GetXheightString(const std::string &script_dir, const UNICHARSET &unicharset)
signed int char32
Definition: unichar.h:51
void SetScriptProperties(const std::string &script_dir, UNICHARSET *unicharset)
void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET *unicharset)
void SetPropertiesForInputFile(const std::string &script_dir, const std::string &input_unicharset_file, const std::string &output_unicharset_file, const std::string &output_xheights_file)
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:215
static std::string UTF32ToUTF8(const std::vector< char32 > &str32)
Definition: unichar.cpp:232
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:451
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
Definition: unicharset.h:477
void set_isalpha(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:431
static TESS_API const char * kCustomLigatures[][2]
Definition: unicharset.h:150
void SetPropertiesFromOther(const UNICHARSET &src)
Definition: unicharset.h:545
int null_sid() const
Definition: unicharset.h:884
void set_isdigit(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:446
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
Definition: unicharset.h:467
void set_normed(UNICHAR_ID unichar_id, const char *normed)
Definition: unicharset.h:482
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
Definition: unicharset.h:472
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:462
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:683
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:854
bool save_to_file(const char *const filename) const
Definition: unicharset.h:350
void set_isupper(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:441
void set_islower(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:436
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
bool PropertiesIncomplete(UNICHAR_ID unichar_id) const
Definition: unicharset.h:646
int get_script_table_size() const
Definition: unicharset.h:849
void post_load_setup()
Definition: unicharset.cpp:926
int size() const
Definition: unicharset.h:341
int common_sid() const
Definition: unicharset.h:885
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:388
static bool ReadFileToString(const std::string &filename, std::string *out)
Definition: fileio.cpp:77
static void WriteStringToFileOrDie(const std::string &str, const std::string &filename)
Definition: fileio.cpp:56