tesseract 4.1.1
Loading...
Searching...
No Matches
unichar.cpp
Go to the documentation of this file.
1
2// File: unichar.cpp
3// Description: Unicode character/ligature class.
4// Author: Ray Smith
5//
6// (C) Copyright 2006, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16//
18
19#include "unichar.h"
20#include "errcode.h"
21#include "genericvector.h"
22#include "tprintf.h"
23
24#define UNI_MAX_LEGAL_UTF32 0x0010FFFF
25
26namespace tesseract {
27
28// Construct from a utf8 string. If len<0 then the string is null terminated.
29// If the string is too long to fit in the UNICHAR then it takes only what
30// will fit. Checks for illegal input and stops at an illegal sequence.
31// The resulting UNICHAR may be empty.
32UNICHAR::UNICHAR(const char* utf8_str, int len) {
33 int total_len = 0;
34 int step = 0;
35 if (len < 0) {
36 for (len = 0; len < UNICHAR_LEN && utf8_str[len] != 0; ++len);
37 }
38 for (total_len = 0; total_len < len; total_len += step) {
39 step = utf8_step(utf8_str + total_len);
40 if (total_len + step > UNICHAR_LEN)
41 break; // Too long.
42 if (step == 0)
43 break; // Illegal first byte.
44 int i;
45 for (i = 1; i < step; ++i)
46 if ((utf8_str[total_len + i] & 0xc0) != 0x80)
47 break;
48 if (i < step)
49 break; // Illegal surrogate
50 }
51 memcpy(chars, utf8_str, total_len);
52 if (total_len < UNICHAR_LEN) {
53 chars[UNICHAR_LEN - 1] = total_len;
54 while (total_len < UNICHAR_LEN - 1)
55 chars[total_len++] = 0;
56 }
57}
58
59// Construct from a single UCS4 character. Illegal values are ignored,
60// resulting in an empty UNICHAR.
61UNICHAR::UNICHAR(int unicode) {
62 const int bytemask = 0xBF;
63 const int bytemark = 0x80;
64
65 if (unicode < 0x80) {
66 chars[UNICHAR_LEN - 1] = 1;
67 chars[2] = 0;
68 chars[1] = 0;
69 chars[0] = static_cast<char>(unicode);
70 } else if (unicode < 0x800) {
71 chars[UNICHAR_LEN - 1] = 2;
72 chars[2] = 0;
73 chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
74 unicode >>= 6;
75 chars[0] = static_cast<char>(unicode | 0xc0);
76 } else if (unicode < 0x10000) {
77 chars[UNICHAR_LEN - 1] = 3;
78 chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
79 unicode >>= 6;
80 chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
81 unicode >>= 6;
82 chars[0] = static_cast<char>(unicode | 0xe0);
83 } else if (unicode <= UNI_MAX_LEGAL_UTF32) {
84 chars[UNICHAR_LEN - 1] = 4;
85 chars[3] = static_cast<char>((unicode | bytemark) & bytemask);
86 unicode >>= 6;
87 chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
88 unicode >>= 6;
89 chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
90 unicode >>= 6;
91 chars[0] = static_cast<char>(unicode | 0xf0);
92 } else {
93 memset(chars, 0, UNICHAR_LEN);
94 }
95}
96
97// Get the first character as UCS-4.
98int UNICHAR::first_uni() const {
99 static const int utf8_offsets[5] = {
100 0, 0, 0x3080, 0xE2080, 0x3C82080
101 };
102 int uni = 0;
103 int len = utf8_step(chars);
104 const char* src = chars;
105
106 switch (len) {
107 default:
108 break;
109 case 4:
110 uni += static_cast<unsigned char>(*src++);
111 uni <<= 6;
112 // Fall through.
113 case 3:
114 uni += static_cast<unsigned char>(*src++);
115 uni <<= 6;
116 // Fall through.
117 case 2:
118 uni += static_cast<unsigned char>(*src++);
119 uni <<= 6;
120 // Fall through.
121 case 1:
122 uni += static_cast<unsigned char>(*src++);
123 }
124 uni -= utf8_offsets[len];
125 return uni;
126}
127
128// Get a terminated UTF8 string: Must delete[] it after use.
129char* UNICHAR::utf8_str() const {
130 int len = utf8_len();
131 char* str = new char[len + 1];
132 memcpy(str, chars, len);
133 str[len] = 0;
134 return str;
135}
136
137// Get the number of bytes in the first character of the given utf8 string.
138int UNICHAR::utf8_step(const char* utf8_str) {
139 static const char utf8_bytes[256] = {
140 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
141 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
142 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
143 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
144 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
145 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
146 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
147 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
148 };
149
150 return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
151}
152
154 ASSERT_HOST(it_ != nullptr);
155 int step = utf8_step(it_);
156 if (step == 0) {
157 tprintf("ERROR: Illegal UTF8 encountered.\n");
158 for (int i = 0; i < 5 && it_[i] != '\0'; ++i) {
159 tprintf("Index %d char = 0x%x\n", i, it_[i]);
160 }
161 step = 1;
162 }
163 it_ += step;
164 return *this;
165}
166
168 ASSERT_HOST(it_ != nullptr);
169 const int len = utf8_step(it_);
170 if (len == 0) {
171 tprintf("WARNING: Illegal UTF8 encountered\n");
172 return ' ';
173 }
174 UNICHAR uch(it_, len);
175 return uch.first_uni();
176}
177
178int UNICHAR::const_iterator::get_utf8(char* utf8_output) const {
179 ASSERT_HOST(it_ != nullptr);
180 const int len = utf8_step(it_);
181 if (len == 0) {
182 tprintf("WARNING: Illegal UTF8 encountered\n");
183 utf8_output[0] = ' ';
184 return 1;
185 }
186 strncpy(utf8_output, it_, len);
187 return len;
188}
189
191 ASSERT_HOST(it_ != nullptr);
192 const int len = utf8_step(it_);
193 if (len == 0) {
194 tprintf("WARNING: Illegal UTF8 encountered\n");
195 return 1;
196 }
197 return len;
198}
199
201 return utf8_step(it_) > 0;
202}
203
206}
207
209 return UNICHAR::const_iterator(utf8_str + len);
210}
211
212// Converts a utf-8 string to a vector of unicodes.
213// Returns an empty vector if the input contains invalid UTF-8.
214/* static */
215std::vector<char32> UNICHAR::UTF8ToUTF32(const char* utf8_str) {
216 const int utf8_length = strlen(utf8_str);
217 std::vector<char32> unicodes;
218 unicodes.reserve(utf8_length);
219 const_iterator end_it(end(utf8_str, utf8_length));
220 for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) {
221 if (it.is_legal()) {
222 unicodes.push_back(*it);
223 } else {
224 unicodes.clear();
225 return unicodes;
226 }
227 }
228 return unicodes;
229}
230
231// Returns an empty string if the input contains an invalid unicode.
232std::string UNICHAR::UTF32ToUTF8(const std::vector<char32>& str32) {
233 std::string utf8_str;
234 for (char32 ch : str32) {
235 UNICHAR uni_ch(ch);
236 int step;
237 if (uni_ch.utf8_len() > 0 && (step = utf8_step(uni_ch.utf8())) > 0) {
238 utf8_str.append(uni_ch.utf8(), step);
239 } else {
240 return "";
241 }
242 }
243 return utf8_str;
244}
245
246} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:88
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
#define UNI_MAX_LEGAL_UTF32
Definition: unichar.cpp:24
#define UNICHAR_LEN
Definition: unichar.h:30
signed int char32
Definition: unichar.h:51
char * utf8_str() const
Definition: unichar.cpp:129
int first_uni() const
Definition: unichar.cpp:98
static const_iterator begin(const char *utf8_str, int byte_length)
Definition: unichar.cpp:204
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:215
int utf8_len() const
Definition: unichar.h:77
static std::string UTF32ToUTF8(const std::vector< char32 > &str32)
Definition: unichar.cpp:232
const char * utf8() const
Definition: unichar.h:83
static const_iterator end(const char *utf8_str, int byte_length)
Definition: unichar.cpp:208
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:138
const_iterator & operator++()
Definition: unichar.cpp:153
int get_utf8(char *buf) const
Definition: unichar.cpp:178