tesseract 4.1.1
Loading...
Searching...
No Matches
normstrngs.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: normstrngs.cpp
3 * Description: Utilities to normalize and manipulate UTF-32 and
4 * UTF-8 strings.
5 * Author: Ranjith Unnikrishnan
6 * Created: Thu July 4 2013
7 *
8 * (C) Copyright 2013, Google Inc.
9 * Licensed under the Apache License, Version 2.0 (the "License");
10 * you may not use this file except in compliance with the License.
11 * You may obtain a copy of the License at
12 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 *
19 **********************************************************************/
20
21#include "normstrngs.h"
22
23#include <string>
24#include <unordered_map>
25#include <vector>
26
27#include "errcode.h"
28#include "icuerrorcode.h"
29#include "unichar.h"
30#include "unicode/normalizer2.h" // From libicu
31#include "unicode/translit.h" // From libicu
32#include "unicode/uchar.h" // From libicu
33#include "unicode/unorm2.h" // From libicu
34#include "unicode/uscript.h" // From libicu
35
36namespace tesseract {
37
38static bool is_hyphen_punc(const char32 ch) {
39 static const int kNumHyphenPuncUnicodes = 13;
40 static const char32 kHyphenPuncUnicodes[kNumHyphenPuncUnicodes] = {
41 '-', 0x2010, 0x2011, 0x2012,
42 0x2013, 0x2014, 0x2015, // hyphen..horizontal bar
43 0x207b, // superscript minus
44 0x208b, // subscript minus
45 0x2212, // minus sign
46 0xfe58, // small em dash
47 0xfe63, // small hyphen-minus
48 0xff0d, // fullwidth hyphen-minus
49 };
50 for (int i = 0; i < kNumHyphenPuncUnicodes; ++i) {
51 if (kHyphenPuncUnicodes[i] == ch) return true;
52 }
53 return false;
54}
55
56static bool is_single_quote(const char32 ch) {
57 static const int kNumSingleQuoteUnicodes = 8;
58 static const char32 kSingleQuoteUnicodes[kNumSingleQuoteUnicodes] = {
59 '\'', '`',
60 0x2018, // left single quotation mark (English, others)
61 0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.)
62 // We may have to introduce a comma set with 0x201a
63 0x201B, // single high-reveresed-9 quotation mark (PropList.txt)
64 0x2032, // prime
65 0x300C, // left corner bracket (East Asian languages)
66 0xFF07, // fullwidth apostrophe
67 };
68 for (int i = 0; i < kNumSingleQuoteUnicodes; ++i) {
69 if (kSingleQuoteUnicodes[i] == ch) return true;
70 }
71 return false;
72}
73
74static bool is_double_quote(const char32 ch) {
75 static const int kNumDoubleQuoteUnicodes = 8;
76 static const char32 kDoubleQuoteUnicodes[kNumDoubleQuoteUnicodes] = {
77 '"',
78 0x201C, // left double quotation mark (English, others)
79 0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.)
80 0x201F, // double high-reversed-9 quotation mark (PropList.txt)
81 0x2033, // double prime
82 0x301D, // reversed double prime quotation mark (East Asian langs,
83 // horiz.)
84 0x301E, // close double prime (East Asian languages written horizontally)
85 0xFF02, // fullwidth quotation mark
86 };
87 for (int i = 0; i < kNumDoubleQuoteUnicodes; ++i) {
88 if (kDoubleQuoteUnicodes[i] == ch) return true;
89 }
90 return false;
91}
92
93// Helper runs a standard unicode normalization, optional OCR normalization,
94// and leaves the result as char32 for subsequent processing.
95static void NormalizeUTF8ToUTF32(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
96 const char* str8,
97 std::vector<char32>* normed32) {
98 // Convert to ICU string for unicode normalization.
99 icu::UnicodeString uch_str(str8, "UTF-8");
100 IcuErrorCode error_code;
101 // Convert the enum to the new weird icu representation.
102 const char* norm_type =
104 ? "nfkc"
105 : "nfc";
106 UNormalization2Mode compose =
107 u_mode == UnicodeNormMode::kNFC || u_mode == UnicodeNormMode::kNFKC
108 ? UNORM2_COMPOSE
109 : UNORM2_DECOMPOSE;
110 // Pointer to singleton does not require deletion.
111 const icu::Normalizer2* normalizer =
112 icu::Normalizer2::getInstance(nullptr, norm_type, compose, error_code);
113 error_code.assertSuccess();
114 error_code.reset();
115 icu::UnicodeString norm_str = normalizer->normalize(uch_str, error_code);
116 error_code.assertSuccess();
117 // Convert to char32 for output. OCR normalization if required.
118 normed32->reserve(norm_str.length()); // An approximation.
119 for (int offset = 0; offset < norm_str.length();
120 offset = norm_str.moveIndex32(offset, 1)) {
121 char32 ch = norm_str.char32At(offset);
122 // Skip all ZWS, RTL and LTR marks.
123 if (Validator::IsZeroWidthMark(ch)) continue;
124 if (ocr_normalize == OCRNorm::kNormalize) ch = OCRNormalize(ch);
125 normed32->push_back(ch);
126 }
127}
128
129// Helper removes joiners from strings that contain no letters.
130static void StripJoiners(std::vector<char32>* str32) {
131 for (char32 ch : *str32) {
132 if (u_isalpha(ch)) return;
133 }
134 int len = 0;
135 for (char32 ch : *str32) {
136 if (ch != Validator::kZeroWidthJoiner &&
138 (*str32)[len++] = ch;
139 }
140 }
141 str32->resize(len);
142}
143
144// Normalizes a UTF8 string according to the given modes. Returns true on
145// success. If false is returned, some failure or invalidity was present, and
146// the result string is produced on a "best effort" basis.
147bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
148 GraphemeNorm grapheme_normalize, const char* str8,
149 std::string* normalized) {
150 std::vector<char32> normed32;
151 NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32);
152 if (grapheme_normalize == GraphemeNorm::kNormalize) {
153 StripJoiners(&normed32);
154 std::vector<std::vector<char32>> graphemes;
156 GraphemeNormMode::kSingleString, false, normed32, &graphemes);
157 if (graphemes.empty() || graphemes[0].empty()) {
158 success = false;
159 } else if (normalized != nullptr) {
160 *normalized = UNICHAR::UTF32ToUTF8(graphemes[0]);
161 }
162 return success;
163 }
164 if (normalized != nullptr) *normalized = UNICHAR::UTF32ToUTF8(normed32);
165 return true;
166}
167
168// Normalizes a UTF8 string according to the given modes and splits into
169// graphemes according to g_mode. Returns true on success. If false is returned,
170// some failure or invalidity was present, and the result string is produced on
171// a "best effort" basis.
173 GraphemeNormMode g_mode, bool report_errors,
174 const char* str8,
175 std::vector<std::string>* graphemes) {
176 std::vector<char32> normed32;
177 NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32);
178 StripJoiners(&normed32);
179 std::vector<std::vector<char32>> graphemes32;
180 bool success = Validator::ValidateCleanAndSegment(g_mode, report_errors,
181 normed32, &graphemes32);
182 if (g_mode != GraphemeNormMode::kSingleString && success) {
183 // If we modified the string to clean it up, the segmentation may not be
184 // correct, so check for changes and do it again.
185 std::vector<char32> cleaned32;
186 for (const auto& g : graphemes32) {
187 cleaned32.insert(cleaned32.end(), g.begin(), g.end());
188 }
189 if (cleaned32 != normed32) {
190 graphemes32.clear();
191 success = Validator::ValidateCleanAndSegment(g_mode, report_errors,
192 cleaned32, &graphemes32);
193 }
194 }
195 graphemes->clear();
196 graphemes->reserve(graphemes32.size());
197 for (const auto& grapheme : graphemes32) {
198 graphemes->push_back(UNICHAR::UTF32ToUTF8(grapheme));
199 }
200 return success;
201}
202
203// Apply just the OCR-specific normalizations and return the normalized char.
205 if (is_hyphen_punc(ch))
206 return '-';
207 else if (is_single_quote(ch))
208 return '\'';
209 else if (is_double_quote(ch))
210 return '"';
211 return ch;
212}
213
215 return OCRNormalize(ch1) == OCRNormalize(ch2);
216}
217
218bool IsValidCodepoint(const char32 ch) {
219 // In the range [0, 0xD800) or [0xE000, 0x10FFFF]
220 return (static_cast<uint32_t>(ch) < 0xD800) || (ch >= 0xE000 && ch <= 0x10FFFF);
221}
222
223bool IsWhitespace(const char32 ch) {
224 ASSERT_HOST_MSG(IsValidCodepoint(ch), "Invalid Unicode codepoint: 0x%x\n",
225 ch);
226 return u_isUWhiteSpace(static_cast<UChar32>(ch));
227}
228
229bool IsUTF8Whitespace(const char* text) {
230 return SpanUTF8Whitespace(text) == strlen(text);
231}
232
233unsigned int SpanUTF8Whitespace(const char* text) {
234 int n_white = 0;
235 for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text));
236 it != UNICHAR::end(text, strlen(text)); ++it) {
237 if (!IsWhitespace(*it)) break;
238 n_white += it.utf8_len();
239 }
240 return n_white;
241}
242
243unsigned int SpanUTF8NotWhitespace(const char* text) {
244 int n_notwhite = 0;
245 for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text));
246 it != UNICHAR::end(text, strlen(text)); ++it) {
247 if (IsWhitespace(*it)) break;
248 n_notwhite += it.utf8_len();
249 }
250 return n_notwhite;
251}
252
254 return IsValidCodepoint(ch) &&
255 !(ch >= 0xFDD0 && ch <= 0xFDEF) && // Noncharacters.
256 !(ch >= 0xFFFE && ch <= 0xFFFF) && !(ch >= 0x1FFFE && ch <= 0x1FFFF) &&
257 !(ch >= 0x2FFFE && ch <= 0x2FFFF) &&
258 !(ch >= 0x3FFFE && ch <= 0x3FFFF) &&
259 !(ch >= 0x4FFFE && ch <= 0x4FFFF) &&
260 !(ch >= 0x5FFFE && ch <= 0x5FFFF) &&
261 !(ch >= 0x6FFFE && ch <= 0x6FFFF) &&
262 !(ch >= 0x7FFFE && ch <= 0x7FFFF) &&
263 !(ch >= 0x8FFFE && ch <= 0x8FFFF) &&
264 !(ch >= 0x9FFFE && ch <= 0x9FFFF) &&
265 !(ch >= 0xAFFFE && ch <= 0xAFFFF) &&
266 !(ch >= 0xBFFFE && ch <= 0xBFFFF) &&
267 !(ch >= 0xCFFFE && ch <= 0xCFFFF) &&
268 !(ch >= 0xDFFFE && ch <= 0xDFFFF) &&
269 !(ch >= 0xEFFFE && ch <= 0xEFFFF) &&
270 !(ch >= 0xFFFFE && ch <= 0xFFFFF) &&
271 !(ch >= 0x10FFFE && ch <= 0x10FFFF) &&
272 (!u_isISOControl(static_cast<UChar32>(ch)) || ch == '\n' ||
273 ch == '\f' || ch == '\t' || ch == '\r');
274}
275
277 return IsValidCodepoint(ch) && ch <= 128 &&
278 (!u_isISOControl(static_cast<UChar32>(ch)) || ch == '\n' ||
279 ch == '\f' || ch == '\t' || ch == '\r');
280}
281
283 // Return unchanged if not in the fullwidth-halfwidth Unicode block.
284 if (ch < 0xFF00 || ch > 0xFFEF || !IsValidCodepoint(ch)) {
285 if (ch != 0x3000) return ch;
286 }
287 // Special case for fullwidth left and right "white parentheses".
288 if (ch == 0xFF5F) return 0x2985;
289 if (ch == 0xFF60) return 0x2986;
290 // Construct a full-to-half width transliterator.
291 IcuErrorCode error_code;
292 icu::UnicodeString uch_str(static_cast<UChar32>(ch));
293 const icu::Transliterator* fulltohalf = icu::Transliterator::createInstance(
294 "Fullwidth-Halfwidth", UTRANS_FORWARD, error_code);
295 error_code.assertSuccess();
296 error_code.reset();
297
298 fulltohalf->transliterate(uch_str);
299 delete fulltohalf;
300 ASSERT_HOST(uch_str.length() != 0);
301 return uch_str[0];
302}
303
304} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:88
#define ASSERT_HOST_MSG(x,...)
Definition: errcode.h:92
signed int char32
bool IsOCREquivalent(char32 ch1, char32 ch2)
Definition: normstrngs.cpp:214
GraphemeNormMode
Definition: validator.h:33
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNorm grapheme_normalize, const char *str8, std::string *normalized)
Definition: normstrngs.cpp:147
bool IsUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:229
char32 OCRNormalize(char32 ch)
Definition: normstrngs.cpp:204
signed int char32
Definition: unichar.h:51
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNormMode g_mode, bool report_errors, const char *str8, std::vector< std::string > *graphemes)
Definition: normstrngs.cpp:172
unsigned int SpanUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:233
bool IsInterchangeValid(const char32 ch)
Definition: normstrngs.cpp:253
unsigned int SpanUTF8NotWhitespace(const char *text)
Definition: normstrngs.cpp:243
char32 FullwidthToHalfwidth(const char32 ch)
Definition: normstrngs.cpp:282
bool IsWhitespace(const char32 ch)
Definition: normstrngs.cpp:223
bool IsInterchangeValid7BitAscii(const char32 ch)
Definition: normstrngs.cpp:276
bool IsValidCodepoint(const char32 ch)
Definition: normstrngs.cpp:218
static const_iterator begin(const char *utf8_str, int byte_length)
Definition: unichar.cpp:204
static std::string UTF32ToUTF8(const std::vector< char32 > &str32)
Definition: unichar.cpp:232
static const_iterator end(const char *utf8_str, int byte_length)
Definition: unichar.cpp:208
static const char32 kZeroWidthNonJoiner
Definition: validator.h:95
static bool IsZeroWidthMark(char32 ch)
Definition: validator.h:87
static bool ValidateCleanAndSegment(GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 > > *dest)
Definition: validator.cpp:40
static const char32 kZeroWidthJoiner
Definition: validator.h:96