tesseract 4.1.1
Loading...
Searching...
No Matches
normstrngs.h
Go to the documentation of this file.
1/**********************************************************************
2 * File: normstrngs.h
3 * Description: Utilities to normalize and manipulate UTF-32 and
4 * UTF-8 strings.
5 * Author: Ranjith Unnikrishnan
6 * Created: Thu July 4 2013
7 *
8 * (C) Copyright 2013, Google Inc.
9 * Licensed under the Apache License, Version 2.0 (the "License");
10 * you may not use this file except in compliance with the License.
11 * You may obtain a copy of the License at
12 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 *
19 **********************************************************************/
20
21#ifndef TESSERACT_CCUTIL_NORMSTRNGS_H_
22#define TESSERACT_CCUTIL_NORMSTRNGS_H_
23
24#include <string>
25#include <vector>
26
27#include "validator.h"
28
29namespace tesseract {
30
31// The standard unicode normalizations.
32enum class UnicodeNormMode {
33 kNFD,
34 kNFC,
35 kNFKD,
36 kNFKC,
37};
38
39// To normalize away differences in punctuation that are ambiguous, like
40// curly quotes and different widths of dash.
41enum class OCRNorm {
42 kNone,
44};
45
46// To validate and normalize away some subtle differences that can occur in
47// Indic scripts, eg ensuring that an explicit virama is always followed by
48// a zero-width non-joiner.
49enum class GraphemeNorm {
50 kNone,
52};
53
54// Normalizes a UTF8 string according to the given modes. Returns true on
55// success. If false is returned, some failure or invalidity was present, and
56// the result string is produced on a "best effort" basis.
57bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
58 GraphemeNorm grapheme_normalize, const char* str8,
59 std::string* normalized);
60// Normalizes a UTF8 string according to the given modes and splits into
61// graphemes according to g_mode. Returns true on success. If false is returned,
62// some failure or invalidity was present, and the result string is produced on
63// a "best effort" basis.
65 GraphemeNormMode g_mode, bool report_errors,
66 const char* str8,
67 std::vector<std::string>* graphemes);
68
69// Applies just the OCR-specific normalizations and return the normalized char.
71
72// Returns true if the OCRNormalized ch1 and ch2 are the same.
73bool IsOCREquivalent(char32 ch1, char32 ch2);
74
75// Returns true if the value lies in the range of valid unicodes.
76bool IsValidCodepoint(const char32 ch);
77
78// Returns true a code point has the White_Space Unicode property.
79bool IsWhitespace(const char32 ch);
80// Returns true if every char in the given (null-terminated) string has the
81// White_Space Unicode property.
82bool IsUTF8Whitespace(const char* text);
83
84// Returns the length of bytes of the prefix of 'text' that have the White_Space
85// unicode property.
86unsigned int SpanUTF8Whitespace(const char* text);
87
88// Returns the length of bytes of the prefix of 'text' that DO NOT have the
89// White_Space unicode property.
90unsigned int SpanUTF8NotWhitespace(const char* text);
91
92// Returns true if the char is interchange valid i.e. no C0 or C1 control codes
93// (other than CR LF HT FF) and no non-characters.
94bool IsInterchangeValid(const char32 ch);
95// Same as above but restricted to 7-bit ASCII.
97
98// Convert a full-width UTF-8 string to half-width.
100
101} // namespace tesseract
102
103#endif // TESSERACT_CCUTIL_NORMSTRNGS_H_
signed int char32
bool IsOCREquivalent(char32 ch1, char32 ch2)
Definition: normstrngs.cpp:214
GraphemeNormMode
Definition: validator.h:33
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNorm grapheme_normalize, const char *str8, std::string *normalized)
Definition: normstrngs.cpp:147
bool IsUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:229
char32 OCRNormalize(char32 ch)
Definition: normstrngs.cpp:204
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNormMode g_mode, bool report_errors, const char *str8, std::vector< std::string > *graphemes)
Definition: normstrngs.cpp:172
bool IsInterchangeValid(const char32 ch)
Definition: normstrngs.cpp:253
char32 FullwidthToHalfwidth(const char32 ch)
Definition: normstrngs.cpp:282
bool IsWhitespace(const char32 ch)
Definition: normstrngs.cpp:223
bool IsInterchangeValid7BitAscii(const char32 ch)
Definition: normstrngs.cpp:276
bool IsValidCodepoint(const char32 ch)
Definition: normstrngs.cpp:218