tesseract 4.1.1
Loading...
Searching...
No Matches
validate_grapheme.cpp
Go to the documentation of this file.
1#include "validate_grapheme.h"
2#include "tprintf.h"
3#include "unicode/uchar.h" // From libicu
4
5namespace tesseract {
6
8 const unsigned num_codes = codes_.size();
9 char32 prev_prev_ch = ' ';
10 char32 prev_ch = ' ';
12 int num_codes_in_grapheme = 0;
13 while (codes_used_ < num_codes) {
14 CharClass cc = codes_[codes_used_].first;
15 char32 ch = codes_[codes_used_].second;
16 const bool is_combiner =
18 // TODO: Make this code work well with RTL text.
19 // See https://github.com/tesseract-ocr/tesseract/pull/2266#issuecomment-467114751
20 #if 0
21 // Reject easily detected badly formed sequences.
22 if (prev_cc == CharClass::kWhitespace && is_combiner) {
23 if (report_errors_) tprintf("Word started with a combiner:0x%x\n", ch);
24 return false;
25 }
26 #endif
27 if (prev_cc == CharClass::kVirama && cc == CharClass::kVirama) {
29 tprintf("Two grapheme links in a row:0x%x 0x%x\n", prev_ch, ch);
30 return false;
31 }
32 if (prev_cc != CharClass::kWhitespace && cc != CharClass::kWhitespace &&
33 IsBadlyFormed(prev_ch, ch)) {
34 return false;
35 }
36 bool prev_is_fwd_combiner =
37 prev_ch == kZeroWidthJoiner || prev_cc == CharClass::kVirama ||
38 (prev_ch == kZeroWidthNonJoiner &&
39 (cc == CharClass::kVirama || prev_prev_ch == kZeroWidthJoiner));
40 if (num_codes_in_grapheme > 0 && !is_combiner && !prev_is_fwd_combiner)
41 break;
43 ++num_codes_in_grapheme;
44 prev_prev_ch = prev_ch;
45 prev_ch = ch;
46 prev_cc = cc;
47 }
48 if (num_codes_in_grapheme > 0) MultiCodePart(num_codes_in_grapheme);
49 return true;
50}
51
54 // The ZeroWidth[Non]Joiner characters are mapped to kCombiner as they
55 // always combine with the previous character.
56 if (u_hasBinaryProperty(ch, UCHAR_GRAPHEME_LINK)) return CharClass::kVirama;
57 if (u_isUWhiteSpace(ch)) return CharClass::kWhitespace;
58 // Workaround for Javanese Aksara's Taling, do not label it as a combiner
59 if (ch == 0xa9ba) return CharClass::kConsonant;
60 int char_type = u_charType(ch);
61 if (char_type == U_NON_SPACING_MARK || char_type == U_ENCLOSING_MARK ||
62 char_type == U_COMBINING_SPACING_MARK || ch == kZeroWidthNonJoiner ||
63 ch == kZeroWidthJoiner)
65 return CharClass::kOther;
66}
67
68// Helper returns true if the sequence prev_ch,ch is invalid.
69bool ValidateGrapheme::IsBadlyFormed(char32 prev_ch, char32 ch) {
70 // Reject badly formed Indic vowels.
71 if (IsBadlyFormedIndicVowel(prev_ch, ch)) {
73 tprintf("Badly formed Indic vowel sequence:0x%x 0x%x\n", prev_ch, ch);
74 return true;
75 }
76 if (IsBadlyFormedThai(prev_ch, ch)) {
77 if (report_errors_) tprintf("Badly formed Thai:0x%x 0x%x\n", prev_ch, ch);
78 return true;
79 }
80 return false;
81}
82
83// Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel.
84// Some vowels in Indic scripts may be analytically decomposed into atomic pairs
85// of components that are themselves valid unicode symbols. (See Table 12-1 in
86// http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
87// for examples in Devanagari). The Unicode standard discourages specifying
88// vowels this way, but they are sometimes encountered in text, probably because
89// some editors still permit it. Renderers however dislike such pairs, and so
90// this function may be used to detect their occurrence for removal.
91// TODO(rays) This function only covers a subset of Indic languages and doesn't
92// include all rules. Add rules as appropriate to support other languages or
93// find a way to generalize these existing rules that makes use of the
94// regularity of the mapping from ISCII to Unicode.
95/* static */
96bool ValidateGrapheme::IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch) {
97 return ((prev_ch == 0x905 && (ch == 0x946 || ch == 0x93E)) ||
98 (prev_ch == 0x909 && ch == 0x941) ||
99 (prev_ch == 0x90F && (ch >= 0x945 && ch <= 0x947)) ||
100 (prev_ch == 0x905 && (ch >= 0x949 && ch <= 0x94C)) ||
101 (prev_ch == 0x906 && (ch >= 0x949 && ch <= 0x94C)) ||
102 // Illegal combinations of two dependent Devanagari vowels.
103 (prev_ch == 0x93E && (ch >= 0x945 && ch <= 0x948)) ||
104 // Dependent Devanagari vowels following a virama.
105 (prev_ch == 0x94D && (ch >= 0x93E && ch <= 0x94C)) ||
106 // Bengali vowels (Table 9-5, pg 313)
107 (prev_ch == 0x985 && ch == 0x9BE) ||
108 // Telugu vowels (Table 9-19, pg 331)
109 (prev_ch == 0xC12 && (ch == 0xC55 || ch == 0xC4C)) ||
110 // Kannada vowels (Table 9-20, pg 332)
111 (prev_ch == 0xC92 && ch == 0xCCC));
112}
113
114// Helper returns true if ch is a Thai consonant.
115static bool IsThaiConsonant(char32 ch) { return 0xe01 <= ch && ch <= 0xe2e; }
116
117// Helper returns true is ch is a before-consonant vowel.
118static bool IsThaiBeforeConsonantVowel(char32 ch) {
119 return 0xe40 <= ch && ch <= 0xe44;
120}
121
122// Helper returns true if ch is a Thai tone mark.
123static bool IsThaiToneMark(char32 ch) { return 0xe48 <= ch && ch <= 0xe4b; }
124
125// Helper returns true if ch is a Thai vowel that may be followed by a tone
126// mark.
127static bool IsThaiTonableVowel(char32 ch) {
128 return (0xe34 <= ch && ch <= 0xe39) || ch == 0xe31;
129}
130
131// Helper returns true if the sequence prev_ch,ch is invalid Thai.
132// These rules come from a native Thai speaker, and are not covered by the
133// Thai section in the unicode book:
134// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
135// Comments below added by Ray interpreting the code ranges.
136/* static */
137bool ValidateGrapheme::IsBadlyFormedThai(char32 prev_ch, char32 ch) {
138 // Tone marks must follow consonants or specific vowels.
139 if (IsThaiToneMark(ch) &&
140 !(IsThaiConsonant(prev_ch) || IsThaiTonableVowel(prev_ch))) {
141 return true;
142 }
143 // Tonable vowels must follow consonants.
144 if ((IsThaiTonableVowel(ch) || ch == 0xe47) && !IsThaiConsonant(prev_ch)) {
145 return true;
146 }
147 // Thanthakhat must follow consonant or specific vowels.
148 if (ch == 0xe4c &&
149 !(IsThaiConsonant(prev_ch) || prev_ch == 0xe38 || prev_ch == 0xe34)) {
150 return true;
151 }
152 // Nikkhahit must follow a consonant ?or certain markers?.
153 // TODO(rays) confirm this, but there were so many in the ground truth of the
154 // validation set that it seems reasonable to assume it is valid.
155 if (ch == 0xe4d &&
156 !(IsThaiConsonant(prev_ch) || prev_ch == 0xe48 || prev_ch == 0xe49)) {
157 return true;
158 }
159 // The vowels e30, e32, e33 can be used more liberally.
160 if ((ch == 0xe30 || ch == 0xe32 || ch == 0xe33) &&
161 !(IsThaiConsonant(prev_ch) || IsThaiToneMark(prev_ch)) &&
162 !(prev_ch == 0xe32 && ch == 0xe30) &&
163 !(prev_ch == 0xe4d && ch == 0xe32)) {
164 return true;
165 }
166 // Some vowels come before consonants, and therefore cannot follow things
167 // that cannot end a syllable.
168 if (IsThaiBeforeConsonantVowel(ch) &&
169 (IsThaiBeforeConsonantVowel(prev_ch) || prev_ch == 0xe31 ||
170 prev_ch == 0xe37)) {
171 return true;
172 }
173 // Don't allow the standalone vowel U+0e24 to be followed by other vowels.
174 if ((0xe30 <= ch && ch <= 0xe4D) && prev_ch == 0xe24) {
175 return true;
176 }
177 return false;
178}
179
180} // namespace tesseract
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
signed int char32
signed int char32
Definition: unichar.h:51
CharClass UnicodeToCharClass(char32 ch) const override
bool ConsumeGraphemeIfValid() override
static const char32 kZeroWidthNonJoiner
Definition: validator.h:95
static bool IsVedicAccent(char32 unicode)
Definition: validator.cpp:191
unsigned codes_used_
Definition: validator.h:237
void MultiCodePart(unsigned length)
Definition: validator.h:181
std::vector< IndicPair > codes_
Definition: validator.h:231
static const char32 kZeroWidthJoiner
Definition: validator.h:96