tesseract 4.1.1
Loading...
Searching...
No Matches
validate_indic.cpp
Go to the documentation of this file.
1#include "validate_indic.h"
2#include "errcode.h"
3#include "tprintf.h"
4
5namespace tesseract {
6
7// Returns whether codes matches the pattern for an Indic Grapheme.
8// The ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf
9// has a BNF for valid syllables (Graphemes) which is modified slightly
10// for Unicode. Notably U+200C and U+200D are used before/after the
11// virama/virama to express explicit or soft viramas.
12// Also the unicode v.9 Malayalam entry states that CZHC can be used in several
13// Indic languages to request traditional ligatures, and CzHC is Malayalam-
14// specific for requesting open conjuncts.
15//
16// + vowel Grapheme: V[D](v)*
17// + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
19 switch (codes_[codes_used_].first) {
21 return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();
24 return ConsumeVowelIfValid();
27 // Apart from within an aksara, joiners are silently dropped.
29 tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second);
31 return true;
33 UseMultiCode(1);
34 return true;
35 default:
36 if (report_errors_) {
37 tprintf("Invalid start of grapheme sequence:%c=0x%x\n",
38 codes_[codes_used_].first, codes_[codes_used_].second);
39 }
40 return false;
41 }
42}
43
48 // Offset from the start of the relevant unicode code block aka code page.
49 int base = static_cast<char32>(script_);
50 int off = ch - base;
51 // Anything in another code block is other.
52 if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
53 // Exception for Tamil. The aytham character is considered a letter.
54 if (script_ == ViramaScript::kTamil && off == 0x03) return CharClass::kVowel;
55 if (off < 0x4) return CharClass::kVowelModifier;
57 // Sinhala is an exception.
58 if (off <= 0x19) return CharClass::kVowel;
59 if (off <= 0x49) return CharClass::kConsonant;
60 if (off == 0x4a) return CharClass::kVirama;
61 if (off <= 0x5f) return CharClass::kMatra;
62 } else {
63 if (off <= 0x14 || off == 0x50) return CharClass::kVowel;
64 if (off <= 0x3b || (0x58 <= off && off <= 0x5f))
66 // Sinhala doesn't have Nukta or Avagraha.
67 if (off == 0x3c) return CharClass::kNukta;
68 if (off == 0x3d) return CharClass::kVowel; // avagraha
69 if (off <= 0x4c || (0x51 <= off && off <= 0x54)) return CharClass::kMatra;
70 if (0x55 <= off && off <= 0x57) return CharClass::kMatraPiece;
71 if (off == 0x4d) return CharClass::kVirama;
72 }
73 if (off == 0x60 || off == 0x61) return CharClass::kVowel;
74 if (off == 0x62 || off == 0x63) return CharClass::kMatra;
75 // Danda and digits up to 6f are OK as other.
76 // 70-7f are script-specific.
77 // 0BF0-0BF2 are Tamil numbers 10, 100 and 1000; treat as other.
78 if (script_ == ViramaScript::kTamil && (0x70 <= off && off <= 0x72))
79 return CharClass::kOther;
80 // 0BF3-0BFA are other Tamil symbols.
81 if (script_ == ViramaScript::kTamil && (0x73 <= off && off <= 0x7A))
82 return CharClass::kOther;
83 if (script_ == ViramaScript::kBengali && (off == 0x70 || off == 0x71))
85 if (script_ == ViramaScript::kGurmukhi && (off == 0x72 || off == 0x73))
87 if (script_ == ViramaScript::kSinhala && off == 0x70)
89 if (script_ == ViramaScript::kDevanagari && off == 0x70)
90 return CharClass::kOther;
91 if (0x70 <= off && off <= 0x73) return CharClass::kVowelModifier;
92 // Non Indic, Digits, Measures, danda, etc.
93 return CharClass::kOther;
94}
95
96// Helper consumes/copies a virama and any associated post-virama joiners.
97// A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or
98// no joiner at all) must be followed by a consonant.
99// A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non
100// consonant, space, or character from a different script. We clean up the
101// representation to make it consistent by adding a ZWNJ if missing from a
102// non-linking virama. Returns false with an invalid sequence.
103bool ValidateIndic::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) {
104 const unsigned num_codes = codes_.size();
105 if (joiner.first == CharClass::kOther) {
107 if (codes_used_ < num_codes &&
109 // Post-matra viramas must be explicit, so no joiners allowed here.
110 if (post_matra) {
111 if (report_errors_) tprintf("ZWJ after a post-matra virama!!\n");
112 return false;
113 }
114 if (codes_used_ + 1 < num_codes &&
115 codes_[codes_used_ - 2].second != kRayana &&
116 (codes_[codes_used_ + 1].second == kZeroWidthNonJoiner ||
117 codes_[codes_used_ + 1].second == kYayana ||
118 codes_[codes_used_ + 1].second == kRayana)) {
119 // This combination will be picked up later.
121 } else {
122 // Half-form with optional Nukta.
123 unsigned len = output_.size() + 1 - output_used_;
124 if (UseMultiCode(len)) return true;
125 }
126 if (codes_used_ < num_codes &&
128 if (output_used_ == output_.size() ||
129 output_[output_used_] != kRayana) {
130 if (report_errors_) {
131 tprintf("Virama ZWJ ZWNJ in non-Sinhala: base=0x%x!\n",
132 static_cast<int>(script_));
133 }
134 return false;
135 }
136 // Special Sinhala case of Stand-alone Repaya. ['RA' H Z z]
137 if (UseMultiCode(4)) return true;
138 }
139 } else if (codes_used_ == num_codes ||
141 post_matra) {
142 if (codes_used_ == num_codes ||
144 // It is valid to have an unterminated virama at the end of a word, but
145 // for consistency, we will always add ZWNJ if not present.
146 output_.push_back(kZeroWidthNonJoiner);
147 } else {
149 }
150 // Explicit virama [H z]
151 MultiCodePart(2);
152 }
153 } else {
154 // Pre-virama joiner [{Z|z} H] requests specific conjunct.
155 if (UseMultiCode(2)) {
156 if (report_errors_)
157 tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n");
158 return false;
159 }
160 if (codes_[codes_used_].second == kZeroWidthJoiner ||
162 if (report_errors_) {
163 tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(),
164 codes_[codes_used_].second);
165 }
166 return false;
167 }
168 }
169 // It is good so far as it goes.
170 return true;
171}
172
173// Helper consumes/copies a series of consonants separated by viramas while
174// valid, but not any vowel or other modifiers.
175bool ValidateIndic::ConsumeConsonantHeadIfValid() {
176 const unsigned num_codes = codes_.size();
177 // Consonant aksara
178 do {
180 // Special Sinhala case of [H Z Yayana/Rayana].
181 int index = output_.size() - 3;
182 if (output_used_ + 3 <= output_.size() &&
183 (output_.back() == kYayana || output_.back() == kRayana) &&
184 IsVirama(output_[index]) && output_[index + 1] == kZeroWidthJoiner) {
185 MultiCodePart(3);
186 }
187 bool have_nukta = false;
188 if (codes_used_ < num_codes &&
190 have_nukta = true;
192 }
193 // Test for subscript conjunct.
194 index = output_.size() - 2 - have_nukta;
195 if (output_used_ + 2 + have_nukta <= output_.size() && IsSubscriptScript() &&
196 IsVirama(output_[index])) {
197 // Output previous virama, consonant + optional nukta.
198 MultiCodePart(2 + have_nukta);
199 }
200 IndicPair joiner(CharClass::kOther, 0);
201 if (codes_used_ < num_codes &&
202 (codes_[codes_used_].second == kZeroWidthJoiner ||
205 joiner = codes_[codes_used_];
206 if (++codes_used_ == num_codes) {
207 if (report_errors_) {
208 tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(),
209 joiner.second);
210 }
211 return true;
212 }
213 if (codes_[codes_used_].first == CharClass::kVirama) {
214 output_.push_back(joiner.second);
215 } else {
216 if (report_errors_) {
217 tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n",
218 output_.back(), joiner.second, codes_[codes_used_].second);
219 }
220 joiner = std::make_pair(CharClass::kOther, 0);
221 }
222 }
223 if (codes_used_ < num_codes &&
225 if (!ConsumeViramaIfValid(joiner, false)) return false;
226 } else {
227 break; // No virama, so the run of consonants is over.
228 }
229 } while (codes_used_ < num_codes &&
231 if (output_used_ < output_.size()) MultiCodePart(1);
232 return true;
233}
234
235// Helper consumes/copies a tail part of a consonant, comprising optional
236// matra/piece, vowel modifier, vedic mark, terminating virama.
237bool ValidateIndic::ConsumeConsonantTailIfValid() {
238 if (codes_used_ == codes_.size()) return true;
239 // No virama: Finish the grapheme.
240 // Are multiple matras allowed?
241 if (codes_[codes_used_].first == CharClass::kMatra) {
242 if (UseMultiCode(1)) return true;
244 if (UseMultiCode(1)) return true;
245 }
246 }
248 if (UseMultiCode(1)) return true;
249 // Only Malayalam allows only repeated 0xd02.
250 if (script_ != ViramaScript::kMalayalam || output_.back() != 0xd02) break;
251 }
252 while (codes_[codes_used_].first == CharClass::kVedicMark) {
253 if (UseMultiCode(1)) return true;
254 }
255 if (codes_[codes_used_].first == CharClass::kVirama) {
256 if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) {
257 return false;
258 }
259 }
260 // What we have consumed so far is a valid consonant cluster.
261 if (output_used_ < output_.size()) MultiCodePart(1);
262
263 return true;
264}
265
266// Helper consumes/copies a vowel and optional modifiers.
267bool ValidateIndic::ConsumeVowelIfValid() {
268 if (UseMultiCode(1)) return true;
270 if (UseMultiCode(1)) return true;
271 // Only Malayalam allows repeated modifiers?
272 if (script_ != ViramaScript::kMalayalam) break;
273 }
274 while (codes_[codes_used_].first == CharClass::kVedicMark) {
275 if (UseMultiCode(1)) return true;
276 }
277 // What we have consumed so far is a valid vowel cluster.
278 return true;
279}
280
281} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:88
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
signed int char32
Definition: unichar.h:51
bool ConsumeGraphemeIfValid() override
Validator::CharClass UnicodeToCharClass(char32 ch) const override
static const char32 kZeroWidthNonJoiner
Definition: validator.h:95
ViramaScript script_
Definition: validator.h:229
std::vector< char32 > output_
Definition: validator.h:235
unsigned output_used_
Definition: validator.h:239
static bool IsVedicAccent(char32 unicode)
Definition: validator.cpp:191
unsigned codes_used_
Definition: validator.h:237
bool UseMultiCode(unsigned length)
Definition: validator.h:195
void MultiCodePart(unsigned length)
Definition: validator.h:181
static bool IsVirama(char32 unicode)
Definition: validator.cpp:180
static const int kIndicCodePageSize
Definition: validator.h:213
std::pair< CharClass, char32 > IndicPair
Definition: validator.h:133
bool IsSubscriptScript() const
Definition: validator.cpp:198
std::vector< IndicPair > codes_
Definition: validator.h:231
static const char32 kZeroWidthJoiner
Definition: validator.h:96