tesseract 4.1.1
Loading...
Searching...
No Matches
validate_javanese.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: validate_javanese.cpp
3 * Description: Text validator for Javanese Script - aksara jawa.
4 * Author: Shree Devi Kumar
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *
16 **********************************************************************/
17
18#include "validate_javanese.h"
19#include "errcode.h"
20#include "tprintf.h"
21
22namespace tesseract {
23
24// Returns whether codes matches the pattern for a Javanese Grapheme.
25// Taken from unicode standard:
26// http://www.unicode.org/charts/PDF/UA980.pdf
27// http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf
28// The Consonant class here includes independent vowels.
29// The order of components in an orthographic syllable as expressed in BNF is:
30// {C F} C {{R}Y} {V{A}} {Z}
31// Translated to the codes used by the CharClass enum:
32// [(V|C[N])(H)] (V|C[N]) [[N]N] [M[D]] [v]
33// Also see https://r12a.github.io/scripts/javanese/ for detailed notes.
34// Validation rules copied from validate_indic.cpp and modified for Javanese.
35// Indic - for reference
36// + vowel Grapheme: V[D](v)*
37// + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
38
40 switch (codes_[codes_used_].first) {
42 return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();
45 return ConsumeVowelIfValid();
48 // Apart from within an aksara, joiners are silently dropped.
50 tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second);
52 return true;
54 UseMultiCode(1);
55 return true;
56 default:
57 if (report_errors_) {
58 tprintf("Invalid start of grapheme sequence:%c=0x%x\n",
59 codes_[codes_used_].first, codes_[codes_used_].second);
60 }
61 return false;
62 }
63}
64
65// Helper consumes/copies a virama and any associated post-virama joiners.
66// A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or
67// no joiner at all) must be followed by a consonant.
68// A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non
69// consonant, space, or character from a different script. We clean up the
70// representation to make it consistent by adding a ZWNJ if missing from a
71// non-linking virama. Returns false with an invalid sequence.
72bool ValidateJavanese::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) {
73 const unsigned num_codes = codes_.size();
74 if (joiner.first == CharClass::kOther) {
76 if (codes_used_ < num_codes &&
78 // Post-matra viramas must be explicit, so no joiners allowed here.
79 if (post_matra) {
80 if (report_errors_) tprintf("ZWJ after a post-matra virama!!\n");
81 return false;
82 }
83 if (codes_used_ + 1 < num_codes &&
84 codes_[codes_used_ - 2].second != kCakra &&
85 (codes_[codes_used_ + 1].second == kZeroWidthNonJoiner ||
86 codes_[codes_used_ + 1].second == kPengkal ||
87 codes_[codes_used_ + 1].second == kCakra)) {
88 // This combination will be picked up later.
90 } else {
91 // Half-form with optional Nukta.
92 unsigned len = output_.size() + 1 - output_used_;
93 if (UseMultiCode(len)) return true;
94 }
95 if (codes_used_ < num_codes &&
97 if (output_used_ == output_.size() ||
98 output_[output_used_] != kCakra) {
99 if (report_errors_) {
100 tprintf("Virama ZWJ ZWNJ in non-Sinhala: base=0x%x!\n",
101 static_cast<int>(script_));
102 }
103 return false;
104 }
105 // Special Sinhala case of Stand-alone Repaya. ['RA' H Z z]
106 if (UseMultiCode(4)) return true;
107 }
108 } else if (codes_used_ == num_codes ||
110 post_matra) {
111 if (codes_used_ == num_codes ||
113 // It is valid to have an unterminated virama at the end of a word, but
114 // for consistency, we will always add ZWNJ if not present.
116 } else {
118 }
119 // Explicit virama [H z]
120 MultiCodePart(2);
121 }
122 } else {
123 // Pre-virama joiner [{Z|z} H] requests specific conjunct.
124 if (UseMultiCode(2)) {
125 if (report_errors_)
126 tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n");
127 return false;
128 }
129 if (codes_[codes_used_].second == kZeroWidthJoiner ||
131 if (report_errors_) {
132 tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(),
133 codes_[codes_used_].second);
134 }
135 return false;
136 }
137 }
138 // It is good so far as it goes.
139 return true;
140}
141
142// Helper consumes/copies a series of consonants separated by viramas while
143// valid, but not any vowel or other modifiers.
144bool ValidateJavanese::ConsumeConsonantHeadIfValid() {
145 const unsigned num_codes = codes_.size();
146 // Consonant aksara
147 do {
149 // Special Sinhala case of [H Z Yayana/Rayana].
150 int index = output_.size() - 3;
151 if (output_used_ + 3 <= output_.size() &&
152 (output_.back() == kPengkal || output_.back() == kCakra) &&
153 IsVirama(output_[index]) && output_[index + 1] == kZeroWidthJoiner) {
154 MultiCodePart(3);
155 }
156 bool have_nukta = false;
157 if (codes_used_ < num_codes &&
159 have_nukta = true;
161 }
162 // Test for subscript conjunct.
163 index = output_.size() - 2 - have_nukta;
164 if (output_used_ + 2 + have_nukta <= output_.size() && IsSubscriptScript() &&
165 IsVirama(output_[index])) {
166 // Output previous virama, consonant + optional nukta.
167 MultiCodePart(2 + have_nukta);
168 }
169 IndicPair joiner(CharClass::kOther, 0);
170 if (codes_used_ < num_codes &&
171 (codes_[codes_used_].second == kZeroWidthJoiner ||
174 joiner = codes_[codes_used_];
175 if (++codes_used_ == num_codes) {
176 if (report_errors_) {
177 tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(),
178 joiner.second);
179 }
180 return true;
181 }
182 if (codes_[codes_used_].first == CharClass::kVirama) {
183 output_.push_back(joiner.second);
184 } else {
185 if (report_errors_) {
186 tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n",
187 output_.back(), joiner.second, codes_[codes_used_].second);
188 }
189 joiner = std::make_pair(CharClass::kOther, 0);
190 }
191 }
192 if (codes_used_ < num_codes &&
194 if (!ConsumeViramaIfValid(joiner, false)) return false;
195 } else {
196 break; // No virama, so the run of consonants is over.
197 }
198 } while (codes_used_ < num_codes &&
200 if (output_used_ < output_.size()) MultiCodePart(1);
201 return true;
202}
203
204// Helper consumes/copies a tail part of a consonant, comprising optional
205// matra/piece, vowel modifier, vedic mark, terminating virama.
206bool ValidateJavanese::ConsumeConsonantTailIfValid() {
207 if (codes_used_ == codes_.size()) return true;
208 // No virama: Finish the grapheme.
209 // Are multiple matras allowed?
210 if (codes_[codes_used_].first == CharClass::kMatra) {
211 if (UseMultiCode(1)) return true;
213 if (UseMultiCode(1)) return true;
214 }
215 }
216 // Tarung also used for long versions of u and o vowels and vocalic r
217 // Taling + Tarung is valid eg. ꦏ + ◌ꦺ + ◌ꦴ
218 while (codes_[codes_used_].first == CharClass::kMatraPiece) {
219 if (UseMultiCode(1)) return true;
220 }
222 if (UseMultiCode(1)) return true;
223 // Only Malayalam allows only repeated 0xd02.
224 if (script_ != ViramaScript::kMalayalam || output_.back() != 0xd02) break;
225 }
226 while (codes_[codes_used_].first == CharClass::kVedicMark) {
227 if (UseMultiCode(1)) return true;
228 }
229 if (codes_[codes_used_].first == CharClass::kVirama) {
230 if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) {
231 return false;
232 }
233 }
234 // What we have consumed so far is a valid consonant cluster.
235 if (output_used_ < output_.size()) MultiCodePart(1);
236
237 return true;
238}
239
240// Helper consumes/copies a vowel and optional modifiers.
241bool ValidateJavanese::ConsumeVowelIfValid() {
242 if (UseMultiCode(1)) return true;
244 if (UseMultiCode(1)) return true;
245 // Only Malayalam allows repeated modifiers?
246 if (script_ != ViramaScript::kMalayalam) break;
247 }
248 while (codes_[codes_used_].first == CharClass::kVedicMark) {
249 if (UseMultiCode(1)) return true;
250 }
251 // What we have consumed so far is a valid vowel cluster.
252 return true;
253}
254
255
259 // Offset from the start of the relevant unicode code block aka code page.
260 int off = ch - static_cast<char32>(script_);
261 // Anything in another code block is other.
262 if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
263 if (off < 0x4) return CharClass::kVowelModifier;
264 if (off <= 0x32) return CharClass::kConsonant; // includes independent vowels
265 if (off == 0x33) return CharClass::kNukta; // A9B3 CECAK TELU
266 if (off == 0x34) return CharClass::kMatraPiece; // A9B4 TARUNG two part vowels
267 if (off <= 0x39) return CharClass::kMatra;
268 if (off <= 0x3a) return CharClass::kConsonant; // A9BA TALING - pre base vowel
269 if (off <= 0x3d) return CharClass::kMatra;
270 if (off <= 0x3f) return CharClass::kNukta; // A9BE-A9BF PENGKAL-CAKRA medial consonants
271 if (off == 0x40) return CharClass::kVirama; // A9C0 PANGKON
272 return CharClass::kOther;
273}
274
275} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:88
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
signed int char32
Definition: unichar.h:51
bool ConsumeGraphemeIfValid() override
Validator::CharClass UnicodeToCharClass(char32 ch) const override
static const char32 kZeroWidthNonJoiner
Definition: validator.h:95
ViramaScript script_
Definition: validator.h:229
std::vector< char32 > output_
Definition: validator.h:235
unsigned output_used_
Definition: validator.h:239
unsigned codes_used_
Definition: validator.h:237
bool UseMultiCode(unsigned length)
Definition: validator.h:195
void MultiCodePart(unsigned length)
Definition: validator.h:181
static bool IsVirama(char32 unicode)
Definition: validator.cpp:180
static const int kIndicCodePageSize
Definition: validator.h:213
std::pair< CharClass, char32 > IndicPair
Definition: validator.h:133
bool IsSubscriptScript() const
Definition: validator.cpp:198
std::vector< IndicPair > codes_
Definition: validator.h:231
static const char32 kZeroWidthJoiner
Definition: validator.h:96