tesseract 4.1.1
Loading...
Searching...
No Matches
validate_myanmar.cpp
Go to the documentation of this file.
1#include "validate_myanmar.h"
2#include "errcode.h"
3#include "icuerrorcode.h"
4#include "tprintf.h"
5#include "unicode/uchar.h" // From libicu
6#include "unicode/uscript.h" // From libicu
7
8namespace tesseract {
9
10// Returns whether codes matches the pattern for a Myanmar Grapheme.
11// Taken directly from the unicode table 16-3.
12// See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
14 const unsigned num_codes = codes_.size();
15 if (codes_used_ == num_codes) return true;
16 // Other.
17 if (IsMyanmarOther(codes_[codes_used_].second)) {
18 UseMultiCode(1);
19 return true;
20 }
21 // Kinzi.
22 if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 &&
23 codes_[codes_used_ + 1].second == kMyanmarAsat &&
24 codes_[codes_used_ + 2].second == kMyanmarVirama) {
27 if (UseMultiCode(3)) return true;
28 }
29 // Base consonant/vowel. NOTE that since everything in Myanmar appears to be
30 // optional, except the base, this is the only place where invalid input can
31 // be detected and false returned.
32 if (IsMyanmarLetter(codes_[codes_used_].second)) {
33 if (UseMultiCode(1)) return true;
34 } else {
35 if (report_errors_) {
36 tprintf("Invalid start of Myanmar syllable:0x%x\n",
37 codes_[codes_used_].second);
38 }
39 return false; // One of these is required.
40 }
41 if (ConsumeSubscriptIfPresent()) return true;
42 ConsumeOptionalSignsIfPresent();
43 // What we have consumed so far is a valid syllable.
44 return true;
45}
46
47// TODO(rays) Doesn't use intermediate coding like the other scripts, as there
48// is little correspondence between the content of table 16-3 and the char
49// classes of the Indic languages. (Experts may disagree and improve!)
50// In unicode table 16-3 there is basically a long list of optional characters,
51// which can be coded quite easily.
52// Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!!
53// The table also allows sequences that still result in dotted circles!!
54// So with a lot of guesswork the rest have been added in a reasonable place.
56 if (IsMyanmarLetter(ch)) return CharClass::kConsonant;
57 return CharClass::kOther;
58}
59
60// Helper consumes/copies a virama and any subscript consonant.
61// Returns true if the end of input is reached.
62bool ValidateMyanmar::ConsumeSubscriptIfPresent() {
63 // Subscript consonant. It appears there can be only one.
64 const unsigned num_codes = codes_.size();
65 if (codes_used_ + 1 < num_codes &&
67 if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) {
69 if (UseMultiCode(2)) return true;
70 }
71 }
72 return false;
73}
74
75// Helper consumes/copies a series of optional signs.
76// Returns true if the end of input is reached.
77bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {
78 // The following characters are allowed, all optional, and in sequence.
79 // An exception is kMyanmarMedialYa, which can include kMyanmarAsat.
80 const std::vector<char32> kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c,
81 0x103d, 0x103e, 0x105e, 0x105f, 0x1060,
82 0x1081, 0x1031});
83 for (char32 ch : kMedials) {
84 if (codes_[codes_used_].second == ch) {
85 if (UseMultiCode(1)) return true;
86 if (ch == kMyanmarMedialYa &&
87 codes_[codes_used_].second == kMyanmarAsat) {
88 if (UseMultiCode(1)) return true;
89 }
90 }
91 }
92 // Vowel sign i, ii, ai.
93 char32 ch = codes_[codes_used_].second;
94 if (ch == 0x102d || ch == 0x102e || ch == 0x1032) {
95 if (UseMultiCode(1)) return true;
96 }
97 // Vowel sign u, uu, and extensions.
98 ch = codes_[codes_used_].second;
99 if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) ||
100 ch == 0x1062 || ch == 0x1067 || ch == 0x1068 ||
101 (0x1071 <= ch && ch <= 0x1074) || (0x1083 <= ch && ch <= 0x1086) ||
102 ch == 0x109c || ch == 0x109d) {
103 if (UseMultiCode(1)) return true;
104 }
105 // Tall aa, aa with optional asat.
106 if (codes_[codes_used_].second == 0x102b ||
107 codes_[codes_used_].second == 0x102c) {
108 if (UseMultiCode(1)) return true;
109 if (codes_[codes_used_].second == kMyanmarAsat) {
110 if (UseMultiCode(1)) return true;
111 }
112 }
113 // The following characters are allowed, all optional, and in sequence.
114 // Anusvar, Dot below, Visarga
115 const std::vector<char32> kSigns({0x1036, 0x1037, 0x1038});
116 for (char32 ch : kSigns) {
117 if (codes_[codes_used_].second == ch) {
118 if (UseMultiCode(1)) return true;
119 }
120 }
121 // Tone mark extensions.
122 ch = codes_[codes_used_].second;
123 if (ch == 0x1038 || ch == kMyanmarAsat || ch == 0x1063 || ch == 0x1064 ||
124 (0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) ||
125 ch == 0x108f || ch == 0x109a || ch == 0x109b ||
126 (0xaa7b <= ch && ch <= 0xaa7d)) {
127 if (UseMultiCode(1)) return true;
128 }
129 return false;
130}
131
132// Returns true if the unicode is a Myanmar "letter" including consonants
133// and independent vowels. Although table 16-3 distinguishes between some
134// base consonants and vowels, the extensions make no such distinction, so we
135// put them all into a single bucket.
136// Update MYANMAR LETTER based on following:
137// https://unicode.org/charts/PDF/U1000.pdf - Myanmar
138// http://unicode.org/charts/PDF/UAA60.pdf - Myanmar Extended-A
139// http://unicode.org/charts/PDF/UA9E0.pdf - Myanmar Extended-B
140/* static */
141bool ValidateMyanmar::IsMyanmarLetter(char32 ch) {
142 return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f ||
143 (0x104c <= ch && ch <= 0x1055) || (0x105a <= ch && ch <= 0x105d) ||
144 ch == 0x1061 || ch == 0x1065 || ch == 0x1066 ||
145 (0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1081) ||
146 ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9e4) ||
147 (0xa9e7 <= ch && ch <= 0xa9ef) || (0xa9fa <= ch && ch <= 0xa9fe) ||
148 (0xaa60 <= ch && ch <= 0xaa6f) || (0xaa71 <= ch && ch <= 0xaa73) ||
149 ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f;
150}
151
152// Returns true if ch is a Myanmar digit or other symbol that does not take
153// part in being a syllable eg. punctuation marks.
154// MYANMAR DIGIT, MYANMAR SYMBOL, MYANMAR LOGOGRAM
155// REDUPLICATION MARKS
156/* static */
157bool ValidateMyanmar::IsMyanmarOther(char32 ch) {
158 IcuErrorCode err;
159 UScriptCode script_code = uscript_getScript(ch, err);
160 if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner &&
162 return true;
163 return (0x1040 <= ch && ch <= 0x104f) || (0x1090 <= ch && ch <= 0x1099) ||
164 (0x109e <= ch && ch <= 0x109f) || (0xa9f0 <= ch && ch <= 0xa9f9) ||
165 (ch == 0xa9e6 || ch == 0xaa70) || (0xaa74 <= ch && ch <= 0xaa79);
166}
167
168} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:88
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
signed int char32
signed int char32
Definition: unichar.h:51
bool ConsumeGraphemeIfValid() override
Validator::CharClass UnicodeToCharClass(char32 ch) const override
static const char32 kZeroWidthNonJoiner
Definition: validator.h:95
unsigned codes_used_
Definition: validator.h:237
bool UseMultiCode(unsigned length)
Definition: validator.h:195
std::vector< IndicPair > codes_
Definition: validator.h:231
static const char32 kZeroWidthJoiner
Definition: validator.h:96
static const char32 kMyanmarVirama
Definition: validator.h:222