tesseract 4.1.1
Loading...
Searching...
No Matches
validate_khmer.cpp
Go to the documentation of this file.
1#include "validate_khmer.h"
2#include "errcode.h"
3#include "tprintf.h"
4
5namespace tesseract {
6
7// Returns whether codes matches the pattern for a Khmer Grapheme.
8// Taken from unicode standard:
9// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf.
10// where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation
11// to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf.
12// Translated to the codes used by the CharClass enum:
13// C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC}
14// Where R is a new symbol (Robat) and N is repurposed as a consonant shifter.
15// Also the Consonant class here includes independent vowels, as they are
16// treated the same anyway.
17// In the split grapheme mode, the only characters that get grouped are the
18// HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in
19// the BNF syntax, so who knows what they do.
21 const unsigned num_codes = codes_.size();
22 if (codes_used_ == num_codes) return false;
23 if (codes_[codes_used_].first == CharClass::kOther) {
24 UseMultiCode(1);
25 return true;
26 }
28 if (report_errors_) {
29 tprintf("Invalid start of Khmer syllable:0x%x\n",
30 codes_[codes_used_].second);
31 }
32 return false;
33 }
34 if (UseMultiCode(1)) return true;
35 if (codes_[codes_used_].first == CharClass::kRobat ||
37 if (UseMultiCode(1)) return true;
38 }
39 while (codes_used_ + 1 < num_codes &&
43 if (UseMultiCode(2)) return true;
44 if (codes_[codes_used_].first == CharClass::kRobat) {
45 if (UseMultiCode(1)) return true;
46 }
47 }
48 unsigned num_matra_parts = 0;
49 if (codes_[codes_used_].second == kZeroWidthJoiner ||
51 if (CodeOnlyToOutput()) {
52 if (report_errors_) {
53 tprintf("Unterminated joiner: 0x%x\n", output_.back());
54 }
55 return false;
56 }
57 ++num_matra_parts;
58 }
59 // Not quite as shown by the BNF, the matra piece is allowed as a matra on its
60 // own or as an addition to other matras.
61 if (codes_[codes_used_].first == CharClass::kMatra ||
63 ++num_matra_parts;
64 if (UseMultiCode(num_matra_parts)) return true;
65 } else if (num_matra_parts) {
66 if (report_errors_) {
67 tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n",
68 output_.back(), codes_[codes_used_].second);
69 }
70 return false;
71 }
74 if (UseMultiCode(1)) return true;
75 }
77 if (UseMultiCode(1)) return true;
78 }
79 if (codes_used_ + 1 < num_codes &&
83 if (UseMultiCode(2)) return true;
84 }
85 return true;
86}
87
92 // Offset from the start of the relevant unicode code block aka code page.
93 int off = ch - static_cast<char32>(script_);
94 // Anything in another code block is other.
95 if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
96 if (off <= 0x33) return CharClass::kConsonant;
97 if (off <= 0x45) return CharClass::kMatra;
98 if (off == 0x46) return CharClass::kMatraPiece;
99 if (off == 0x4c) return CharClass::kRobat;
100 if (off == 0x49 || off == 0x4a) return CharClass::kNukta;
101 if (off <= 0x51) return CharClass::kVowelModifier;
102 if (off == 0x52) return CharClass::kVirama;
103 return CharClass::kOther;
104}
105
106} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:88
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
signed int char32
Definition: unichar.h:51
bool ConsumeGraphemeIfValid() override
CharClass UnicodeToCharClass(char32 ch) const override
static const char32 kZeroWidthNonJoiner
Definition: validator.h:95
ViramaScript script_
Definition: validator.h:229
std::vector< char32 > output_
Definition: validator.h:235
static bool IsVedicAccent(char32 unicode)
Definition: validator.cpp:191
unsigned codes_used_
Definition: validator.h:237
bool UseMultiCode(unsigned length)
Definition: validator.h:195
static const int kIndicCodePageSize
Definition: validator.h:213
std::vector< IndicPair > codes_
Definition: validator.h:231
static const char32 kZeroWidthJoiner
Definition: validator.h:96