tesseract 4.1.1
Loading...
Searching...
No Matches
validator.cpp
Go to the documentation of this file.
1#include "validator.h"
2
3#include <algorithm>
4#include <unordered_map>
5#include <vector>
6#include <iterator>
7
8#include "icuerrorcode.h"
9#include "unicode/uchar.h" // From libicu
10#include "unicode/uscript.h" // From libicu
11#include "validate_grapheme.h"
12#include "validate_indic.h"
13#include "validate_javanese.h"
14#include "validate_khmer.h"
15#include "validate_myanmar.h"
16
17namespace tesseract {
18
19// Some specific but universally useful unicodes.
25const char32 Validator::kInvalid = 0xfffd;
26
27// Destructor.
28// It is defined here, so the compiler can create a single vtable
29// instead of weak vtables in every compilation unit.
30Validator::~Validator() = default;
31
32// Validates and cleans the src vector of unicodes to the *dest, according to
33// g_mode. In the case of kSingleString, a single vector containing the whole
34// result is added to *dest. With kCombined, multiple vectors are added to
35// *dest with one grapheme in each. With kGlyphSplit, multiple vectors are
36// added to *dest with a smaller unit representing a glyph in each.
37// In case of validation error, returns false and as much as possible of the
38// input, without discarding invalid text.
39/* static */
41 GraphemeNormMode g_mode, bool report_errors, const std::vector<char32>& src,
42 std::vector<std::vector<char32>>* dest) {
43 ValidateGrapheme g_validator(ViramaScript::kNonVirama, report_errors);
44 std::vector<std::vector<char32>> graphemes;
46 bool success = true;
47 if (script == ViramaScript::kNonVirama) {
48 // The grapheme segmenter's maximum segmentation is the grapheme unit, so
49 // up the mode by 1 to get the desired effect.
50 if (g_mode == GraphemeNormMode::kCombined)
52 else if (g_mode == GraphemeNormMode::kGlyphSplit)
54 // Just do grapheme segmentation.
55 success = g_validator.ValidateCleanAndSegmentInternal(g_mode, src, dest);
56 } else {
57 success = g_validator.ValidateCleanAndSegmentInternal(
58 GraphemeNormMode::kGlyphSplit, src, &graphemes);
59 std::unique_ptr<Validator> validator(
60 ScriptValidator(script, report_errors));
61 for (const auto& grapheme : graphemes) {
62 if (!validator->ValidateCleanAndSegmentInternal(g_mode, grapheme, dest)) {
63 success = false;
64 }
65 }
66 }
67 return success;
68}
69
70// Factory method that understands how to map script to the right subclass.
71std::unique_ptr<Validator> Validator::ScriptValidator(ViramaScript script,
72 bool report_errors) {
73 switch (script) {
75 return std::unique_ptr<Validator>(
76 new ValidateGrapheme(script, report_errors));
78 return std::unique_ptr<Validator>(
79 new ValidateJavanese(script, report_errors));
81 return std::unique_ptr<Validator>(
82 new ValidateMyanmar(script, report_errors));
84 return std::unique_ptr<Validator>(
85 new ValidateKhmer(script, report_errors));
86 default:
87 return std::unique_ptr<Validator>(
88 new ValidateIndic(script, report_errors));
89 }
90}
91
92// Internal version of the public static ValidateCleanAndSegment.
93// Validates and cleans the src vector of unicodes to the *dest, according to
94// its type and the given g_mode.
95// In case of validation error, returns false and returns as much as possible
96// of the input, without discarding invalid text.
98 GraphemeNormMode g_mode, const std::vector<char32>& src,
99 std::vector<std::vector<char32>>* dest) {
100 Clear();
102 bool success = true;
103 for (codes_used_ = 0; codes_used_ < codes_.size();) {
104 if (!ConsumeGraphemeIfValid()) {
105 success = false;
106 ++codes_used_;
107 }
108 }
109 MoveResultsToDest(g_mode, dest);
110 return success;
111}
112
113// Moves the results from parts_ or output_ to dest according to g_mode.
115 std::vector<std::vector<char32>>* dest) {
117 // Append each element of the combined output_ that we made as a new vector
118 // in dest.
119 dest->reserve(dest->size() + output_.size());
120 for (char32 ch : output_) dest->push_back({ch});
121 } else if (g_mode == GraphemeNormMode::kGlyphSplit) {
122 // Append all the parts_ that we made onto dest.
123 std::move(parts_.begin(), parts_.end(), std::back_inserter(*dest));
124 } else if (g_mode == GraphemeNormMode::kCombined || dest->empty()) {
125 // Append the combined output_ that we made onto dest as one new vector.
126 dest->push_back(std::vector<char32>());
127 output_.swap(dest->back());
128 } else { // kNone.
129 // Append the combined output_ that we made onto the last existing element
130 // of dest.
131 dest->back().insert(dest->back().end(), output_.begin(), output_.end());
132 }
133}
134
135static bool CmpPairSecond(const std::pair<int, int>& p1,
136 const std::pair<int, int>& p2) {
137 return p1.second < p2.second;
138}
139
140// Computes and returns the ViramaScript corresponding to the most frequent
141// virama-using script in the input, or kNonVirama if none are present.
142/* static */
144 const std::vector<char32>& utf32) {
145 std::unordered_map<int, int> histogram;
146 for (char32 ch : utf32) {
147 // Determine the codepage base. For the Indic scripts, Khmer and Javanese, it is
148 // sufficient to divide by kIndicCodePageSize but Myanmar is all over the
149 // unicode code space, so use its script id.
150 int base = ch / kIndicCodePageSize;
151 IcuErrorCode err;
152 UScriptCode script_code = uscript_getScript(ch, err);
153 if ((kMinIndicUnicode <= ch && ch <= kMaxJavaneseUnicode &&
154 script_code != USCRIPT_COMMON) ||
155 script_code == USCRIPT_MYANMAR) {
156 if (script_code == USCRIPT_MYANMAR)
157 base = static_cast<char32>(ViramaScript::kMyanmar) / kIndicCodePageSize;
158 ++histogram[base];
159 }
160 }
161 if (!histogram.empty()) {
162 int base =
163 std::max_element(histogram.begin(), histogram.end(), CmpPairSecond)
164 ->first;
165 char32 codebase = static_cast<char32>(base * kIndicCodePageSize);
166 // Check for validity.
167 if (codebase == static_cast<char32>(ViramaScript::kMyanmar) ||
168 codebase == static_cast<char32>(ViramaScript::kJavanese) ||
169 codebase == static_cast<char32>(ViramaScript::kKhmer) ||
170 (static_cast<char32>(ViramaScript::kDevanagari) <= codebase &&
171 codebase <= static_cast<char32>(ViramaScript::kSinhala))) {
172 return static_cast<ViramaScript>(codebase);
173 }
174 }
176}
177
178// Returns true if the given UTF-32 unicode is a "virama" character.
179/* static */
181 return (kMinIndicUnicode <= unicode && unicode <= kMaxSinhalaUnicode &&
182 (unicode & 0x7f) == 0x4d) ||
183 unicode == kSinhalaVirama ||
184 unicode == kJavaneseVirama ||
185 unicode == kMyanmarVirama ||
186 unicode == kKhmerVirama;
187}
188
189// Returns true if the given UTF-32 unicode is a vedic accent.
190/* static */
192 return (0x1cd0 <= unicode && unicode < 0x1d00) ||
193 (0xa8e0 <= unicode && unicode <= 0xa8f7) ||
194 (0x951 <= unicode && unicode <= 0x954);
195}
196
197// Returns true if the script is one that uses subscripts for conjuncts.
199 return script_ == ViramaScript::kTelugu ||
204}
205
206void Validator::ComputeClassCodes(const std::vector<char32>& text) {
207 codes_.reserve(text.size());
208 for (char32 c : text) {
209 codes_.push_back(std::make_pair(UnicodeToCharClass(c), c));
210 }
211}
212
213// Resets to the initial state.
215 codes_.clear();
216 parts_.clear();
217 output_.clear();
218 codes_used_ = 0;
219 output_used_ = 0;
220}
221
222} // namespace tesseract
signed int char32
GraphemeNormMode
Definition: validator.h:33
signed int char32
Definition: unichar.h:51
static ViramaScript MostFrequentViramaScript(const std::vector< char32 > &utf32)
Definition: validator.cpp:143
static const char32 kSinhalaVirama
Definition: validator.h:221
static const char32 kZeroWidthNonJoiner
Definition: validator.h:95
static const char32 kKhmerVirama
Definition: validator.h:223
ViramaScript script_
Definition: validator.h:229
virtual CharClass UnicodeToCharClass(char32 ch) const =0
static const char32 kJavaneseVirama
Definition: validator.h:225
std::vector< char32 > output_
Definition: validator.h:235
static const char32 kInvalid
Definition: validator.h:99
bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode, const std::vector< char32 > &src, std::vector< std::vector< char32 > > *dest)
Definition: validator.cpp:97
static const char32 kRightToLeftMark
Definition: validator.h:98
unsigned output_used_
Definition: validator.h:239
static bool IsVedicAccent(char32 unicode)
Definition: validator.cpp:191
unsigned codes_used_
Definition: validator.h:237
static std::unique_ptr< Validator > ScriptValidator(ViramaScript script, bool report_errors)
Definition: validator.cpp:71
static const char32 kLeftToRightMark
Definition: validator.h:97
static bool IsVirama(char32 unicode)
Definition: validator.cpp:180
static const int kIndicCodePageSize
Definition: validator.h:213
static const char32 kZeroWidthSpace
Definition: validator.h:94
bool IsSubscriptScript() const
Definition: validator.cpp:198
std::vector< IndicPair > codes_
Definition: validator.h:231
void MoveResultsToDest(GraphemeNormMode g_mode, std::vector< std::vector< char32 > > *dest)
Definition: validator.cpp:114
void ComputeClassCodes(const std::vector< char32 > &text)
Definition: validator.cpp:206
static bool ValidateCleanAndSegment(GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 > > *dest)
Definition: validator.cpp:40
static const char32 kZeroWidthJoiner
Definition: validator.h:96
static const char32 kMaxJavaneseUnicode
Definition: validator.h:226
static const char32 kMaxSinhalaUnicode
Definition: validator.h:217
static const char32 kMyanmarVirama
Definition: validator.h:222
static const char32 kMinIndicUnicode
Definition: validator.h:215
std::vector< std::vector< char32 > > parts_
Definition: validator.h:233
virtual bool ConsumeGraphemeIfValid()=0