tesseract 4.1.1
Loading...
Searching...
No Matches
validator.h
Go to the documentation of this file.
1/**********************************************************************
2 * File: validator.h
3 * Description: Base class for various text validators. Intended mainly for
4 * scripts that use a virama character.
5 * Author: Ray Smith
6 *
7 * (C) Copyright 2017, Google Inc.
8 * Licensed under the Apache License, Version 2.0 (the "License");
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 **********************************************************************/
19
20#ifndef TESSERACT_TRAINING_VALIDATOR_H_
21#define TESSERACT_TRAINING_VALIDATOR_H_
22
23#include <memory>
24#include <vector>
25#include "unichar.h"
26
27namespace tesseract {
28
29// Different kinds of grapheme normalization - not just for Indic!
30// A grapheme is a syllable unit in Indic and can be several unicodes.
31// In other scripts, a grapheme is a base character and accent/diacritic
32// combination, as not all accented characters have a single composed form.
33enum class GraphemeNormMode {
34 // Validation result is a single string, even if input is multi-word.
36 // Standard unicode graphemes are validated and output as grapheme units.
38 // Graphemes are validated and sub-divided. For virama-using scripts, units
39 // that correspond to repeatable glyphs are generated. (Mostly single unicodes
40 // but viramas and joiners are paired with the most sensible neighbor.)
41 // For non-virama scripts, this means that base/accent pairs are separated,
42 // ie the output is individual unicodes.
44 // The output is always single unicodes, regardless of the script.
46};
47
48// An enum representing the scripts that use a virama character. It is
49// guaranteed that the value of any element, (except kNonVirama) can be cast
50// to a unicode (char32) value that represents the start of the unicode range
51// of the corresponding script.
52enum class ViramaScript : char32 {
53 kNonVirama = 0,
54 kDevanagari = 0x900,
55 kBengali = 0x980,
56 kGurmukhi = 0xa00,
57 kGujarati = 0xa80,
58 kOriya = 0xb00,
59 kTamil = 0xb80,
60 kTelugu = 0xc00,
61 kKannada = 0xc80,
62 kMalayalam = 0xd00,
63 kSinhala = 0xd80,
64 kMyanmar = 0x1000,
65 kKhmer = 0x1780,
66 kJavanese = 0xa980,
67};
68
69// Base class offers a validation API and protected methods to allow subclasses
70// to easily build the validated/segmented output.
71class Validator {
72 public:
73 // Validates and cleans the src vector of unicodes to the *dest, according to
74 // g_mode. In the case of kSingleString, a single vector containing the whole
75 // result is added to *dest. With kCombined, multiple vectors are added to
76 // *dest with one grapheme in each. With kGlyphSplit, multiple vectors are
77 // added to *dest with a smaller unit representing a glyph in each.
78 // In case of validation error, returns false and as much as possible of the
79 // input, without discarding invalid text.
81 bool report_errors,
82 const std::vector<char32>& src,
83 std::vector<std::vector<char32>>* dest);
84
85 // Returns true if the unicode ch is a non-printing zero-width mark of no
86 // significance to OCR training or evaluation.
87 static bool IsZeroWidthMark(char32 ch) {
88 return ch == kZeroWidthSpace || ch == kLeftToRightMark ||
89 ch == kRightToLeftMark || ch == kInvalid;
90 }
91 virtual ~Validator();
92
93 // Some specific but universally useful unicodes.
94 static const char32 kZeroWidthSpace;
99 static const char32 kInvalid;
100
101 protected:
102 // These are more or less the character class identifiers in the ISCII
103 // standard, section 8. They have been augmented with the Unicode meta
104 // characters Zero Width Joiner and Zero Width Non Joiner, and the
105 // Unicode Vedic Marks.
106 // The best sources of information on Unicode and Indic scripts are:
107 // http://varamozhi.sourceforge.net/iscii91.pdf
108 // http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
109 // http://unicode.org/faq/indic.html
110 // http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx
111 enum class CharClass {
112 // NOTE: The values of the enum members are meaningless and arbitrary, ie
113 // they are not used for sorting, or any other risky application.
114 // The reason they are what they are is they are a single character
115 // abbreviation that can be used in a regexp/BNF definition of a grammar,
116 // IN A COMMENT, and still not relied upon in the code.
117 kConsonant = 'C',
118 kVowel = 'V',
119 kVirama = 'H', // (aka Halant)
120 kMatra = 'M', // (aka Dependent Vowel)
121 kMatraPiece = 'P', // unicode provides pieces of Matras.
122 kVowelModifier = 'D', // (candrabindu, anusvara, visarga, other marks)
123 kZeroWidthNonJoiner = 'z', // Unicode Zero Width Non-Joiner U+200C
124 kZeroWidthJoiner = 'Z', // Unicode Zero Width Joiner U+200D
125 kVedicMark = 'v', // Modifiers can come modify any indic syllable.
126 kNukta = 'N', // Occurs only immediately after consonants.
127 kRobat = 'R', // Khmer only.
128 kOther = 'O', // (digits, measures, non-Indic, etc)
129 // Additional classes used only by ValidateGrapheme.
130 kWhitespace = ' ',
131 kCombiner = 'c', // Combiners other than virama.
132 };
133 using IndicPair = std::pair<CharClass, char32>;
134
135 Validator(ViramaScript script, bool report_errors)
136 : script_(script),
137 codes_used_(0),
138 output_used_(0),
139 report_errors_(report_errors) {}
140
141 // Factory method that understands how to map script to the right subclass.
142 static std::unique_ptr<Validator> ScriptValidator(ViramaScript script,
143 bool report_errors);
144
145 // Internal version of the public static ValidateCleanAndSegment.
146 // Validates and cleans the src vector of unicodes to the *dest, according to
147 // its type and the given g_mode.
148 // In case of validation error, returns false and returns as much as possible
149 // of the input, without discarding invalid text.
151 const std::vector<char32>& src,
152 std::vector<std::vector<char32>>* dest);
153 // Moves the results from parts_ or output_ to dest according to g_mode.
155 std::vector<std::vector<char32>>* dest);
156
157 // Computes and returns the ViramaScript corresponding to the most frequent
158 // virama-using script in the input, or kNonVirama if none are present.
160 const std::vector<char32>& utf32);
161 // Returns true if the given UTF-32 unicode is a "virama" character.
162 static bool IsVirama(char32 unicode);
163 // Returns true if the given UTF-32 unicode is a vedic accent.
164 static bool IsVedicAccent(char32 unicode);
165 // Returns true if the script is one that uses subscripts for conjuncts.
166 bool IsSubscriptScript() const;
167
168 // Helper function appends the next element of codes_ only to output_,
169 // without touching parts_
170 // Returns true at the end of codes_.
172 output_.push_back(codes_[codes_used_].second);
173 return ++codes_used_ == codes_.size();
174 }
175
176 // Helper function adds a length-element vector to parts_ from the last length
177 // elements of output_. If there are more than length unused elements in
178 // output_, adds unicodes as single-element vectors to parts_ to catch
179 // output_used_ up to output->size() - length before adding the length-element
180 // vector.
181 void MultiCodePart(unsigned length) {
182 while (output_used_ + length < output_.size()) {
183 parts_.emplace_back(
184 std::initializer_list<char32>{output_[output_used_++]});
185 }
186 parts_.emplace_back(std::initializer_list<char32>{output_[output_used_]});
187 while (++output_used_ < output_.size()) {
188 parts_.back().push_back(output_[output_used_]);
189 }
190 }
191
192 // Helper function appends the next element of codes_ to output_, and then
193 // calls MultiCodePart to add the appropriate components to parts_.
194 // Returns true at the end of codes_.
195 bool UseMultiCode(unsigned length) {
196 output_.push_back(codes_[codes_used_].second);
197 MultiCodePart(length);
198 return ++codes_used_ == codes_.size();
199 }
200
201 // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
202 // parts_ and output_. Returns true if a valid Grapheme was consumed,
203 // otherwise does not increment codes_used_.
204 virtual bool ConsumeGraphemeIfValid() = 0;
205 // Sets codes_ to the class codes for the given unicode text.
206 void ComputeClassCodes(const std::vector<char32>& text);
207 // Returns the CharClass corresponding to the given Unicode ch.
208 virtual CharClass UnicodeToCharClass(char32 ch) const = 0;
209 // Resets to the initial state.
210 void Clear();
211
212 // Number of unicodes in each Indic codepage.
213 static const int kIndicCodePageSize = 128;
214 // Lowest unicode value of any Indic script. (Devanagari).
215 static const char32 kMinIndicUnicode = 0x900;
216 // Highest unicode value of any consistent (ISCII-based) Indic script.
217 static const char32 kMaxSinhalaUnicode = 0xdff;
218 // Highest unicode value of any virama-using script. (Khmer).
219 static const char32 kMaxViramaScriptUnicode = 0x17ff;
220 // Some special unicodes.
221 static const char32 kSinhalaVirama = 0xdca;
222 static const char32 kMyanmarVirama = 0x1039;
223 static const char32 kKhmerVirama = 0x17d2;
224 // Javanese Script - aksarajawa
225 static const char32 kJavaneseVirama = 0xa9c0;
226 static const char32 kMaxJavaneseUnicode = 0xa9df;
227
228 // Script we are operating on.
230 // Input unicodes with assigned CharClass is the data to be validated.
231 std::vector<IndicPair> codes_;
232 // Glyph-like components of the input.
233 std::vector<std::vector<char32>> parts_;
234 // Copied validated unicodes from codes_ that are OK to output.
235 std::vector<char32> output_;
236 // The number of elements of codes_ that have been processed so far.
237 unsigned codes_used_;
238 // The number of elements of output_ that have already been added to parts_.
239 unsigned output_used_;
240 // Log error messages for reasons why text is invalid.
242};
243
244} // namespace tesseract
245
246#endif // TESSERACT_TRAINING_VALIDATOR_H_
GraphemeNormMode
Definition: validator.h:33
signed int char32
Definition: unichar.h:51
static ViramaScript MostFrequentViramaScript(const std::vector< char32 > &utf32)
Definition: validator.cpp:143
static const char32 kSinhalaVirama
Definition: validator.h:221
static const char32 kZeroWidthNonJoiner
Definition: validator.h:95
static const char32 kKhmerVirama
Definition: validator.h:223
ViramaScript script_
Definition: validator.h:229
virtual CharClass UnicodeToCharClass(char32 ch) const =0
static const char32 kJavaneseVirama
Definition: validator.h:225
std::vector< char32 > output_
Definition: validator.h:235
static const char32 kInvalid
Definition: validator.h:99
bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode, const std::vector< char32 > &src, std::vector< std::vector< char32 > > *dest)
Definition: validator.cpp:97
static const char32 kRightToLeftMark
Definition: validator.h:98
unsigned output_used_
Definition: validator.h:239
static bool IsVedicAccent(char32 unicode)
Definition: validator.cpp:191
unsigned codes_used_
Definition: validator.h:237
static std::unique_ptr< Validator > ScriptValidator(ViramaScript script, bool report_errors)
Definition: validator.cpp:71
bool UseMultiCode(unsigned length)
Definition: validator.h:195
void MultiCodePart(unsigned length)
Definition: validator.h:181
static const char32 kMaxViramaScriptUnicode
Definition: validator.h:219
static const char32 kLeftToRightMark
Definition: validator.h:97
static bool IsVirama(char32 unicode)
Definition: validator.cpp:180
static const int kIndicCodePageSize
Definition: validator.h:213
std::pair< CharClass, char32 > IndicPair
Definition: validator.h:133
static const char32 kZeroWidthSpace
Definition: validator.h:94
static bool IsZeroWidthMark(char32 ch)
Definition: validator.h:87
bool IsSubscriptScript() const
Definition: validator.cpp:198
std::vector< IndicPair > codes_
Definition: validator.h:231
void MoveResultsToDest(GraphemeNormMode g_mode, std::vector< std::vector< char32 > > *dest)
Definition: validator.cpp:114
void ComputeClassCodes(const std::vector< char32 > &text)
Definition: validator.cpp:206
static bool ValidateCleanAndSegment(GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 > > *dest)
Definition: validator.cpp:40
static const char32 kZeroWidthJoiner
Definition: validator.h:96
static const char32 kMaxJavaneseUnicode
Definition: validator.h:226
static const char32 kMaxSinhalaUnicode
Definition: validator.h:217
Validator(ViramaScript script, bool report_errors)
Definition: validator.h:135
static const char32 kMyanmarVirama
Definition: validator.h:222
static const char32 kMinIndicUnicode
Definition: validator.h:215
std::vector< std::vector< char32 > > parts_
Definition: validator.h:233
virtual bool ConsumeGraphemeIfValid()=0