tesseract 4.1.1
Loading...
Searching...
No Matches
tesseract::ValidateMyanmar Class Reference

#include <validate_myanmar.h>

Inheritance diagram for tesseract::ValidateMyanmar:
tesseract::Validator

Public Member Functions

 ValidateMyanmar (ViramaScript script, bool report_errors)
 
 ~ValidateMyanmar ()
 
- Public Member Functions inherited from tesseract::Validator
virtual ~Validator ()
 

Protected Member Functions

bool ConsumeGraphemeIfValid () override
 
Validator::CharClass UnicodeToCharClass (char32 ch) const override
 
- Protected Member Functions inherited from tesseract::Validator
 Validator (ViramaScript script, bool report_errors)
 
bool ValidateCleanAndSegmentInternal (GraphemeNormMode g_mode, const std::vector< char32 > &src, std::vector< std::vector< char32 > > *dest)
 
void MoveResultsToDest (GraphemeNormMode g_mode, std::vector< std::vector< char32 > > *dest)
 
bool IsSubscriptScript () const
 
bool CodeOnlyToOutput ()
 
void MultiCodePart (unsigned length)
 
bool UseMultiCode (unsigned length)
 
virtual bool ConsumeGraphemeIfValid ()=0
 
void ComputeClassCodes (const std::vector< char32 > &text)
 
virtual CharClass UnicodeToCharClass (char32 ch) const =0
 
void Clear ()
 

Additional Inherited Members

- Static Public Member Functions inherited from tesseract::Validator
static bool ValidateCleanAndSegment (GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 > > *dest)
 
static bool IsZeroWidthMark (char32 ch)
 
- Static Public Attributes inherited from tesseract::Validator
static const char32 kZeroWidthSpace = 0x200B
 
static const char32 kZeroWidthNonJoiner = 0x200C
 
static const char32 kZeroWidthJoiner = 0x200D
 
static const char32 kLeftToRightMark = 0x200E
 
static const char32 kRightToLeftMark = 0x200F
 
static const char32 kInvalid = 0xfffd
 
- Protected Types inherited from tesseract::Validator
enum class  CharClass {
  kConsonant = 'C' , kVowel = 'V' , kVirama = 'H' , kMatra = 'M' ,
  kMatraPiece = 'P' , kVowelModifier = 'D' , kZeroWidthNonJoiner = 'z' , kZeroWidthJoiner = 'Z' ,
  kVedicMark = 'v' , kNukta = 'N' , kRobat = 'R' , kOther = 'O' ,
  kWhitespace = ' ' , kCombiner = 'c'
}
 
using IndicPair = std::pair< CharClass, char32 >
 
- Static Protected Member Functions inherited from tesseract::Validator
static std::unique_ptr< ValidatorScriptValidator (ViramaScript script, bool report_errors)
 
static ViramaScript MostFrequentViramaScript (const std::vector< char32 > &utf32)
 
static bool IsVirama (char32 unicode)
 
static bool IsVedicAccent (char32 unicode)
 
- Protected Attributes inherited from tesseract::Validator
ViramaScript script_
 
std::vector< IndicPaircodes_
 
std::vector< std::vector< char32 > > parts_
 
std::vector< char32output_
 
unsigned codes_used_
 
unsigned output_used_
 
bool report_errors_
 
- Static Protected Attributes inherited from tesseract::Validator
static const int kIndicCodePageSize = 128
 
static const char32 kMinIndicUnicode = 0x900
 
static const char32 kMaxSinhalaUnicode = 0xdff
 
static const char32 kMaxViramaScriptUnicode = 0x17ff
 
static const char32 kSinhalaVirama = 0xdca
 
static const char32 kMyanmarVirama = 0x1039
 
static const char32 kKhmerVirama = 0x17d2
 
static const char32 kJavaneseVirama = 0xa9c0
 
static const char32 kMaxJavaneseUnicode = 0xa9df
 

Detailed Description

Definition at line 9 of file validate_myanmar.h.

Constructor & Destructor Documentation

◆ ValidateMyanmar()

tesseract::ValidateMyanmar::ValidateMyanmar ( ViramaScript  script,
bool  report_errors 
)
inline

Definition at line 11 of file validate_myanmar.h.

12 : Validator(script, report_errors) {}
Validator(ViramaScript script, bool report_errors)
Definition: validator.h:135

◆ ~ValidateMyanmar()

tesseract::ValidateMyanmar::~ValidateMyanmar ( )
inline

Definition at line 13 of file validate_myanmar.h.

13{}

Member Function Documentation

◆ ConsumeGraphemeIfValid()

bool tesseract::ValidateMyanmar::ConsumeGraphemeIfValid ( )
overrideprotectedvirtual

Implements tesseract::Validator.

Definition at line 13 of file validate_myanmar.cpp.

13 {
14 const unsigned num_codes = codes_.size();
15 if (codes_used_ == num_codes) return true;
16 // Other.
17 if (IsMyanmarOther(codes_[codes_used_].second)) {
18 UseMultiCode(1);
19 return true;
20 }
21 // Kinzi.
22 if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 &&
23 codes_[codes_used_ + 1].second == kMyanmarAsat &&
24 codes_[codes_used_ + 2].second == kMyanmarVirama) {
27 if (UseMultiCode(3)) return true;
28 }
29 // Base consonant/vowel. NOTE that since everything in Myanmar appears to be
30 // optional, except the base, this is the only place where invalid input can
31 // be detected and false returned.
32 if (IsMyanmarLetter(codes_[codes_used_].second)) {
33 if (UseMultiCode(1)) return true;
34 } else {
35 if (report_errors_) {
36 tprintf("Invalid start of Myanmar syllable:0x%x\n",
37 codes_[codes_used_].second);
38 }
39 return false; // One of these is required.
40 }
41 if (ConsumeSubscriptIfPresent()) return true;
42 ConsumeOptionalSignsIfPresent();
43 // What we have consumed so far is a valid syllable.
44 return true;
45}
#define ASSERT_HOST(x)
Definition: errcode.h:88
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
unsigned codes_used_
Definition: validator.h:237
bool UseMultiCode(unsigned length)
Definition: validator.h:195
std::vector< IndicPair > codes_
Definition: validator.h:231
static const char32 kMyanmarVirama
Definition: validator.h:222

◆ UnicodeToCharClass()

Validator::CharClass tesseract::ValidateMyanmar::UnicodeToCharClass ( char32  ch) const
overrideprotectedvirtual

Implements tesseract::Validator.

Definition at line 55 of file validate_myanmar.cpp.

55 {
56 if (IsMyanmarLetter(ch)) return CharClass::kConsonant;
57 return CharClass::kOther;
58}

The documentation for this class was generated from the following files: