tesseract 4.1.1
Loading...
Searching...
No Matches
tesseract::UNICHAR Class Reference

#include <unichar.h>

Classes

class  const_iterator
 

Public Member Functions

 UNICHAR ()
 
 UNICHAR (const char *utf8_str, int len)
 
 UNICHAR (int unicode)
 
int first_uni () const
 
int utf8_len () const
 
const char * utf8 () const
 
char * utf8_str () const
 

Static Public Member Functions

static int utf8_step (const char *utf8_str)
 
static const_iterator begin (const char *utf8_str, int byte_length)
 
static const_iterator end (const char *utf8_str, int byte_length)
 
static std::vector< char32UTF8ToUTF32 (const char *utf8_str)
 
static std::string UTF32ToUTF8 (const std::vector< char32 > &str32)
 

Detailed Description

Definition at line 57 of file unichar.h.

Constructor & Destructor Documentation

◆ UNICHAR() [1/3]

tesseract::UNICHAR::UNICHAR ( )
inline

Definition at line 59 of file unichar.h.

59 {
60 memset(chars, 0, UNICHAR_LEN);
61 }
#define UNICHAR_LEN
Definition: unichar.h:30

◆ UNICHAR() [2/3]

tesseract::UNICHAR::UNICHAR ( const char *  utf8_str,
int  len 
)

Definition at line 32 of file unichar.cpp.

32 {
33 int total_len = 0;
34 int step = 0;
35 if (len < 0) {
36 for (len = 0; len < UNICHAR_LEN && utf8_str[len] != 0; ++len);
37 }
38 for (total_len = 0; total_len < len; total_len += step) {
39 step = utf8_step(utf8_str + total_len);
40 if (total_len + step > UNICHAR_LEN)
41 break; // Too long.
42 if (step == 0)
43 break; // Illegal first byte.
44 int i;
45 for (i = 1; i < step; ++i)
46 if ((utf8_str[total_len + i] & 0xc0) != 0x80)
47 break;
48 if (i < step)
49 break; // Illegal surrogate
50 }
51 memcpy(chars, utf8_str, total_len);
52 if (total_len < UNICHAR_LEN) {
53 chars[UNICHAR_LEN - 1] = total_len;
54 while (total_len < UNICHAR_LEN - 1)
55 chars[total_len++] = 0;
56 }
57}
char * utf8_str() const
Definition: unichar.cpp:129
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:138

◆ UNICHAR() [3/3]

tesseract::UNICHAR::UNICHAR ( int  unicode)
explicit

Definition at line 61 of file unichar.cpp.

61 {
62 const int bytemask = 0xBF;
63 const int bytemark = 0x80;
64
65 if (unicode < 0x80) {
66 chars[UNICHAR_LEN - 1] = 1;
67 chars[2] = 0;
68 chars[1] = 0;
69 chars[0] = static_cast<char>(unicode);
70 } else if (unicode < 0x800) {
71 chars[UNICHAR_LEN - 1] = 2;
72 chars[2] = 0;
73 chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
74 unicode >>= 6;
75 chars[0] = static_cast<char>(unicode | 0xc0);
76 } else if (unicode < 0x10000) {
77 chars[UNICHAR_LEN - 1] = 3;
78 chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
79 unicode >>= 6;
80 chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
81 unicode >>= 6;
82 chars[0] = static_cast<char>(unicode | 0xe0);
83 } else if (unicode <= UNI_MAX_LEGAL_UTF32) {
84 chars[UNICHAR_LEN - 1] = 4;
85 chars[3] = static_cast<char>((unicode | bytemark) & bytemask);
86 unicode >>= 6;
87 chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
88 unicode >>= 6;
89 chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
90 unicode >>= 6;
91 chars[0] = static_cast<char>(unicode | 0xf0);
92 } else {
93 memset(chars, 0, UNICHAR_LEN);
94 }
95}
#define UNI_MAX_LEGAL_UTF32
Definition: unichar.cpp:24

Member Function Documentation

◆ begin()

UNICHAR::const_iterator tesseract::UNICHAR::begin ( const char *  utf8_str,
int  byte_length 
)
static

Definition at line 204 of file unichar.cpp.

204 {
205 return UNICHAR::const_iterator(utf8_str);
206}

◆ end()

UNICHAR::const_iterator tesseract::UNICHAR::end ( const char *  utf8_str,
int  byte_length 
)
static

Definition at line 208 of file unichar.cpp.

208 {
209 return UNICHAR::const_iterator(utf8_str + len);
210}

◆ first_uni()

int tesseract::UNICHAR::first_uni ( ) const

Definition at line 98 of file unichar.cpp.

98 {
99 static const int utf8_offsets[5] = {
100 0, 0, 0x3080, 0xE2080, 0x3C82080
101 };
102 int uni = 0;
103 int len = utf8_step(chars);
104 const char* src = chars;
105
106 switch (len) {
107 default:
108 break;
109 case 4:
110 uni += static_cast<unsigned char>(*src++);
111 uni <<= 6;
112 // Fall through.
113 case 3:
114 uni += static_cast<unsigned char>(*src++);
115 uni <<= 6;
116 // Fall through.
117 case 2:
118 uni += static_cast<unsigned char>(*src++);
119 uni <<= 6;
120 // Fall through.
121 case 1:
122 uni += static_cast<unsigned char>(*src++);
123 }
124 uni -= utf8_offsets[len];
125 return uni;
126}

◆ UTF32ToUTF8()

std::string tesseract::UNICHAR::UTF32ToUTF8 ( const std::vector< char32 > &  str32)
static

Definition at line 232 of file unichar.cpp.

232 {
233 std::string utf8_str;
234 for (char32 ch : str32) {
235 UNICHAR uni_ch(ch);
236 int step;
237 if (uni_ch.utf8_len() > 0 && (step = utf8_step(uni_ch.utf8())) > 0) {
238 utf8_str.append(uni_ch.utf8(), step);
239 } else {
240 return "";
241 }
242 }
243 return utf8_str;
244}
signed int char32

◆ utf8()

const char * tesseract::UNICHAR::utf8 ( ) const
inline

Definition at line 83 of file unichar.h.

83 {
84 return chars;
85 }

◆ utf8_len()

int tesseract::UNICHAR::utf8_len ( ) const
inline

Definition at line 77 of file unichar.h.

77 {
78 int len = chars[UNICHAR_LEN - 1];
79 return len >= 0 && len < UNICHAR_LEN ? len : UNICHAR_LEN;
80 }

◆ utf8_step()

int tesseract::UNICHAR::utf8_step ( const char *  utf8_str)
static

Definition at line 138 of file unichar.cpp.

138 {
139 static const char utf8_bytes[256] = {
140 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
141 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
142 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
143 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
144 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
145 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
146 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
147 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
148 };
149
150 return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
151}

◆ utf8_str()

char * tesseract::UNICHAR::utf8_str ( ) const

Definition at line 129 of file unichar.cpp.

129 {
130 int len = utf8_len();
131 char* str = new char[len + 1];
132 memcpy(str, chars, len);
133 str[len] = 0;
134 return str;
135}
int utf8_len() const
Definition: unichar.h:77

◆ UTF8ToUTF32()

std::vector< char32 > tesseract::UNICHAR::UTF8ToUTF32 ( const char *  utf8_str)
static

Definition at line 215 of file unichar.cpp.

215 {
216 const int utf8_length = strlen(utf8_str);
217 std::vector<char32> unicodes;
218 unicodes.reserve(utf8_length);
219 const_iterator end_it(end(utf8_str, utf8_length));
220 for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) {
221 if (it.is_legal()) {
222 unicodes.push_back(*it);
223 } else {
224 unicodes.clear();
225 return unicodes;
226 }
227 }
228 return unicodes;
229}
static const_iterator begin(const char *utf8_str, int byte_length)
Definition: unichar.cpp:204
static const_iterator end(const char *utf8_str, int byte_length)
Definition: unichar.cpp:208

The documentation for this class was generated from the following files: