40 it->Orientation(&orientation, &writing_direction, &textline_order,
53static void AddBaselineCoordsTohOCR(
const PageIterator* it,
55 std::stringstream& hocr_str) {
58 hocr_str <<
"; textangle " << 360 - orientation * 90;
62 int left, top, right, bottom;
63 it->BoundingBox(level, &left, &top, &right, &bottom);
67 if (!it->Baseline(level, &x1, &y1, &x2, &y2))
return;
82 double p1 = (y2 - y1) /
static_cast<double>(x2 - x1);
83 double p0 = y1 - p1 * x1;
85 hocr_str <<
"; baseline " << round(p1 * 1000.0) / 1000.0 <<
" "
86 << round(p0 * 1000.0) / 1000.0;
90 std::stringstream& hocr_str) {
91 int left, top, right, bottom;
92 it->BoundingBox(level, &left, &top, &right, &bottom);
95 hocr_str <<
" title=\"bbox " << left <<
" " << top <<
" " << right <<
" "
99 AddBaselineCoordsTohOCR(it, level, hocr_str);
101 float row_height, descenders, ascenders;
102 it->RowAttributes(&row_height, &descenders, &ascenders);
104 hocr_str <<
"; x_size " << row_height <<
"; x_descenders " << -descenders
105 <<
"; x_ascenders " << ascenders;
119char* TessBaseAPI::GetHOCRText(
int page_number) {
132char* TessBaseAPI::GetHOCRText(
ETEXT_DESC* monitor,
int page_number) {
136 int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, gcnt = 1;
137 int page_id = page_number + 1;
138 bool para_is_ltr =
true;
139 const char* paragraph_lang =
nullptr;
140 bool font_info =
false;
141 bool hocr_boxes =
false;
151 wchar_t* uni16_str =
new WCHAR[str16_len];
153 uni16_str, str16_len);
154 int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len,
nullptr,
155 0,
nullptr,
nullptr);
156 char* utf8_str =
new char[utf8_len];
157 WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
164 std::stringstream hocr_str;
166 hocr_str.imbue(std::locale::classic());
168 hocr_str.precision(8);
169 hocr_str <<
" <div class='ocr_page'";
171 <<
"page_" << page_id <<
"'";
172 hocr_str <<
" title='image \"";
176 hocr_str <<
"unknown";
182 std::unique_ptr<ResultIterator> res_it(
GetIterator());
190 if (res_it->IsAtBeginningOf(
RIL_BLOCK)) {
192 hocr_str <<
" <div class='ocr_carea'"
194 <<
"block_" << page_id <<
"_" << bcnt <<
"'";
195 AddBoxTohOCR(res_it.get(),
RIL_BLOCK, hocr_str);
197 if (res_it->IsAtBeginningOf(
RIL_PARA)) {
198 hocr_str <<
"\n <p class='ocr_par'";
199 para_is_ltr = res_it->ParagraphIsLtr();
201 hocr_str <<
" dir='rtl'";
204 <<
"par_" << page_id <<
"_" << pcnt <<
"'";
205 paragraph_lang = res_it->WordRecognitionLanguage();
206 if (paragraph_lang) {
207 hocr_str <<
" lang='" << paragraph_lang <<
"'";
209 AddBoxTohOCR(res_it.get(),
RIL_PARA, hocr_str);
212 hocr_str <<
"\n <span class='";
213 switch (res_it->BlockType()) {
215 hocr_str <<
"ocr_header";
218 hocr_str <<
"ocr_textfloat";
221 hocr_str <<
"ocr_caption";
224 hocr_str <<
"ocr_line";
227 <<
"line_" << page_id <<
"_" << lcnt <<
"'";
232 std::vector<std::vector<std::pair<const char*, float>>>* choiceMap =
236 choiceMap = res_it->GetBestLSTMSymbolChoices();
238 hocr_str <<
"\n <span class='ocrx_word'"
240 <<
"word_" << page_id <<
"_" << wcnt <<
"'";
241 int left, top, right, bottom;
242 bool bold, italic, underlined, monospace, serif, smallcaps;
243 int pointsize, font_id;
244 const char* font_name;
245 res_it->BoundingBox(
RIL_WORD, &left, &top, &right, &bottom);
247 res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
248 &serif, &smallcaps, &pointsize, &font_id);
249 hocr_str <<
" title='bbox " << left <<
" " << top <<
" " << right <<
" "
250 << bottom <<
"; x_wconf "
251 <<
static_cast<int>(res_it->Confidence(
RIL_WORD));
256 hocr_str <<
"; x_fsize " << pointsize;
259 const char* lang = res_it->WordRecognitionLanguage();
260 if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
261 hocr_str <<
" lang='" << lang <<
"'";
263 switch (res_it->WordDirection()) {
266 if (!para_is_ltr) hocr_str <<
" dir='ltr'";
269 if (para_is_ltr) hocr_str <<
" dir='rtl'";
280 if (bold) hocr_str <<
"<strong>";
281 if (italic) hocr_str <<
"<em>";
283 const std::unique_ptr<const char[]> grapheme(
285 if (grapheme && grapheme[0] != 0) {
287 res_it->BoundingBox(
RIL_SYMBOL, &left, &top, &right, &bottom);
288 hocr_str <<
"\n <span class='ocrx_cinfo' title='x_bboxes "
289 << left <<
" " << top <<
" " << right <<
" " << bottom
290 <<
"; x_conf " << res_it->Confidence(
RIL_SYMBOL) <<
"'>";
294 hocr_str <<
"</span>";
299 if (italic) hocr_str <<
"</em>";
300 if (bold) hocr_str <<
"</strong>";
303 for (
auto timestep : *choiceMap) {
304 hocr_str <<
"\n <span class='ocrx_cinfo'"
306 <<
"timestep_" << page_id <<
"_" << wcnt <<
"_" << tcnt <<
"'"
308 for (std::pair<const char*, float> conf : timestep) {
309 hocr_str <<
"<span class='ocr_glyph'"
311 <<
"choice_" << page_id <<
"_" << wcnt <<
"_" << gcnt <<
"'"
312 <<
" title='x_confs " << int(conf.second * 100) <<
"'>"
313 << conf.first <<
"</span>";
316 hocr_str <<
"</span>";
320 for (
auto timestep : *choiceMap) {
321 if (timestep.size() > 0) {
322 hocr_str <<
"\n <span class='ocrx_cinfo'"
324 <<
"lstm_choices_" << page_id <<
"_" << wcnt <<
"_" << tcnt
326 for (
auto & j : timestep) {
327 hocr_str <<
"<span class='ocr_glyph'"
329 <<
"choice_" << page_id <<
"_" << wcnt <<
"_" << gcnt
331 <<
" title='x_confs " << int(j.second * 100)
332 <<
"'>" << j.first <<
"</span>";
335 hocr_str <<
"</span>";
344 hocr_str <<
"</span>";
349 if (last_word_in_line) {
350 hocr_str <<
"\n </span>";
353 if (last_word_in_para) {
354 hocr_str <<
"\n </p>\n";
358 if (last_word_in_block) {
359 hocr_str <<
" </div>\n";
363 hocr_str <<
" </div>\n";
365 const std::string& text = hocr_str.str();
366 char* result =
new char[text.length() + 1];
367 strcpy(result, text.c_str());
374TessHOcrRenderer::TessHOcrRenderer(
const char* outputbase)
379TessHOcrRenderer::TessHOcrRenderer(
const char* outputbase,
bool font_info)
381 font_info_ = font_info;
384bool TessHOcrRenderer::BeginDocumentHandler() {
386 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
387 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
388 " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
389 "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
390 "lang=\"en\">\n <head>\n <title>");
394 " <meta http-equiv=\"Content-Type\" content=\"text/html;"
395 "charset=utf-8\"/>\n"
396 " <meta name='ocr-system' content='tesseract " PACKAGE_VERSION
398 " <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
399 " ocr_line ocrx_word ocrp_wconf");
400 if (font_info_)
AppendString(
" ocrp_lang ocrp_dir ocrp_font ocrp_fsize");
409bool TessHOcrRenderer::EndDocumentHandler() {
417 if (hocr ==
nullptr)
return false;
STRING HOcrEscape(const char *text)
int Recognize(ETEXT_DESC *monitor)
PAGE_RES * page_res_
The page-level data.
Tesseract * tesseract_
The underlying data object.
STRING * input_file_
Name used by training code.
ResultIterator * GetIterator()
char * GetHOCRText(ETEXT_DESC *monitor, int page_number)
void SetInputName(const char *name)
bool GetBoolVariable(const char *name, bool *value) const
void AppendString(const char *s)
const char * title() const
const char * c_str() const
const char * string() const