tesseract 4.1.1
Loading...
Searching...
No Matches
hocrrenderer.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: hocrrenderer.cpp
3 * Description: Simple API for calling tesseract.
4 * Author: Ray Smith (original code from baseapi.cpp)
5 * Author: Stefan Weil (moved to separate file and cleaned code)
6 *
7 * (C) Copyright 2006, Google Inc.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 **********************************************************************/
19
20#include <locale> // for std::locale::classic
21#include <memory> // for std::unique_ptr
22#include <sstream> // for std::stringstream
23#include "baseapi.h" // for TessBaseAPI
24#ifdef _WIN32
25# include "host.h" // windows.h for MultiByteToWideChar, ...
26#endif
27#include "renderer.h"
28#include "tesseractclass.h" // for Tesseract
29
30namespace tesseract {
31
35static tesseract::Orientation GetBlockTextOrientation(const PageIterator* it) {
36 tesseract::Orientation orientation;
37 tesseract::WritingDirection writing_direction;
38 tesseract::TextlineOrder textline_order;
39 float deskew_angle;
40 it->Orientation(&orientation, &writing_direction, &textline_order,
41 &deskew_angle);
42 return orientation;
43}
44
53static void AddBaselineCoordsTohOCR(const PageIterator* it,
55 std::stringstream& hocr_str) {
56 tesseract::Orientation orientation = GetBlockTextOrientation(it);
57 if (orientation != ORIENTATION_PAGE_UP) {
58 hocr_str << "; textangle " << 360 - orientation * 90;
59 return;
60 }
61
62 int left, top, right, bottom;
63 it->BoundingBox(level, &left, &top, &right, &bottom);
64
65 // Try to get the baseline coordinates at this level.
66 int x1, y1, x2, y2;
67 if (!it->Baseline(level, &x1, &y1, &x2, &y2)) return;
68 // Following the description of this field of the hOCR spec, we convert the
69 // baseline coordinates so that "the bottom left of the bounding box is the
70 // origin".
71 x1 -= left;
72 x2 -= left;
73 y1 -= bottom;
74 y2 -= bottom;
75
76 // Now fit a line through the points so we can extract coefficients for the
77 // equation: y = p1 x + p0
78 if (x1 == x2) {
79 // Problem computing the polynomial coefficients.
80 return;
81 }
82 double p1 = (y2 - y1) / static_cast<double>(x2 - x1);
83 double p0 = y1 - p1 * x1;
84
85 hocr_str << "; baseline " << round(p1 * 1000.0) / 1000.0 << " "
86 << round(p0 * 1000.0) / 1000.0;
87}
88
89static void AddBoxTohOCR(const ResultIterator* it, PageIteratorLevel level,
90 std::stringstream& hocr_str) {
91 int left, top, right, bottom;
92 it->BoundingBox(level, &left, &top, &right, &bottom);
93 // This is the only place we use double quotes instead of single quotes,
94 // but it may too late to change for consistency
95 hocr_str << " title=\"bbox " << left << " " << top << " " << right << " "
96 << bottom;
97 // Add baseline coordinates & heights for textlines only.
98 if (level == RIL_TEXTLINE) {
99 AddBaselineCoordsTohOCR(it, level, hocr_str);
100 // add custom height measures
101 float row_height, descenders, ascenders; // row attributes
102 it->RowAttributes(&row_height, &descenders, &ascenders);
103 // TODO(rays): Do we want to limit these to a single decimal place?
104 hocr_str << "; x_size " << row_height << "; x_descenders " << -descenders
105 << "; x_ascenders " << ascenders;
106 }
107 hocr_str << "\">";
108}
109
119char* TessBaseAPI::GetHOCRText(int page_number) {
120 return GetHOCRText(nullptr, page_number);
121}
122
132char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
133 if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0))
134 return nullptr;
135
136 int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, gcnt = 1;
137 int page_id = page_number + 1; // hOCR uses 1-based page numbers.
138 bool para_is_ltr = true; // Default direction is LTR
139 const char* paragraph_lang = nullptr;
140 bool font_info = false;
141 bool hocr_boxes = false;
142 GetBoolVariable("hocr_font_info", &font_info);
143 GetBoolVariable("hocr_char_boxes", &hocr_boxes);
144
145 if (input_file_ == nullptr) SetInputName(nullptr);
146
147#ifdef _WIN32
148 // convert input name from ANSI encoding to utf-8
149 int str16_len =
150 MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, nullptr, 0);
151 wchar_t* uni16_str = new WCHAR[str16_len];
152 str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
153 uni16_str, str16_len);
154 int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr,
155 0, nullptr, nullptr);
156 char* utf8_str = new char[utf8_len];
157 WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
158 nullptr, nullptr);
159 *input_file_ = utf8_str;
160 delete[] uni16_str;
161 delete[] utf8_str;
162#endif
163
164 std::stringstream hocr_str;
165 // Use "C" locale (needed for double values x_size and x_descenders).
166 hocr_str.imbue(std::locale::classic());
167 // Use 8 digits for double values.
168 hocr_str.precision(8);
169 hocr_str << " <div class='ocr_page'";
170 hocr_str << " id='"
171 << "page_" << page_id << "'";
172 hocr_str << " title='image \"";
173 if (input_file_) {
174 hocr_str << HOcrEscape(input_file_->string()).c_str();
175 } else {
176 hocr_str << "unknown";
177 }
178 hocr_str << "\"; bbox " << rect_left_ << " " << rect_top_ << " "
179 << rect_width_ << " " << rect_height_ << "; ppageno " << page_number
180 << "'>\n";
181
182 std::unique_ptr<ResultIterator> res_it(GetIterator());
183 while (!res_it->Empty(RIL_BLOCK)) {
184 if (res_it->Empty(RIL_WORD)) {
185 res_it->Next(RIL_WORD);
186 continue;
187 }
188
189 // Open any new block/paragraph/textline.
190 if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
191 para_is_ltr = true; // reset to default direction
192 hocr_str << " <div class='ocr_carea'"
193 << " id='"
194 << "block_" << page_id << "_" << bcnt << "'";
195 AddBoxTohOCR(res_it.get(), RIL_BLOCK, hocr_str);
196 }
197 if (res_it->IsAtBeginningOf(RIL_PARA)) {
198 hocr_str << "\n <p class='ocr_par'";
199 para_is_ltr = res_it->ParagraphIsLtr();
200 if (!para_is_ltr) {
201 hocr_str << " dir='rtl'";
202 }
203 hocr_str << " id='"
204 << "par_" << page_id << "_" << pcnt << "'";
205 paragraph_lang = res_it->WordRecognitionLanguage();
206 if (paragraph_lang) {
207 hocr_str << " lang='" << paragraph_lang << "'";
208 }
209 AddBoxTohOCR(res_it.get(), RIL_PARA, hocr_str);
210 }
211 if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
212 hocr_str << "\n <span class='";
213 switch (res_it->BlockType()) {
214 case PT_HEADING_TEXT:
215 hocr_str << "ocr_header";
216 break;
217 case PT_PULLOUT_TEXT:
218 hocr_str << "ocr_textfloat";
219 break;
220 case PT_CAPTION_TEXT:
221 hocr_str << "ocr_caption";
222 break;
223 default:
224 hocr_str << "ocr_line";
225 }
226 hocr_str << "' id='"
227 << "line_" << page_id << "_" << lcnt << "'";
228 AddBoxTohOCR(res_it.get(), RIL_TEXTLINE, hocr_str);
229 }
230
231 // Now, process the word...
232 std::vector<std::vector<std::pair<const char*, float>>>* choiceMap =
233 nullptr;
235
236 choiceMap = res_it->GetBestLSTMSymbolChoices();
237 }
238 hocr_str << "\n <span class='ocrx_word'"
239 << " id='"
240 << "word_" << page_id << "_" << wcnt << "'";
241 int left, top, right, bottom;
242 bool bold, italic, underlined, monospace, serif, smallcaps;
243 int pointsize, font_id;
244 const char* font_name;
245 res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
246 font_name =
247 res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
248 &serif, &smallcaps, &pointsize, &font_id);
249 hocr_str << " title='bbox " << left << " " << top << " " << right << " "
250 << bottom << "; x_wconf "
251 << static_cast<int>(res_it->Confidence(RIL_WORD));
252 if (font_info) {
253 if (font_name) {
254 hocr_str << "; x_font " << HOcrEscape(font_name).c_str();
255 }
256 hocr_str << "; x_fsize " << pointsize;
257 }
258 hocr_str << "'";
259 const char* lang = res_it->WordRecognitionLanguage();
260 if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
261 hocr_str << " lang='" << lang << "'";
262 }
263 switch (res_it->WordDirection()) {
264 // Only emit direction if different from current paragraph direction
266 if (!para_is_ltr) hocr_str << " dir='ltr'";
267 break;
269 if (para_is_ltr) hocr_str << " dir='rtl'";
270 break;
271 case DIR_MIX:
272 case DIR_NEUTRAL:
273 default: // Do nothing.
274 break;
275 }
276 hocr_str << ">";
277 bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
278 bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
279 bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
280 if (bold) hocr_str << "<strong>";
281 if (italic) hocr_str << "<em>";
282 do {
283 const std::unique_ptr<const char[]> grapheme(
284 res_it->GetUTF8Text(RIL_SYMBOL));
285 if (grapheme && grapheme[0] != 0) {
286 if (hocr_boxes) {
287 res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
288 hocr_str << "\n <span class='ocrx_cinfo' title='x_bboxes "
289 << left << " " << top << " " << right << " " << bottom
290 << "; x_conf " << res_it->Confidence(RIL_SYMBOL) << "'>";
291 }
292 hocr_str << HOcrEscape(grapheme.get()).c_str();
293 if (hocr_boxes) {
294 hocr_str << "</span>";
295 }
296 }
297 res_it->Next(RIL_SYMBOL);
298 } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
299 if (italic) hocr_str << "</em>";
300 if (bold) hocr_str << "</strong>";
301 // If the lstm choice mode is required it is added here
302 if (tesseract_->lstm_choice_mode == 1 && choiceMap != nullptr) {
303 for (auto timestep : *choiceMap) {
304 hocr_str << "\n <span class='ocrx_cinfo'"
305 << " id='"
306 << "timestep_" << page_id << "_" << wcnt << "_" << tcnt << "'"
307 << ">";
308 for (std::pair<const char*, float> conf : timestep) {
309 hocr_str << "<span class='ocr_glyph'"
310 << " id='"
311 << "choice_" << page_id << "_" << wcnt << "_" << gcnt << "'"
312 << " title='x_confs " << int(conf.second * 100) << "'>"
313 << conf.first << "</span>";
314 gcnt++;
315 }
316 hocr_str << "</span>";
317 tcnt++;
318 }
319 } else if (tesseract_->lstm_choice_mode == 2 && choiceMap != nullptr) {
320 for (auto timestep : *choiceMap) {
321 if (timestep.size() > 0) {
322 hocr_str << "\n <span class='ocrx_cinfo'"
323 << " id='"
324 << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt
325 << "'>";
326 for (auto & j : timestep) {
327 hocr_str << "<span class='ocr_glyph'"
328 << " id='"
329 << "choice_" << page_id << "_" << wcnt << "_" << gcnt
330 << "'"
331 << " title='x_confs " << int(j.second * 100)
332 << "'>" << j.first << "</span>";
333 gcnt++;
334 }
335 hocr_str << "</span>";
336 tcnt++;
337 }
338 }
339 }
340 // Close ocrx_word.
341 if (hocr_boxes || tesseract_->lstm_choice_mode > 0) {
342 hocr_str << "\n ";
343 }
344 hocr_str << "</span>";
345 tcnt = 1;
346 gcnt = 1;
347 wcnt++;
348 // Close any ending block/paragraph/textline.
349 if (last_word_in_line) {
350 hocr_str << "\n </span>";
351 lcnt++;
352 }
353 if (last_word_in_para) {
354 hocr_str << "\n </p>\n";
355 pcnt++;
356 para_is_ltr = true; // back to default direction
357 }
358 if (last_word_in_block) {
359 hocr_str << " </div>\n";
360 bcnt++;
361 }
362 }
363 hocr_str << " </div>\n";
364
365 const std::string& text = hocr_str.str();
366 char* result = new char[text.length() + 1];
367 strcpy(result, text.c_str());
368 return result;
369}
370
371/**********************************************************************
372 * HOcr Text Renderer interface implementation
373 **********************************************************************/
374TessHOcrRenderer::TessHOcrRenderer(const char* outputbase)
375 : TessResultRenderer(outputbase, "hocr") {
376 font_info_ = false;
377}
378
379TessHOcrRenderer::TessHOcrRenderer(const char* outputbase, bool font_info)
380 : TessResultRenderer(outputbase, "hocr") {
381 font_info_ = font_info;
382}
383
384bool TessHOcrRenderer::BeginDocumentHandler() {
386 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
387 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
388 " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
389 "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
390 "lang=\"en\">\n <head>\n <title>");
393 "</title>\n"
394 " <meta http-equiv=\"Content-Type\" content=\"text/html;"
395 "charset=utf-8\"/>\n"
396 " <meta name='ocr-system' content='tesseract " PACKAGE_VERSION
397 "' />\n"
398 " <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
399 " ocr_line ocrx_word ocrp_wconf");
400 if (font_info_) AppendString(" ocrp_lang ocrp_dir ocrp_font ocrp_fsize");
402 "'/>\n"
403 " </head>\n"
404 " <body>\n");
405
406 return true;
407}
408
409bool TessHOcrRenderer::EndDocumentHandler() {
410 AppendString(" </body>\n</html>\n");
411
412 return true;
413}
414
415bool TessHOcrRenderer::AddImageHandler(TessBaseAPI* api) {
416 const std::unique_ptr<const char[]> hocr(api->GetHOCRText(imagenum()));
417 if (hocr == nullptr) return false;
418
419 AppendString(hocr.get());
420
421 return true;
422}
423
424} // namespace tesseract
@ PT_PULLOUT_TEXT
Definition: capi.h:132
@ PT_HEADING_TEXT
Definition: capi.h:131
@ PT_CAPTION_TEXT
Definition: capi.h:137
@ DIR_MIX
Definition: unichar.h:45
@ DIR_RIGHT_TO_LEFT
Definition: unichar.h:44
@ DIR_LEFT_TO_RIGHT
Definition: unichar.h:43
@ DIR_NEUTRAL
Definition: unichar.h:42
@ ORIENTATION_PAGE_UP
Definition: publictypes.h:121
STRING HOcrEscape(const char *text)
Definition: baseapi.cpp:2310
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:830
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:895
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
STRING * input_file_
Name used by training code.
Definition: baseapi.h:896
ResultIterator * GetIterator()
Definition: baseapi.cpp:1324
char * GetHOCRText(ETEXT_DESC *monitor, int page_number)
void SetInputName(const char *name)
Definition: baseapi.cpp:271
bool GetBoolVariable(const char *name, bool *value) const
Definition: baseapi.cpp:306
void AppendString(const char *s)
Definition: renderer.cpp:102
const char * title() const
Definition: renderer.h:88
const char * c_str() const
Definition: strngs.cpp:205
const char * string() const
Definition: strngs.cpp:194