tesseract 4.1.1
Loading...
Searching...
No Matches
tesseract::StringRenderer Class Reference

#include <stringrenderer.h>

Public Member Functions

 StringRenderer (const std::string &font_desc, int page_width, int page_height)
 
 ~StringRenderer ()
 
int RenderToImage (const char *text, int text_length, Pix **pix)
 
int RenderToGrayscaleImage (const char *text, int text_length, Pix **pix)
 
int RenderToBinaryImage (const char *text, int text_length, int threshold, Pix **pix)
 
int RenderAllFontsToImage (double min_coverage, const char *text, int text_length, std::string *font_used, Pix **pix)
 
bool set_font (const std::string &desc)
 
void set_char_spacing (int char_spacing)
 
void set_leading (int leading)
 
void set_resolution (const int resolution)
 
void set_vertical_text (bool vertical_text)
 
void set_gravity_hint_strong (bool gravity_hint_strong)
 
void set_render_fullwidth_latin (bool render_fullwidth_latin)
 
void set_underline_start_prob (const double frac)
 
void set_underline_continuation_prob (const double frac)
 
void set_underline_style (const PangoUnderline style)
 
void set_features (const char *features)
 
void set_page (int page)
 
void set_box_padding (int val)
 
void set_drop_uncovered_chars (bool val)
 
void set_strip_unrenderable_words (bool val)
 
void set_output_word_boxes (bool val)
 
void set_add_ligatures (bool add_ligatures)
 
void set_pen_color (double r, double g, double b)
 
void set_h_margin (const int h_margin)
 
void set_v_margin (const int v_margin)
 
const PangoFontInfofont () const
 
int h_margin () const
 
int v_margin () const
 
const std::vector< BoxChar * > & GetBoxes () const
 
Boxa * GetPageBoxes () const
 
void RotatePageBoxes (float rotation)
 
void ClearBoxes ()
 
std::string GetBoxesStr ()
 
void WriteAllBoxes (const std::string &filename)
 
int StripUnrenderableWords (std::string *utf8_text) const
 

Static Public Member Functions

static std::string InsertWordJoiners (const std::string &text)
 
static std::string ConvertBasicLatinToFullwidthLatin (const std::string &text)
 
static std::string ConvertFullwidthLatinToBasicLatin (const std::string &text)
 

Protected Member Functions

void InitPangoCairo ()
 
void FreePangoCairo ()
 
void SetLayoutProperties ()
 
void SetWordUnderlineAttributes (const std::string &page_text)
 
void ComputeClusterBoxes ()
 
void CorrectBoxPositionsToLayout (std::vector< BoxChar * > *boxchars)
 
bool GetClusterStrings (std::vector< std::string > *cluster_text)
 
int FindFirstPageBreakOffset (const char *text, int text_length)
 

Protected Attributes

PangoFontInfo font_
 
int page_width_
 
int page_height_
 
int h_margin_
 
int v_margin_
 
double pen_color_ [3]
 
int char_spacing_
 
int leading_
 
int resolution_
 
bool vertical_text_
 
bool gravity_hint_strong_
 
bool render_fullwidth_latin_
 
double underline_start_prob_
 
double underline_continuation_prob_
 
PangoUnderline underline_style_
 
char * features_
 
bool drop_uncovered_chars_
 
bool strip_unrenderable_words_
 
bool add_ligatures_
 
bool output_word_boxes_
 
cairo_surface_t * surface_
 
cairo_t * cr_
 
PangoLayout * layout_
 
int start_box_
 
int page_
 
std::vector< BoxChar * > boxchars_
 
int box_padding_
 
Boxa * page_boxes_
 
std::unordered_map< char32, int64_t > char_map_
 
int total_chars_
 
unsigned int font_index_
 
int last_offset_
 

Detailed Description

Definition at line 50 of file stringrenderer.h.

Constructor & Destructor Documentation

◆ StringRenderer()

tesseract::StringRenderer::StringRenderer ( const std::string &  font_desc,
int  page_width,
int  page_height 
)

Definition at line 90 of file stringrenderer.cpp.

92 : font_(font_desc),
93 page_width_(page_width),
94 page_height_(page_height),
95 h_margin_(50),
96 v_margin_(50),
97 pen_color_{0.0, 0.0, 0.0},
99 leading_(0),
100 vertical_text_(false),
105 underline_style_(PANGO_UNDERLINE_SINGLE),
106 features_(nullptr),
109 add_ligatures_(false),
110 output_word_boxes_(false),
111 surface_(nullptr),
112 cr_(nullptr),
113 layout_(nullptr),
114 start_box_(0),
115 page_(0),
116 box_padding_(0),
117 page_boxes_(nullptr),
118 total_chars_(0),
119 font_index_(0),
120 last_offset_(0) {
121 set_resolution(kDefaultOutputResolution);
122 set_font(font_desc);
123}
bool set_font(const std::string &desc)
void set_resolution(const int resolution)
cairo_surface_t * surface_
PangoUnderline underline_style_

◆ ~StringRenderer()

tesseract::StringRenderer::~StringRenderer ( )

Definition at line 144 of file stringrenderer.cpp.

144 {
145 free(features_);
146 ClearBoxes();
148}

Member Function Documentation

◆ ClearBoxes()

void tesseract::StringRenderer::ClearBoxes ( )

Definition at line 337 of file stringrenderer.cpp.

337 {
338 for (size_t i = 0; i < boxchars_.size(); ++i) delete boxchars_[i];
339 boxchars_.clear();
340 boxaDestroy(&page_boxes_);
341}
std::vector< BoxChar * > boxchars_

◆ ComputeClusterBoxes()

void tesseract::StringRenderer::ComputeClusterBoxes ( )
protected

Definition at line 460 of file stringrenderer.cpp.

460 {
461 const char* text = pango_layout_get_text(layout_);
462 PangoLayoutIter* cluster_iter = pango_layout_get_iter(layout_);
463
464 // Do a first pass to store cluster start indexes.
465 std::vector<int> cluster_start_indices;
466 do {
467 cluster_start_indices.push_back(pango_layout_iter_get_index(cluster_iter));
468 tlog(3, "Added %d\n", cluster_start_indices.back());
469 } while (pango_layout_iter_next_cluster(cluster_iter));
470 pango_layout_iter_free(cluster_iter);
471 cluster_start_indices.push_back(strlen(text));
472 tlog(3, "Added last index %d\n", cluster_start_indices.back());
473 // Sort the indices and create a map from start to end indices.
474 std::sort(cluster_start_indices.begin(), cluster_start_indices.end());
475 std::map<int, int> cluster_start_to_end_index;
476 for (size_t i = 0; i + 1 < cluster_start_indices.size(); ++i) {
477 cluster_start_to_end_index[cluster_start_indices[i]]
478 = cluster_start_indices[i + 1];
479 }
480
481 // Iterate again to compute cluster boxes and their text with the obtained
482 // cluster extent information.
483 cluster_iter = pango_layout_get_iter(layout_);
484 // Store BoxChars* sorted by their byte start positions
485 std::map<int, BoxChar*> start_byte_to_box;
486 do {
487 PangoRectangle cluster_rect;
488 pango_layout_iter_get_cluster_extents(cluster_iter, &cluster_rect, nullptr);
489 pango_extents_to_pixels(&cluster_rect, nullptr);
490 const int start_byte_index = pango_layout_iter_get_index(cluster_iter);
491 const int end_byte_index = cluster_start_to_end_index[start_byte_index];
492 std::string cluster_text = std::string(text + start_byte_index,
493 end_byte_index - start_byte_index);
494 if (!cluster_text.empty() && cluster_text[0] == '\n') {
495 tlog(2, "Skipping newlines at start of text.\n");
496 continue;
497 }
498 if (!cluster_rect.width || !cluster_rect.height ||
499 IsUTF8Whitespace(cluster_text.c_str())) {
500 tlog(2, "Skipping whitespace with boxdim (%d,%d) '%s'\n",
501 cluster_rect.width, cluster_rect.height, cluster_text.c_str());
502 BoxChar* boxchar = new BoxChar(" ", 1);
503 boxchar->set_page(page_);
504 start_byte_to_box[start_byte_index] = boxchar;
505 continue;
506 }
507 // Prepare a boxchar for addition at this byte position.
508 tlog(2, "[%d %d], %d, %d : start_byte=%d end_byte=%d : '%s'\n",
509 cluster_rect.x, cluster_rect.y,
510 cluster_rect.width, cluster_rect.height,
511 start_byte_index, end_byte_index,
512 cluster_text.c_str());
513 ASSERT_HOST_MSG(cluster_rect.width,
514 "cluster_text:%s start_byte_index:%d\n",
515 cluster_text.c_str(), start_byte_index);
516 ASSERT_HOST_MSG(cluster_rect.height,
517 "cluster_text:%s start_byte_index:%d\n",
518 cluster_text.c_str(), start_byte_index);
519 if (box_padding_) {
520 cluster_rect.x = std::max(0, cluster_rect.x - box_padding_);
521 cluster_rect.width += 2 * box_padding_;
522 cluster_rect.y = std::max(0, cluster_rect.y - box_padding_);
523 cluster_rect.height += 2 * box_padding_;
524 }
525 if (add_ligatures_) {
526 // Make sure the output box files have ligatured text in case the font
527 // decided to use an unmapped glyph.
528 cluster_text = LigatureTable::Get()->AddLigatures(cluster_text, nullptr);
529 }
530 BoxChar* boxchar = new BoxChar(cluster_text.c_str(), cluster_text.size());
531 boxchar->set_page(page_);
532 boxchar->AddBox(cluster_rect.x, cluster_rect.y,
533 cluster_rect.width, cluster_rect.height);
534 start_byte_to_box[start_byte_index] = boxchar;
535 } while (pango_layout_iter_next_cluster(cluster_iter));
536 pango_layout_iter_free(cluster_iter);
537
538 // There is a subtle bug in the cluster text reported by the PangoLayoutIter
539 // on ligatured characters (eg. The word "Lam-Aliph" in arabic). To work
540 // around this, we use text reported using the PangoGlyphIter which is
541 // accurate.
542 // TODO(ranjith): Revisit whether this is still needed in newer versions of
543 // pango.
544 std::vector<std::string> cluster_text;
545 if (GetClusterStrings(&cluster_text)) {
546 ASSERT_HOST(cluster_text.size() == start_byte_to_box.size());
547 int ind = 0;
548 for (std::map<int, BoxChar*>::iterator it = start_byte_to_box.begin();
549 it != start_byte_to_box.end(); ++it, ++ind) {
550 it->second->mutable_ch()->swap(cluster_text[ind]);
551 }
552 }
553
554 // Append to the boxchars list in byte order.
555 std::vector<BoxChar*> page_boxchars;
556 page_boxchars.reserve(start_byte_to_box.size());
557 std::string last_ch;
558 for (std::map<int, BoxChar*>::const_iterator it = start_byte_to_box.begin();
559 it != start_byte_to_box.end(); ++it) {
560 if (it->second->ch() == kWordJoinerUTF8) {
561 // Skip zero-width joiner characters (ZWJs) here.
562 delete it->second;
563 } else {
564 page_boxchars.push_back(it->second);
565 }
566 }
567 CorrectBoxPositionsToLayout(&page_boxchars);
568
570 for (std::map<int, BoxChar*>::iterator it = start_byte_to_box.begin();
571 it != start_byte_to_box.end(); ++it) {
572 // Convert fullwidth Latin characters to their halfwidth forms.
573 std::string half(ConvertFullwidthLatinToBasicLatin(it->second->ch()));
574 it->second->mutable_ch()->swap(half);
575 }
576 }
577
578 // Merge the character boxes into word boxes if we are rendering n-grams.
579 if (output_word_boxes_) {
580 MergeBoxCharsToWords(&page_boxchars);
581 }
582
583 boxchars_.insert(boxchars_.end(), page_boxchars.begin(), page_boxchars.end());
584
585 // Compute the page bounding box
586 Box* page_box = nullptr;
587 Boxa* all_boxes = nullptr;
588 for (size_t i = 0; i < page_boxchars.size(); ++i) {
589 if (page_boxchars[i]->box() == nullptr) continue;
590 if (all_boxes == nullptr) all_boxes = boxaCreate(0);
591 boxaAddBox(all_boxes, page_boxchars[i]->mutable_box(), L_CLONE);
592 }
593 if (all_boxes != nullptr) {
594 boxaGetExtent(all_boxes, nullptr, nullptr, &page_box);
595 boxaDestroy(&all_boxes);
596 if (page_boxes_ == nullptr) page_boxes_ = boxaCreate(0);
597 boxaAddBox(page_boxes_, page_box, L_INSERT);
598 }
599}
#define ASSERT_HOST(x)
Definition: errcode.h:88
#define ASSERT_HOST_MSG(x,...)
Definition: errcode.h:92
#define tlog(level,...)
Definition: tlog.h:33
bool IsUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:229
std::string AddLigatures(const std::string &str, const PangoFontInfo *font) const
static LigatureTable * Get()
bool GetClusterStrings(std::vector< std::string > *cluster_text)
static std::string ConvertFullwidthLatinToBasicLatin(const std::string &text)
void CorrectBoxPositionsToLayout(std::vector< BoxChar * > *boxchars)

◆ ConvertBasicLatinToFullwidthLatin()

std::string tesseract::StringRenderer::ConvertBasicLatinToFullwidthLatin ( const std::string &  text)
static

Definition at line 694 of file stringrenderer.cpp.

694 {
695 std::string full_str;
696 const UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(),
697 str.length());
698 for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length());
699 it != it_end; ++it) {
700 // Convert printable and non-space 7-bit ASCII characters to
701 // their fullwidth forms.
702 if (IsInterchangeValid7BitAscii(*it) && isprint(*it) && !isspace(*it)) {
703 // Convert by adding 0xFEE0 to the codepoint of 7-bit ASCII.
704 char32 full_char = *it + 0xFEE0;
705 full_str.append(EncodeAsUTF8(full_char));
706 } else {
707 full_str.append(it.utf8_data(), it.utf8_len());
708 }
709 }
710 return full_str;
711}
signed int char32
bool IsInterchangeValid7BitAscii(const char32 ch)
Definition: normstrngs.cpp:276
static const_iterator begin(const char *utf8_str, int byte_length)
Definition: unichar.cpp:204
static const_iterator end(const char *utf8_str, int byte_length)
Definition: unichar.cpp:208

◆ ConvertFullwidthLatinToBasicLatin()

std::string tesseract::StringRenderer::ConvertFullwidthLatinToBasicLatin ( const std::string &  text)
static

Definition at line 714 of file stringrenderer.cpp.

714 {
715 std::string half_str;
716 UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
717 for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length());
718 it != it_end; ++it) {
719 char32 half_char = FullwidthToHalfwidth(*it);
720 // Convert fullwidth Latin characters to their halfwidth forms
721 // only if halfwidth forms are printable and non-space 7-bit ASCII.
722 if (IsInterchangeValid7BitAscii(half_char) &&
723 isprint(half_char) && !isspace(half_char)) {
724 half_str.append(EncodeAsUTF8(half_char));
725 } else {
726 half_str.append(it.utf8_data(), it.utf8_len());
727 }
728 }
729 return half_str;
730}
char32 FullwidthToHalfwidth(const char32 ch)
Definition: normstrngs.cpp:282

◆ CorrectBoxPositionsToLayout()

void tesseract::StringRenderer::CorrectBoxPositionsToLayout ( std::vector< BoxChar * > *  boxchars)
protected

Definition at line 602 of file stringrenderer.cpp.

603 {
604 if (vertical_text_) {
605 const double rotation = - pango_gravity_to_rotation(
606 pango_context_get_base_gravity(pango_layout_get_context(layout_)));
609 0, boxchars->size(), boxchars);
610 } else {
612 }
613}
static void TranslateBoxes(int xshift, int yshift, std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:83
static void RotateBoxes(float rotation, int xcenter, int ycenter, int start_box, int end_box, std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:299

◆ FindFirstPageBreakOffset()

int tesseract::StringRenderer::FindFirstPageBreakOffset ( const char *  text,
int  text_length 
)
protected

Definition at line 279 of file stringrenderer.cpp.

280 {
281 if (!text_length) return 0;
282 const int max_height = (page_height_ - 2 * v_margin_);
283 const int max_width = (page_width_ - 2 * h_margin_);
284 const int max_layout_height = vertical_text_ ? max_width : max_height;
285
286 UNICHAR::const_iterator it = UNICHAR::begin(text, text_length);
287 const UNICHAR::const_iterator it_end = UNICHAR::end(text, text_length);
288 const int kMaxUnicodeBufLength = 15000;
289 for (int i = 0; i < kMaxUnicodeBufLength && it != it_end; ++it, ++i);
290 int buf_length = it.utf8_data() - text;
291 tlog(1, "len = %d buf_len = %d\n", text_length, buf_length);
292 pango_layout_set_text(layout_, text, buf_length);
293
294 PangoLayoutIter* line_iter = nullptr;
295 { // Fontconfig caches some info here that is not freed before exit.
297 line_iter = pango_layout_get_iter(layout_);
298 }
299 bool first_page = true;
300 int page_top = 0;
301 int offset = buf_length;
302 do {
303 // Get bounding box of the current line
304 PangoRectangle line_ink_rect;
305 pango_layout_iter_get_line_extents(line_iter, &line_ink_rect, nullptr);
306 pango_extents_to_pixels(&line_ink_rect, nullptr);
307 PangoLayoutLine* line = pango_layout_iter_get_line_readonly(line_iter);
308 if (first_page) {
309 page_top = line_ink_rect.y;
310 first_page = false;
311 }
312 int line_bottom = line_ink_rect.y + line_ink_rect.height;
313 if (line_bottom - page_top > max_layout_height) {
314 offset = line->start_index;
315 tlog(1, "Found offset = %d\n", offset);
316 break;
317 }
318 } while (pango_layout_iter_next_line(line_iter));
319 pango_layout_iter_free(line_iter);
320 return offset;
321}
#define DISABLE_HEAP_LEAK_CHECK
Definition: util.h:61

◆ font()

const PangoFontInfo & tesseract::StringRenderer::font ( ) const
inline

Definition at line 131 of file stringrenderer.h.

131 {
132 return font_;
133 }

◆ FreePangoCairo()

void tesseract::StringRenderer::FreePangoCairo ( )
protected

Definition at line 218 of file stringrenderer.cpp.

218 {
219 if (layout_) {
220 g_object_unref(layout_);
221 layout_ = nullptr;
222 }
223 if (cr_) {
224 cairo_destroy(cr_);
225 cr_ = nullptr;
226 }
227 if (surface_) {
228 cairo_surface_destroy(surface_);
229 surface_ = nullptr;
230 }
231}

◆ GetBoxes()

const std::vector< BoxChar * > & tesseract::StringRenderer::GetBoxes ( ) const

Definition at line 323 of file stringrenderer.cpp.

323 {
324 return boxchars_;
325}

◆ GetBoxesStr()

std::string tesseract::StringRenderer::GetBoxesStr ( )

Definition at line 343 of file stringrenderer.cpp.

343 {
346}
static void PrepareToWrite(std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:97
static std::string GetTesseractBoxStr(int height, const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:327

◆ GetClusterStrings()

bool tesseract::StringRenderer::GetClusterStrings ( std::vector< std::string > *  cluster_text)
protected

Definition at line 354 of file stringrenderer.cpp.

354 {
355 std::map<int, std::string> start_byte_to_text;
356 PangoLayoutIter* run_iter = pango_layout_get_iter(layout_);
357 const char* full_text = pango_layout_get_text(layout_);
358 do {
359 PangoLayoutRun* run = pango_layout_iter_get_run_readonly(run_iter);
360 if (!run) {
361 // End of line nullptr run marker
362 tlog(2, "Found end of line marker\n");
363 continue;
364 }
365 PangoGlyphItemIter cluster_iter;
366 gboolean have_cluster;
367 for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter,
368 run, full_text);
369 have_cluster;
370 have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {
371 const int start_byte_index = cluster_iter.start_index;
372 const int end_byte_index = cluster_iter.end_index;
373 std::string text = std::string(full_text + start_byte_index,
374 end_byte_index - start_byte_index);
375 if (IsUTF8Whitespace(text.c_str())) {
376 tlog(2, "Found whitespace\n");
377 text = " ";
378 }
379 tlog(2, "start_byte=%d end_byte=%d : '%s'\n", start_byte_index,
380 end_byte_index, text.c_str());
381 if (add_ligatures_) {
382 // Make sure the output box files have ligatured text in case the font
383 // decided to use an unmapped glyph.
384 text = LigatureTable::Get()->AddLigatures(text, nullptr);
385 }
386 start_byte_to_text[start_byte_index] = text;
387 }
388 } while (pango_layout_iter_next_run(run_iter));
389 pango_layout_iter_free(run_iter);
390
391 cluster_text->clear();
392 for (std::map<int, std::string>::const_iterator it = start_byte_to_text.begin();
393 it != start_byte_to_text.end(); ++it) {
394 cluster_text->push_back(it->second);
395 }
396 return !cluster_text->empty();
397}

◆ GetPageBoxes()

Boxa * tesseract::StringRenderer::GetPageBoxes ( ) const

Definition at line 327 of file stringrenderer.cpp.

327 {
328 return page_boxes_;
329}

◆ h_margin()

int tesseract::StringRenderer::h_margin ( ) const
inline

Definition at line 134 of file stringrenderer.h.

134{ return h_margin_; }

◆ InitPangoCairo()

void tesseract::StringRenderer::InitPangoCairo ( )
protected

Definition at line 150 of file stringrenderer.cpp.

150 {
152 surface_ = cairo_image_surface_create(CAIRO_FORMAT_ARGB32, page_width_,
154 cr_ = cairo_create(surface_);
155 {
157 layout_ = pango_cairo_create_layout(cr_);
158 }
159
160 if (vertical_text_) {
161 PangoContext* context = pango_layout_get_context(layout_);
162 pango_context_set_base_gravity(context, PANGO_GRAVITY_EAST);
164 pango_context_set_gravity_hint(context, PANGO_GRAVITY_HINT_STRONG);
165 }
166 pango_layout_context_changed(layout_);
167 }
168
170}

◆ InsertWordJoiners()

std::string tesseract::StringRenderer::InsertWordJoiners ( const std::string &  text)
static

Definition at line 671 of file stringrenderer.cpp.

671 {
672 std::string out_str;
673 const UNICHAR::const_iterator it_end = UNICHAR::end(text.c_str(),
674 text.length());
675 for (UNICHAR::const_iterator it = UNICHAR::begin(text.c_str(), text.length());
676 it != it_end; ++it) {
677 // Add the symbol to the output string.
678 out_str.append(it.utf8_data(), it.utf8_len());
679 // Check the next symbol.
680 UNICHAR::const_iterator next_it = it;
681 ++next_it;
682 bool next_char_is_boundary = (next_it == it_end || *next_it == ' ');
683 bool next_char_is_combiner = (next_it == it_end) ?
684 false : IsCombiner(*next_it);
685 if (*it != ' ' && *it != '\n' && !next_char_is_boundary &&
686 !next_char_is_combiner) {
687 out_str += kWordJoinerUTF8;
688 }
689 }
690 return out_str;
691}

◆ RenderAllFontsToImage()

int tesseract::StringRenderer::RenderAllFontsToImage ( double  min_coverage,
const char *  text,
int  text_length,
std::string *  font_used,
Pix **  pix 
)

Definition at line 834 of file stringrenderer.cpp.

836 {
837 *image = nullptr;
838 // Select a suitable font to render the title with.
839 const char kTitleTemplate[] = "%s : %d hits = %.2f%%, raw = %d = %.2f%%";
840 std::string title_font;
841 if (!FontUtils::SelectFont(kTitleTemplate, strlen(kTitleTemplate),
842 &title_font, nullptr)) {
843 tprintf("WARNING: Could not find a font to render image title with!\n");
844 title_font = "Arial";
845 }
846 title_font += " 8";
847 tlog(1, "Selected title font: %s\n", title_font.c_str());
848 if (font_used) font_used->clear();
849
850 std::string orig_font = font_.DescriptionName();
851 if (char_map_.empty()) {
852 total_chars_ = 0;
853 // Fill the hash table and use that for computing which fonts to use.
854 for (UNICHAR::const_iterator it = UNICHAR::begin(text, text_length);
855 it != UNICHAR::end(text, text_length); ++it) {
856 ++total_chars_;
857 ++char_map_[*it];
858 }
859 tprintf("Total chars = %d\n", total_chars_);
860 }
861 const std::vector<std::string>& all_fonts = FontUtils::ListAvailableFonts();
862
863 for (size_t i = font_index_; i < all_fonts.size(); ++i) {
864 ++font_index_;
865 int raw_score = 0;
866 int ok_chars =
867 FontUtils::FontScore(char_map_, all_fonts[i], &raw_score, nullptr);
868 if (ok_chars > 0 && ok_chars >= total_chars_ * min_coverage) {
869 set_font(all_fonts[i]);
870 int offset = RenderToBinaryImage(text, text_length, 128, image);
871 ClearBoxes(); // Get rid of them as they are garbage.
872 const int kMaxTitleLength = 1024;
873 char title[kMaxTitleLength];
874 snprintf(title, kMaxTitleLength, kTitleTemplate,
875 all_fonts[i].c_str(), ok_chars,
876 100.0 * ok_chars / total_chars_, raw_score,
877 100.0 * raw_score / char_map_.size());
878 tprintf("%s\n", title);
879 // This is a good font! Store the offset to return once we've tried all
880 // the fonts.
881 if (offset) {
882 last_offset_ = offset;
883 if (font_used) *font_used = all_fonts[i];
884 }
885 // Add the font to the image.
886 set_font(title_font);
887 v_margin_ /= 8;
888 Pix* title_image = nullptr;
889 RenderToBinaryImage(title, strlen(title), 128, &title_image);
890 pixOr(*image, *image, title_image);
891 pixDestroy(&title_image);
892
893 v_margin_ *= 8;
894 set_font(orig_font);
895 // We return the real offset only after cycling through the list of fonts.
896 return 0;
897 } else {
898 tprintf("Font %s failed with %d hits = %.2f%%\n",
899 all_fonts[i].c_str(), ok_chars, 100.0 * ok_chars / total_chars_);
900 }
901 }
902 font_index_ = 0;
903 char_map_.clear();
904 return last_offset_ == 0 ? -1 : last_offset_;
905}
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
std::string DescriptionName() const
static int FontScore(const std::unordered_map< char32, int64_t > &ch_map, const std::string &fontname, int *raw_score, std::vector< bool > *ch_flags)
static bool SelectFont(const char *utf8_word, const int utf8_len, std::string *font_name, std::vector< std::string > *graphemes)
static const std::vector< std::string > & ListAvailableFonts()
int RenderToBinaryImage(const char *text, int text_length, int threshold, Pix **pix)
std::unordered_map< char32, int64_t > char_map_

◆ RenderToBinaryImage()

int tesseract::StringRenderer::RenderToBinaryImage ( const char *  text,
int  text_length,
int  threshold,
Pix **  pix 
)

Definition at line 653 of file stringrenderer.cpp.

654 {
655 Pix* orig_pix = nullptr;
656 int offset = RenderToImage(text, text_length, &orig_pix);
657 if (orig_pix) {
658 Pix* gray_pix = pixConvertTo8(orig_pix, false);
659 pixDestroy(&orig_pix);
660 *pix = pixThresholdToBinary(gray_pix, threshold);
661 pixDestroy(&gray_pix);
662 } else {
663 *pix = orig_pix;
664 }
665 return offset;
666}
int RenderToImage(const char *text, int text_length, Pix **pix)

◆ RenderToGrayscaleImage()

int tesseract::StringRenderer::RenderToGrayscaleImage ( const char *  text,
int  text_length,
Pix **  pix 
)

Definition at line 642 of file stringrenderer.cpp.

643 {
644 Pix* orig_pix = nullptr;
645 int offset = RenderToImage(text, text_length, &orig_pix);
646 if (orig_pix) {
647 *pix = pixConvertTo8(orig_pix, false);
648 pixDestroy(&orig_pix);
649 }
650 return offset;
651}

◆ RenderToImage()

int tesseract::StringRenderer::RenderToImage ( const char *  text,
int  text_length,
Pix **  pix 
)

Definition at line 733 of file stringrenderer.cpp.

734 {
735 if (pix && *pix) pixDestroy(pix);
737
738 const int page_offset = FindFirstPageBreakOffset(text, text_length);
739 if (!page_offset) {
740 return 0;
741 }
742 start_box_ = boxchars_.size();
743
744 if (!vertical_text_) {
745 // Translate by the specified margin
746 cairo_translate(cr_, h_margin_, v_margin_);
747 } else {
748 // Vertical text rendering is achieved by a two-step process of first
749 // performing regular horizontal layout with character orientation set to
750 // EAST, and then translating and rotating the layout before rendering onto
751 // the desired image surface. The settings required for the former step are
752 // done within InitPangoCairo().
753 //
754 // Translate to the top-right margin of page
755 cairo_translate(cr_, page_width_ - h_margin_, v_margin_);
756 // Rotate the layout
757 double rotation = - pango_gravity_to_rotation(
758 pango_context_get_base_gravity(pango_layout_get_context(layout_)));
759 tlog(2, "Rotating by %f radians\n", rotation);
760 cairo_rotate(cr_, rotation);
761 pango_cairo_update_layout(cr_, layout_);
762 }
763 std::string page_text(text, page_offset);
765 // Convert Basic Latin to their fullwidth forms.
766 page_text = ConvertBasicLatinToFullwidthLatin(page_text);
767 }
769 StripUnrenderableWords(&page_text);
770 }
772 !font_.CoversUTF8Text(page_text.c_str(), page_text.length())) {
773 int num_dropped = font_.DropUncoveredChars(&page_text);
774 if (num_dropped) {
775 tprintf("WARNING: Dropped %d uncovered characters\n", num_dropped);
776 }
777 }
778 if (add_ligatures_) {
779 // Add ligatures wherever possible, including custom ligatures.
780 page_text = LigatureTable::Get()->AddLigatures(page_text, &font_);
781 }
782 if (underline_start_prob_ > 0) {
784 }
785
786 pango_layout_set_text(layout_, page_text.c_str(), page_text.length());
787
788 if (pix) {
789 // Set a white background for the target image surface.
790 cairo_set_source_rgb(cr_, 1.0, 1.0, 1.0); // sets drawing colour to white
791 // Fill the surface with the active colour (if you don't do this, you will
792 // be given a surface with a transparent background to draw on)
793 cairo_paint(cr_);
794 // Set the ink color to black
795 cairo_set_source_rgb(cr_, pen_color_[0], pen_color_[1], pen_color_[2]);
796 // If the target surface or transformation properties of the cairo instance
797 // have changed, update the pango layout to reflect this
798 pango_cairo_update_layout(cr_, layout_);
799 {
800 DISABLE_HEAP_LEAK_CHECK; // for Fontconfig
801 // Draw the pango layout onto the cairo surface
802 pango_cairo_show_layout(cr_, layout_);
803 }
804 *pix = CairoARGB32ToPixFormat(surface_);
805 }
808 // Update internal state variables.
809 ++page_;
810 return page_offset;
811}
int DropUncoveredChars(std::string *utf8_text) const
bool CoversUTF8Text(const char *utf8_text, int byte_length) const
void SetWordUnderlineAttributes(const std::string &page_text)
static std::string ConvertBasicLatinToFullwidthLatin(const std::string &text)
int StripUnrenderableWords(std::string *utf8_text) const
int FindFirstPageBreakOffset(const char *text, int text_length)

◆ RotatePageBoxes()

void tesseract::StringRenderer::RotatePageBoxes ( float  rotation)

Definition at line 331 of file stringrenderer.cpp.

331 {
333 start_box_, boxchars_.size(), &boxchars_);
334}

◆ set_add_ligatures()

void tesseract::StringRenderer::set_add_ligatures ( bool  add_ligatures)
inline

Definition at line 116 of file stringrenderer.h.

116 {
117 add_ligatures_ = add_ligatures;
118 }

◆ set_box_padding()

void tesseract::StringRenderer::set_box_padding ( int  val)
inline

Definition at line 101 of file stringrenderer.h.

101 {
102 box_padding_ = val;
103 }

◆ set_char_spacing()

void tesseract::StringRenderer::set_char_spacing ( int  char_spacing)
inline

Definition at line 70 of file stringrenderer.h.

70{ char_spacing_ = char_spacing; }

◆ set_drop_uncovered_chars()

void tesseract::StringRenderer::set_drop_uncovered_chars ( bool  val)
inline

Definition at line 104 of file stringrenderer.h.

104 {
106 }

◆ set_features()

void tesseract::StringRenderer::set_features ( const char *  features)
inline

Definition at line 94 of file stringrenderer.h.

94 {
95 free(features_);
96 features_ = strdup(features);
97 }

◆ set_font()

bool tesseract::StringRenderer::set_font ( const std::string &  desc)

Definition at line 125 of file stringrenderer.cpp.

125 {
126 bool success = font_.ParseFontDescriptionName(desc);
128 return success;
129}
void set_resolution(const int resolution)
bool ParseFontDescriptionName(const std::string &name)

◆ set_gravity_hint_strong()

void tesseract::StringRenderer::set_gravity_hint_strong ( bool  gravity_hint_strong)
inline

Definition at line 78 of file stringrenderer.h.

78 {
79 gravity_hint_strong_ = gravity_hint_strong;
80 }

◆ set_h_margin()

void tesseract::StringRenderer::set_h_margin ( const int  h_margin)
inline

Definition at line 125 of file stringrenderer.h.

125 {
127 }

◆ set_leading()

void tesseract::StringRenderer::set_leading ( int  leading)
inline

Definition at line 71 of file stringrenderer.h.

71 {
72 leading_ = leading;
73 }

◆ set_output_word_boxes()

void tesseract::StringRenderer::set_output_word_boxes ( bool  val)
inline

Definition at line 110 of file stringrenderer.h.

110 {
111 output_word_boxes_ = val;
112 }

◆ set_page()

void tesseract::StringRenderer::set_page ( int  page)
inline

Definition at line 98 of file stringrenderer.h.

98 {
99 page_ = page;
100 }

◆ set_pen_color()

void tesseract::StringRenderer::set_pen_color ( double  r,
double  g,
double  b 
)
inline

Definition at line 120 of file stringrenderer.h.

120 {
121 pen_color_[0] = r;
122 pen_color_[1] = g;
123 pen_color_[2] = b;
124 }

◆ set_render_fullwidth_latin()

void tesseract::StringRenderer::set_render_fullwidth_latin ( bool  render_fullwidth_latin)
inline

Definition at line 81 of file stringrenderer.h.

81 {
82 render_fullwidth_latin_ = render_fullwidth_latin;
83 }

◆ set_resolution()

void tesseract::StringRenderer::set_resolution ( const int  resolution)

Definition at line 131 of file stringrenderer.cpp.

131 {
132 resolution_ = resolution;
133 font_.set_resolution(resolution);
134}

◆ set_strip_unrenderable_words()

void tesseract::StringRenderer::set_strip_unrenderable_words ( bool  val)
inline

Definition at line 107 of file stringrenderer.h.

107 {
109 }

◆ set_underline_continuation_prob()

void tesseract::StringRenderer::set_underline_continuation_prob ( const double  frac)

Definition at line 140 of file stringrenderer.cpp.

140 {
141 underline_continuation_prob_ = std::min(std::max(frac, 0.0), 1.0);
142}

◆ set_underline_start_prob()

void tesseract::StringRenderer::set_underline_start_prob ( const double  frac)

Definition at line 136 of file stringrenderer.cpp.

136 {
137 underline_start_prob_ = std::min(std::max(frac, 0.0), 1.0);
138}

◆ set_underline_style()

void tesseract::StringRenderer::set_underline_style ( const PangoUnderline  style)
inline

Definition at line 91 of file stringrenderer.h.

91 {
92 underline_style_ = style;
93 }

◆ set_v_margin()

void tesseract::StringRenderer::set_v_margin ( const int  v_margin)
inline

Definition at line 128 of file stringrenderer.h.

128 {
130 }

◆ set_vertical_text()

void tesseract::StringRenderer::set_vertical_text ( bool  vertical_text)
inline

Definition at line 75 of file stringrenderer.h.

75 {
76 vertical_text_ = vertical_text;
77 }

◆ SetLayoutProperties()

void tesseract::StringRenderer::SetLayoutProperties ( )
protected

Definition at line 172 of file stringrenderer.cpp.

172 {
173 std::string font_desc = font_.DescriptionName();
174 // Specify the font via a description name
175 PangoFontDescription *desc =
176 pango_font_description_from_string(font_desc.c_str());
177 // Assign the font description to the layout
178 pango_layout_set_font_description(layout_, desc);
179 pango_font_description_free(desc); // free the description
180 pango_cairo_context_set_resolution(pango_layout_get_context(layout_),
182
183 int max_width = page_width_ - 2 * h_margin_;
184 int max_height = page_height_ - 2 * v_margin_;
185 tlog(3, "max_width = %d, max_height = %d\n", max_width, max_height);
186 if (vertical_text_) {
187 using std::swap;
188 swap(max_width, max_height);
189 }
190 pango_layout_set_width(layout_, max_width * PANGO_SCALE);
191 // Ultra-wide Thai strings need to wrap at char level.
192 pango_layout_set_wrap(layout_, PANGO_WRAP_WORD_CHAR);
193
194 // Adjust character spacing
195 PangoAttrList* attr_list = pango_attr_list_new();
196 if (char_spacing_) {
197 PangoAttribute* spacing_attr =
198 pango_attr_letter_spacing_new(char_spacing_ * PANGO_SCALE);
199 spacing_attr->start_index = 0;
200 spacing_attr->end_index = static_cast<guint>(-1);
201 pango_attr_list_change(attr_list, spacing_attr);
202 }
203#if (PANGO_VERSION_MAJOR == 1 && PANGO_VERSION_MINOR >= 38)
204 if (add_ligatures_) {
205 set_features("liga, clig, dlig, hlig");
206 PangoAttribute* feature_attr = pango_attr_font_features_new(features_);
207 pango_attr_list_change(attr_list, feature_attr);
208 }
209#endif
210 pango_layout_set_attributes(layout_, attr_list);
211 pango_attr_list_unref(attr_list);
212 // Adjust line spacing
213 if (leading_) {
214 pango_layout_set_spacing(layout_, leading_ * PANGO_SCALE);
215 }
216}
void set_features(const char *features)

◆ SetWordUnderlineAttributes()

void tesseract::StringRenderer::SetWordUnderlineAttributes ( const std::string &  page_text)
protected

Definition at line 233 of file stringrenderer.cpp.

233 {
234 if (underline_start_prob_ == 0) return;
235 PangoAttrList* attr_list = pango_layout_get_attributes(layout_);
236
237 const char* text = page_text.c_str();
238 size_t offset = 0;
239 TRand rand;
240 bool started_underline = false;
241 PangoAttribute* und_attr = nullptr;
242
243 while (offset < page_text.length()) {
244 offset += SpanUTF8Whitespace(text + offset);
245 if (offset == page_text.length()) break;
246
247 int word_start = offset;
248 int word_len = SpanUTF8NotWhitespace(text + offset);
249 offset += word_len;
250 if (started_underline) {
251 // Should we continue the underline to the next word?
252 if (RandBool(underline_continuation_prob_, &rand)) {
253 // Continue the current underline to this word.
254 und_attr->end_index = word_start + word_len;
255 } else {
256 // Otherwise end the current underline attribute at the end of the
257 // previous word.
258 pango_attr_list_insert(attr_list, und_attr);
259 started_underline = false;
260 und_attr = nullptr;
261 }
262 }
263 if (!started_underline && RandBool(underline_start_prob_, &rand)) {
264 // Start a new underline attribute
265 und_attr = pango_attr_underline_new(underline_style_);
266 und_attr->start_index = word_start;
267 und_attr->end_index = word_start + word_len;
268 started_underline = true;
269 }
270 }
271 // Finish the current underline attribute at the end of the page.
272 if (started_underline) {
273 und_attr->end_index = page_text.length();
274 pango_attr_list_insert(attr_list, und_attr);
275 }
276}
unsigned int SpanUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:233
unsigned int SpanUTF8NotWhitespace(const char *text)
Definition: normstrngs.cpp:243

◆ StripUnrenderableWords()

int tesseract::StringRenderer::StripUnrenderableWords ( std::string *  utf8_text) const

Definition at line 615 of file stringrenderer.cpp.

615 {
616 std::string output_text;
617 const char* text = utf8_text->c_str();
618 size_t offset = 0;
619 int num_dropped = 0;
620 while (offset < utf8_text->length()) {
621 int space_len = SpanUTF8Whitespace(text + offset);
622 output_text.append(text + offset, space_len);
623 offset += space_len;
624 if (offset == utf8_text->length()) break;
625
626 int word_len = SpanUTF8NotWhitespace(text + offset);
627 if (font_.CanRenderString(text + offset, word_len)) {
628 output_text.append(text + offset, word_len);
629 } else {
630 ++num_dropped;
631 }
632 offset += word_len;
633 }
634 utf8_text->swap(output_text);
635
636 if (num_dropped > 0) {
637 tprintf("Stripped %d unrenderable words\n", num_dropped);
638 }
639 return num_dropped;
640}
bool CanRenderString(const char *utf8_word, int len, std::vector< std::string > *graphemes) const

◆ v_margin()

int tesseract::StringRenderer::v_margin ( ) const
inline

Definition at line 135 of file stringrenderer.h.

135{ return v_margin_; }

◆ WriteAllBoxes()

void tesseract::StringRenderer::WriteAllBoxes ( const std::string &  filename)

Definition at line 348 of file stringrenderer.cpp.

348 {
351}
static void WriteTesseractBoxFile(const std::string &name, int height, const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:320

Member Data Documentation

◆ add_ligatures_

bool tesseract::StringRenderer::add_ligatures_
protected

Definition at line 196 of file stringrenderer.h.

◆ box_padding_

int tesseract::StringRenderer::box_padding_
protected

Definition at line 209 of file stringrenderer.h.

◆ boxchars_

std::vector<BoxChar*> tesseract::StringRenderer::boxchars_
protected

Definition at line 208 of file stringrenderer.h.

◆ char_map_

std::unordered_map<char32, int64_t> tesseract::StringRenderer::char_map_
protected

Definition at line 214 of file stringrenderer.h.

◆ char_spacing_

int tesseract::StringRenderer::char_spacing_
protected

Definition at line 184 of file stringrenderer.h.

◆ cr_

cairo_t* tesseract::StringRenderer::cr_
protected

Definition at line 200 of file stringrenderer.h.

◆ drop_uncovered_chars_

bool tesseract::StringRenderer::drop_uncovered_chars_
protected

Definition at line 194 of file stringrenderer.h.

◆ features_

char* tesseract::StringRenderer::features_
protected

Definition at line 192 of file stringrenderer.h.

◆ font_

PangoFontInfo tesseract::StringRenderer::font_
protected

Definition at line 179 of file stringrenderer.h.

◆ font_index_

unsigned int tesseract::StringRenderer::font_index_
protected

Definition at line 216 of file stringrenderer.h.

◆ gravity_hint_strong_

bool tesseract::StringRenderer::gravity_hint_strong_
protected

Definition at line 187 of file stringrenderer.h.

◆ h_margin_

int tesseract::StringRenderer::h_margin_
protected

Definition at line 181 of file stringrenderer.h.

◆ last_offset_

int tesseract::StringRenderer::last_offset_
protected

Definition at line 217 of file stringrenderer.h.

◆ layout_

PangoLayout* tesseract::StringRenderer::layout_
protected

Definition at line 201 of file stringrenderer.h.

◆ leading_

int tesseract::StringRenderer::leading_
protected

Definition at line 185 of file stringrenderer.h.

◆ output_word_boxes_

bool tesseract::StringRenderer::output_word_boxes_
protected

Definition at line 197 of file stringrenderer.h.

◆ page_

int tesseract::StringRenderer::page_
protected

Definition at line 205 of file stringrenderer.h.

◆ page_boxes_

Boxa* tesseract::StringRenderer::page_boxes_
protected

Definition at line 211 of file stringrenderer.h.

◆ page_height_

int tesseract::StringRenderer::page_height_
protected

Definition at line 181 of file stringrenderer.h.

◆ page_width_

int tesseract::StringRenderer::page_width_
protected

Definition at line 181 of file stringrenderer.h.

◆ pen_color_

double tesseract::StringRenderer::pen_color_[3]
protected

Definition at line 183 of file stringrenderer.h.

◆ render_fullwidth_latin_

bool tesseract::StringRenderer::render_fullwidth_latin_
protected

Definition at line 188 of file stringrenderer.h.

◆ resolution_

int tesseract::StringRenderer::resolution_
protected

Definition at line 185 of file stringrenderer.h.

◆ start_box_

int tesseract::StringRenderer::start_box_
protected

Definition at line 204 of file stringrenderer.h.

◆ strip_unrenderable_words_

bool tesseract::StringRenderer::strip_unrenderable_words_
protected

Definition at line 195 of file stringrenderer.h.

◆ surface_

cairo_surface_t* tesseract::StringRenderer::surface_
protected

Definition at line 199 of file stringrenderer.h.

◆ total_chars_

int tesseract::StringRenderer::total_chars_
protected

Definition at line 215 of file stringrenderer.h.

◆ underline_continuation_prob_

double tesseract::StringRenderer::underline_continuation_prob_
protected

Definition at line 190 of file stringrenderer.h.

◆ underline_start_prob_

double tesseract::StringRenderer::underline_start_prob_
protected

Definition at line 189 of file stringrenderer.h.

◆ underline_style_

PangoUnderline tesseract::StringRenderer::underline_style_
protected

Definition at line 191 of file stringrenderer.h.

◆ v_margin_

int tesseract::StringRenderer::v_margin_
protected

Definition at line 181 of file stringrenderer.h.

◆ vertical_text_

bool tesseract::StringRenderer::vertical_text_
protected

Definition at line 186 of file stringrenderer.h.


The documentation for this class was generated from the following files: