tesseract 4.1.1
Loading...
Searching...
No Matches
tesseract::ResultIterator Class Reference

#include <resultiterator.h>

Inheritance diagram for tesseract::ResultIterator:
tesseract::LTRResultIterator tesseract::PageIterator tesseract::MutableIterator

Public Member Functions

 ~ResultIterator () override=default
 
void Begin () override
 
bool Next (PageIteratorLevel level) override
 
bool IsAtBeginningOf (PageIteratorLevel level) const override
 
bool IsAtFinalElement (PageIteratorLevel level, PageIteratorLevel element) const override
 
int BlanksBeforeWord () const
 
virtual char * GetUTF8Text (PageIteratorLevel level) const
 
virtual std::vector< std::vector< std::pair< const char *, float > > > * GetBestLSTMSymbolChoices () const
 
bool ParagraphIsLtr () const
 
- Public Member Functions inherited from tesseract::LTRResultIterator
 LTRResultIterator (PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)
 
 ~LTRResultIterator () override
 
char * GetUTF8Text (PageIteratorLevel level) const
 
void SetLineSeparator (const char *new_line)
 
void SetParagraphSeparator (const char *new_para)
 
float Confidence (PageIteratorLevel level) const
 
void RowAttributes (float *row_height, float *descenders, float *ascenders) const
 
const char * WordFontAttributes (bool *is_bold, bool *is_italic, bool *is_underlined, bool *is_monospace, bool *is_serif, bool *is_smallcaps, int *pointsize, int *font_id) const
 
const char * WordRecognitionLanguage () const
 
StrongScriptDirection WordDirection () const
 
bool WordIsFromDictionary () const
 
int BlanksBeforeWord () const
 
bool WordIsNumeric () const
 
bool HasBlamerInfo () const
 
const void * GetParamsTrainingBundle () const
 
const char * GetBlamerDebug () const
 
const char * GetBlamerMisadaptionDebug () const
 
bool HasTruthString () const
 
bool EquivalentToTruth (const char *str) const
 
char * WordTruthUTF8Text () const
 
char * WordNormedUTF8Text () const
 
const char * WordLattice (int *lattice_size) const
 
bool SymbolIsSuperscript () const
 
bool SymbolIsSubscript () const
 
bool SymbolIsDropcap () const
 
- Public Member Functions inherited from tesseract::PageIterator
 PageIterator (PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)
 
virtual ~PageIterator ()
 
 PageIterator (const PageIterator &src)
 
const PageIteratoroperator= (const PageIterator &src)
 
bool PositionedAtSameWord (const PAGE_RES_IT *other) const
 
virtual void Begin ()
 
virtual void RestartParagraph ()
 
bool IsWithinFirstTextlineOfParagraph () const
 
virtual void RestartRow ()
 
virtual bool Next (PageIteratorLevel level)
 
virtual bool IsAtBeginningOf (PageIteratorLevel level) const
 
virtual bool IsAtFinalElement (PageIteratorLevel level, PageIteratorLevel element) const
 
int Cmp (const PageIterator &other) const
 
void SetBoundingBoxComponents (bool include_upper_dots, bool include_lower_dots)
 
bool BoundingBox (PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
 
bool BoundingBox (PageIteratorLevel level, int padding, int *left, int *top, int *right, int *bottom) const
 
bool BoundingBoxInternal (PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
 
bool Empty (PageIteratorLevel level) const
 
PolyBlockType BlockType () const
 
Pta * BlockPolygon () const
 
Pix * GetBinaryImage (PageIteratorLevel level) const
 
Pix * GetImage (PageIteratorLevel level, int padding, Pix *original_img, int *left, int *top) const
 
bool Baseline (PageIteratorLevel level, int *x1, int *y1, int *x2, int *y2) const
 
void Orientation (tesseract::Orientation *orientation, tesseract::WritingDirection *writing_direction, tesseract::TextlineOrder *textline_order, float *deskew_angle) const
 
void ParagraphInfo (tesseract::ParagraphJustification *justification, bool *is_list_item, bool *is_crown, int *first_line_indent) const
 
bool SetWordBlamerBundle (BlamerBundle *blamer_bundle)
 

Static Public Member Functions

static ResultIteratorStartOfParagraph (const LTRResultIterator &resit)
 
static void CalculateTextlineOrder (bool paragraph_is_ltr, const GenericVector< StrongScriptDirection > &word_dirs, GenericVectorEqEq< int > *reading_order)
 

Static Public Attributes

static const int kMinorRunStart = -1
 
static const int kMinorRunEnd = -2
 
static const int kComplexWord = -3
 

Protected Member Functions

TESS_LOCAL ResultIterator (const LTRResultIterator &resit)
 
- Protected Member Functions inherited from tesseract::PageIterator
TESS_LOCAL void BeginWord (int offset)
 

Additional Inherited Members

- Protected Attributes inherited from tesseract::LTRResultIterator
const char * line_separator_
 
const char * paragraph_separator_
 
- Protected Attributes inherited from tesseract::PageIterator
PAGE_RESpage_res_
 
Tesseracttesseract_
 
PAGE_RES_ITit_
 
WERDword_
 
int word_length_
 
int blob_index_
 
C_BLOB_IT * cblob_it_
 
bool include_upper_dots_
 
bool include_lower_dots_
 
int scale_
 
int scaled_yres_
 
int rect_left_
 
int rect_top_
 
int rect_width_
 
int rect_height_
 

Detailed Description

Definition at line 41 of file resultiterator.h.

Constructor & Destructor Documentation

◆ ~ResultIterator()

tesseract::ResultIterator::~ResultIterator ( )
overridedefault

ResultIterator is copy constructible! The default copy constructor works just fine for us.

◆ ResultIterator()

tesseract::ResultIterator::ResultIterator ( const LTRResultIterator resit)
explicitprotected

We presume the data associated with the given iterator will outlive us. NB: This is private because it does something that is non-obvious: it resets to the beginning of the paragraph instead of staying wherever resit might have pointed.

Definition at line 35 of file resultiterator.cpp.

36 : LTRResultIterator(resit) {
37 in_minor_direction_ = false;
38 at_beginning_of_minor_run_ = false;
39 preserve_interword_spaces_ = false;
40
41 auto *p = ParamUtils::FindParam<BoolParam>(
42 "preserve_interword_spaces", GlobalParams()->bool_params,
44 if (p != nullptr) preserve_interword_spaces_ = (bool)(*p);
45
46 current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
47 MoveToLogicalStartOfTextline();
48}
tesseract::ParamsVectors * GlobalParams()
Definition: params.cpp:32
LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)
ParamsVectors * params()
Definition: ccutil.h:67
GenericVector< BoolParam * > bool_params
Definition: params.h:44

Member Function Documentation

◆ Begin()

void tesseract::ResultIterator::Begin ( )
overridevirtual

Moves the iterator to point to the start of the page to begin an iteration.

Reimplemented from tesseract::PageIterator.

Definition at line 415 of file resultiterator.cpp.

415 {
417 current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
418 in_minor_direction_ = false;
419 at_beginning_of_minor_run_ = false;
420 MoveToLogicalStartOfTextline();
421}

◆ BlanksBeforeWord()

int tesseract::ResultIterator::BlanksBeforeWord ( ) const

Definition at line 556 of file resultiterator.cpp.

556 {
557 if (CurrentParagraphIsLtr()) return LTRResultIterator::BlanksBeforeWord();
558 return IsAtBeginningOf(RIL_TEXTLINE) ? 0 : 1;
559}
bool IsAtBeginningOf(PageIteratorLevel level) const override

◆ CalculateTextlineOrder()

void tesseract::ResultIterator::CalculateTextlineOrder ( bool  paragraph_is_ltr,
const GenericVector< StrongScriptDirection > &  word_dirs,
GenericVectorEqEq< int > *  reading_order 
)
static

Yields the reading order as a sequence of indices and (optional) meta-marks for a set of words (given left-to-right). The meta marks are passed as negative values: kMinorRunStart Start of minor direction text. kMinorRunEnd End of minor direction text. kComplexWord The next indexed word contains both left-to-right and right-to-left characters and was treated as neutral.

For example, suppose we have five words in a text line, indexed [0,1,2,3,4] from the leftmost side of the text line. The following are all believable reading_orders:

Left-to-Right (in ltr paragraph): { 0, 1, 2, 3, 4 } Left-to-Right (in rtl paragraph): { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd } Right-to-Left (in rtl paragraph): { 4, 3, 2, 1, 0 } Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph: { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }

Definition at line 257 of file resultiterator.cpp.

260 {
261 reading_order->truncate(0);
262 if (word_dirs.size() == 0) return;
263
264 // Take all of the runs of minor direction words and insert them
265 // in reverse order.
266 int minor_direction, major_direction, major_step, start, end;
267 if (paragraph_is_ltr) {
268 start = 0;
269 end = word_dirs.size();
270 major_step = 1;
271 major_direction = DIR_LEFT_TO_RIGHT;
272 minor_direction = DIR_RIGHT_TO_LEFT;
273 } else {
274 start = word_dirs.size() - 1;
275 end = -1;
276 major_step = -1;
277 major_direction = DIR_RIGHT_TO_LEFT;
278 minor_direction = DIR_LEFT_TO_RIGHT;
279 // Special rule: if there are neutral words at the right most side
280 // of a line adjacent to a left-to-right word in the middle of the
281 // line, we interpret the end of the line as a single LTR sequence.
282 if (word_dirs[start] == DIR_NEUTRAL) {
283 int neutral_end = start;
284 while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
285 neutral_end--;
286 }
287 if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
288 // LTR followed by neutrals.
289 // Scan for the beginning of the minor left-to-right run.
290 int left = neutral_end;
291 for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
292 if (word_dirs[i] == DIR_LEFT_TO_RIGHT) left = i;
293 }
294 reading_order->push_back(kMinorRunStart);
295 for (int i = left; i < word_dirs.size(); i++) {
296 reading_order->push_back(i);
297 if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
298 }
299 reading_order->push_back(kMinorRunEnd);
300 start = left - 1;
301 }
302 }
303 }
304 for (int i = start; i != end;) {
305 if (word_dirs[i] == minor_direction) {
306 int j = i;
307 while (j != end && word_dirs[j] != major_direction)
308 j += major_step;
309 if (j == end) j -= major_step;
310 while (j != i && word_dirs[j] != minor_direction)
311 j -= major_step;
312 // [j..i] is a minor direction run.
313 reading_order->push_back(kMinorRunStart);
314 for (int k = j; k != i; k -= major_step) {
315 reading_order->push_back(k);
316 }
317 reading_order->push_back(i);
318 reading_order->push_back(kMinorRunEnd);
319 i = j + major_step;
320 } else {
321 reading_order->push_back(i);
322 if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
323 i += major_step;
324 }
325 }
326}
@ DIR_MIX
Definition: unichar.h:45
@ DIR_RIGHT_TO_LEFT
Definition: unichar.h:44
@ DIR_LEFT_TO_RIGHT
Definition: unichar.h:43
@ DIR_NEUTRAL
Definition: unichar.h:42
int push_back(T object)
int size() const
Definition: genericvector.h:72
void truncate(int size)
static const int kMinorRunEnd
static const int kMinorRunStart
static const int kComplexWord

◆ GetBestLSTMSymbolChoices()

std::vector< std::vector< std::pair< const char *, float > > > * tesseract::ResultIterator::GetBestLSTMSymbolChoices ( ) const
virtual

Returns the LSTM choices for every LSTM timestep for the current word.

Definition at line 609 of file resultiterator.cpp.

609 {
610 if (it_->word() != nullptr) {
611 return &it_->word()->timesteps;
612 } else {
613 return nullptr;
614 }
615}
std::vector< std::vector< std::pair< const char *, float > > > timesteps
Definition: pageres.h:221
WERD_RES * word() const
Definition: pageres.h:754

◆ GetUTF8Text()

char * tesseract::ResultIterator::GetUTF8Text ( PageIteratorLevel  level) const
virtual

Returns the null terminated UTF-8 encoded text string for the current object at the given level. Use delete [] to free after use.

Definition at line 565 of file resultiterator.cpp.

565 {
566 if (it_->word() == nullptr) return nullptr; // Already at the end!
567 STRING text;
568 switch (level) {
569 case RIL_BLOCK:
570 {
571 ResultIterator pp(*this);
572 do {
573 pp.AppendUTF8ParagraphText(&text);
574 } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
575 }
576 break;
577 case RIL_PARA:
578 AppendUTF8ParagraphText(&text);
579 break;
580 case RIL_TEXTLINE:
581 {
582 ResultIterator it(*this);
583 it.MoveToLogicalStartOfTextline();
584 it.IterateAndAppendUTF8TextlineText(&text);
585 }
586 break;
587 case RIL_WORD:
588 AppendUTF8WordText(&text);
589 break;
590 case RIL_SYMBOL:
591 {
592 bool reading_direction_is_ltr =
593 current_paragraph_is_ltr_ ^ in_minor_direction_;
594 if (at_beginning_of_minor_run_) {
595 text += reading_direction_is_ltr ? kLRM : kRLM;
596 }
597 text = it_->word()->BestUTF8(blob_index_, false);
598 if (IsAtFinalSymbolOfWord()) AppendSuffixMarks(&text);
599 }
600 break;
601 }
602 int length = text.length() + 1;
603 char* result = new char[length];
604 strncpy(result, text.string(), length);
605 return result;
606}
const char *const kLRM
Left-to-Right Mark.
Definition: unicodes.cpp:23
const char *const kRLM
Right-to-Left Mark.
Definition: unicodes.cpp:24
TESS_LOCAL ResultIterator(const LTRResultIterator &resit)
const char * BestUTF8(int blob_index, bool in_rtl_context) const
Definition: pageres.h:363
BLOCK_RES * block() const
Definition: pageres.h:760
Definition: strngs.h:45
int32_t length() const
Definition: strngs.cpp:189
const char * string() const
Definition: strngs.cpp:194

◆ IsAtBeginningOf()

bool tesseract::ResultIterator::IsAtBeginningOf ( PageIteratorLevel  level) const
overridevirtual

IsAtBeginningOf() returns whether we're at the logical beginning of the given level. (as opposed to ResultIterator's left-to-right top-to-bottom order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf(). For a full description, see pageiterator.h

Reimplemented from tesseract::PageIterator.

Definition at line 499 of file resultiterator.cpp.

499 {
500 if (it_->block() == nullptr) return false; // Already at the end!
501 if (it_->word() == nullptr) return true; // In an image block.
502 if (level == RIL_SYMBOL) return true; // Always at beginning of a symbol.
503
504 bool at_word_start = IsAtFirstSymbolOfWord();
505 if (level == RIL_WORD) return at_word_start;
506
507 ResultIterator line_start(*this);
508 // move to the first word in the line...
509 line_start.MoveToLogicalStartOfTextline();
510
511 bool at_textline_start = at_word_start && *line_start.it_ == *it_;
512 if (level == RIL_TEXTLINE) return at_textline_start;
513
514 // now we move to the left-most word...
515 line_start.RestartRow();
516 bool at_block_start = at_textline_start &&
517 line_start.it_->block() != line_start.it_->prev_block();
518 if (level == RIL_BLOCK) return at_block_start;
519
520 bool at_para_start = at_block_start ||
521 (at_textline_start &&
522 line_start.it_->row()->row->para() !=
523 line_start.it_->prev_row()->row->para());
524 if (level == RIL_PARA) return at_para_start;
525
526 ASSERT_HOST(false); // shouldn't happen.
527 return false;
528}
#define ASSERT_HOST(x)
Definition: errcode.h:88

◆ IsAtFinalElement()

bool tesseract::ResultIterator::IsAtFinalElement ( PageIteratorLevel  level,
PageIteratorLevel  element 
) const
overridevirtual

Implement PageIterator's IsAtFinalElement correctly in a BiDi context. For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we point at the last word in a paragraph. See PageIterator for full comment.

NOTE! This is an exact copy of PageIterator::IsAtFinalElement with the change that the variable next is now a ResultIterator instead of a PageIterator.

Reimplemented from tesseract::PageIterator.

Definition at line 535 of file resultiterator.cpp.

536 {
537 if (Empty(element)) return true; // Already at the end!
538 // The result is true if we step forward by element and find we are
539 // at the the end of the page or at beginning of *all* levels in:
540 // [level, element).
541 // When there is more than one level difference between element and level,
542 // we could for instance move forward one symbol and still be at the first
543 // word on a line, so we also have to be at the first symbol in a word.
544 ResultIterator next(*this);
545 next.Next(element);
546 if (next.Empty(element)) return true; // Reached the end of the page.
547 while (element > level) {
548 element = static_cast<PageIteratorLevel>(element - 1);
549 if (!next.IsAtBeginningOf(element))
550 return false;
551 }
552 return true;
553}
bool Empty(PageIteratorLevel level) const

◆ Next()

bool tesseract::ResultIterator::Next ( PageIteratorLevel  level)
overridevirtual

Moves to the start of the next object at the given level in the page hierarchy in the appropriate reading order and returns false if the end of the page was reached. NOTE that RIL_SYMBOL will skip non-text blocks, but all other PageIteratorLevel level values will visit each non-text block once. Think of non text blocks as containing a single para, with a single line, with a single imaginary word. Calls to Next with different levels may be freely intermixed. This function iterates words in right-to-left scripts correctly, if the appropriate language has been loaded into Tesseract.

Reimplemented from tesseract::PageIterator.

Definition at line 423 of file resultiterator.cpp.

423 {
424 if (it_->block() == nullptr) return false; // already at end!
425 switch (level) {
426 case RIL_BLOCK: // explicit fall-through
427 case RIL_PARA: // explicit fall-through
428 case RIL_TEXTLINE:
429 if (!PageIterator::Next(level)) return false;
431 // if we've advanced to a new paragraph,
432 // recalculate current_paragraph_is_ltr_
433 current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
434 }
435 in_minor_direction_ = false;
436 MoveToLogicalStartOfTextline();
437 return it_->block() != nullptr;
438 case RIL_SYMBOL:
439 {
440 GenericVector<int> blob_order;
441 CalculateBlobOrder(&blob_order);
442 int next_blob = 0;
443 while (next_blob < blob_order.size() &&
444 blob_index_ != blob_order[next_blob])
445 next_blob++;
446 next_blob++;
447 if (next_blob < blob_order.size()) {
448 // we're in the same word; simply advance one blob.
449 BeginWord(blob_order[next_blob]);
450 at_beginning_of_minor_run_ = false;
451 return true;
452 }
453 level = RIL_WORD; // we've fallen through to the next word.
454 }
455 // Fall through.
456 case RIL_WORD: // explicit fall-through.
457 {
458 if (it_->word() == nullptr) return Next(RIL_BLOCK);
459 GenericVectorEqEq<int> word_indices;
460 int this_word_index = LTRWordIndex();
461 CalculateTextlineOrder(current_paragraph_is_ltr_,
462 *this,
463 &word_indices);
464 int final_real_index = word_indices.size() - 1;
465 while (final_real_index > 0 && word_indices[final_real_index] < 0)
466 final_real_index--;
467 for (int i = 0; i < final_real_index; i++) {
468 if (word_indices[i] == this_word_index) {
469 int j = i + 1;
470 for (; j < final_real_index && word_indices[j] < 0; j++) {
471 if (word_indices[j] == kMinorRunStart) in_minor_direction_ = true;
472 if (word_indices[j] == kMinorRunEnd) in_minor_direction_ = false;
473 }
474 at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
475 // awesome, we move to word_indices[j]
476 if (BidiDebug(3)) {
477 tprintf("Next(RIL_WORD): %d -> %d\n",
478 this_word_index, word_indices[j]);
479 }
481 for (int k = 0; k < word_indices[j]; k++) {
483 }
484 MoveToLogicalStartOfWord();
485 return true;
486 }
487 }
488 if (BidiDebug(3)) {
489 tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
490 }
491 // we're going off the end of the text line.
492 return Next(RIL_TEXTLINE);
493 }
494 }
495 ASSERT_HOST(false); // shouldn't happen.
496 return false;
497}
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
virtual void RestartRow()
virtual bool Next(PageIteratorLevel level)
bool IsWithinFirstTextlineOfParagraph() const
TESS_LOCAL void BeginWord(int offset)
static void CalculateTextlineOrder(bool paragraph_is_ltr, const GenericVector< StrongScriptDirection > &word_dirs, GenericVectorEqEq< int > *reading_order)
bool Next(PageIteratorLevel level) override

◆ ParagraphIsLtr()

bool tesseract::ResultIterator::ParagraphIsLtr ( ) const

Return whether the current paragraph's dominant reading direction is left-to-right (as opposed to right-to-left).

Definition at line 55 of file resultiterator.cpp.

55 {
56 return current_paragraph_is_ltr_;
57}

◆ StartOfParagraph()

ResultIterator * tesseract::ResultIterator::StartOfParagraph ( const LTRResultIterator resit)
static

Definition at line 50 of file resultiterator.cpp.

51 {
52 return new ResultIterator(resit);
53}

Member Data Documentation

◆ kComplexWord

const int tesseract::ResultIterator::kComplexWord = -3
static

Definition at line 143 of file resultiterator.h.

◆ kMinorRunEnd

const int tesseract::ResultIterator::kMinorRunEnd = -2
static

Definition at line 142 of file resultiterator.h.

◆ kMinorRunStart

const int tesseract::ResultIterator::kMinorRunStart = -1
static

Definition at line 141 of file resultiterator.h.


The documentation for this class was generated from the following files: