tesseract 4.1.1
Loading...
Searching...
No Matches
resultiterator.cpp
Go to the documentation of this file.
1
2// File: resultiterator.cpp
3// Description: Iterator for tesseract results that is capable of
4// iterating in proper reading order over Bi Directional
5// (e.g. mixed Hebrew and English) text.
6// Author: David Eger
7// Created: Fri May 27 13:58:06 PST 2011
8//
9// (C) Copyright 2011, Google Inc.
10// Licensed under the Apache License, Version 2.0 (the "License");
11// you may not use this file except in compliance with the License.
12// You may obtain a copy of the License at
13// http://www.apache.org/licenses/LICENSE-2.0
14// Unless required by applicable law or agreed to in writing, software
15// distributed under the License is distributed on an "AS IS" BASIS,
16// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17// See the License for the specific language governing permissions and
18// limitations under the License.
19//
21
22#include "resultiterator.h"
23
24#include "allheaders.h"
25#include "pageres.h"
26#include "strngs.h"
27#include "tesseractclass.h"
28#include "unicharset.h"
29#include "unicodes.h"
30#include <set>
31#include <vector>
32
33namespace tesseract {
34
36 : LTRResultIterator(resit) {
37 in_minor_direction_ = false;
38 at_beginning_of_minor_run_ = false;
39 preserve_interword_spaces_ = false;
40
41 auto *p = ParamUtils::FindParam<BoolParam>(
42 "preserve_interword_spaces", GlobalParams()->bool_params,
44 if (p != nullptr) preserve_interword_spaces_ = (bool)(*p);
45
46 current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
47 MoveToLogicalStartOfTextline();
48}
49
51 const LTRResultIterator &resit) {
52 return new ResultIterator(resit);
53}
54
56 return current_paragraph_is_ltr_;
57}
58
59bool ResultIterator::CurrentParagraphIsLtr() const {
60 if (!it_->word())
61 return true; // doesn't matter.
62 LTRResultIterator it(*this);
63 it.RestartParagraph();
64 // Try to figure out the ltr-ness of the paragraph. The rules below
65 // make more sense in the context of a difficult paragraph example.
66 // Here we denote {ltr characters, RTL CHARACTERS}:
67 //
68 // "don't go in there!" DAIS EH
69 // EHT OTNI DEPMUJ FELSMIH NEHT DNA
70 // .GNIDLIUB GNINRUB
71 //
72 // On the first line, the left-most word is LTR and the rightmost word
73 // is RTL. Thus, we are better off taking the majority direction for
74 // the whole paragraph contents. So instead of "the leftmost word is LTR"
75 // indicating an LTR paragraph, we use a heuristic about what RTL paragraphs
76 // would not do: Typically an RTL paragraph would *not* start with an LTR
77 // word. So our heuristics are as follows:
78 //
79 // (1) If the first text line has an RTL word in the left-most position
80 // it is RTL.
81 // (2) If the first text line has an LTR word in the right-most position
82 // it is LTR.
83 // (3) If neither of the above is true, take the majority count for the
84 // paragraph -- if there are more rtl words, it is RTL. If there
85 // are more LTR words, it's LTR.
86 bool leftmost_rtl = it.WordDirection() == DIR_RIGHT_TO_LEFT;
87 bool rightmost_ltr = it.WordDirection() == DIR_LEFT_TO_RIGHT;
88 int num_ltr, num_rtl;
89 num_rtl = leftmost_rtl ? 1 : 0;
90 num_ltr = (it.WordDirection() == DIR_LEFT_TO_RIGHT) ? 1 : 0;
91 for (it.Next(RIL_WORD);
92 !it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_TEXTLINE);
93 it.Next(RIL_WORD)) {
94 StrongScriptDirection dir = it.WordDirection();
95 rightmost_ltr = (dir == DIR_LEFT_TO_RIGHT);
96 num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
97 num_ltr += rightmost_ltr ? 1 : 0;
98 }
99 if (leftmost_rtl)
100 return false;
101 if (rightmost_ltr)
102 return true;
103 // First line is ambiguous. Take statistics on the whole paragraph.
104 if (!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA)) do {
105 StrongScriptDirection dir = it.WordDirection();
106 num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
107 num_ltr += (dir == DIR_LEFT_TO_RIGHT) ? 1 : 0;
108 } while (it.Next(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA));
109 return num_ltr >= num_rtl;
110}
111
112const int ResultIterator::kMinorRunStart = -1;
113const int ResultIterator::kMinorRunEnd = -2;
114const int ResultIterator::kComplexWord = -3;
115
116void ResultIterator::CalculateBlobOrder(
117 GenericVector<int> *blob_indices) const {
118 bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
119 blob_indices->clear();
120 if (Empty(RIL_WORD)) return;
121 if (context_is_ltr || it_->word()->UnicharsInReadingOrder()) {
122 // Easy! just return the blobs in order;
123 for (int i = 0; i < word_length_; i++)
124 blob_indices->push_back(i);
125 return;
126 }
127
128 // The blobs are in left-to-right order, but the current reading context
129 // is right-to-left.
130 const int U_LTR = UNICHARSET::U_LEFT_TO_RIGHT;
131 const int U_RTL = UNICHARSET::U_RIGHT_TO_LEFT;
132 const int U_EURO_NUM = UNICHARSET::U_EUROPEAN_NUMBER;
133 const int U_EURO_NUM_SEP = UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR;
134 const int U_EURO_NUM_TERM = UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR;
135 const int U_COMMON_NUM_SEP = UNICHARSET::U_COMMON_NUMBER_SEPARATOR;
136 const int U_OTHER_NEUTRAL = UNICHARSET::U_OTHER_NEUTRAL;
137
138 // Step 1: Scan for and mark European Number sequences
139 // [:ET:]*[:EN:]+(([:ES:]|[:CS:])?[:EN:]+)*[:ET:]*
140 GenericVector<int> letter_types;
141 for (int i = 0; i < word_length_; i++) {
142 letter_types.push_back(it_->word()->SymbolDirection(i));
143 }
144 // Convert a single separtor sandwiched between two EN's into an EN.
145 for (int i = 0; i + 2 < word_length_; i++) {
146 if (letter_types[i] == U_EURO_NUM && letter_types[i + 2] == U_EURO_NUM &&
147 (letter_types[i + 1] == U_EURO_NUM_SEP ||
148 letter_types[i + 1] == U_COMMON_NUM_SEP)) {
149 letter_types[i + 1] = U_EURO_NUM;
150 }
151 }
152 // Scan for sequences of European Number Terminators around ENs and convert
153 // them to ENs.
154 for (int i = 0; i < word_length_; i++) {
155 if (letter_types[i] == U_EURO_NUM_TERM) {
156 int j = i + 1;
157 while (j < word_length_ && letter_types[j] == U_EURO_NUM_TERM) { j++; }
158 if (j < word_length_ && letter_types[j] == U_EURO_NUM) {
159 // The sequence [i..j] should be converted to all European Numbers.
160 for (int k = i; k < j; k++) letter_types[k] = U_EURO_NUM;
161 }
162 j = i - 1;
163 while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) { j--; }
164 if (j > -1 && letter_types[j] == U_EURO_NUM) {
165 // The sequence [j..i] should be converted to all European Numbers.
166 for (int k = j; k <= i; k++) letter_types[k] = U_EURO_NUM;
167 }
168 }
169 }
170 // Step 2: Convert all remaining types to either L or R.
171 // Sequences ([:L:]|[:EN:])+ (([:CS:]|[:ON:])+ ([:L:]|[:EN:])+)* -> L.
172 // All other are R.
173 for (int i = 0; i < word_length_;) {
174 int ti = letter_types[i];
175 if (ti == U_LTR || ti == U_EURO_NUM) {
176 // Left to right sequence; scan to the end of it.
177 int last_good = i;
178 for (int j = i + 1; j < word_length_; j++) {
179 int tj = letter_types[j];
180 if (tj == U_LTR || tj == U_EURO_NUM) {
181 last_good = j;
182 } else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) {
183 // do nothing.
184 } else {
185 break;
186 }
187 }
188 // [i..last_good] is the L sequence
189 for (int k = i; k <= last_good; k++) letter_types[k] = U_LTR;
190 i = last_good + 1;
191 } else {
192 letter_types[i] = U_RTL;
193 i++;
194 }
195 }
196
197 // At this point, letter_types is entirely U_LTR or U_RTL.
198 for (int i = word_length_ - 1; i >= 0;) {
199 if (letter_types[i] == U_RTL) {
200 blob_indices->push_back(i);
201 i--;
202 } else {
203 // left to right sequence. scan to the beginning.
204 int j = i - 1;
205 for (; j >= 0 && letter_types[j] != U_RTL; j--) { } // pass
206 // Now (j, i] is LTR
207 for (int k = j + 1; k <= i; k++) blob_indices->push_back(k);
208 i = j;
209 }
210 }
211 ASSERT_HOST(blob_indices->size() == word_length_);
212}
213
214static void PrintScriptDirs(const GenericVector<StrongScriptDirection> &dirs) {
215 for (int i = 0; i < dirs.size(); i++) {
216 switch (dirs[i]) {
217 case DIR_NEUTRAL: tprintf ("N "); break;
218 case DIR_LEFT_TO_RIGHT: tprintf("L "); break;
219 case DIR_RIGHT_TO_LEFT: tprintf("R "); break;
220 case DIR_MIX: tprintf("Z "); break;
221 default: tprintf("? "); break;
222 }
223 }
224 tprintf("\n");
225}
226
228 bool paragraph_is_ltr,
229 const LTRResultIterator &resit,
230 GenericVectorEqEq<int> *word_indices) const {
232 CalculateTextlineOrder(paragraph_is_ltr, resit, &directions, word_indices);
233}
234
236 bool paragraph_is_ltr,
237 const LTRResultIterator &resit,
239 GenericVectorEqEq<int> *word_indices) const {
242 directions = (dirs_arg != nullptr) ? dirs_arg : &dirs;
243 directions->truncate(0);
244
245 // A LTRResultIterator goes strictly left-to-right word order.
246 LTRResultIterator ltr_it(resit);
247 ltr_it.RestartRow();
248 if (ltr_it.Empty(RIL_WORD)) return;
249 do {
250 directions->push_back(ltr_it.WordDirection());
251 } while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE));
252
253 word_indices->truncate(0);
254 CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices);
255}
256
258 bool paragraph_is_ltr,
260 GenericVectorEqEq<int> *reading_order) {
261 reading_order->truncate(0);
262 if (word_dirs.size() == 0) return;
263
264 // Take all of the runs of minor direction words and insert them
265 // in reverse order.
266 int minor_direction, major_direction, major_step, start, end;
267 if (paragraph_is_ltr) {
268 start = 0;
269 end = word_dirs.size();
270 major_step = 1;
271 major_direction = DIR_LEFT_TO_RIGHT;
272 minor_direction = DIR_RIGHT_TO_LEFT;
273 } else {
274 start = word_dirs.size() - 1;
275 end = -1;
276 major_step = -1;
277 major_direction = DIR_RIGHT_TO_LEFT;
278 minor_direction = DIR_LEFT_TO_RIGHT;
279 // Special rule: if there are neutral words at the right most side
280 // of a line adjacent to a left-to-right word in the middle of the
281 // line, we interpret the end of the line as a single LTR sequence.
282 if (word_dirs[start] == DIR_NEUTRAL) {
283 int neutral_end = start;
284 while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
285 neutral_end--;
286 }
287 if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
288 // LTR followed by neutrals.
289 // Scan for the beginning of the minor left-to-right run.
290 int left = neutral_end;
291 for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
292 if (word_dirs[i] == DIR_LEFT_TO_RIGHT) left = i;
293 }
294 reading_order->push_back(kMinorRunStart);
295 for (int i = left; i < word_dirs.size(); i++) {
296 reading_order->push_back(i);
297 if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
298 }
299 reading_order->push_back(kMinorRunEnd);
300 start = left - 1;
301 }
302 }
303 }
304 for (int i = start; i != end;) {
305 if (word_dirs[i] == minor_direction) {
306 int j = i;
307 while (j != end && word_dirs[j] != major_direction)
308 j += major_step;
309 if (j == end) j -= major_step;
310 while (j != i && word_dirs[j] != minor_direction)
311 j -= major_step;
312 // [j..i] is a minor direction run.
313 reading_order->push_back(kMinorRunStart);
314 for (int k = j; k != i; k -= major_step) {
315 reading_order->push_back(k);
316 }
317 reading_order->push_back(i);
318 reading_order->push_back(kMinorRunEnd);
319 i = j + major_step;
320 } else {
321 reading_order->push_back(i);
322 if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
323 i += major_step;
324 }
325 }
326}
327
328int ResultIterator::LTRWordIndex() const {
329 int this_word_index = 0;
330 LTRResultIterator textline(*this);
331 textline.RestartRow();
332 while (!textline.PositionedAtSameWord(it_)) {
333 this_word_index++;
334 textline.Next(RIL_WORD);
335 }
336 return this_word_index;
337}
338
339void ResultIterator::MoveToLogicalStartOfWord() {
340 if (word_length_ == 0) {
341 BeginWord(0);
342 return;
343 }
344 GenericVector<int> blob_order;
345 CalculateBlobOrder(&blob_order);
346 if (blob_order.size() == 0 || blob_order[0] == 0) return;
347 BeginWord(blob_order[0]);
348}
349
350bool ResultIterator::IsAtFinalSymbolOfWord() const {
351 if (!it_->word()) return true;
352 GenericVector<int> blob_order;
353 CalculateBlobOrder(&blob_order);
354 return blob_order.size() == 0 || blob_order.back() == blob_index_;
355}
356
357bool ResultIterator::IsAtFirstSymbolOfWord() const {
358 if (!it_->word()) return true;
359 GenericVector<int> blob_order;
360 CalculateBlobOrder(&blob_order);
361 return blob_order.size() == 0 || blob_order[0] == blob_index_;
362}
363
364void ResultIterator::AppendSuffixMarks(STRING *text) const {
365 if (!it_->word()) return;
366 bool reading_direction_is_ltr =
367 current_paragraph_is_ltr_ ^ in_minor_direction_;
368 // scan forward to see what meta-information the word ordering algorithm
369 // left us.
370 // If this word is at the *end* of a minor run, insert the other
371 // direction's mark; else if this was a complex word, insert the
372 // current reading order's mark.
373 GenericVectorEqEq<int> textline_order;
374 CalculateTextlineOrder(current_paragraph_is_ltr_,
375 *this, &textline_order);
376 int this_word_index = LTRWordIndex();
377 int i = textline_order.get_index(this_word_index);
378 if (i < 0) return;
379
380 int last_non_word_mark = 0;
381 for (i++; i < textline_order.size() && textline_order[i] < 0; i++) {
382 last_non_word_mark = textline_order[i];
383 }
384 if (last_non_word_mark == kComplexWord) {
385 *text += reading_direction_is_ltr ? kLRM : kRLM;
386 } else if (last_non_word_mark == kMinorRunEnd) {
387 if (current_paragraph_is_ltr_) {
388 *text += kLRM;
389 } else {
390 *text += kRLM;
391 }
392 }
393}
394
395void ResultIterator::MoveToLogicalStartOfTextline() {
396 GenericVectorEqEq<int> word_indices;
397 RestartRow();
398 CalculateTextlineOrder(current_paragraph_is_ltr_,
399 dynamic_cast<const LTRResultIterator&>(*this),
400 &word_indices);
401 int i = 0;
402 for (; i < word_indices.size() && word_indices[i] < 0; i++) {
403 if (word_indices[i] == kMinorRunStart) in_minor_direction_ = true;
404 else if (word_indices[i] == kMinorRunEnd) in_minor_direction_ = false;
405 }
406 if (in_minor_direction_) at_beginning_of_minor_run_ = true;
407 if (i >= word_indices.size()) return;
408 int first_word_index = word_indices[i];
409 for (int j = 0; j < first_word_index; j++) {
411 }
412 MoveToLogicalStartOfWord();
413}
414
417 current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
418 in_minor_direction_ = false;
419 at_beginning_of_minor_run_ = false;
420 MoveToLogicalStartOfTextline();
421}
422
424 if (it_->block() == nullptr) return false; // already at end!
425 switch (level) {
426 case RIL_BLOCK: // explicit fall-through
427 case RIL_PARA: // explicit fall-through
428 case RIL_TEXTLINE:
429 if (!PageIterator::Next(level)) return false;
431 // if we've advanced to a new paragraph,
432 // recalculate current_paragraph_is_ltr_
433 current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
434 }
435 in_minor_direction_ = false;
436 MoveToLogicalStartOfTextline();
437 return it_->block() != nullptr;
438 case RIL_SYMBOL:
439 {
440 GenericVector<int> blob_order;
441 CalculateBlobOrder(&blob_order);
442 int next_blob = 0;
443 while (next_blob < blob_order.size() &&
444 blob_index_ != blob_order[next_blob])
445 next_blob++;
446 next_blob++;
447 if (next_blob < blob_order.size()) {
448 // we're in the same word; simply advance one blob.
449 BeginWord(blob_order[next_blob]);
450 at_beginning_of_minor_run_ = false;
451 return true;
452 }
453 level = RIL_WORD; // we've fallen through to the next word.
454 }
455 // Fall through.
456 case RIL_WORD: // explicit fall-through.
457 {
458 if (it_->word() == nullptr) return Next(RIL_BLOCK);
459 GenericVectorEqEq<int> word_indices;
460 int this_word_index = LTRWordIndex();
461 CalculateTextlineOrder(current_paragraph_is_ltr_,
462 *this,
463 &word_indices);
464 int final_real_index = word_indices.size() - 1;
465 while (final_real_index > 0 && word_indices[final_real_index] < 0)
466 final_real_index--;
467 for (int i = 0; i < final_real_index; i++) {
468 if (word_indices[i] == this_word_index) {
469 int j = i + 1;
470 for (; j < final_real_index && word_indices[j] < 0; j++) {
471 if (word_indices[j] == kMinorRunStart) in_minor_direction_ = true;
472 if (word_indices[j] == kMinorRunEnd) in_minor_direction_ = false;
473 }
474 at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
475 // awesome, we move to word_indices[j]
476 if (BidiDebug(3)) {
477 tprintf("Next(RIL_WORD): %d -> %d\n",
478 this_word_index, word_indices[j]);
479 }
481 for (int k = 0; k < word_indices[j]; k++) {
483 }
484 MoveToLogicalStartOfWord();
485 return true;
486 }
487 }
488 if (BidiDebug(3)) {
489 tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
490 }
491 // we're going off the end of the text line.
492 return Next(RIL_TEXTLINE);
493 }
494 }
495 ASSERT_HOST(false); // shouldn't happen.
496 return false;
497}
498
500 if (it_->block() == nullptr) return false; // Already at the end!
501 if (it_->word() == nullptr) return true; // In an image block.
502 if (level == RIL_SYMBOL) return true; // Always at beginning of a symbol.
503
504 bool at_word_start = IsAtFirstSymbolOfWord();
505 if (level == RIL_WORD) return at_word_start;
506
507 ResultIterator line_start(*this);
508 // move to the first word in the line...
509 line_start.MoveToLogicalStartOfTextline();
510
511 bool at_textline_start = at_word_start && *line_start.it_ == *it_;
512 if (level == RIL_TEXTLINE) return at_textline_start;
513
514 // now we move to the left-most word...
515 line_start.RestartRow();
516 bool at_block_start = at_textline_start &&
517 line_start.it_->block() != line_start.it_->prev_block();
518 if (level == RIL_BLOCK) return at_block_start;
519
520 bool at_para_start = at_block_start ||
521 (at_textline_start &&
522 line_start.it_->row()->row->para() !=
523 line_start.it_->prev_row()->row->para());
524 if (level == RIL_PARA) return at_para_start;
525
526 ASSERT_HOST(false); // shouldn't happen.
527 return false;
528}
529
536 PageIteratorLevel element) const {
537 if (Empty(element)) return true; // Already at the end!
538 // The result is true if we step forward by element and find we are
539 // at the the end of the page or at beginning of *all* levels in:
540 // [level, element).
541 // When there is more than one level difference between element and level,
542 // we could for instance move forward one symbol and still be at the first
543 // word on a line, so we also have to be at the first symbol in a word.
544 ResultIterator next(*this);
545 next.Next(element);
546 if (next.Empty(element)) return true; // Reached the end of the page.
547 while (element > level) {
548 element = static_cast<PageIteratorLevel>(element - 1);
549 if (!next.IsAtBeginningOf(element))
550 return false;
551 }
552 return true;
553}
554
555// Returns the number of blanks before the current word.
557 if (CurrentParagraphIsLtr()) return LTRResultIterator::BlanksBeforeWord();
558 return IsAtBeginningOf(RIL_TEXTLINE) ? 0 : 1;
559}
560
566 if (it_->word() == nullptr) return nullptr; // Already at the end!
567 STRING text;
568 switch (level) {
569 case RIL_BLOCK:
570 {
571 ResultIterator pp(*this);
572 do {
573 pp.AppendUTF8ParagraphText(&text);
574 } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
575 }
576 break;
577 case RIL_PARA:
578 AppendUTF8ParagraphText(&text);
579 break;
580 case RIL_TEXTLINE:
581 {
582 ResultIterator it(*this);
583 it.MoveToLogicalStartOfTextline();
584 it.IterateAndAppendUTF8TextlineText(&text);
585 }
586 break;
587 case RIL_WORD:
588 AppendUTF8WordText(&text);
589 break;
590 case RIL_SYMBOL:
591 {
592 bool reading_direction_is_ltr =
593 current_paragraph_is_ltr_ ^ in_minor_direction_;
594 if (at_beginning_of_minor_run_) {
595 text += reading_direction_is_ltr ? kLRM : kRLM;
596 }
597 text = it_->word()->BestUTF8(blob_index_, false);
598 if (IsAtFinalSymbolOfWord()) AppendSuffixMarks(&text);
599 }
600 break;
601 }
602 int length = text.length() + 1;
603 char* result = new char[length];
604 strncpy(result, text.string(), length);
605 return result;
606}
607
608std::vector<std::vector<std::pair<const char*, float>>>*
610 if (it_->word() != nullptr) {
611 return &it_->word()->timesteps;
612 } else {
613 return nullptr;
614 }
615}
616
617void ResultIterator::AppendUTF8WordText(STRING *text) const {
618 if (!it_->word()) return;
619 ASSERT_HOST(it_->word()->best_choice != nullptr);
620 bool reading_direction_is_ltr =
621 current_paragraph_is_ltr_ ^ in_minor_direction_;
622 if (at_beginning_of_minor_run_) {
623 *text += reading_direction_is_ltr ? kLRM : kRLM;
624 }
625
626 GenericVector<int> blob_order;
627 CalculateBlobOrder(&blob_order);
628 for (int i = 0; i < blob_order.size(); i++) {
629 *text += it_->word()->BestUTF8(blob_order[i], false);
630 }
631 AppendSuffixMarks(text);
632}
633
634void ResultIterator::IterateAndAppendUTF8TextlineText(STRING *text) {
635 if (Empty(RIL_WORD)) {
636 Next(RIL_WORD);
637 return;
638 }
639 if (BidiDebug(1)) {
640 GenericVectorEqEq<int> textline_order;
642 CalculateTextlineOrder(current_paragraph_is_ltr_,
643 *this, &dirs, &textline_order);
644 tprintf("Strong Script dirs [%p/P=%s]: ", it_->row(),
645 current_paragraph_is_ltr_ ? "ltr" : "rtl");
646 PrintScriptDirs(dirs);
647 tprintf("Logical textline order [%p/P=%s]: ", it_->row(),
648 current_paragraph_is_ltr_ ? "ltr" : "rtl");
649 for (int i = 0; i < textline_order.size(); i++) {
650 tprintf("%d ", textline_order[i]);
651 }
652 tprintf("\n");
653 }
654
655 int words_appended = 0;
656 do {
657 int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space()
658 : (words_appended > 0);
659 for (int i = 0; i < numSpaces; ++i) {
660 *text += " ";
661 }
662 AppendUTF8WordText(text);
663 words_appended++;
664 if (BidiDebug(2)) {
665 tprintf("Num spaces=%d, text=%s\n", numSpaces, text->string());
666 }
668 if (BidiDebug(1)) {
669 tprintf("%d words printed\n", words_appended);
670 }
671 *text += line_separator_;
672 // If we just finished a paragraph, add an extra newline.
674 *text += paragraph_separator_;
675 }
676}
677
678void ResultIterator::AppendUTF8ParagraphText(STRING *text) const {
679 ResultIterator it(*this);
680 it.RestartParagraph();
681 it.MoveToLogicalStartOfTextline();
682 if (it.Empty(RIL_WORD)) return;
683 do {
684 it.IterateAndAppendUTF8TextlineText(text);
685 } while (it.it_->block() != nullptr && !it.IsAtBeginningOf(RIL_PARA));
686}
687
688bool ResultIterator::BidiDebug(int min_level) const {
689 int debug_level = 1;
690 auto *p = ParamUtils::FindParam<IntParam>(
691 "bidi_debug", GlobalParams()->int_params,
693 if (p != nullptr) debug_level = (int32_t)(*p);
694 return debug_level >= min_level;
695}
696
697} // namespace tesseract.
#define ASSERT_HOST(x)
Definition: errcode.h:88
tesseract::ParamsVectors * GlobalParams()
Definition: params.cpp:32
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
StrongScriptDirection
Definition: unichar.h:41
@ DIR_MIX
Definition: unichar.h:45
@ DIR_RIGHT_TO_LEFT
Definition: unichar.h:44
@ DIR_LEFT_TO_RIGHT
Definition: unichar.h:43
@ DIR_NEUTRAL
Definition: unichar.h:42
const char *const kLRM
Left-to-Right Mark.
Definition: unicodes.cpp:23
const char *const kRLM
Right-to-Left Mark.
Definition: unicodes.cpp:24
int push_back(T object)
int size() const
Definition: genericvector.h:72
T & back() const
int get_index(const T &object) const
void truncate(int size)
LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)
virtual void RestartRow()
virtual bool Next(PageIteratorLevel level)
bool IsWithinFirstTextlineOfParagraph() const
bool Empty(PageIteratorLevel level) const
TESS_LOCAL void BeginWord(int offset)
bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const override
static void CalculateTextlineOrder(bool paragraph_is_ltr, const GenericVector< StrongScriptDirection > &word_dirs, GenericVectorEqEq< int > *reading_order)
static const int kMinorRunEnd
static const int kMinorRunStart
virtual char * GetUTF8Text(PageIteratorLevel level) const
bool IsAtBeginningOf(PageIteratorLevel level) const override
virtual std::vector< std::vector< std::pair< const char *, float > > > * GetBestLSTMSymbolChoices() const
bool Next(PageIteratorLevel level) override
static ResultIterator * StartOfParagraph(const LTRResultIterator &resit)
TESS_LOCAL ResultIterator(const LTRResultIterator &resit)
static const int kComplexWord
PARA * para() const
Definition: ocrrow.h:118
ROW * row
Definition: pageres.h:140
UNICHARSET::Direction SymbolDirection(int blob_index) const
Definition: pageres.h:385
const char * BestUTF8(int blob_index, bool in_rtl_context) const
Definition: pageres.h:363
std::vector< std::vector< std::pair< const char *, float > > > timesteps
Definition: pageres.h:221
WERD_CHOICE * best_choice
Definition: pageres.h:241
bool UnicharsInReadingOrder() const
Definition: pageres.h:427
WERD * word
Definition: pageres.h:186
WERD_RES * word() const
Definition: pageres.h:754
ROW_RES * row() const
Definition: pageres.h:757
BLOCK_RES * prev_block() const
Definition: pageres.h:751
ROW_RES * prev_row() const
Definition: pageres.h:748
BLOCK_RES * block() const
Definition: pageres.h:760
uint8_t space()
Definition: werd.h:99
ParamsVectors * params()
Definition: ccutil.h:67
GenericVector< IntParam * > int_params
Definition: params.h:43
GenericVector< BoolParam * > bool_params
Definition: params.h:44
Definition: strngs.h:45
int32_t length() const
Definition: strngs.cpp:189
const char * string() const
Definition: strngs.cpp:194
@ U_COMMON_NUMBER_SEPARATOR
Definition: unicharset.h:163
@ U_OTHER_NEUTRAL
Definition: unicharset.h:167
@ U_EUROPEAN_NUMBER_TERMINATOR
Definition: unicharset.h:161
@ U_EUROPEAN_NUMBER_SEPARATOR
Definition: unicharset.h:160
@ U_EUROPEAN_NUMBER
Definition: unicharset.h:159
@ U_RIGHT_TO_LEFT
Definition: unicharset.h:158
@ U_LEFT_TO_RIGHT
Definition: unicharset.h:157