tesseract 4.1.1
Loading...
Searching...
No Matches
paragraphs.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: paragraphs.cpp
3 * Description: Paragraph detection for tesseract.
4 * Author: David Eger
5 *
6 * (C) Copyright 2011, Google Inc.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19#include "paragraphs.h"
20#include <cctype> // for isspace
21#include <cmath> // for abs
22#include <cstdio> // for snprintf
23#include <cstdlib> // for abs
24#include <cstring> // for strchr, strlen
25#include <algorithm> // for max
26#include <memory> // for unique_ptr
27#include "genericvector.h" // for GenericVector, GenericVectorEqEq
28#include "helpers.h" // for UpdateRange, ClipToRange
29#include "host.h" // for NearlyEqual
30#include "mutableiterator.h" // for MutableIterator
31#include "ocrblock.h" // for BLOCK
32#include "ocrpara.h" // for ParagraphModel, PARA, PARA_IT, PARA...
33#include "ocrrow.h" // for ROW
34#include "pageiterator.h" // for PageIterator
35#include "pageres.h" // for PAGE_RES_IT, WERD_RES, ROW_RES, BLO...
36#include "paragraphs_internal.h" // for RowScratchRegisters, SetOfModels
37#include "pdblock.h" // for PDBLK
38#include "polyblk.h" // for POLY_BLOCK
39#include "publictypes.h" // for JUSTIFICATION_LEFT, JUSTIFICATION_R...
40#include "ratngs.h" // for WERD_CHOICE
41#include "rect.h" // for TBOX
42#include "statistc.h" // for STATS
43#include "strngs.h" // for STRING
44#include "tprintf.h" // for tprintf
45#include "unichar.h" // for UNICHAR, UNICHAR_ID
46#include "unicharset.h" // for UNICHARSET
47#include "unicodes.h" // for kPDF, kRLE
48#include "werd.h" // for WERD, W_REP_CHAR
49
50namespace tesseract {
51
52// Special "weak" ParagraphModels.
54 = reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD111F));
56 = reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD888F));
57
58// Do the text and geometry of two rows support a paragraph break between them?
59static bool LikelyParagraphStart(const RowScratchRegisters &before,
60 const RowScratchRegisters &after,
62
63// Given the width of a typical space between words, what is the threshold
64// by which by which we think left and right alignments for paragraphs
65// can vary and still be aligned.
66static int Epsilon(int space_pix) {
67 return space_pix * 4 / 5;
68}
69
70static bool AcceptableRowArgs(
71 int debug_level, int min_num_rows, const char *function_name,
73 int row_start, int row_end) {
74 if (row_start < 0 || row_end > rows->size() || row_start > row_end) {
75 tprintf("Invalid arguments rows[%d, %d) while rows is of size %d.\n",
76 row_start, row_end, rows->size());
77 return false;
78 }
79 if (row_end - row_start < min_num_rows) {
80 if (debug_level > 1) {
81 tprintf("# Too few rows[%d, %d) for %s.\n",
82 row_start, row_end, function_name);
83 }
84 return false;
85 }
86 return true;
87}
88
89// =============================== Debug Code ================================
90
91// Convert an integer to a decimal string.
92static STRING StrOf(int num) {
93 char buffer[30];
94 snprintf(buffer, sizeof(buffer), "%d", num);
95 return STRING(buffer);
96}
97
98// Given a row-major matrix of unicode text and a column separator, print
99// a formatted table. For ASCII, we get good column alignment.
100static void PrintTable(const GenericVector<GenericVector<STRING> > &rows,
101 const STRING &colsep) {
102 GenericVector<int> max_col_widths;
103 for (int r = 0; r < rows.size(); r++) {
104 int num_columns = rows[r].size();
105 for (int c = 0; c < num_columns; c++) {
106 int num_unicodes = 0;
107 for (int i = 0; i < rows[r][c].size(); i++) {
108 if ((rows[r][c][i] & 0xC0) != 0x80) num_unicodes++;
109 }
110 if (c >= max_col_widths.size()) {
111 max_col_widths.push_back(num_unicodes);
112 } else {
113 if (num_unicodes > max_col_widths[c])
114 max_col_widths[c] = num_unicodes;
115 }
116 }
117 }
118
119 GenericVector<STRING> col_width_patterns;
120 for (int c = 0; c < max_col_widths.size(); c++) {
121 col_width_patterns.push_back(
122 STRING("%-") + StrOf(max_col_widths[c]) + "s");
123 }
124
125 for (int r = 0; r < rows.size(); r++) {
126 for (int c = 0; c < rows[r].size(); c++) {
127 if (c > 0)
128 tprintf("%s", colsep.string());
129 tprintf(col_width_patterns[c].string(), rows[r][c].string());
130 }
131 tprintf("\n");
132 }
133}
134
135static STRING RtlEmbed(const STRING &word, bool rtlify) {
136 if (rtlify)
137 return STRING(kRLE) + word + STRING(kPDF);
138 return word;
139}
140
141// Print the current thoughts of the paragraph detector.
142static void PrintDetectorState(const ParagraphTheory &theory,
146 output.back().push_back("#row");
147 output.back().push_back("space");
148 output.back().push_back("..");
149 output.back().push_back("lword[widthSEL]");
150 output.back().push_back("rword[widthSEL]");
152 output.back().push_back("text");
153
154 for (int i = 0; i < rows.size(); i++) {
156 GenericVector<STRING> &row = output.back();
157 const RowInfo& ri = *rows[i].ri_;
158 row.push_back(StrOf(i));
159 row.push_back(StrOf(ri.average_interword_space));
160 row.push_back(ri.has_leaders ? ".." : " ");
161 row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) +
162 "[" + StrOf(ri.lword_box.width()) +
163 (ri.lword_likely_starts_idea ? "S" : "s") +
164 (ri.lword_likely_ends_idea ? "E" : "e") +
165 (ri.lword_indicates_list_item ? "L" : "l") +
166 "]");
167 row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) +
168 "[" + StrOf(ri.rword_box.width()) +
169 (ri.rword_likely_starts_idea ? "S" : "s") +
170 (ri.rword_likely_ends_idea ? "E" : "e") +
171 (ri.rword_indicates_list_item ? "L" : "l") +
172 "]");
173 rows[i].AppendDebugInfo(theory, &row);
174 row.push_back(RtlEmbed(ri.text, !ri.ltr));
175 }
176 PrintTable(output, " ");
177
178 tprintf("Active Paragraph Models:\n");
179 for (int m = 0; m < theory.models().size(); m++) {
180 tprintf(" %d: %s\n", m + 1, theory.models()[m]->ToString().string());
181 }
182}
183
184static void DebugDump(
185 bool should_print,
186 const STRING &phase,
187 const ParagraphTheory &theory,
189 if (!should_print)
190 return;
191 tprintf("# %s\n", phase.string());
192 PrintDetectorState(theory, rows);
193}
194
195// Print out the text for rows[row_start, row_end)
196static void PrintRowRange(const GenericVector<RowScratchRegisters> &rows,
197 int row_start, int row_end) {
198 tprintf("======================================\n");
199 for (int row = row_start; row < row_end; row++) {
200 tprintf("%s\n", rows[row].ri_->text.string());
201 }
202 tprintf("======================================\n");
203}
204
205// ============= Brain Dead Language Model (ASCII Version) ===================
206
207static bool IsLatinLetter(int ch) {
208 return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
209}
210
211static bool IsDigitLike(int ch) {
212 return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I';
213}
214
215static bool IsOpeningPunct(int ch) {
216 return strchr("'\"({[", ch) != nullptr;
217}
218
219static bool IsTerminalPunct(int ch) {
220 return strchr(":'\".?!]})", ch) != nullptr;
221}
222
223// Return a pointer after consuming as much text as qualifies as roman numeral.
224static const char *SkipChars(const char *str, const char *toskip) {
225 while (*str != '\0' && strchr(toskip, *str)) { str++; }
226 return str;
227}
228
229static const char *SkipChars(const char *str, bool (*skip)(int)) {
230 while (*str != '\0' && skip(*str)) { str++; }
231 return str;
232}
233
234static const char *SkipOne(const char *str, const char *toskip) {
235 if (*str != '\0' && strchr(toskip, *str)) return str + 1;
236 return str;
237}
238
239// Return whether it is very likely that this is a numeral marker that could
240// start a list item. Some examples include:
241// A I iii. VI (2) 3.5. [C-4]
242static bool LikelyListNumeral(const STRING &word) {
243 const char *kRomans = "ivxlmdIVXLMD";
244 const char *kDigits = "012345789";
245 const char *kOpen = "[{(";
246 const char *kSep = ":;-.,";
247 const char *kClose = "]})";
248
249 int num_segments = 0;
250 const char *pos = word.string();
251 while (*pos != '\0' && num_segments < 3) {
252 // skip up to two open parens.
253 const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);
254 const char *numeral_end = SkipChars(numeral_start, kRomans);
255 if (numeral_end != numeral_start) {
256 // Got Roman Numeral. Great.
257 } else {
258 numeral_end = SkipChars(numeral_start, kDigits);
259 if (numeral_end == numeral_start) {
260 // If there's a single latin letter, we can use that.
261 numeral_end = SkipChars(numeral_start, IsLatinLetter);
262 if (numeral_end - numeral_start != 1)
263 break;
264 }
265 }
266 // We got some sort of numeral.
267 num_segments++;
268 // Skip any trailing parens or punctuation.
269 pos = SkipChars(SkipChars(numeral_end, kClose), kSep);
270 if (pos == numeral_end)
271 break;
272 }
273 return *pos == '\0';
274}
275
276static bool LikelyListMark(const STRING &word) {
277 const char *kListMarks = "0Oo*.,+.";
278 return word.size() == 1 && strchr(kListMarks, word[0]) != nullptr;
279}
280
281bool AsciiLikelyListItem(const STRING &word) {
282 return LikelyListMark(word) || LikelyListNumeral(word);
283}
284
285// ========== Brain Dead Language Model (Tesseract Version) ================
286
287// Return the first Unicode Codepoint from werd[pos].
288int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) {
289 if (!u || !werd || pos > werd->length())
290 return 0;
291 return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni();
292}
293
294// A useful helper class for finding the first j >= i so that word[j]
295// does not have given character type.
297 public:
298 UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
299 : u_(unicharset), word_(word) { wordlen_ = word->length(); }
300
301 // Given an input position, return the first position >= pos not punc.
302 int SkipPunc(int pos);
303 // Given an input position, return the first position >= pos not digit.
304 int SkipDigits(int pos);
305 // Given an input position, return the first position >= pos not roman.
306 int SkipRomans(int pos);
307 // Given an input position, return the first position >= pos not alpha.
308 int SkipAlpha(int pos);
309
310 private:
311 const UNICHARSET *u_;
312 const WERD_CHOICE *word_;
313 int wordlen_;
314};
315
317 while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) pos++;
318 return pos;
319}
320
322 while (pos < wordlen_ && (u_->get_isdigit(word_->unichar_id(pos)) ||
323 IsDigitLike(UnicodeFor(u_, word_, pos)))) pos++;
324 return pos;
325}
326
328 const char *kRomans = "ivxlmdIVXLMD";
329 while (pos < wordlen_) {
330 int ch = UnicodeFor(u_, word_, pos);
331 if (ch >= 0xF0 || strchr(kRomans, ch) == nullptr) break;
332 pos++;
333 }
334 return pos;
335}
336
338 while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) pos++;
339 return pos;
340}
341
342static bool LikelyListMarkUnicode(int ch) {
343 if (ch < 0x80) {
344 STRING single_ch;
345 single_ch += ch;
346 return LikelyListMark(single_ch);
347 }
348 switch (ch) {
349 // TODO(eger) expand this list of unicodes as needed.
350 case 0x00B0: // degree sign
351 case 0x2022: // bullet
352 case 0x25E6: // white bullet
353 case 0x00B7: // middle dot
354 case 0x25A1: // white square
355 case 0x25A0: // black square
356 case 0x25AA: // black small square
357 case 0x2B1D: // black very small square
358 case 0x25BA: // black right-pointing pointer
359 case 0x25CF: // black circle
360 case 0x25CB: // white circle
361 return true;
362 default:
363 break; // fall through
364 }
365 return false;
366}
367
368// Return whether it is very likely that this is a numeral marker that could
369// start a list item. Some examples include:
370// A I iii. VI (2) 3.5. [C-4]
371static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {
372 if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0)))
373 return true;
374
375 UnicodeSpanSkipper m(u, werd);
376 int num_segments = 0;
377 int pos = 0;
378 while (pos < werd->length() && num_segments < 3) {
379 int numeral_start = m.SkipPunc(pos);
380 if (numeral_start > pos + 1) break;
381 int numeral_end = m.SkipRomans(numeral_start);
382 if (numeral_end == numeral_start) {
383 numeral_end = m.SkipDigits(numeral_start);
384 if (numeral_end == numeral_start) {
385 // If there's a single latin letter, we can use that.
386 numeral_end = m.SkipAlpha(numeral_start);
387 if (numeral_end - numeral_start != 1)
388 break;
389 }
390 }
391 // We got some sort of numeral.
392 num_segments++;
393 // Skip any trailing punctuation.
394 pos = m.SkipPunc(numeral_end);
395 if (pos == numeral_end)
396 break;
397 }
398 return pos == werd->length();
399}
400
401// ========= Brain Dead Language Model (combined entry points) ================
402
403// Given the leftmost word of a line either as a Tesseract unicharset + werd
404// or a utf8 string, set the following attributes for it:
405// is_list - this word might be a list number or bullet.
406// starts_idea - this word is likely to start a sentence.
407// ends_idea - this word is likely to end a sentence.
408void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
409 const STRING &utf8,
410 bool *is_list, bool *starts_idea, bool *ends_idea) {
411 *is_list = false;
412 *starts_idea = false;
413 *ends_idea = false;
414 if (utf8.size() == 0 || (werd != nullptr && werd->length() == 0)) { // Empty
415 *ends_idea = true;
416 return;
417 }
418
419 if (unicharset && werd) { // We have a proper werd and unicharset so use it.
420 if (UniLikelyListItem(unicharset, werd)) {
421 *is_list = true;
422 *starts_idea = true;
423 *ends_idea = true;
424 }
425 if (unicharset->get_isupper(werd->unichar_id(0))) {
426 *starts_idea = true;
427 }
428 if (unicharset->get_ispunctuation(werd->unichar_id(0))) {
429 *starts_idea = true;
430 *ends_idea = true;
431 }
432 } else { // Assume utf8 is mostly ASCII
433 if (AsciiLikelyListItem(utf8)) {
434 *is_list = true;
435 *starts_idea = true;
436 }
437 int start_letter = utf8[0];
438 if (IsOpeningPunct(start_letter)) {
439 *starts_idea = true;
440 }
441 if (IsTerminalPunct(start_letter)) {
442 *ends_idea = true;
443 }
444 if (start_letter >= 'A' && start_letter <= 'Z') {
445 *starts_idea = true;
446 }
447 }
448}
449
450// Given the rightmost word of a line either as a Tesseract unicharset + werd
451// or a utf8 string, set the following attributes for it:
452// is_list - this word might be a list number or bullet.
453// starts_idea - this word is likely to start a sentence.
454// ends_idea - this word is likely to end a sentence.
455void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
456 const STRING &utf8,
457 bool *is_list, bool *starts_idea, bool *ends_idea) {
458 *is_list = false;
459 *starts_idea = false;
460 *ends_idea = false;
461 if (utf8.size() == 0 || (werd != nullptr && werd->length() == 0)) { // Empty
462 *ends_idea = true;
463 return;
464 }
465
466 if (unicharset && werd) { // We have a proper werd and unicharset so use it.
467 if (UniLikelyListItem(unicharset, werd)) {
468 *is_list = true;
469 *starts_idea = true;
470 }
471 UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1);
472 if (unicharset->get_ispunctuation(last_letter)) {
473 *ends_idea = true;
474 }
475 } else { // Assume utf8 is mostly ASCII
476 if (AsciiLikelyListItem(utf8)) {
477 *is_list = true;
478 *starts_idea = true;
479 }
480 int last_letter = utf8[utf8.size() - 1];
481 if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) {
482 *ends_idea = true;
483 }
484 }
485}
486
487// =============== Implementation of RowScratchRegisters =====================
488/* static */
490 GenericVector<STRING> *header) {
491 header->push_back("[lmarg,lind;rind,rmarg]");
492 header->push_back("model");
493}
494
496 GenericVector<STRING> *dbg) const {
497 char s[30];
498 snprintf(s, sizeof(s), "[%3d,%3d;%3d,%3d]",
500 dbg->push_back(s);
501 STRING model_string;
502 model_string += static_cast<char>(GetLineType());
503 model_string += ":";
504
505 int model_numbers = 0;
506 for (int h = 0; h < hypotheses_.size(); h++) {
507 if (hypotheses_[h].model == nullptr)
508 continue;
509 if (model_numbers > 0)
510 model_string += ",";
511 if (StrongModel(hypotheses_[h].model)) {
512 model_string += StrOf(1 + theory.IndexOf(hypotheses_[h].model));
513 } else if (hypotheses_[h].model == kCrownLeft) {
514 model_string += "CrL";
515 } else if (hypotheses_[h].model == kCrownRight) {
516 model_string += "CrR";
517 }
518 model_numbers++;
519 }
520 if (model_numbers == 0)
521 model_string += "0";
522
523 dbg->push_back(model_string);
524}
525
527 ri_ = &row;
528 lmargin_ = 0;
530 rmargin_ = 0;
532}
533
535 if (hypotheses_.empty())
536 return LT_UNKNOWN;
537 bool has_start = false;
538 bool has_body = false;
539 for (int i = 0; i < hypotheses_.size(); i++) {
540 switch (hypotheses_[i].ty) {
541 case LT_START: has_start = true; break;
542 case LT_BODY: has_body = true; break;
543 default:
544 tprintf("Encountered bad value in hypothesis list: %c\n",
545 hypotheses_[i].ty);
546 break;
547 }
548 }
549 if (has_start && has_body)
550 return LT_MULTIPLE;
551 return has_start ? LT_START : LT_BODY;
552}
553
555 if (hypotheses_.empty())
556 return LT_UNKNOWN;
557 bool has_start = false;
558 bool has_body = false;
559 for (int i = 0; i < hypotheses_.size(); i++) {
560 if (hypotheses_[i].model != model)
561 continue;
562 switch (hypotheses_[i].ty) {
563 case LT_START: has_start = true; break;
564 case LT_BODY: has_body = true; break;
565 default:
566 tprintf("Encountered bad value in hypothesis list: %c\n",
567 hypotheses_[i].ty);
568 break;
569 }
570 }
571 if (has_start && has_body)
572 return LT_MULTIPLE;
573 return has_start ? LT_START : LT_BODY;
574}
575
577 LineType current_lt = GetLineType();
578 if (current_lt != LT_UNKNOWN && current_lt != LT_START) {
579 tprintf("Trying to set a line to be START when it's already BODY.\n");
580 }
581 if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) {
582 hypotheses_.push_back_new(LineHypothesis(LT_START, nullptr));
583 }
584}
585
587 LineType current_lt = GetLineType();
588 if (current_lt != LT_UNKNOWN && current_lt != LT_BODY) {
589 tprintf("Trying to set a line to be BODY when it's already START.\n");
590 }
591 if (current_lt == LT_UNKNOWN || current_lt == LT_START) {
592 hypotheses_.push_back_new(LineHypothesis(LT_BODY, nullptr));
593 }
594}
595
597 hypotheses_.push_back_new(LineHypothesis(LT_START, model));
598 int old_idx = hypotheses_.get_index(LineHypothesis(LT_START, nullptr));
599 if (old_idx >= 0)
600 hypotheses_.remove(old_idx);
601}
602
604 hypotheses_.push_back_new(LineHypothesis(LT_BODY, model));
605 int old_idx = hypotheses_.get_index(LineHypothesis(LT_BODY, nullptr));
606 if (old_idx >= 0)
607 hypotheses_.remove(old_idx);
608}
609
611 for (int h = 0; h < hypotheses_.size(); h++) {
612 if (hypotheses_[h].ty == LT_START && StrongModel(hypotheses_[h].model))
613 models->push_back_new(hypotheses_[h].model);
614 }
615}
616
618 for (int h = 0; h < hypotheses_.size(); h++) {
619 if (StrongModel(hypotheses_[h].model))
620 models->push_back_new(hypotheses_[h].model);
621 }
622}
623
625 for (int h = 0; h < hypotheses_.size(); h++) {
626 if (hypotheses_[h].model != nullptr)
627 models->push_back_new(hypotheses_[h].model);
628 }
629}
630
632 if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_START)
633 return nullptr;
634 return hypotheses_[0].model;
635}
636
638 if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_BODY)
639 return nullptr;
640 return hypotheses_[0].model;
641}
642
643// Discard any hypotheses whose model is not in the given list.
645 const SetOfModels &models) {
646 if (models.empty())
647 return;
648 for (int h = hypotheses_.size() - 1; h >= 0; h--) {
649 if (!models.contains(hypotheses_[h].model)) {
650 hypotheses_.remove(h);
651 }
652 }
653}
654
655// ============ Geometry based Paragraph Detection Algorithm =================
656
657struct Cluster {
658 Cluster() : center(0), count(0) {}
659 Cluster(int cen, int num) : center(cen), count(num) {}
660
661 int center; // The center of the cluster.
662 int count; // The number of entries within the cluster.
663};
664
666 public:
667 explicit SimpleClusterer(int max_cluster_width)
668 : max_cluster_width_(max_cluster_width) {}
669 void Add(int value) { values_.push_back(value); }
670 int size() const { return values_.size(); }
671 void GetClusters(GenericVector<Cluster> *clusters);
672
673 private:
674 int max_cluster_width_;
676};
677
678// Return the index of the cluster closest to value.
679static int ClosestCluster(const GenericVector<Cluster> &clusters, int value) {
680 int best_index = 0;
681 for (int i = 0; i < clusters.size(); i++) {
682 if (abs(value - clusters[i].center) <
683 abs(value - clusters[best_index].center))
684 best_index = i;
685 }
686 return best_index;
687}
688
690 clusters->clear();
691 values_.sort();
692 for (int i = 0; i < values_.size();) {
693 int orig_i = i;
694 int lo = values_[i];
695 int hi = lo;
696 while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) {
697 hi = values_[i];
698 }
699 clusters->push_back(Cluster((hi + lo) / 2, i - orig_i));
700 }
701}
702
703// Calculate left- and right-indent tab stop values seen in
704// rows[row_start, row_end) given a tolerance of tolerance.
705static void CalculateTabStops(GenericVector<RowScratchRegisters> *rows,
706 int row_start, int row_end, int tolerance,
707 GenericVector<Cluster> *left_tabs,
708 GenericVector<Cluster> *right_tabs) {
709 if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end))
710 return;
711 // First pass: toss all left and right indents into clusterers.
712 SimpleClusterer initial_lefts(tolerance);
713 SimpleClusterer initial_rights(tolerance);
714 GenericVector<Cluster> initial_left_tabs;
715 GenericVector<Cluster> initial_right_tabs;
716 for (int i = row_start; i < row_end; i++) {
717 initial_lefts.Add((*rows)[i].lindent_);
718 initial_rights.Add((*rows)[i].rindent_);
719 }
720 initial_lefts.GetClusters(&initial_left_tabs);
721 initial_rights.GetClusters(&initial_right_tabs);
722
723 // Second pass: cluster only lines that are not "stray"
724 // An example of a stray line is a page number -- a line whose start
725 // and end tab-stops are far outside the typical start and end tab-stops
726 // for the block.
727 // Put another way, we only cluster data from lines whose start or end
728 // tab stop is frequent.
729 SimpleClusterer lefts(tolerance);
730 SimpleClusterer rights(tolerance);
731
732 // Outlier elimination. We might want to switch this to test outlier-ness
733 // based on how strange a position an outlier is in instead of or in addition
734 // to how rare it is. These outliers get re-added if we end up having too
735 // few tab stops, to work with, however.
736 int infrequent_enough_to_ignore = 0;
737 if (row_end - row_start >= 8) infrequent_enough_to_ignore = 1;
738 if (row_end - row_start >= 20) infrequent_enough_to_ignore = 2;
739
740 for (int i = row_start; i < row_end; i++) {
741 int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
742 int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
743 if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
744 initial_right_tabs[ridx].count > infrequent_enough_to_ignore) {
745 lefts.Add((*rows)[i].lindent_);
746 rights.Add((*rows)[i].rindent_);
747 }
748 }
749 lefts.GetClusters(left_tabs);
750 rights.GetClusters(right_tabs);
751
752 if ((left_tabs->size() == 1 && right_tabs->size() >= 4) ||
753 (right_tabs->size() == 1 && left_tabs->size() >= 4)) {
754 // One side is really ragged, and the other only has one tab stop,
755 // so those "insignificant outliers" are probably important, actually.
756 // This often happens on a page of an index. Add back in the ones
757 // we omitted in the first pass.
758 for (int i = row_start; i < row_end; i++) {
759 int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
760 int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
761 if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
762 initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) {
763 lefts.Add((*rows)[i].lindent_);
764 rights.Add((*rows)[i].rindent_);
765 }
766 }
767 }
768 lefts.GetClusters(left_tabs);
769 rights.GetClusters(right_tabs);
770
771 // If one side is almost a two-indent aligned side, and the other clearly
772 // isn't, try to prune out the least frequent tab stop from that side.
773 if (left_tabs->size() == 3 && right_tabs->size() >= 4) {
774 int to_prune = -1;
775 for (int i = left_tabs->size() - 1; i >= 0; i--) {
776 if (to_prune < 0 ||
777 (*left_tabs)[i].count < (*left_tabs)[to_prune].count) {
778 to_prune = i;
779 }
780 }
781 if (to_prune >= 0 &&
782 (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
783 left_tabs->remove(to_prune);
784 }
785 }
786 if (right_tabs->size() == 3 && left_tabs->size() >= 4) {
787 int to_prune = -1;
788 for (int i = right_tabs->size() - 1; i >= 0; i--) {
789 if (to_prune < 0 ||
790 (*right_tabs)[i].count < (*right_tabs)[to_prune].count) {
791 to_prune = i;
792 }
793 }
794 if (to_prune >= 0 &&
795 (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
796 right_tabs->remove(to_prune);
797 }
798 }
799}
800
801// Given a paragraph model mark rows[row_start, row_end) as said model
802// start or body lines.
803//
804// Case 1: model->first_indent_ != model->body_indent_
805// Differentiating the paragraph start lines from the paragraph body lines in
806// this case is easy, we just see how far each line is indented.
807//
808// Case 2: model->first_indent_ == model->body_indent_
809// Here, we find end-of-paragraph lines by looking for "short lines."
810// What constitutes a "short line" changes depending on whether the text
811// ragged-right[left] or fully justified (aligned left and right).
812//
813// Case 2a: Ragged Right (or Left) text. (eop_threshold == 0)
814// We have a new paragraph it the first word would have at the end
815// of the previous line.
816//
817// Case 2b: Fully Justified. (eop_threshold > 0)
818// We mark a line as short (end of paragraph) if the offside indent
819// is greater than eop_threshold.
820static void MarkRowsWithModel(GenericVector<RowScratchRegisters> *rows,
821 int row_start, int row_end,
822 const ParagraphModel *model,
823 bool ltr, int eop_threshold) {
824 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end))
825 return;
826 for (int row = row_start; row < row_end; row++) {
827 bool valid_first = ValidFirstLine(rows, row, model);
828 bool valid_body = ValidBodyLine(rows, row, model);
829 if (valid_first && !valid_body) {
830 (*rows)[row].AddStartLine(model);
831 } else if (valid_body && !valid_first) {
832 (*rows)[row].AddBodyLine(model);
833 } else if (valid_body && valid_first) {
834 bool after_eop = (row == row_start);
835 if (row > row_start) {
836 if (eop_threshold > 0) {
837 if (model->justification() == JUSTIFICATION_LEFT) {
838 after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
839 } else {
840 after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
841 }
842 } else {
843 after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row],
844 model->justification());
845 }
846 }
847 if (after_eop) {
848 (*rows)[row].AddStartLine(model);
849 } else {
850 (*rows)[row].AddBodyLine(model);
851 }
852 } else {
853 // Do nothing. Stray row.
854 }
855 }
856}
857
858// GeometricClassifierState holds all of the information we'll use while
859// trying to determine a paragraph model for the text lines in a block of
860// text:
861// + the rows under consideration [row_start, row_end)
862// + the common left- and right-indent tab stops
863// + does the block start out left-to-right or right-to-left
864// Further, this struct holds the data we amass for the (single) ParagraphModel
865// we'll assign to the text lines (assuming we get that far).
869 int r_start, int r_end)
870 : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end) {
871 tolerance = InterwordSpace(*r, r_start, r_end);
872 CalculateTabStops(r, r_start, r_end, tolerance,
874 if (debug_level >= 3) {
875 tprintf("Geometry: TabStop cluster tolerance = %d; "
876 "%d left tabs; %d right tabs\n",
877 tolerance, left_tabs.size(), right_tabs.size());
878 }
879 ltr = (*r)[r_start].ri_->ltr;
880 }
881
884 margin = (*rows)[row_start].lmargin_;
885 }
886
889 margin = (*rows)[row_start].rmargin_;
890 }
891
892 // Align tabs are the tab stops the text is aligned to.
895 return left_tabs;
896 }
897
898 // Offside tabs are the tab stops opposite the tabs used to align the text.
899 //
900 // Note that for a left-to-right text which is aligned to the right such as
901 // this function comment, the offside tabs are the horizontal tab stops
902 // marking the beginning of ("Note", "this" and "marking").
905 return right_tabs;
906 }
907
908 // Return whether the i'th row extends from the leftmost left tab stop
909 // to the right most right tab stop.
910 bool IsFullRow(int i) const {
911 return ClosestCluster(left_tabs, (*rows)[i].lindent_) == 0 &&
912 ClosestCluster(right_tabs, (*rows)[i].rindent_) == 0;
913 }
914
915 int AlignsideTabIndex(int row_idx) const {
916 return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just));
917 }
918
919 // Given what we know about the paragraph justification (just), would the
920 // first word of row_b have fit at the end of row_a?
921 bool FirstWordWouldHaveFit(int row_a, int row_b) {
922 return ::tesseract::FirstWordWouldHaveFit(
923 (*rows)[row_a], (*rows)[row_b], just);
924 }
925
926 void PrintRows() const { PrintRowRange(*rows, row_start, row_end); }
927
928 void Fail(int min_debug_level, const char *why) const {
929 if (debug_level < min_debug_level) return;
930 tprintf("# %s\n", why);
931 PrintRows();
932 }
933
936 }
937
938 // We print out messages with a debug level at least as great as debug_level.
939 int debug_level = 0;
940
941 // The Geometric Classifier was asked to find a single paragraph model
942 // to fit the text rows (*rows)[row_start, row_end)
944 int row_start = 0;
945 int row_end = 0;
946
947 // The amount by which we expect the text edge can vary and still be aligned.
948 int tolerance = 0;
949
950 // Is the script in this text block left-to-right?
951 // HORRIBLE ROUGH APPROXIMATION. TODO(eger): Improve
952 bool ltr = false;
953
954 // These left and right tab stops were determined to be the common tab
955 // stops for the given text.
958
959 // These are parameters we must determine to create a ParagraphModel.
961 int margin = 0;
963 int body_indent = 0;
964
965 // eop_threshold > 0 if the text is fully justified. See MarkRowsWithModel()
967};
968
969// Given a section of text where strong textual clues did not help identifying
970// paragraph breaks, and for which the left and right indents have exactly
971// three tab stops between them, attempt to find the paragraph breaks based
972// solely on the outline of the text and whether the script is left-to-right.
973//
974// Algorithm Detail:
975// The selected rows are in the form of a rectangle except
976// for some number of "short lines" of the same length:
977//
978// (A1) xxxxxxxxxxxxx (B1) xxxxxxxxxxxx
979// xxxxxxxxxxx xxxxxxxxxx # A "short" line.
980// xxxxxxxxxxxxx xxxxxxxxxxxx
981// xxxxxxxxxxxxx xxxxxxxxxxxx
982//
983// We have a slightly different situation if the only short
984// line is at the end of the excerpt.
985//
986// (A2) xxxxxxxxxxxxx (B2) xxxxxxxxxxxx
987// xxxxxxxxxxxxx xxxxxxxxxxxx
988// xxxxxxxxxxxxx xxxxxxxxxxxx
989// xxxxxxxxxxx xxxxxxxxxx # A "short" line.
990//
991// We'll interpret these as follows based on the reasoning in the comment for
992// GeometricClassify():
993// [script direction: first indent, body indent]
994// (A1) LtR: 2,0 RtL: 0,0 (B1) LtR: 0,0 RtL: 2,0
995// (A2) LtR: 2,0 RtL: CrR (B2) LtR: CrL RtL: 2,0
996static void GeometricClassifyThreeTabStopTextBlock(
997 int debug_level,
999 ParagraphTheory *theory) {
1000 int num_rows = s.row_end - s.row_start;
1001 int num_full_rows = 0;
1002 int last_row_full = 0;
1003 for (int i = s.row_start; i < s.row_end; i++) {
1004 if (s.IsFullRow(i)) {
1005 num_full_rows++;
1006 if (i == s.row_end - 1) last_row_full++;
1007 }
1008 }
1009
1010 if (num_full_rows < 0.7 * num_rows) {
1011 s.Fail(1, "Not enough full lines to know which lines start paras.");
1012 return;
1013 }
1014
1015 // eop_threshold gets set if we're fully justified; see MarkRowsWithModel()
1016 s.eop_threshold = 0;
1017
1018 if (s.ltr) {
1020 } else {
1022 }
1023
1024 if (debug_level > 0) {
1025 tprintf("# Not enough variety for clear outline classification. "
1026 "Guessing these are %s aligned based on script.\n",
1027 s.ltr ? "left" : "right");
1028 s.PrintRows();
1029 }
1030
1031 if (s.AlignTabs().size() == 2) { // case A1 or A2
1032 s.first_indent = s.AlignTabs()[1].center;
1033 s.body_indent = s.AlignTabs()[0].center;
1034 } else { // case B1 or B2
1035 if (num_rows - 1 == num_full_rows - last_row_full) {
1036 // case B2
1037 const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight;
1038 (*s.rows)[s.row_start].AddStartLine(model);
1039 for (int i = s.row_start + 1; i < s.row_end; i++) {
1040 (*s.rows)[i].AddBodyLine(model);
1041 }
1042 return;
1043 } else {
1044 // case B1
1045 s.first_indent = s.body_indent = s.AlignTabs()[0].center;
1046 s.eop_threshold = (s.OffsideTabs()[0].center +
1047 s.OffsideTabs()[1].center) / 2;
1048 }
1049 }
1050 const ParagraphModel *model = theory->AddModel(s.Model());
1051 MarkRowsWithModel(s.rows, s.row_start, s.row_end, model,
1052 s.ltr, s.eop_threshold);
1053 return;
1054}
1055
1056// This function is called if strong textual clues were not available, but
1057// the caller hopes that the paragraph breaks will be super obvious just
1058// by the outline of the text.
1059//
1060// The particularly difficult case is figuring out what's going on if you
1061// don't have enough short paragraph end lines to tell us what's going on.
1062//
1063// For instance, let's say you have the following outline:
1064//
1065// (A1) xxxxxxxxxxxxxxxxxxxxxx
1066// xxxxxxxxxxxxxxxxxxxx
1067// xxxxxxxxxxxxxxxxxxxxxx
1068// xxxxxxxxxxxxxxxxxxxxxx
1069//
1070// Even if we know that the text is left-to-right and so will probably be
1071// left-aligned, both of the following are possible texts:
1072//
1073// (A1a) 1. Here our list item
1074// with two full lines.
1075// 2. Here a second item.
1076// 3. Here our third one.
1077//
1078// (A1b) so ends paragraph one.
1079// Here starts another
1080// paragraph we want to
1081// read. This continues
1082//
1083// These examples are obvious from the text and should have been caught
1084// by the StrongEvidenceClassify pass. However, for languages where we don't
1085// have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese),
1086// it's worth guessing that (A1b) is the correct interpretation if there are
1087// far more "full" lines than "short" lines.
1088static void GeometricClassify(int debug_level,
1090 int row_start, int row_end,
1091 ParagraphTheory *theory) {
1092 if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end))
1093 return;
1094 if (debug_level > 1) {
1095 tprintf("###############################################\n");
1096 tprintf("##### GeometricClassify( rows[%d:%d) ) ####\n",
1097 row_start, row_end);
1098 tprintf("###############################################\n");
1099 }
1100 RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
1101
1102 GeometricClassifierState s(debug_level, rows, row_start, row_end);
1103 if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) {
1104 s.Fail(2, "Too much variety for simple outline classification.");
1105 return;
1106 }
1107 if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) {
1108 s.Fail(1, "Not enough variety for simple outline classification.");
1109 return;
1110 }
1111 if (s.left_tabs.size() + s.right_tabs.size() == 3) {
1112 GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);
1113 return;
1114 }
1115
1116 // At this point, we know that one side has at least two tab stops, and the
1117 // other side has one or two tab stops.
1118 // Left to determine:
1119 // (1) Which is the body indent and which is the first line indent?
1120 // (2) Is the text fully justified?
1121
1122 // If one side happens to have three or more tab stops, assume that side
1123 // is opposite of the aligned side.
1124 if (s.right_tabs.size() > 2) {
1126 } else if (s.left_tabs.size() > 2) {
1128 } else if (s.ltr) { // guess based on script direction
1130 } else {
1132 }
1133
1134 if (s.AlignTabs().size() == 2) {
1135 // For each tab stop on the aligned side, how many of them appear
1136 // to be paragraph start lines? [first lines]
1137 int firsts[2] = {0, 0};
1138 // Count the first line as a likely paragraph start line.
1139 firsts[s.AlignsideTabIndex(s.row_start)]++;
1140 // For each line, if the first word would have fit on the previous
1141 // line count it as a likely paragraph start line.
1142 bool jam_packed = true;
1143 for (int i = s.row_start + 1; i < s.row_end; i++) {
1144 if (s.FirstWordWouldHaveFit(i - 1, i)) {
1145 firsts[s.AlignsideTabIndex(i)]++;
1146 jam_packed = false;
1147 }
1148 }
1149 // Make an extra accounting for the last line of the paragraph just
1150 // in case it's the only short line in the block. That is, take its
1151 // first word as typical and see if this looks like the *last* line
1152 // of a paragraph. If so, mark the *other* indent as probably a first.
1153 if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
1154 firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;
1155 }
1156
1157 int percent0firsts, percent1firsts;
1158 percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count;
1159 percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count;
1160
1161 // TODO(eger): Tune these constants if necessary.
1162 if ((percent0firsts < 20 && 30 < percent1firsts) ||
1163 percent0firsts + 30 < percent1firsts) {
1164 s.first_indent = s.AlignTabs()[1].center;
1165 s.body_indent = s.AlignTabs()[0].center;
1166 } else if ((percent1firsts < 20 && 30 < percent0firsts) ||
1167 percent1firsts + 30 < percent0firsts) {
1168 s.first_indent = s.AlignTabs()[0].center;
1169 s.body_indent = s.AlignTabs()[1].center;
1170 } else {
1171 // Ambiguous! Probably lineated (poetry)
1172 if (debug_level > 1) {
1173 tprintf("# Cannot determine %s indent likely to start paragraphs.\n",
1174 s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right");
1175 tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
1176 s.AlignTabs()[0].center, percent0firsts);
1177 tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
1178 s.AlignTabs()[1].center, percent1firsts);
1179 s.PrintRows();
1180 }
1181 return;
1182 }
1183 } else {
1184 // There's only one tab stop for the "aligned to" side.
1185 s.first_indent = s.body_indent = s.AlignTabs()[0].center;
1186 }
1187
1188 // At this point, we have our model.
1189 const ParagraphModel *model = theory->AddModel(s.Model());
1190
1191 // Now all we have to do is figure out if the text is fully justified or not.
1192 // eop_threshold: default to fully justified unless we see evidence below.
1193 // See description on MarkRowsWithModel()
1194 s.eop_threshold =
1195 (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;
1196 // If the text is not fully justified, re-set the eop_threshold to 0.
1197 if (s.AlignTabs().size() == 2) {
1198 // Paragraphs with a paragraph-start indent.
1199 for (int i = s.row_start; i < s.row_end - 1; i++) {
1200 if (ValidFirstLine(s.rows, i + 1, model) &&
1201 !NearlyEqual(s.OffsideTabs()[0].center,
1202 (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
1203 // We found a non-end-of-paragraph short line: not fully justified.
1204 s.eop_threshold = 0;
1205 break;
1206 }
1207 }
1208 } else {
1209 // Paragraphs with no paragraph-start indent.
1210 for (int i = s.row_start; i < s.row_end - 1; i++) {
1211 if (!s.FirstWordWouldHaveFit(i, i + 1) &&
1212 !NearlyEqual(s.OffsideTabs()[0].center,
1213 (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
1214 // We found a non-end-of-paragraph short line: not fully justified.
1215 s.eop_threshold = 0;
1216 break;
1217 }
1218 }
1219 }
1220 MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold);
1221}
1222
1223// =============== Implementation of ParagraphTheory =====================
1224
1226 for (int i = 0; i < models_->size(); i++) {
1227 if ((*models_)[i]->Comparable(model))
1228 return (*models_)[i];
1229 }
1230 auto *m = new ParagraphModel(model);
1231 models_->push_back(m);
1232 models_we_added_.push_back_new(m);
1233 return m;
1234}
1235
1237 for (int i = models_->size() - 1; i >= 0; i--) {
1238 ParagraphModel *m = (*models_)[i];
1239 if (!used_models.contains(m) && models_we_added_.contains(m)) {
1240 models_->remove(i);
1241 models_we_added_.remove(models_we_added_.get_index(m));
1242 delete m;
1243 }
1244 }
1245}
1246
1247// Examine rows[start, end) and try to determine if an existing non-centered
1248// paragraph model would fit them perfectly. If so, return a pointer to it.
1249// If not, return nullptr.
1251 const GenericVector<RowScratchRegisters> *rows, int start, int end) const {
1252 for (int m = 0; m < models_->size(); m++) {
1253 const ParagraphModel *model = (*models_)[m];
1254 if (model->justification() != JUSTIFICATION_CENTER &&
1255 RowsFitModel(rows, start, end, model))
1256 return model;
1257 }
1258 return nullptr;
1259}
1260
1262 for (int m = 0; m < models_->size(); m++) {
1263 const ParagraphModel *model = (*models_)[m];
1264 if (model->justification() != JUSTIFICATION_CENTER)
1265 models->push_back_new(model);
1266 }
1267}
1268
1270 for (int i = 0; i < models_->size(); i++) {
1271 if ((*models_)[i] == model)
1272 return i;
1273 }
1274 return -1;
1275}
1276
1278 int row, const ParagraphModel *model) {
1279 if (!StrongModel(model)) {
1280 tprintf("ValidFirstLine() should only be called with strong models!\n");
1281 }
1282 return StrongModel(model) &&
1283 model->ValidFirstLine(
1284 (*rows)[row].lmargin_, (*rows)[row].lindent_,
1285 (*rows)[row].rindent_, (*rows)[row].rmargin_);
1286}
1287
1289 int row, const ParagraphModel *model) {
1290 if (!StrongModel(model)) {
1291 tprintf("ValidBodyLine() should only be called with strong models!\n");
1292 }
1293 return StrongModel(model) &&
1294 model->ValidBodyLine(
1295 (*rows)[row].lmargin_, (*rows)[row].lindent_,
1296 (*rows)[row].rindent_, (*rows)[row].rmargin_);
1297}
1298
1300 int a, int b, const ParagraphModel *model) {
1301 if (model != kCrownRight && model != kCrownLeft) {
1302 tprintf("CrownCompatible() should only be called with crown models!\n");
1303 return false;
1304 }
1305 RowScratchRegisters &row_a = (*rows)[a];
1306 RowScratchRegisters &row_b = (*rows)[b];
1307 if (model == kCrownRight) {
1308 return NearlyEqual(row_a.rindent_ + row_a.rmargin_,
1309 row_b.rindent_ + row_b.rmargin_,
1310 Epsilon(row_a.ri_->average_interword_space));
1311 }
1312 return NearlyEqual(row_a.lindent_ + row_a.lmargin_,
1313 row_b.lindent_ + row_b.lmargin_,
1314 Epsilon(row_a.ri_->average_interword_space));
1315}
1316
1317
1318// =============== Implementation of ParagraphModelSmearer ====================
1319
1322 int row_start, int row_end, ParagraphTheory *theory)
1323 : theory_(theory), rows_(rows), row_start_(row_start),
1324 row_end_(row_end) {
1325 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
1326 row_start_ = 0;
1327 row_end_ = 0;
1328 return;
1329 }
1330 SetOfModels no_models;
1331 for (int row = row_start - 1; row <= row_end; row++) {
1332 open_models_.push_back(no_models);
1333 }
1334}
1335
1336// see paragraphs_internal.h
1337void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) {
1338 SetOfModels no_models;
1339 if (row_start < row_start_) row_start = row_start_;
1340 if (row_end > row_end_) row_end = row_end_;
1341
1342 for (int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end;
1343 row++) {
1344 if ((*rows_)[row].ri_->num_words == 0) {
1345 OpenModels(row + 1) = no_models;
1346 } else {
1347 SetOfModels &opened = OpenModels(row);
1348 (*rows_)[row].StartHypotheses(&opened);
1349
1350 // Which models survive the transition from row to row + 1?
1351 SetOfModels still_open;
1352 for (int m = 0; m < opened.size(); m++) {
1353 if (ValidFirstLine(rows_, row, opened[m]) ||
1354 ValidBodyLine(rows_, row, opened[m])) {
1355 // This is basic filtering; we check likely paragraph starty-ness down
1356 // below in Smear() -- you know, whether the first word would have fit
1357 // and such.
1358 still_open.push_back_new(opened[m]);
1359 }
1360 }
1361 OpenModels(row + 1) = still_open;
1362 }
1363 }
1364}
1365
1366// see paragraphs_internal.h
1368 CalculateOpenModels(row_start_, row_end_);
1369
1370 // For each row which we're unsure about (that is, it is LT_UNKNOWN or
1371 // we have multiple LT_START hypotheses), see if there's a model that
1372 // was recently used (an "open" model) which might model it well.
1373 for (int i = row_start_; i < row_end_; i++) {
1374 RowScratchRegisters &row = (*rows_)[i];
1375 if (row.ri_->num_words == 0)
1376 continue;
1377
1378 // Step One:
1379 // Figure out if there are "open" models which are left-alined or
1380 // right-aligned. This is important for determining whether the
1381 // "first" word in a row would fit at the "end" of the previous row.
1382 bool left_align_open = false;
1383 bool right_align_open = false;
1384 for (int m = 0; m < OpenModels(i).size(); m++) {
1385 switch (OpenModels(i)[m]->justification()) {
1386 case JUSTIFICATION_LEFT: left_align_open = true; break;
1387 case JUSTIFICATION_RIGHT: right_align_open = true; break;
1388 default: left_align_open = right_align_open = true;
1389 }
1390 }
1391 // Step Two:
1392 // Use that knowledge to figure out if this row is likely to
1393 // start a paragraph.
1394 bool likely_start;
1395 if (i == 0) {
1396 likely_start = true;
1397 } else {
1398 if ((left_align_open && right_align_open) ||
1399 (!left_align_open && !right_align_open)) {
1400 likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
1402 LikelyParagraphStart((*rows_)[i - 1], row,
1404 } else if (left_align_open) {
1405 likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
1407 } else {
1408 likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
1410 }
1411 }
1412
1413 // Step Three:
1414 // If this text line seems like an obvious first line of an
1415 // open model, or an obvious continuation of an existing
1416 // modelled paragraph, mark it up.
1417 if (likely_start) {
1418 // Add Start Hypotheses for all Open models that fit.
1419 for (int m = 0; m < OpenModels(i).size(); m++) {
1420 if (ValidFirstLine(rows_, i, OpenModels(i)[m])) {
1421 row.AddStartLine(OpenModels(i)[m]);
1422 }
1423 }
1424 } else {
1425 // Add relevant body line hypotheses.
1426 SetOfModels last_line_models;
1427 if (i > 0) {
1428 (*rows_)[i - 1].StrongHypotheses(&last_line_models);
1429 } else {
1430 theory_->NonCenteredModels(&last_line_models);
1431 }
1432 for (int m = 0; m < last_line_models.size(); m++) {
1433 const ParagraphModel *model = last_line_models[m];
1434 if (ValidBodyLine(rows_, i, model))
1435 row.AddBodyLine(model);
1436 }
1437 }
1438
1439 // Step Four:
1440 // If we're still quite unsure about this line, go through all
1441 // models in our theory and see if this row could be the start
1442 // of any of our models.
1443 if (row.GetLineType() == LT_UNKNOWN ||
1444 (row.GetLineType() == LT_START && !row.UniqueStartHypothesis())) {
1445 SetOfModels all_models;
1446 theory_->NonCenteredModels(&all_models);
1447 for (int m = 0; m < all_models.size(); m++) {
1448 if (ValidFirstLine(rows_, i, all_models[m])) {
1449 row.AddStartLine(all_models[m]);
1450 }
1451 }
1452 }
1453 // Step Five:
1454 // Since we may have updated the hypotheses about this row, we need
1455 // to recalculate the Open models for the rest of rows[i + 1, row_end)
1456 if (row.GetLineType() != LT_UNKNOWN) {
1457 CalculateOpenModels(i + 1, row_end_);
1458 }
1459 }
1460}
1461
1462// ================ Main Paragraph Detection Algorithm =======================
1463
1464// Find out what ParagraphModels are actually used, and discard any
1465// that are not.
1466static void DiscardUnusedModels(const GenericVector<RowScratchRegisters> &rows,
1467 ParagraphTheory *theory) {
1468 SetOfModels used_models;
1469 for (int i = 0; i < rows.size(); i++) {
1470 rows[i].StrongHypotheses(&used_models);
1471 }
1472 theory->DiscardUnusedModels(used_models);
1473}
1474
1475// DowngradeWeakestToCrowns:
1476// Forget any flush-{left, right} models unless we see two or more
1477// of them in sequence.
1478//
1479// In pass 3, we start to classify even flush-left paragraphs (paragraphs
1480// where the first line and body indent are the same) as having proper Models.
1481// This is generally dangerous, since if you start imagining that flush-left
1482// is a typical paragraph model when it is not, it will lead you to chop normal
1483// indented paragraphs in the middle whenever a sentence happens to start on a
1484// new line (see "This" above). What to do?
1485// What we do is to take any paragraph which is flush left and is not
1486// preceded by another paragraph of the same model and convert it to a "Crown"
1487// paragraph. This is a weak pseudo-ParagraphModel which is a placeholder
1488// for later. It means that the paragraph is flush, but it would be desirable
1489// to mark it as the same model as following text if it fits. This downgrade
1490// FlushLeft -> CrownLeft -> Model of following paragraph. Means that we
1491// avoid making flush left Paragraph Models whenever we see a top-of-the-page
1492// half-of-a-paragraph. and instead we mark it the same as normal body text.
1493//
1494// Implementation:
1495//
1496// Comb backwards through the row scratch registers, and turn any
1497// sequences of body lines of equivalent type abutted against the beginning
1498// or a body or start line of a different type into a crown paragraph.
1499static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory,
1501 int start;
1502 for (int end = rows->size(); end > 0; end = start) {
1503 // Search back for a body line of a unique type.
1504 const ParagraphModel *model = nullptr;
1505 while (end > 0 &&
1506 (model = (*rows)[end - 1].UniqueBodyHypothesis()) == nullptr) {
1507 end--;
1508 }
1509 if (end == 0) break;
1510 start = end - 1;
1511 while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
1512 start--; // walk back to the first line that is not the same body type.
1513 }
1514 if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model &&
1515 StrongModel(model) &&
1516 NearlyEqual(model->first_indent(), model->body_indent(),
1517 model->tolerance())) {
1518 start--;
1519 }
1520 start++;
1521 // Now rows[start, end) is a sequence of unique body hypotheses of model.
1522 if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER)
1523 continue;
1524 if (!StrongModel(model)) {
1525 while (start > 0 &&
1526 CrownCompatible(rows, start - 1, start, model))
1527 start--;
1528 }
1529 if (start == 0 ||
1530 (!StrongModel(model)) ||
1531 (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) {
1532 // crownify rows[start, end)
1533 const ParagraphModel *crown_model = model;
1534 if (StrongModel(model)) {
1535 if (model->justification() == JUSTIFICATION_LEFT)
1536 crown_model = kCrownLeft;
1537 else
1538 crown_model = kCrownRight;
1539 }
1540 (*rows)[start].SetUnknown();
1541 (*rows)[start].AddStartLine(crown_model);
1542 for (int row = start + 1; row < end; row++) {
1543 (*rows)[row].SetUnknown();
1544 (*rows)[row].AddBodyLine(crown_model);
1545 }
1546 }
1547 }
1548 DiscardUnusedModels(*rows, theory);
1549}
1550
1551
1552// Clear all hypotheses about lines [start, end) and reset margins.
1553//
1554// The empty space between the left of a row and the block boundary (and
1555// similarly for the right) is split into two pieces: margin and indent.
1556// In initial processing, we assume the block is tight and the margin for
1557// all lines is set to zero. However, if our first pass does not yield
1558// models for everything, it may be due to an inset paragraph like a
1559// block-quote. In that case, we make a second pass over that unmarked
1560// section of the page and reset the "margin" portion of the empty space
1561// to the common amount of space at the ends of the lines under consid-
1562// eration. This would be equivalent to percentile set to 0. However,
1563// sometimes we have a single character sticking out in the right margin
1564// of a text block (like the 'r' in 'for' on line 3 above), and we can
1565// really just ignore it as an outlier. To express this, we allow the
1566// user to specify the percentile (0..100) of indent values to use as
1567// the common margin for each row in the run of rows[start, end).
1569 GenericVector<RowScratchRegisters> *rows, int start, int end,
1570 int percentile) {
1571 if (!AcceptableRowArgs(0, 0, __func__, rows, start, end))
1572 return;
1573
1574 int lmin, lmax, rmin, rmax;
1575 lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
1576 rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
1577 for (int i = start; i < end; i++) {
1578 RowScratchRegisters &sr = (*rows)[i];
1579 sr.SetUnknown();
1580 if (sr.ri_->num_words == 0)
1581 continue;
1582 UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax);
1583 UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax);
1584 }
1585 STATS lefts(lmin, lmax + 1);
1586 STATS rights(rmin, rmax + 1);
1587 for (int i = start; i < end; i++) {
1588 RowScratchRegisters &sr = (*rows)[i];
1589 if (sr.ri_->num_words == 0)
1590 continue;
1591 lefts.add(sr.lmargin_ + sr.lindent_, 1);
1592 rights.add(sr.rmargin_ + sr.rindent_, 1);
1593 }
1594 int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0);
1595 int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0);
1596 for (int i = start; i < end; i++) {
1597 RowScratchRegisters &sr = (*rows)[i];
1598 int ldelta = ignorable_left - sr.lmargin_;
1599 sr.lmargin_ += ldelta;
1600 sr.lindent_ -= ldelta;
1601 int rdelta = ignorable_right - sr.rmargin_;
1602 sr.rmargin_ += rdelta;
1603 sr.rindent_ -= rdelta;
1604 }
1605}
1606
1607// Return the median inter-word space in rows[row_start, row_end).
1609 int row_start, int row_end) {
1610 if (row_end < row_start + 1) return 1;
1611 int word_height = (rows[row_start].ri_->lword_box.height() +
1612 rows[row_end - 1].ri_->lword_box.height()) / 2;
1613 int word_width = (rows[row_start].ri_->lword_box.width() +
1614 rows[row_end - 1].ri_->lword_box.width()) / 2;
1615 STATS spacing_widths(0, 5 + word_width);
1616 for (int i = row_start; i < row_end; i++) {
1617 if (rows[i].ri_->num_words > 1) {
1618 spacing_widths.add(rows[i].ri_->average_interword_space, 1);
1619 }
1620 }
1621 int minimum_reasonable_space = word_height / 3;
1622 if (minimum_reasonable_space < 2)
1623 minimum_reasonable_space = 2;
1624 int median = spacing_widths.median();
1625 return (median > minimum_reasonable_space)
1626 ? median : minimum_reasonable_space;
1627}
1628
1629// Return whether the first word on the after line can fit in the space at
1630// the end of the before line (knowing which way the text is aligned and read).
1632 const RowScratchRegisters &after,
1633 tesseract::ParagraphJustification justification) {
1634 if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
1635 return true;
1636
1637 if (justification == JUSTIFICATION_UNKNOWN) {
1638 tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
1639 }
1640 int available_space;
1641 if (justification == JUSTIFICATION_CENTER) {
1642 available_space = before.lindent_ + before.rindent_;
1643 } else {
1644 available_space = before.OffsideIndent(justification);
1645 }
1646 available_space -= before.ri_->average_interword_space;
1647
1648 if (before.ri_->ltr)
1649 return after.ri_->lword_box.width() < available_space;
1650 return after.ri_->rword_box.width() < available_space;
1651}
1652
1653// Return whether the first word on the after line can fit in the space at
1654// the end of the before line (not knowing which way the text goes) in a left
1655// or right alignment.
1657 const RowScratchRegisters &after) {
1658 if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
1659 return true;
1660
1661 int available_space = before.lindent_;
1662 if (before.rindent_ > available_space)
1663 available_space = before.rindent_;
1664 available_space -= before.ri_->average_interword_space;
1665
1666 if (before.ri_->ltr)
1667 return after.ri_->lword_box.width() < available_space;
1668 return after.ri_->rword_box.width() < available_space;
1669}
1670
1671static bool TextSupportsBreak(const RowScratchRegisters &before,
1672 const RowScratchRegisters &after) {
1673 if (before.ri_->ltr) {
1674 return before.ri_->rword_likely_ends_idea &&
1675 after.ri_->lword_likely_starts_idea;
1676 } else {
1677 return before.ri_->lword_likely_ends_idea &&
1678 after.ri_->rword_likely_starts_idea;
1679 }
1680}
1681
1682static bool LikelyParagraphStart(const RowScratchRegisters &before,
1683 const RowScratchRegisters &after,
1685 return before.ri_->num_words == 0 ||
1686 (FirstWordWouldHaveFit(before, after, j) &&
1687 TextSupportsBreak(before, after));
1688}
1689
1690// Examine rows[start, end) and try to determine what sort of ParagraphModel
1691// would fit them as a single paragraph.
1692// If we can't produce a unique model justification_ = JUSTIFICATION_UNKNOWN.
1693// If the rows given could be a consistent start to a paragraph, set *consistent
1694// true.
1695static ParagraphModel InternalParagraphModelByOutline(
1697 int start, int end, int tolerance, bool *consistent) {
1698 int ltr_line_count = 0;
1699 for (int i = start; i < end; i++) {
1700 ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr);
1701 }
1702 bool ltr = (ltr_line_count >= (end - start) / 2);
1703
1704 *consistent = true;
1705 if (!AcceptableRowArgs(0, 2, __func__, rows, start, end))
1706 return ParagraphModel();
1707
1708 // Ensure the caller only passed us a region with a common rmargin and
1709 // lmargin.
1710 int lmargin = (*rows)[start].lmargin_;
1711 int rmargin = (*rows)[start].rmargin_;
1712 int lmin, lmax, rmin, rmax, cmin, cmax;
1713 lmin = lmax = (*rows)[start + 1].lindent_;
1714 rmin = rmax = (*rows)[start + 1].rindent_;
1715 cmin = cmax = 0;
1716 for (int i = start + 1; i < end; i++) {
1717 if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
1718 tprintf("Margins don't match! Software error.\n");
1719 *consistent = false;
1720 return ParagraphModel();
1721 }
1722 UpdateRange((*rows)[i].lindent_, &lmin, &lmax);
1723 UpdateRange((*rows)[i].rindent_, &rmin, &rmax);
1724 UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
1725 }
1726 int ldiff = lmax - lmin;
1727 int rdiff = rmax - rmin;
1728 int cdiff = cmax - cmin;
1729 if (rdiff > tolerance && ldiff > tolerance) {
1730 if (cdiff < tolerance * 2) {
1731 if (end - start < 3)
1732 return ParagraphModel();
1733 return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance);
1734 }
1735 *consistent = false;
1736 return ParagraphModel();
1737 }
1738 if (end - start < 3) // Don't return a model for two line paras.
1739 return ParagraphModel();
1740
1741 // These booleans keep us from saying something is aligned left when the body
1742 // left variance is too large.
1743 bool body_admits_left_alignment = ldiff < tolerance;
1744 bool body_admits_right_alignment = rdiff < tolerance;
1745
1746 ParagraphModel left_model =
1747 ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_,
1748 (lmin + lmax) / 2, tolerance);
1749 ParagraphModel right_model =
1750 ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_,
1751 (rmin + rmax) / 2, tolerance);
1752
1753 // These booleans keep us from having an indent on the "wrong side" for the
1754 // first line.
1755 bool text_admits_left_alignment = ltr || left_model.is_flush();
1756 bool text_admits_right_alignment = !ltr || right_model.is_flush();
1757
1758 // At least one of the edges is less than tolerance in variance.
1759 // If the other is obviously ragged, it can't be the one aligned to.
1760 // [Note the last line is included in this raggedness.]
1761 if (tolerance < rdiff) {
1762 if (body_admits_left_alignment && text_admits_left_alignment)
1763 return left_model;
1764 *consistent = false;
1765 return ParagraphModel();
1766 }
1767 if (tolerance < ldiff) {
1768 if (body_admits_right_alignment && text_admits_right_alignment)
1769 return right_model;
1770 *consistent = false;
1771 return ParagraphModel();
1772 }
1773
1774 // At this point, we know the body text doesn't vary much on either side.
1775
1776 // If the first line juts out oddly in one direction or the other,
1777 // that likely indicates the side aligned to.
1778 int first_left = (*rows)[start].lindent_;
1779 int first_right = (*rows)[start].rindent_;
1780
1781 if (ltr && body_admits_left_alignment &&
1782 (first_left < lmin || first_left > lmax))
1783 return left_model;
1784 if (!ltr && body_admits_right_alignment &&
1785 (first_right < rmin || first_right > rmax))
1786 return right_model;
1787
1788 *consistent = false;
1789 return ParagraphModel();
1790}
1791
1792// Examine rows[start, end) and try to determine what sort of ParagraphModel
1793// would fit them as a single paragraph. If nothing fits,
1794// justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug
1795// output if we're debugging.
1796static ParagraphModel ParagraphModelByOutline(
1797 int debug_level,
1799 int start, int end, int tolerance) {
1800 bool unused_consistent;
1801 ParagraphModel retval = InternalParagraphModelByOutline(
1802 rows, start, end, tolerance, &unused_consistent);
1803 if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) {
1804 tprintf("Could not determine a model for this paragraph:\n");
1805 PrintRowRange(*rows, start, end);
1806 }
1807 return retval;
1808}
1809
1810// Do rows[start, end) form a single instance of the given paragraph model?
1812 int start, int end, const ParagraphModel *model) {
1813 if (!AcceptableRowArgs(0, 1, __func__, rows, start, end))
1814 return false;
1815 if (!ValidFirstLine(rows, start, model)) return false;
1816 for (int i = start + 1 ; i < end; i++) {
1817 if (!ValidBodyLine(rows, i, model)) return false;
1818 }
1819 return true;
1820}
1821
1822// Examine rows[row_start, row_end) as an independent section of text,
1823// and mark rows that are exceptionally clear as start-of-paragraph
1824// and paragraph-body lines.
1825//
1826// We presume that any lines surrounding rows[row_start, row_end) may
1827// have wildly different paragraph models, so we don't key any data off
1828// of those lines.
1829//
1830// We only take the very strongest signals, as we don't want to get
1831// confused and marking up centered text, poetry, or source code as
1832// clearly part of a typical paragraph.
1833static void MarkStrongEvidence(GenericVector<RowScratchRegisters> *rows,
1834 int row_start, int row_end) {
1835 // Record patently obvious body text.
1836 for (int i = row_start + 1; i < row_end; i++) {
1837 const RowScratchRegisters &prev = (*rows)[i - 1];
1838 RowScratchRegisters &curr = (*rows)[i];
1839 tesseract::ParagraphJustification typical_justification =
1840 prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1841 if (!curr.ri_->rword_likely_starts_idea &&
1842 !curr.ri_->lword_likely_starts_idea &&
1843 !FirstWordWouldHaveFit(prev, curr, typical_justification)) {
1844 curr.SetBodyLine();
1845 }
1846 }
1847
1848 // Record patently obvious start paragraph lines.
1849 //
1850 // It's an extremely good signal of the start of a paragraph that
1851 // the first word would have fit on the end of the previous line.
1852 // However, applying just that signal would have us mark random
1853 // start lines of lineated text (poetry and source code) and some
1854 // centered headings as paragraph start lines. Therefore, we use
1855 // a second qualification for a paragraph start: Not only should
1856 // the first word of this line have fit on the previous line,
1857 // but also, this line should go full to the right of the block,
1858 // disallowing a subsequent word from having fit on this line.
1859
1860 // First row:
1861 {
1862 RowScratchRegisters &curr = (*rows)[row_start];
1863 RowScratchRegisters &next = (*rows)[row_start + 1];
1865 curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1866 if (curr.GetLineType() == LT_UNKNOWN &&
1867 !FirstWordWouldHaveFit(curr, next, j) &&
1868 (curr.ri_->lword_likely_starts_idea ||
1869 curr.ri_->rword_likely_starts_idea)) {
1870 curr.SetStartLine();
1871 }
1872 }
1873 // Middle rows
1874 for (int i = row_start + 1; i < row_end - 1; i++) {
1875 RowScratchRegisters &prev = (*rows)[i - 1];
1876 RowScratchRegisters &curr = (*rows)[i];
1877 RowScratchRegisters &next = (*rows)[i + 1];
1879 curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1880 if (curr.GetLineType() == LT_UNKNOWN &&
1881 !FirstWordWouldHaveFit(curr, next, j) &&
1882 LikelyParagraphStart(prev, curr, j)) {
1883 curr.SetStartLine();
1884 }
1885 }
1886 // Last row
1887 { // the short circuit at the top means we have at least two lines.
1888 RowScratchRegisters &prev = (*rows)[row_end - 2];
1889 RowScratchRegisters &curr = (*rows)[row_end - 1];
1891 curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1892 if (curr.GetLineType() == LT_UNKNOWN &&
1893 !FirstWordWouldHaveFit(curr, curr, j) &&
1894 LikelyParagraphStart(prev, curr, j)) {
1895 curr.SetStartLine();
1896 }
1897 }
1898}
1899
1900// Look for sequences of a start line followed by some body lines in
1901// rows[row_start, row_end) and create ParagraphModels for them if
1902// they seem coherent.
1903static void ModelStrongEvidence(int debug_level,
1905 int row_start, int row_end,
1906 bool allow_flush_models,
1907 ParagraphTheory *theory) {
1908 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
1909 return;
1910
1911 int start = row_start;
1912 while (start < row_end) {
1913 while (start < row_end && (*rows)[start].GetLineType() != LT_START)
1914 start++;
1915 if (start >= row_end - 1)
1916 break;
1917
1918 int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
1919 int end = start;
1920 ParagraphModel last_model;
1921 bool next_consistent;
1922 do {
1923 ++end;
1924 // rows[row, end) was consistent.
1925 // If rows[row, end + 1) is not consistent,
1926 // just model rows[row, end)
1927 if (end < row_end - 1) {
1928 RowScratchRegisters &next = (*rows)[end];
1929 LineType lt = next.GetLineType();
1930 next_consistent = lt == LT_BODY ||
1931 (lt == LT_UNKNOWN &&
1932 !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end]));
1933 } else {
1934 next_consistent = false;
1935 }
1936 if (next_consistent) {
1937 ParagraphModel next_model = InternalParagraphModelByOutline(
1938 rows, start, end + 1, tolerance, &next_consistent);
1939 if (((*rows)[start].ri_->ltr &&
1940 last_model.justification() == JUSTIFICATION_LEFT &&
1941 next_model.justification() != JUSTIFICATION_LEFT) ||
1942 (!(*rows)[start].ri_->ltr &&
1943 last_model.justification() == JUSTIFICATION_RIGHT &&
1944 next_model.justification() != JUSTIFICATION_RIGHT)) {
1945 next_consistent = false;
1946 }
1947 last_model = next_model;
1948 } else {
1949 next_consistent = false;
1950 }
1951 } while (next_consistent && end < row_end);
1952 // At this point, rows[start, end) looked like it could have been a
1953 // single paragraph. If we can make a good ParagraphModel for it,
1954 // do so and mark this sequence with that model.
1955 if (end > start + 1) {
1956 // emit a new paragraph if we have more than one line.
1957 const ParagraphModel *model = nullptr;
1958 ParagraphModel new_model = ParagraphModelByOutline(
1959 debug_level, rows, start, end,
1960 Epsilon(InterwordSpace(*rows, start, end)));
1961 if (new_model.justification() == JUSTIFICATION_UNKNOWN) {
1962 // couldn't create a good model, oh well.
1963 } else if (new_model.is_flush()) {
1964 if (end == start + 2) {
1965 // It's very likely we just got two paragraph starts in a row.
1966 end = start + 1;
1967 } else if (start == row_start) {
1968 // Mark this as a Crown.
1969 if (new_model.justification() == JUSTIFICATION_LEFT) {
1970 model = kCrownLeft;
1971 } else {
1972 model = kCrownRight;
1973 }
1974 } else if (allow_flush_models) {
1975 model = theory->AddModel(new_model);
1976 }
1977 } else {
1978 model = theory->AddModel(new_model);
1979 }
1980 if (model) {
1981 (*rows)[start].AddStartLine(model);
1982 for (int i = start + 1; i < end; i++) {
1983 (*rows)[i].AddBodyLine(model);
1984 }
1985 }
1986 }
1987 start = end;
1988 }
1989}
1990
1991// We examine rows[row_start, row_end) and do the following:
1992// (1) Clear all existing hypotheses for the rows being considered.
1993// (2) Mark up any rows as exceptionally likely to be paragraph starts
1994// or paragraph body lines as such using both geometric and textual
1995// clues.
1996// (3) Form models for any sequence of start + continuation lines.
1997// (4) Smear the paragraph models to cover surrounding text.
1998static void StrongEvidenceClassify(int debug_level,
2000 int row_start, int row_end,
2001 ParagraphTheory *theory) {
2002 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
2003 return;
2004
2005 if (debug_level > 1) {
2006 tprintf("#############################################\n");
2007 tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
2008 tprintf("#############################################\n");
2009 }
2010
2011 RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
2012 MarkStrongEvidence(rows, row_start, row_end);
2013
2014 DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows);
2015
2016 // Create paragraph models.
2017 ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory);
2018
2019 DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows);
2020
2021 // At this point, some rows are marked up as paragraphs with model numbers,
2022 // and some rows are marked up as either LT_START or LT_BODY. Now let's
2023 // smear any good paragraph hypotheses forward and backward.
2024 ParagraphModelSmearer smearer(rows, row_start, row_end, theory);
2025 smearer.Smear();
2026}
2027
2028static void SeparateSimpleLeaderLines(GenericVector<RowScratchRegisters> *rows,
2029 int row_start, int row_end,
2030 ParagraphTheory *theory) {
2031 for (int i = row_start + 1; i < row_end - 1; i++) {
2032 if ((*rows)[i - 1].ri_->has_leaders &&
2033 (*rows)[i].ri_->has_leaders &&
2034 (*rows)[i + 1].ri_->has_leaders) {
2035 const ParagraphModel *model = theory->AddModel(
2037 (*rows)[i].AddStartLine(model);
2038 }
2039 }
2040}
2041
2042// Collect sequences of unique hypotheses in row registers and create proper
2043// paragraphs for them, referencing the paragraphs in row_owners.
2044static void ConvertHypothesizedModelRunsToParagraphs(
2045 int debug_level,
2047 GenericVector<PARA *> *row_owners,
2048 ParagraphTheory *theory) {
2049 int end = rows.size();
2050 int start;
2051 for (; end > 0; end = start) {
2052 start = end - 1;
2053 const ParagraphModel *model = nullptr;
2054 // TODO(eger): Be smarter about dealing with multiple hypotheses.
2055 bool single_line_paragraph = false;
2056 SetOfModels models;
2057 rows[start].NonNullHypotheses(&models);
2058 if (!models.empty()) {
2059 model = models[0];
2060 if (rows[start].GetLineType(model) != LT_BODY)
2061 single_line_paragraph = true;
2062 }
2063 if (model && !single_line_paragraph) {
2064 // walk back looking for more body lines and then a start line.
2065 while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) {
2066 // do nothing
2067 }
2068 if (start < 0 || rows[start].GetLineType(model) != LT_START) {
2069 model = nullptr;
2070 }
2071 }
2072 if (model == nullptr) {
2073 continue;
2074 }
2075 // rows[start, end) should be a paragraph.
2076 PARA *p = new PARA();
2077 if (model == kCrownLeft || model == kCrownRight) {
2079 // Crown paragraph.
2080 // If we can find an existing ParagraphModel that fits, use it,
2081 // else create a new one.
2082 for (int row = end; row < rows.size(); row++) {
2083 if ((*row_owners)[row] &&
2084 (ValidBodyLine(&rows, start, (*row_owners)[row]->model) &&
2085 (start == 0 ||
2086 ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) {
2087 model = (*row_owners)[row]->model;
2088 break;
2089 }
2090 }
2091 if (model == kCrownLeft) {
2092 // No subsequent model fits, so cons one up.
2093 model = theory->AddModel(ParagraphModel(
2094 JUSTIFICATION_LEFT, rows[start].lmargin_ + rows[start].lindent_,
2095 0, 0, Epsilon(rows[start].ri_->average_interword_space)));
2096 } else if (model == kCrownRight) {
2097 // No subsequent model fits, so cons one up.
2098 model = theory->AddModel(ParagraphModel(
2099 JUSTIFICATION_RIGHT, rows[start].rmargin_ + rows[start].rmargin_,
2100 0, 0, Epsilon(rows[start].ri_->average_interword_space)));
2101 }
2102 }
2103 rows[start].SetUnknown();
2104 rows[start].AddStartLine(model);
2105 for (int i = start + 1; i < end; i++) {
2106 rows[i].SetUnknown();
2107 rows[i].AddBodyLine(model);
2108 }
2109 p->model = model;
2110 p->has_drop_cap = rows[start].ri_->has_drop_cap;
2111 p->is_list_item =
2113 ? rows[start].ri_->rword_indicates_list_item
2114 : rows[start].ri_->lword_indicates_list_item;
2115 for (int row = start; row < end; row++) {
2116 if ((*row_owners)[row] != nullptr) {
2117 tprintf("Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "
2118 "more than once!\n");
2119 delete (*row_owners)[row];
2120 }
2121 (*row_owners)[row] = p;
2122 }
2123 }
2124}
2125
2126struct Interval {
2127 Interval() : begin(0), end(0) {}
2128 Interval(int b, int e) : begin(b), end(e) {}
2129
2131 int end;
2132};
2133
2134// Return whether rows[row] appears to be stranded, meaning that the evidence
2135// for this row is very weak due to context. For instance, two lines of source
2136// code may happen to be indented at the same tab vector as body text starts,
2137// leading us to think they are two start-of-paragraph lines. This is not
2138// optimal. However, we also don't want to mark a sequence of short dialog
2139// as "weak," so our heuristic is:
2140// (1) If a line is surrounded by lines of unknown type, it's weak.
2141// (2) If two lines in a row are start lines for a given paragraph type, but
2142// after that the same paragraph type does not continue, they're weak.
2143static bool RowIsStranded(const GenericVector<RowScratchRegisters> &rows,
2144 int row) {
2145 SetOfModels row_models;
2146 rows[row].StrongHypotheses(&row_models);
2147
2148 for (int m = 0; m < row_models.size(); m++) {
2149 bool all_starts = rows[row].GetLineType();
2150 int run_length = 1;
2151 bool continues = true;
2152 for (int i = row - 1; i >= 0 && continues; i--) {
2153 SetOfModels models;
2154 rows[i].NonNullHypotheses(&models);
2155 switch (rows[i].GetLineType(row_models[m])) {
2156 case LT_START: run_length++; break;
2157 case LT_MULTIPLE: // explicit fall-through
2158 case LT_BODY: run_length++; all_starts = false; break;
2159 case LT_UNKNOWN: // explicit fall-through
2160 default: continues = false;
2161 }
2162 }
2163 continues = true;
2164 for (int i = row + 1; i < rows.size() && continues; i++) {
2165 SetOfModels models;
2166 rows[i].NonNullHypotheses(&models);
2167 switch (rows[i].GetLineType(row_models[m])) {
2168 case LT_START: run_length++; break;
2169 case LT_MULTIPLE: // explicit fall-through
2170 case LT_BODY: run_length++; all_starts = false; break;
2171 case LT_UNKNOWN: // explicit fall-through
2172 default: continues = false;
2173 }
2174 }
2175 if (run_length > 2 || (!all_starts && run_length > 1)) return false;
2176 }
2177 return true;
2178}
2179
2180// Go through rows[row_start, row_end) and gather up sequences that need better
2181// classification.
2182// + Sequences of non-empty rows without hypotheses.
2183// + Crown paragraphs not immediately followed by a strongly modeled line.
2184// + Single line paragraphs surrounded by text that doesn't match the
2185// model.
2186static void LeftoverSegments(const GenericVector<RowScratchRegisters> &rows,
2188 int row_start, int row_end) {
2189 to_fix->clear();
2190 for (int i = row_start; i < row_end; i++) {
2191 bool needs_fixing = false;
2192
2193 SetOfModels models;
2194 SetOfModels models_w_crowns;
2195 rows[i].StrongHypotheses(&models);
2196 rows[i].NonNullHypotheses(&models_w_crowns);
2197 if (models.empty() && !models_w_crowns.empty()) {
2198 // Crown paragraph. Is it followed by a modeled line?
2199 for (int end = i + 1; end < rows.size(); end++) {
2200 SetOfModels end_models;
2201 SetOfModels strong_end_models;
2202 rows[end].NonNullHypotheses(&end_models);
2203 rows[end].StrongHypotheses(&strong_end_models);
2204 if (end_models.empty()) {
2205 needs_fixing = true;
2206 break;
2207 } else if (!strong_end_models.empty()) {
2208 needs_fixing = false;
2209 break;
2210 }
2211 }
2212 } else if (models.empty() && rows[i].ri_->num_words > 0) {
2213 // No models at all.
2214 needs_fixing = true;
2215 }
2216
2217 if (!needs_fixing && !models.empty()) {
2218 needs_fixing = RowIsStranded(rows, i);
2219 }
2220
2221 if (needs_fixing) {
2222 if (!to_fix->empty() && to_fix->back().end == i - 1)
2223 to_fix->back().end = i;
2224 else
2225 to_fix->push_back(Interval(i, i));
2226 }
2227 }
2228 // Convert inclusive intervals to half-open intervals.
2229 for (int i = 0; i < to_fix->size(); i++) {
2230 (*to_fix)[i].end = (*to_fix)[i].end + 1;
2231 }
2232}
2233
2234// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
2235// normalize each row_owner to point to an actual PARA, and output the
2236// paragraphs in order onto paragraphs.
2238 GenericVector<PARA *> *row_owners,
2239 PARA_LIST *paragraphs) {
2240 GenericVector<PARA *> &rows = *row_owners;
2241 paragraphs->clear();
2242 PARA_IT out(paragraphs);
2243 PARA *formerly_null = nullptr;
2244 for (int i = 0; i < rows.size(); i++) {
2245 if (rows[i] == nullptr) {
2246 if (i == 0 || rows[i - 1] != formerly_null) {
2247 rows[i] = formerly_null = new PARA();
2248 } else {
2249 rows[i] = formerly_null;
2250 continue;
2251 }
2252 } else if (i > 0 && rows[i - 1] == rows[i]) {
2253 continue;
2254 }
2255 out.add_after_then_move(rows[i]);
2256 }
2257}
2258
2259// Main entry point for Paragraph Detection Algorithm.
2260//
2261// Given a set of equally spaced textlines (described by row_infos),
2262// Split them into paragraphs.
2263//
2264// Output:
2265// row_owners - one pointer for each row, to the paragraph it belongs to.
2266// paragraphs - this is the actual list of PARA objects.
2267// models - the list of paragraph models referenced by the PARA objects.
2268// caller is responsible for deleting the models.
2269void DetectParagraphs(int debug_level,
2270 GenericVector<RowInfo> *row_infos,
2271 GenericVector<PARA *> *row_owners,
2272 PARA_LIST *paragraphs,
2275 ParagraphTheory theory(models);
2276
2277 // Initialize row_owners to be a bunch of nullptr pointers.
2278 row_owners->init_to_size(row_infos->size(), nullptr);
2279
2280 // Set up row scratch registers for the main algorithm.
2281 rows.init_to_size(row_infos->size(), RowScratchRegisters());
2282 for (int i = 0; i < row_infos->size(); i++) {
2283 rows[i].Init((*row_infos)[i]);
2284 }
2285
2286 // Pass 1:
2287 // Detect sequences of lines that all contain leader dots (.....)
2288 // These are likely Tables of Contents. If there are three text lines in
2289 // a row with leader dots, it's pretty safe to say the middle one should
2290 // be a paragraph of its own.
2291 SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory);
2292
2293 DebugDump(debug_level > 1, "End of Pass 1", theory, rows);
2294
2295 GenericVector<Interval> leftovers;
2296 LeftoverSegments(rows, &leftovers, 0, rows.size());
2297 for (int i = 0; i < leftovers.size(); i++) {
2298 // Pass 2a:
2299 // Find any strongly evidenced start-of-paragraph lines. If they're
2300 // followed by two lines that look like body lines, make a paragraph
2301 // model for that and see if that model applies throughout the text
2302 // (that is, "smear" it).
2303 StrongEvidenceClassify(debug_level, &rows,
2304 leftovers[i].begin, leftovers[i].end, &theory);
2305
2306 // Pass 2b:
2307 // If we had any luck in pass 2a, we got part of the page and didn't
2308 // know how to classify a few runs of rows. Take the segments that
2309 // didn't find a model and reprocess them individually.
2310 GenericVector<Interval> leftovers2;
2311 LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end);
2312 bool pass2a_was_useful = leftovers2.size() > 1 ||
2313 (leftovers2.size() == 1 &&
2314 (leftovers2[0].begin != 0 || leftovers2[0].end != rows.size()));
2315 if (pass2a_was_useful) {
2316 for (int j = 0; j < leftovers2.size(); j++) {
2317 StrongEvidenceClassify(debug_level, &rows,
2318 leftovers2[j].begin, leftovers2[j].end,
2319 &theory);
2320 }
2321 }
2322 }
2323
2324 DebugDump(debug_level > 1, "End of Pass 2", theory, rows);
2325
2326 // Pass 3:
2327 // These are the dregs for which we didn't have enough strong textual
2328 // and geometric clues to form matching models for. Let's see if
2329 // the geometric clues are simple enough that we could just use those.
2330 LeftoverSegments(rows, &leftovers, 0, rows.size());
2331 for (int i = 0; i < leftovers.size(); i++) {
2332 GeometricClassify(debug_level, &rows,
2333 leftovers[i].begin, leftovers[i].end, &theory);
2334 }
2335
2336 // Undo any flush models for which there's little evidence.
2337 DowngradeWeakestToCrowns(debug_level, &theory, &rows);
2338
2339 DebugDump(debug_level > 1, "End of Pass 3", theory, rows);
2340
2341 // Pass 4:
2342 // Take everything that's still not marked up well and clear all markings.
2343 LeftoverSegments(rows, &leftovers, 0, rows.size());
2344 for (int i = 0; i < leftovers.size(); i++) {
2345 for (int j = leftovers[i].begin; j < leftovers[i].end; j++) {
2346 rows[j].SetUnknown();
2347 }
2348 }
2349
2350 DebugDump(debug_level > 1, "End of Pass 4", theory, rows);
2351
2352 // Convert all of the unique hypothesis runs to PARAs.
2353 ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners,
2354 &theory);
2355
2356 DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows);
2357
2358 // Finally, clean up any dangling nullptr row paragraph parents.
2359 CanonicalizeDetectionResults(row_owners, paragraphs);
2360}
2361
2362// ============ Code interfacing with the rest of Tesseract ==================
2363
2364static void InitializeTextAndBoxesPreRecognition(const MutableIterator &it,
2365 RowInfo *info) {
2366 // Set up text, lword_text, and rword_text (mostly for debug printing).
2367 STRING fake_text;
2368 PageIterator pit(static_cast<const PageIterator&>(it));
2369 bool first_word = true;
2370 if (!pit.Empty(RIL_WORD)) {
2371 do {
2372 fake_text += "x";
2373 if (first_word) info->lword_text += "x";
2374 info->rword_text += "x";
2375 if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) &&
2376 !pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL)) {
2377 fake_text += " ";
2378 info->rword_text = "";
2379 first_word = false;
2380 }
2381 } while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) &&
2382 pit.Next(RIL_SYMBOL));
2383 }
2384 if (fake_text.size() == 0) return;
2385
2386 int lspaces = info->pix_ldistance / info->average_interword_space;
2387 for (int i = 0; i < lspaces; i++) {
2388 info->text += ' ';
2389 }
2390 info->text += fake_text;
2391
2392 // Set up lword_box, rword_box, and num_words.
2393 PAGE_RES_IT page_res_it = *it.PageResIt();
2394 WERD_RES *word_res = page_res_it.restart_row();
2395 ROW_RES *this_row = page_res_it.row();
2396
2397 WERD_RES *lword = nullptr;
2398 WERD_RES *rword = nullptr;
2399 info->num_words = 0;
2400 do {
2401 if (word_res) {
2402 if (!lword) lword = word_res;
2403 if (rword != word_res) info->num_words++;
2404 rword = word_res;
2405 }
2406 word_res = page_res_it.forward();
2407 } while (page_res_it.row() == this_row);
2408
2409 if (lword) info->lword_box = lword->word->bounding_box();
2410 if (rword) info->rword_box = rword->word->bounding_box();
2411}
2412
2413
2414// Given a Tesseract Iterator pointing to a text line, fill in the paragraph
2415// detector RowInfo with all relevant information from the row.
2416static void InitializeRowInfo(bool after_recognition,
2417 const MutableIterator &it, RowInfo *info) {
2418 if (it.PageResIt()->row() != nullptr) {
2419 ROW *row = it.PageResIt()->row()->row;
2420 info->pix_ldistance = row->lmargin();
2421 info->pix_rdistance = row->rmargin();
2422 info->average_interword_space =
2423 row->space() > 0 ? row->space() : std::max(static_cast<int>(row->x_height()), 1);
2424 info->pix_xheight = row->x_height();
2425 info->has_leaders = false;
2426 info->has_drop_cap = row->has_drop_cap();
2427 info->ltr = true; // set below depending on word scripts
2428 } else {
2429 info->pix_ldistance = info->pix_rdistance = 0;
2430 info->average_interword_space = 1;
2431 info->pix_xheight = 1.0;
2432 info->has_leaders = false;
2433 info->has_drop_cap = false;
2434 info->ltr = true;
2435 }
2436
2437 info->num_words = 0;
2438 info->lword_indicates_list_item = false;
2439 info->lword_likely_starts_idea = false;
2440 info->lword_likely_ends_idea = false;
2441 info->rword_indicates_list_item = false;
2442 info->rword_likely_starts_idea = false;
2443 info->rword_likely_ends_idea = false;
2444 info->has_leaders = false;
2445 info->ltr = true;
2446
2447 if (!after_recognition) {
2448 InitializeTextAndBoxesPreRecognition(it, info);
2449 return;
2450 }
2451 info->text = "";
2452 const std::unique_ptr<const char[]> text(it.GetUTF8Text(RIL_TEXTLINE));
2453 int trailing_ws_idx = strlen(text.get()); // strip trailing space
2454 while (trailing_ws_idx > 0 &&
2455 // isspace() only takes ASCII
2456 isascii(text[trailing_ws_idx - 1]) &&
2457 isspace(text[trailing_ws_idx - 1]))
2458 trailing_ws_idx--;
2459 if (trailing_ws_idx > 0) {
2460 int lspaces = info->pix_ldistance / info->average_interword_space;
2461 for (int i = 0; i < lspaces; i++)
2462 info->text += ' ';
2463 for (int i = 0; i < trailing_ws_idx; i++)
2464 info->text += text[i];
2465 }
2466
2467 if (info->text.size() == 0) {
2468 return;
2469 }
2470
2471 PAGE_RES_IT page_res_it = *it.PageResIt();
2473 WERD_RES *word_res = page_res_it.restart_row();
2474 ROW_RES *this_row = page_res_it.row();
2475 int num_leaders = 0;
2476 int ltr = 0;
2477 int rtl = 0;
2478 do {
2479 if (word_res && word_res->best_choice->unichar_string().length() > 0) {
2480 werds.push_back(word_res);
2481 ltr += word_res->AnyLtrCharsInWord() ? 1 : 0;
2482 rtl += word_res->AnyRtlCharsInWord() ? 1 : 0;
2483 if (word_res->word->flag(W_REP_CHAR)) num_leaders++;
2484 }
2485 word_res = page_res_it.forward();
2486 } while (page_res_it.row() == this_row);
2487 info->ltr = ltr >= rtl;
2488 info->has_leaders = num_leaders > 3;
2489 info->num_words = werds.size();
2490 if (!werds.empty()) {
2491 WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1];
2492 info->lword_text = lword->best_choice->unichar_string().string();
2493 info->rword_text = rword->best_choice->unichar_string().string();
2494 info->lword_box = lword->word->bounding_box();
2495 info->rword_box = rword->word->bounding_box();
2496 LeftWordAttributes(lword->uch_set, lword->best_choice,
2497 info->lword_text,
2498 &info->lword_indicates_list_item,
2499 &info->lword_likely_starts_idea,
2500 &info->lword_likely_ends_idea);
2501 RightWordAttributes(rword->uch_set, rword->best_choice,
2502 info->rword_text,
2503 &info->rword_indicates_list_item,
2504 &info->rword_likely_starts_idea,
2505 &info->rword_likely_ends_idea);
2506 }
2507}
2508
2509// This is called after rows have been identified and words are recognized.
2510// Much of this could be implemented before word recognition, but text helps
2511// to identify bulleted lists and gives good signals for sentence boundaries.
2512void DetectParagraphs(int debug_level,
2513 bool after_text_recognition,
2514 const MutableIterator *block_start,
2516 // Clear out any preconceived notions.
2517 if (block_start->Empty(RIL_TEXTLINE)) {
2518 return;
2519 }
2520 BLOCK *block = block_start->PageResIt()->block()->block;
2521 block->para_list()->clear();
2522 bool is_image_block = block->pdblk.poly_block() && !block->pdblk.poly_block()->IsText();
2523
2524 // Convert the Tesseract structures to RowInfos
2525 // for the paragraph detection algorithm.
2526 MutableIterator row(*block_start);
2527 if (row.Empty(RIL_TEXTLINE))
2528 return; // end of input already.
2529
2530 GenericVector<RowInfo> row_infos;
2531 do {
2532 if (!row.PageResIt()->row())
2533 continue; // empty row.
2534 row.PageResIt()->row()->row->set_para(nullptr);
2535 row_infos.push_back(RowInfo());
2536 RowInfo &ri = row_infos.back();
2537 InitializeRowInfo(after_text_recognition, row, &ri);
2538 } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) &&
2539 row.Next(RIL_TEXTLINE));
2540
2541 // If we're called before text recognition, we might not have
2542 // tight block bounding boxes, so trim by the minimum on each side.
2543 if (!row_infos.empty()) {
2544 int min_lmargin = row_infos[0].pix_ldistance;
2545 int min_rmargin = row_infos[0].pix_rdistance;
2546 for (int i = 1; i < row_infos.size(); i++) {
2547 if (row_infos[i].pix_ldistance < min_lmargin)
2548 min_lmargin = row_infos[i].pix_ldistance;
2549 if (row_infos[i].pix_rdistance < min_rmargin)
2550 min_rmargin = row_infos[i].pix_rdistance;
2551 }
2552 if (min_lmargin > 0 || min_rmargin > 0) {
2553 for (int i = 0; i < row_infos.size(); i++) {
2554 row_infos[i].pix_ldistance -= min_lmargin;
2555 row_infos[i].pix_rdistance -= min_rmargin;
2556 }
2557 }
2558 }
2559
2560 // Run the paragraph detection algorithm.
2561 GenericVector<PARA *> row_owners;
2562 GenericVector<PARA *> the_paragraphs;
2563 if (!is_image_block) {
2564 DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(),
2565 models);
2566 } else {
2567 row_owners.init_to_size(row_infos.size(), nullptr);
2568 CanonicalizeDetectionResults(&row_owners, block->para_list());
2569 }
2570
2571 // Now stitch in the row_owners into the rows.
2572 row = *block_start;
2573 for (int i = 0; i < row_owners.size(); i++) {
2574 while (!row.PageResIt()->row())
2575 row.Next(RIL_TEXTLINE);
2576 row.PageResIt()->row()->row->set_para(row_owners[i]);
2577 row.Next(RIL_TEXTLINE);
2578 }
2579}
2580
2581} // namespace
@ W_REP_CHAR
repeated character
Definition: werd.h:38
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
Definition: helpers.h:120
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:108
bool NearlyEqual(T x, T y, T tolerance)
Definition: host.h:37
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int UNICHAR_ID
Definition: unichar.h:34
int count(LIST var_list)
Definition: oldlist.cpp:95
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:408
bool CrownCompatible(const GenericVector< RowScratchRegisters > *rows, int a, int b, const ParagraphModel *model)
bool StrongModel(const ParagraphModel *model)
const char *const kPDF
Pop Directional Formatting.
Definition: unicodes.cpp:26
bool AsciiLikelyListItem(const STRING &word)
Definition: paragraphs.cpp:281
ParagraphJustification
Definition: publictypes.h:251
@ JUSTIFICATION_LEFT
Definition: publictypes.h:253
@ JUSTIFICATION_UNKNOWN
Definition: publictypes.h:252
@ JUSTIFICATION_RIGHT
Definition: publictypes.h:255
@ JUSTIFICATION_CENTER
Definition: publictypes.h:254
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification justification)
bool ValidBodyLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
void RecomputeMarginsAndClearHypotheses(GenericVector< RowScratchRegisters > *rows, int start, int end, int percentile)
bool ValidFirstLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
const ParagraphModel * kCrownLeft
Definition: paragraphs.cpp:54
void CanonicalizeDetectionResults(GenericVector< PARA * > *row_owners, PARA_LIST *paragraphs)
const ParagraphModel * kCrownRight
Definition: paragraphs.cpp:56
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:455
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos)
Definition: paragraphs.cpp:288
bool RowsFitModel(const GenericVector< RowScratchRegisters > *rows, int start, int end, const ParagraphModel *model)
GenericVectorEqEq< const ParagraphModel * > SetOfModels
void DetectParagraphs(int debug_level, GenericVector< RowInfo > *row_infos, GenericVector< PARA * > *row_owners, PARA_LIST *paragraphs, GenericVector< ParagraphModel * > *models)
const char *const kRLE
Right-to-Left Embedding.
Definition: unicodes.cpp:25
int InterwordSpace(const GenericVector< RowScratchRegisters > &rows, int row_start, int row_end)
void init_to_size(int size, const T &t)
int push_back(T object)
bool empty() const
Definition: genericvector.h:91
int size() const
Definition: genericvector.h:72
void remove(int index)
T & back() const
int get_index(const T &object) const
bool contains(const T &object) const
int push_back_new(const T &object)
const PAGE_RES_IT * PageResIt() const
bool Empty(PageIteratorLevel level) const
UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
Definition: paragraphs.cpp:298
Cluster(int cen, int num)
Definition: paragraphs.cpp:659
void GetClusters(GenericVector< Cluster > *clusters)
Definition: paragraphs.cpp:689
SimpleClusterer(int max_cluster_width)
Definition: paragraphs.cpp:667
const GenericVector< Cluster > & AlignTabs() const
Definition: paragraphs.cpp:893
void Fail(int min_debug_level, const char *why) const
Definition: paragraphs.cpp:928
bool FirstWordWouldHaveFit(int row_a, int row_b)
Definition: paragraphs.cpp:921
GenericVector< RowScratchRegisters > * rows
Definition: paragraphs.cpp:943
GeometricClassifierState(int dbg_level, GenericVector< RowScratchRegisters > *r, int r_start, int r_end)
Definition: paragraphs.cpp:867
const GenericVector< Cluster > & OffsideTabs() const
Definition: paragraphs.cpp:903
GenericVector< Cluster > left_tabs
Definition: paragraphs.cpp:956
ParagraphModel Model() const
Definition: paragraphs.cpp:934
GenericVector< Cluster > right_tabs
Definition: paragraphs.cpp:957
tesseract::ParagraphJustification just
Definition: paragraphs.cpp:960
int AlignsideTabIndex(int row_idx) const
Definition: paragraphs.cpp:915
Interval(int b, int e)
int average_interword_space
Definition: paragraphs.h:53
static void AppendDebugHeaderFields(GenericVector< STRING > *header)
Definition: paragraphs.cpp:489
void StartHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:610
void AppendDebugInfo(const ParagraphTheory &theory, GenericVector< STRING > *dbg) const
Definition: paragraphs.cpp:495
const ParagraphModel * UniqueStartHypothesis() const
Definition: paragraphs.cpp:631
void NonNullHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:624
void AddBodyLine(const ParagraphModel *model)
Definition: paragraphs.cpp:603
void StrongHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:617
int OffsideIndent(tesseract::ParagraphJustification just) const
void DiscardNonMatchingHypotheses(const SetOfModels &models)
Definition: paragraphs.cpp:644
void AddStartLine(const ParagraphModel *model)
Definition: paragraphs.cpp:596
const ParagraphModel * UniqueBodyHypothesis() const
Definition: paragraphs.cpp:637
void Init(const RowInfo &row)
Definition: paragraphs.cpp:526
void NonCenteredModels(SetOfModels *models)
const ParagraphModel * Fits(const GenericVector< RowScratchRegisters > *rows, int start, int end) const
GenericVector< ParagraphModel * > & models()
void DiscardUnusedModels(const SetOfModels &used_models)
int IndexOf(const ParagraphModel *model) const
const ParagraphModel * AddModel(const ParagraphModel &model)
ParagraphModelSmearer(GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const override
bool Next(PageIteratorLevel level) override
Definition: ocrblock.h:31
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:190
PARA_LIST * para_list()
Definition: ocrblock.h:124
Definition: ocrpara.h:29
const ParagraphModel * model
Definition: ocrpara.h:36
bool is_very_first_or_continuation
Definition: ocrpara.h:43
bool is_list_item
Definition: ocrpara.h:38
bool has_drop_cap
Definition: ocrpara.h:46
int first_indent() const
Definition: ocrpara.h:168
tesseract::ParagraphJustification justification() const
Definition: ocrpara.h:164
int body_indent() const
Definition: ocrpara.h:169
bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const
Definition: ocrpara.cpp:63
int tolerance() const
Definition: ocrpara.h:170
bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const
Definition: ocrpara.cpp:46
bool is_flush() const
Definition: ocrpara.h:171
Definition: ocrrow.h:37
int32_t space() const
Definition: ocrrow.h:79
void set_para(PARA *p)
Definition: ocrrow.h:115
int16_t lmargin() const
Definition: ocrrow.h:101
bool has_drop_cap() const
Definition: ocrrow.h:111
float x_height() const
Definition: ocrrow.h:64
int16_t rmargin() const
Definition: ocrrow.h:104
BLOCK * block
Definition: pageres.h:116
ROW * row
Definition: pageres.h:140
const UNICHARSET * uch_set
Definition: pageres.h:203
WERD_CHOICE * best_choice
Definition: pageres.h:241
bool AnyRtlCharsInWord() const
Definition: pageres.h:393
bool AnyLtrCharsInWord() const
Definition: pageres.h:409
WERD * word
Definition: pageres.h:186
ROW_RES * row() const
Definition: pageres.h:757
BLOCK_RES * block() const
Definition: pageres.h:760
WERD_RES * restart_row()
Definition: pageres.cpp:1630
WERD_RES * forward()
Definition: pageres.h:734
POLY_BLOCK * poly_block() const
Definition: pdblock.h:55
bool IsText() const
Definition: polyblk.h:49
const STRING & unichar_string() const
Definition: ratngs.h:531
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
int length() const
Definition: ratngs.h:293
int16_t width() const
Definition: rect.h:115
Definition: statistc.h:31
void add(int32_t value, int32_t count)
Definition: statistc.cpp:93
double median() const
Definition: statistc.cpp:231
double ile(double frac) const
Definition: statistc.cpp:166
TBOX bounding_box() const
Definition: werd.cpp:148
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
Definition: strngs.h:45
int32_t size() const
Definition: strngs.h:68
int32_t length() const
Definition: strngs.cpp:189
const char * string() const
Definition: strngs.cpp:194
int first_uni() const
Definition: unichar.cpp:98
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:519
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:505
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291