tesseract 4.1.1
Loading...
Searching...
No Matches
paragraphs_internal.h
Go to the documentation of this file.
1/**********************************************************************
2 * File: paragraphs_internal.h
3 * Description: Paragraph Detection internal data structures.
4 * Author: David Eger
5 *
6 * (C) Copyright 2011, Google Inc.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19#ifndef TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
20#define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
21
22#include "paragraphs.h"
23#include "publictypes.h" // for ParagraphJustification
24
25// NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS
26// DATA STRUCTURES OR FUNCTIONS IN THIS FILE.
27
28class UNICHARSET;
29class WERD_CHOICE;
30
31namespace tesseract {
32
33// Return whether the given word is likely to be a list item start word.
34bool AsciiLikelyListItem(const STRING &word);
35
36// Return the first Unicode Codepoint from werd[pos].
37int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos);
38
39// Set right word attributes given either a unicharset and werd or a utf8
40// string.
41void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
42 const STRING &utf8,
43 bool *is_list, bool *starts_idea, bool *ends_idea);
44
45// Set left word attributes given either a unicharset and werd or a utf8 string.
46void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
47 const STRING &utf8,
48 bool *is_list, bool *starts_idea, bool *ends_idea);
49
51 LT_START = 'S', // First line of a paragraph.
52 LT_BODY = 'C', // Continuation line of a paragraph.
53 LT_UNKNOWN = 'U', // No clues.
54 LT_MULTIPLE = 'M', // Matches for both LT_START and LT_BODY.
55};
56
57// The first paragraph in a page of body text is often un-indented.
58// This is a typographic convention which is common to indicate either that:
59// (1) The paragraph is the continuation of a previous paragraph, or
60// (2) The paragraph is the first paragraph in a chapter.
61//
62// I refer to such paragraphs as "crown"s, and the output of the paragraph
63// detection algorithm attempts to give them the same paragraph model as
64// the rest of the body text.
65//
66// Nonetheless, while building hypotheses, it is useful to mark the lines
67// of crown paragraphs temporarily as crowns, either aligned left or right.
68extern const ParagraphModel *kCrownLeft;
69extern const ParagraphModel *kCrownRight;
70
71inline bool StrongModel(const ParagraphModel *model) {
72 return model != nullptr && model != kCrownLeft && model != kCrownRight;
73}
74
78 : ty(line_type), model(m) {}
80 : ty(other.ty), model(other.model) {}
81
82 // Copy assignment operator.
84 ty = other.ty;
85 model = other.model;
86 return *this;
87 }
88
89 bool operator==(const LineHypothesis &other) const {
90 return ty == other.ty && model == other.model;
91 }
92
95};
96
97class ParagraphTheory; // Forward Declaration
98
100
101// Row Scratch Registers are data generated by the paragraph detection
102// algorithm based on a RowInfo input.
104 public:
105 // We presume row will outlive us.
106 void Init(const RowInfo &row);
107
108 LineType GetLineType() const;
109
110 LineType GetLineType(const ParagraphModel *model) const;
111
112 // Mark this as a start line type, sans model. This is useful for the
113 // initial marking of probable body lines or paragraph start lines.
114 void SetStartLine();
115
116 // Mark this as a body line type, sans model. This is useful for the
117 // initial marking of probably body lines or paragraph start lines.
118 void SetBodyLine();
119
120 // Record that this row fits as a paragraph start line in the given model,
121 void AddStartLine(const ParagraphModel *model);
122 // Record that this row fits as a paragraph body line in the given model,
123 void AddBodyLine(const ParagraphModel *model);
124
125 // Clear all hypotheses about this line.
126 void SetUnknown() { hypotheses_.truncate(0); }
127
128 // Append all hypotheses of strong models that match this row as a start.
129 void StartHypotheses(SetOfModels *models) const;
130
131 // Append all hypotheses of strong models matching this row.
132 void StrongHypotheses(SetOfModels *models) const;
133
134 // Append all hypotheses for this row.
135 void NonNullHypotheses(SetOfModels *models) const;
136
137 // Discard any hypotheses whose model is not in the given list.
138 void DiscardNonMatchingHypotheses(const SetOfModels &models);
139
140 // If we have only one hypothesis and that is that this line is a paragraph
141 // start line of a certain model, return that model. Else return nullptr.
143
144 // If we have only one hypothesis and that is that this line is a paragraph
145 // body line of a certain model, return that model. Else return nullptr.
147
148 // Return the indentation for the side opposite of the aligned side.
150 switch (just) {
153 default: return lindent_ > rindent_ ? lindent_ : rindent_;
154 }
155 }
156
157 // Return the indentation for the side the text is aligned to.
159 switch (just) {
162 default: return lindent_ > rindent_ ? lindent_ : rindent_;
163 }
164 }
165
166 // Append header fields to a vector of row headings.
168
169 // Append data for this row to a vector of debug strings.
170 void AppendDebugInfo(const ParagraphTheory &theory,
171 GenericVector<STRING> *dbg) const;
172
173 const RowInfo *ri_;
174
175 // These four constants form a horizontal box model for the white space
176 // on the edges of each line. At each point in the algorithm, the following
177 // shall hold:
178 // ri_->pix_ldistance = lmargin_ + lindent_
179 // ri_->pix_rdistance = rindent_ + rmargin_
184
185 private:
186 // Hypotheses of either LT_START or LT_BODY
188};
189
190// A collection of convenience functions for wrapping the set of
191// Paragraph Models we believe correctly model the paragraphs in the image.
193 public:
194 // We presume models will outlive us, and that models will take ownership
195 // of any ParagraphModel *'s we add.
197 : models_(models) {}
199 const GenericVector<ParagraphModel *> &models() const { return *models_; }
200
201 // Return an existing model if one that is Comparable() can be found.
202 // Else, allocate a new copy of model to save and return a pointer to it.
203 const ParagraphModel *AddModel(const ParagraphModel &model);
204
205 // Discard any models we've made that are not in the list of used models.
206 void DiscardUnusedModels(const SetOfModels &used_models);
207
208 // Return the set of all non-centered models.
210
211 // If any of the non-centered paragraph models we know about fit
212 // rows[start, end), return it. Else nullptr.
214 int start, int end) const;
215
216 int IndexOf(const ParagraphModel *model) const;
217
218 private:
221};
222
224 int row, const ParagraphModel *model);
226 int row, const ParagraphModel *model);
228 int a, int b, const ParagraphModel *model);
229
230// A class for smearing Paragraph Model hypotheses to surrounding rows.
231// The idea here is that StrongEvidenceClassify first marks only exceedingly
232// obvious start and body rows and constructs models of them. Thereafter,
233// we may have left over unmarked lines (mostly end-of-paragraph lines) which
234// were too short to have much confidence about, but which fit the models we've
235// constructed perfectly and which we ought to mark. This class is used to
236// "smear" our models over the text.
238 public:
240 int row_start, int row_end,
241 ParagraphTheory *theory);
242
243 // Smear forward paragraph models from existing row markings to subsequent
244 // text lines if they fit, and mark any thereafter still unmodeled rows
245 // with any model in the theory that fits them.
246 void Smear();
247
248 private:
249 // Record in open_models_ for rows [start_row, end_row) the list of models
250 // currently open at each row.
251 // A model is still open in a row if some previous row has said model as a
252 // start hypothesis, and all rows since (including this row) would fit as
253 // either a body or start line in that model.
254 void CalculateOpenModels(int row_start, int row_end);
255
256 SetOfModels &OpenModels(int row) {
257 return open_models_[row - row_start_ + 1];
258 }
259
260 ParagraphTheory *theory_;
262 int row_start_;
263 int row_end_;
264
265 // open_models_ corresponds to rows[start_row_ - 1, end_row_]
266 //
267 // open_models_: Contains models which there was an active (open) paragraph
268 // as of the previous line and for which the left and right
269 // indents admit the possibility that this text line continues
270 // to fit the same model.
271 // TODO(eger): Think about whether we can get rid of "Open" models and just
272 // use the current hypotheses on RowScratchRegisters.
273 GenericVector<SetOfModels> open_models_;
274};
275
276// Clear all hypotheses about lines [start, end) and reset the margins to the
277// percentile (0..100) value of the left and right row edges for this run of
278// rows.
280 GenericVector<RowScratchRegisters> *rows, int start, int end,
281 int percentile);
282
283// Return the median inter-word space in rows[row_start, row_end).
285 int row_start, int row_end);
286
287// Return whether the first word on the after line can fit in the space at
288// the end of the before line (knowing which way the text is aligned and read).
290 const RowScratchRegisters &after,
292
293// Return whether the first word on the after line can fit in the space at
294// the end of the before line (not knowing the text alignment).
296 const RowScratchRegisters &after);
297
298// Do rows[start, end) form a single instance of the given paragraph model?
300 int start, int end, const ParagraphModel *model);
301
302// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
303// normalize each row_owner to point to an actual PARA, and output the
304// paragraphs in order onto paragraphs.
306 GenericVector<PARA *> *row_owners,
307 PARA_LIST *paragraphs);
308
309} // namespace
310
311#endif // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:408
bool CrownCompatible(const GenericVector< RowScratchRegisters > *rows, int a, int b, const ParagraphModel *model)
bool StrongModel(const ParagraphModel *model)
bool AsciiLikelyListItem(const STRING &word)
Definition: paragraphs.cpp:281
ParagraphJustification
Definition: publictypes.h:251
@ JUSTIFICATION_LEFT
Definition: publictypes.h:253
@ JUSTIFICATION_RIGHT
Definition: publictypes.h:255
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification justification)
bool ValidBodyLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
void RecomputeMarginsAndClearHypotheses(GenericVector< RowScratchRegisters > *rows, int start, int end, int percentile)
bool ValidFirstLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
const ParagraphModel * kCrownLeft
Definition: paragraphs.cpp:54
void CanonicalizeDetectionResults(GenericVector< PARA * > *row_owners, PARA_LIST *paragraphs)
const ParagraphModel * kCrownRight
Definition: paragraphs.cpp:56
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:455
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos)
Definition: paragraphs.cpp:288
bool RowsFitModel(const GenericVector< RowScratchRegisters > *rows, int start, int end, const ParagraphModel *model)
int InterwordSpace(const GenericVector< RowScratchRegisters > &rows, int row_start, int row_end)
bool operator==(const LineHypothesis &other) const
const ParagraphModel * model
LineHypothesis(LineType line_type, const ParagraphModel *m)
LineHypothesis(const LineHypothesis &other)
LineHypothesis & operator=(const LineHypothesis &other)
static void AppendDebugHeaderFields(GenericVector< STRING > *header)
Definition: paragraphs.cpp:489
int AlignsideIndent(tesseract::ParagraphJustification just) const
void StartHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:610
void AppendDebugInfo(const ParagraphTheory &theory, GenericVector< STRING > *dbg) const
Definition: paragraphs.cpp:495
const ParagraphModel * UniqueStartHypothesis() const
Definition: paragraphs.cpp:631
void NonNullHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:624
void AddBodyLine(const ParagraphModel *model)
Definition: paragraphs.cpp:603
void StrongHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:617
int OffsideIndent(tesseract::ParagraphJustification just) const
void DiscardNonMatchingHypotheses(const SetOfModels &models)
Definition: paragraphs.cpp:644
void AddStartLine(const ParagraphModel *model)
Definition: paragraphs.cpp:596
const ParagraphModel * UniqueBodyHypothesis() const
Definition: paragraphs.cpp:637
void Init(const RowInfo &row)
Definition: paragraphs.cpp:526
ParagraphTheory(GenericVector< ParagraphModel * > *models)
void NonCenteredModels(SetOfModels *models)
const ParagraphModel * Fits(const GenericVector< RowScratchRegisters > *rows, int start, int end) const
const GenericVector< ParagraphModel * > & models() const
GenericVector< ParagraphModel * > & models()
void DiscardUnusedModels(const SetOfModels &used_models)
int IndexOf(const ParagraphModel *model) const
const ParagraphModel * AddModel(const ParagraphModel &model)
Definition: strngs.h:45