tesseract 4.1.1
Loading...
Searching...
No Matches
ocrpara.h
Go to the documentation of this file.
1
2// File: ocrpara.h
3// Description: OCR Paragraph Output Type
4// Author: David Eger
5// Created: 2010-11-15
6//
7// (C) Copyright 2010, Google Inc.
8// Licensed under the Apache License, Version 2.0 (the "License");
9// you may not use this file except in compliance with the License.
10// You may obtain a copy of the License at
11// http://www.apache.org/licenses/LICENSE-2.0
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17//
19
20#ifndef TESSERACT_CCSTRUCT_OCRPARA_H_
21#define TESSERACT_CCSTRUCT_OCRPARA_H_
22
23#include "publictypes.h"
24#include "elst.h"
25#include "strngs.h"
26
27class ParagraphModel;
28
29struct PARA : public ELIST_LINK {
30 public:
31 PARA() : model(nullptr), is_list_item(false),
33
34 // We do not own the model, we just reference it.
35 // model may be nullptr if there is not a good model for this paragraph.
37
39
40 // The first paragraph on a page often lacks a first line indent, but should
41 // still be modeled by the same model as other body text paragraphs on the
42 // page.
44
45 // Does this paragraph begin with a drop cap?
47};
48
50
51// A geometric model of paragraph indentation and alignment.
52//
53// Measurements are in pixels. The meaning of the integer arguments changes
54// depending upon the value of justification. Distances less than or equal
55// to tolerance apart we take as "equivalent" for the purpose of model
56// matching, and in the examples below, we assume tolerance is zero.
57//
58// justification = LEFT:
59// margin the "ignored" margin to the left block edge.
60// first_indent indent from the left margin to a typical first text line.
61// body_indent indent from the left margin of a typical body text line.
62//
63// justification = RIGHT:
64// margin the "ignored" margin to the right block edge.
65// first_indent indent from the right margin to a typical first text line.
66// body_indent indent from the right margin of a typical body text line.
67//
68// justification = CENTER:
69// margin ignored
70// first_indent ignored
71// body_indent ignored
72//
73// ====== Extended example, assuming each letter is ten pixels wide: =======
74//
75// +--------------------------------+
76// | Awesome | ParagraphModel(CENTER, 0, 0, 0)
77// | Centered Title |
78// | Paragraph Detection |
79// | OCR TEAM |
80// | 10 November 2010 |
81// | |
82// | Look here, I have a paragraph.| ParagraphModel(LEFT, 0, 20, 0)
83// |This paragraph starts at the top|
84// |of the page and takes 3 lines. |
85// | Here I have a second paragraph| ParagraphModel(LEFT, 0, 20, 0)
86// |which indicates that the first |
87// |paragraph is not a continuation |
88// |from a previous page, as it is |
89// |indented just like this second |
90// |paragraph. |
91// | Here is a block quote. It | ParagraphModel(LEFT, 30, 0, 0)
92// | looks like the prior text |
93// | but it is indented more |
94// | and is fully justified. |
95// | So how does one deal with | ParagraphModel(LEFT, 0, 20, 0)
96// |centered text, block quotes, |
97// |normal paragraphs, and lists |
98// |like what follows? |
99// |1. Make a plan. | ParagraphModel(LEFT, 0, 0, 30)
100// |2. Use a heuristic, for example,| ParagraphModel(LEFT, 0, 0, 30)
101// | looking for lines where the |
102// | first word of the next line |
103// | would fit on the previous |
104// | line. |
105// |8. Try to implement the plan in | ParagraphModel(LEFT, 0, 0, 30)
106// | Python and try it out. |
107// |4. Determine how to fix the | ParagraphModel(LEFT, 0, 0, 30)
108// | mistakes. |
109// |5. Repeat. | ParagraphModel(LEFT, 0, 0, 30)
110// | For extra painful penalty work| ParagraphModel(LEFT, 0, 20, 0)
111// |you can try to identify source |
112// |code. Ouch! |
113// +--------------------------------+
115 public:
117 int margin,
118 int first_indent,
119 int body_indent,
120 int tolerance)
121 : justification_(justification),
122 margin_(margin),
123 first_indent_(first_indent),
124 body_indent_(body_indent),
125 tolerance_(tolerance) {
126 // Make one of {first_indent, body_indent} is 0.
127 int added_margin = first_indent;
128 if (body_indent < added_margin)
129 added_margin = body_indent;
130 margin_ += added_margin;
131 first_indent_ -= added_margin;
132 body_indent_ -= added_margin;
133 }
134
136 : justification_(tesseract::JUSTIFICATION_UNKNOWN),
137 margin_(0),
138 first_indent_(0),
139 body_indent_(0),
140 tolerance_(0) { }
141
142 // ValidFirstLine() and ValidBodyLine() take arguments describing a text line
143 // in a block of text which we are trying to model:
144 // lmargin, lindent: these add up to the distance from the leftmost ink
145 // in the text line to the surrounding text block's left
146 // edge.
147 // rmargin, rindent: these add up to the distance from the rightmost ink
148 // in the text line to the surrounding text block's right
149 // edge.
150 // The caller determines the division between "margin" and "indent", which
151 // only actually affect whether we think the line may be centered.
152 //
153 // If the amount of whitespace matches the amount of whitespace expected on
154 // the relevant side of the line (within tolerance_) we say it matches.
155
156 // Return whether a given text line could be a first paragraph line according
157 // to this paragraph model.
158 bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const;
159
160 // Return whether a given text line could be a first paragraph line according
161 // to this paragraph model.
162 bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const;
163
165 return justification_;
166 }
167 int margin() const { return margin_; }
168 int first_indent() const { return first_indent_; }
169 int body_indent() const { return body_indent_; }
170 int tolerance() const { return tolerance_; }
171 bool is_flush() const {
172 return (justification_ == tesseract::JUSTIFICATION_LEFT ||
173 justification_ == tesseract::JUSTIFICATION_RIGHT) &&
174 abs(first_indent_ - body_indent_) <= tolerance_;
175 }
176
177 // Return whether this model is likely to agree with the other model on most
178 // paragraphs they are marked.
179 bool Comparable(const ParagraphModel &other) const;
180
181 STRING ToString() const;
182
183 private:
185 int margin_;
186 int first_indent_;
187 int body_indent_;
188 int tolerance_;
189};
190
191#endif // TESSERACT_CCSTRUCT_OCRPARA_H_
@ JUSTIFICATION_UNKNOWN
Definition: capi.h:153
#define ELISTIZEH(CLASSNAME)
Definition: elst.h:918
ParagraphJustification
Definition: publictypes.h:251
@ JUSTIFICATION_LEFT
Definition: publictypes.h:253
@ JUSTIFICATION_RIGHT
Definition: publictypes.h:255
Definition: ocrpara.h:29
PARA()
Definition: ocrpara.h:31
const ParagraphModel * model
Definition: ocrpara.h:36
bool is_very_first_or_continuation
Definition: ocrpara.h:43
bool is_list_item
Definition: ocrpara.h:38
bool has_drop_cap
Definition: ocrpara.h:46
int first_indent() const
Definition: ocrpara.h:168
tesseract::ParagraphJustification justification() const
Definition: ocrpara.h:164
int body_indent() const
Definition: ocrpara.h:169
int margin() const
Definition: ocrpara.h:167
ParagraphModel(tesseract::ParagraphJustification justification, int margin, int first_indent, int body_indent, int tolerance)
Definition: ocrpara.h:116
int tolerance() const
Definition: ocrpara.h:170
bool is_flush() const
Definition: ocrpara.h:171
Definition: strngs.h:45