tesseract 4.1.1
Loading...
Searching...
No Matches
paragraphs.h
Go to the documentation of this file.
1/**********************************************************************
2 * File: paragraphs.h
3 * Description: Paragraph Detection data structures.
4 * Author: David Eger
5 * Created: 25 February 2011
6 *
7 * (C) Copyright 2011, Google Inc.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 **********************************************************************/
19
20#ifndef TESSERACT_CCMAIN_PARAGRAPHS_H_
21#define TESSERACT_CCMAIN_PARAGRAPHS_H_
22
23#include "rect.h" // for TBOX
24#include "strngs.h" // for STRING
25
26class PARA_LIST;
27class ParagraphModel;
28
29struct PARA;
30
31template <typename T> class GenericVector;
32
33namespace tesseract {
34
35class MutableIterator;
36
37// This structure captures all information needed about a text line for the
38// purposes of paragraph detection. It is meant to be exceedingly light-weight
39// so that we can easily test paragraph detection independent of the rest of
40// Tesseract.
41class RowInfo {
42 public:
43 // Constant data derived from Tesseract output.
44 STRING text; // the full UTF-8 text of the line.
45 bool ltr; // whether the majority of the text is left-to-right
46 // TODO(eger) make this more fine-grained.
47
48 bool has_leaders; // does the line contain leader dots (.....)?
49 bool has_drop_cap; // does the line have a drop cap?
50 int pix_ldistance; // distance to the left pblock boundary in pixels
51 int pix_rdistance; // distance to the right pblock boundary in pixels
52 float pix_xheight; // guessed xheight for the line
53 int average_interword_space; // average space between words in pixels.
54
56 TBOX lword_box; // in normalized (horiz text rows) space
57 TBOX rword_box; // in normalized (horiz text rows) space
58
59 STRING lword_text; // the UTF-8 text of the leftmost werd
60 STRING rword_text; // the UTF-8 text of the rightmost werd
61
62 // The text of a paragraph typically starts with the start of an idea and
63 // ends with the end of an idea. Here we define paragraph as something that
64 // may have a first line indent and a body indent which may be different.
65 // Typical words that start an idea are:
66 // 1. Words in western scripts that start with
67 // a capital letter, for example "The"
68 // 2. Bulleted or numbered list items, for
69 // example "2."
70 // Typical words which end an idea are words ending in punctuation marks. In
71 // this vocabulary, each list item is represented as a paragraph.
75
79};
80
81// Main entry point for Paragraph Detection Algorithm.
82//
83// Given a set of equally spaced textlines (described by row_infos),
84// Split them into paragraphs. See http://goto/paragraphstalk
85//
86// Output:
87// row_owners - one pointer for each row, to the paragraph it belongs to.
88// paragraphs - this is the actual list of PARA objects.
89// models - the list of paragraph models referenced by the PARA objects.
90// caller is responsible for deleting the models.
91void DetectParagraphs(int debug_level,
92 GenericVector<RowInfo> *row_infos,
93 GenericVector<PARA *> *row_owners,
94 PARA_LIST *paragraphs,
96
97// Given a MutableIterator to the start of a block, run DetectParagraphs on
98// that block and commit the results to the underlying ROW and BLOCK structs,
99// saving the ParagraphModels in models. Caller owns the models.
100// We use unicharset during the function to answer questions such as "is the
101// first letter of this word upper case?"
102void DetectParagraphs(int debug_level,
103 bool after_text_recognition,
104 const MutableIterator *block_start,
106
107} // namespace
108
109#endif // TESSERACT_CCMAIN_PARAGRAPHS_H_
void DetectParagraphs(int debug_level, GenericVector< RowInfo > *row_infos, GenericVector< PARA * > *row_owners, PARA_LIST *paragraphs, GenericVector< ParagraphModel * > *models)
bool lword_likely_ends_idea
Definition: paragraphs.h:74
bool rword_likely_ends_idea
Definition: paragraphs.h:78
int average_interword_space
Definition: paragraphs.h:53
bool rword_likely_starts_idea
Definition: paragraphs.h:77
bool lword_indicates_list_item
Definition: paragraphs.h:72
bool rword_indicates_list_item
Definition: paragraphs.h:76
bool lword_likely_starts_idea
Definition: paragraphs.h:73
Definition: ocrpara.h:29
Definition: rect.h:34
Definition: strngs.h:45