tesseract 4.1.1
Loading...
Searching...
No Matches
gap_map.cpp
Go to the documentation of this file.
1// Licensed under the Apache License, Version 2.0 (the "License");
2// you may not use this file except in compliance with the License.
3// You may obtain a copy of the License at
4// http://www.apache.org/licenses/LICENSE-2.0
5// Unless required by applicable law or agreed to in writing, software
6// distributed under the License is distributed on an "AS IS" BASIS,
7// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8// See the License for the specific language governing permissions and
9// limitations under the License.
10
11#include "statistc.h"
12#include "gap_map.h"
13
14BOOL_VAR(gapmap_debug, false, "Say which blocks have tables");
15BOOL_VAR(gapmap_use_ends, false, "Use large space at start and end of rows");
17"Ensure gaps not less than 2quanta wide");
18double_VAR(gapmap_big_gaps, 1.75, "xht multiplier");
19
20/*************************************************************************
21 * A block gap map is a quantised histogram of whitespace regions in the
22 * block. It is a vertical projection of wide gaps WITHIN lines
23 *
24 * The map is held as an array of counts of rows which have a wide gap
25 * covering that region of the row. Each bucket in the map represents a width
26 * of about half an xheight - (The median of the xhts in the rows is used.)
27 *
28 * The block is considered RECTANGULAR - delimited by the left and right
29 * extremes of the rows in the block. However, ONLY wide gaps WITHIN a row are
30 * counted.
31 *
32 *************************************************************************/
33
34GAPMAP::GAPMAP( //Constructor
35 TO_BLOCK *block //block
36 ) {
37 TO_ROW *row; //current row
38 BLOBNBOX_IT blob_it; //iterator
39 TBOX blob_box;
40 TBOX prev_blob_box;
41 int16_t gap_width;
42 int16_t start_of_row;
43 int16_t end_of_row;
44 STATS xht_stats (0, 128);
45 int16_t min_quantum;
46 int16_t max_quantum;
47 int16_t i;
48
49 /*
50 Find left and right extremes and bucket size
51 */
52 map = nullptr;
53 min_left = INT16_MAX;
54 max_right = -INT16_MAX;
55 total_rows = 0;
56 any_tabs = false;
57
58 // row iterator
59 TO_ROW_IT row_it(block->get_rows());
60 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
61 row = row_it.data ();
62 if (!row->blob_list ()->empty ()) {
63 total_rows++;
64 xht_stats.add (static_cast<int16_t>(floor (row->xheight + 0.5)), 1);
65 blob_it.set_to_list (row->blob_list ());
66 start_of_row = blob_it.data ()->bounding_box ().left ();
67 end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
68 if (min_left > start_of_row)
69 min_left = start_of_row;
70 if (max_right < end_of_row)
71 max_right = end_of_row;
72 }
73 }
74 if ((total_rows < 3) || (min_left >= max_right)) {
75 bucket_size = 0;
76 map_max = 0;
77 total_rows = 0;
78 min_left = max_right = 0;
79 return;
80 }
81 bucket_size = static_cast<int16_t>(floor (xht_stats.median () + 0.5)) / 2;
82 map_max = (max_right - min_left) / bucket_size;
83 map = new int16_t[map_max + 1];
84 for (i = 0; i <= map_max; i++)
85 map[i] = 0;
86
87 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
88 row = row_it.data ();
89 if (!row->blob_list ()->empty ()) {
90 blob_it.set_to_list (row->blob_list ());
91 blob_it.mark_cycle_pt ();
92 blob_box = box_next (&blob_it);
93 prev_blob_box = blob_box;
94 if (gapmap_use_ends) {
95 /* Leading space */
96 gap_width = blob_box.left () - min_left;
97 if ((gap_width > gapmap_big_gaps * row->xheight)
98 && gap_width > 2) {
99 max_quantum = (blob_box.left () - min_left) / bucket_size;
100 if (max_quantum > map_max) max_quantum = map_max;
101 for (i = 0; i <= max_quantum; i++)
102 map[i]++;
103 }
104 }
105 while (!blob_it.cycled_list ()) {
106 blob_box = box_next (&blob_it);
107 gap_width = blob_box.left () - prev_blob_box.right ();
108 if ((gap_width > gapmap_big_gaps * row->xheight)
109 && gap_width > 2) {
110 min_quantum =
111 (prev_blob_box.right () - min_left) / bucket_size;
112 max_quantum = (blob_box.left () - min_left) / bucket_size;
113 if (max_quantum > map_max) max_quantum = map_max;
114 for (i = min_quantum; i <= max_quantum; i++)
115 map[i]++;
116 }
117 prev_blob_box = blob_box;
118 }
119 if (gapmap_use_ends) {
120 /* Trailing space */
121 gap_width = max_right - prev_blob_box.right ();
122 if ((gap_width > gapmap_big_gaps * row->xheight)
123 && gap_width > 2) {
124 min_quantum =
125 (prev_blob_box.right () - min_left) / bucket_size;
126 if (min_quantum < 0) min_quantum = 0;
127 for (i = min_quantum; i <= map_max; i++)
128 map[i]++;
129 }
130 }
131 }
132 }
133 for (i = 0; i <= map_max; i++) {
134 if (map[i] > total_rows / 2) {
136 (((i == 0) &&
137 (map[i + 1] <= total_rows / 2)) ||
138 ((i == map_max) &&
139 (map[i - 1] <= total_rows / 2)) ||
140 ((i > 0) &&
141 (i < map_max) &&
142 (map[i - 1] <= total_rows / 2) &&
143 (map[i + 1] <= total_rows / 2)))) {
144 map[i] = 0; //prevent isolated quantum
145 }
146 else
147 any_tabs = true;
148 }
149 }
150 if (gapmap_debug && any_tabs)
151 tprintf ("Table found\n");
152}
153
154
155/*************************************************************************
156 * GAPMAP::table_gap()
157 * Is there a bucket in the specified range where more than half the rows in the
158 * block have a wide gap?
159 *************************************************************************/
160
161bool GAPMAP::table_gap( //Is gap a table?
162 int16_t left, //From here
163 int16_t right //To here
164) {
165 int16_t min_quantum;
166 int16_t max_quantum;
167 int16_t i;
168 bool tab_found = false;
169
170 if (!any_tabs)
171 return false;
172
173 min_quantum = (left - min_left) / bucket_size;
174 max_quantum = (right - min_left) / bucket_size;
175 // Clip to the bounds of the array. In some circumstances (big blob followed
176 // by small blob) max_quantum can exceed the map_max bounds, but we clip
177 // here instead, as it provides better long-term safety.
178 if (min_quantum < 0) min_quantum = 0;
179 if (max_quantum > map_max) max_quantum = map_max;
180 for (i = min_quantum; (!tab_found && (i <= max_quantum)); i++)
181 if (map[i] > total_rows / 2)
182 tab_found = true;
183 return tab_found;
184}
TBOX box_next(BLOBNBOX_IT *it)
Definition: blobbox.cpp:636
#define BOOL_VAR(name, val, comment)
Definition: params.h:306
#define double_VAR(name, val, comment)
Definition: params.h:312
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
bool gapmap_no_isolated_quanta
Definition: gap_map.cpp:17
double gapmap_big_gaps
Definition: gap_map.cpp:18
bool gapmap_use_ends
Definition: gap_map.cpp:15
bool gapmap_debug
Definition: gap_map.cpp:14
bool gapmap_no_isolated_quanta
Definition: gap_map.cpp:17
double gapmap_big_gaps
Definition: gap_map.cpp:18
bool gapmap_use_ends
Definition: gap_map.cpp:15
bool gapmap_debug
Definition: gap_map.cpp:14
float xheight
Definition: blobbox.h:657
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:600
TO_ROW_LIST * get_rows()
Definition: blobbox.h:704
Definition: rect.h:34
int16_t left() const
Definition: rect.h:72
int16_t right() const
Definition: rect.h:79
Definition: statistc.h:31
void add(int32_t value, int32_t count)
Definition: statistc.cpp:93
double median() const
Definition: statistc.cpp:231
bool table_gap(int16_t left, int16_t right)
Definition: gap_map.cpp:161
GAPMAP(TO_BLOCK *block)
Definition: gap_map.cpp:34