tesseract 4.1.1
Loading...
Searching...
No Matches
tablefind.cpp
Go to the documentation of this file.
1
2// File: tablefind.cpp
3// Description: Helper classes to find tables from ColPartitions.
4// Author: Faisal Shafait (faisal.shafait@dfki.de)
5//
6// (C) Copyright 2009, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16//
18
19#ifdef HAVE_CONFIG_H
20#include "config_auto.h"
21#endif
22
23#include "tablefind.h"
24#include <algorithm>
25#include <cmath>
26
27#include "allheaders.h"
28
29#include "colpartitionset.h"
30#include "tablerecog.h"
31
32namespace tesseract {
33
34// These numbers are used to calculate the global median stats.
35// They just set an upper bound on the stats objects.
36// Maximum vertical spacing between neighbor partitions.
37const int kMaxVerticalSpacing = 500;
38// Maximum width of a blob in a partition.
39const int kMaxBlobWidth = 500;
40
41// Minimum whitespace size to split a partition (measured as a multiple
42// of a partition's median width).
43const double kSplitPartitionSize = 2.0;
44// To insert text, the partition must satisfy these size constraints
45// in AllowTextPartition(). The idea is to filter noise partitions
46// determined by the size compared to the global medians.
47// TODO(nbeato): Need to find good numbers again.
48const double kAllowTextHeight = 0.5;
49const double kAllowTextWidth = 0.6;
50const double kAllowTextArea = 0.8;
51// The same thing applies to blobs (to filter noise).
52// TODO(nbeato): These numbers are a shot in the dark...
53// height and width are 0.5 * gridsize() in colfind.cpp
54// area is a rough guess for the size of a period.
55const double kAllowBlobHeight = 0.3;
56const double kAllowBlobWidth = 0.4;
57const double kAllowBlobArea = 0.05;
58
59// Minimum number of components in a text partition. A partition having fewer
60// components than that is more likely a data partition and is a candidate
61// table cell.
63
64// Maximum number of components that a data partition can have
66
67// Maximum allowed gap in a text partitions as a multiple of its median size.
68const double kMaxGapInTextPartition = 4.0;
69
70// Minimum value that the maximum gap in a text partition should have as a
71// factor of its median size.
72const double kMinMaxGapInTextPartition = 0.5;
73
74// The amount of overlap that is "normal" for adjacent blobs in a text
75// partition. This is used to calculate gap between overlapping blobs.
76const double kMaxBlobOverlapFactor = 4.0;
77
78// Maximum x-height a table partition can have as a multiple of global
79// median x-height
80const double kMaxTableCellXheight = 2.0;
81
82// Maximum line spacing between a table column header and column contents
83// for merging the two (as a multiple of the partition's median_height).
85
86// Minimum ratio of num_table_partitions to num_text_partitions in a column
87// block to be called it a table column
88const double kTableColumnThreshold = 3.0;
89
90// Search for horizontal ruling lines within the vertical margin as a
91// multiple of grid size
92// const int kRulingVerticalMargin = 3;
93
94// Minimum overlap that a colpartition must have with a table region
95// to become part of that table
96const double kMinOverlapWithTable = 0.6;
97
98// Maximum side space (distance from column boundary) that a typical
99// text-line in flowing text should have as a multiple of its x-height
100// (Median size).
101const int kSideSpaceMargin = 10;
102
103// Fraction of the peak of x-projection of a table region to set the
104// threshold for the x-projection histogram
107// Minimum number of rows required to look for more rows in the projection.
109
110// Minimum number of rows in a table
111const int kMinRowsInTable = 3;
112
113// The amount of padding (multiplied by global_median_xheight_ during use)
114// that is vertically added to the search adjacent leader search during
115// ColPartition marking.
117
118// Used when filtering false positives. When finding the last line
119// of a paragraph (typically left-aligned), the previous line should have
120// its center to the right of the last line by this scaled amount.
122
123// The maximum amount of whitespace allowed left of a paragraph ending.
124// Do not filter a ColPartition with more than this space left of it.
126
127// Used when filtering false positives. The last line of a paragraph
128// should be preceded by a line that is predominantly text. This is the
129// ratio of text to whitespace (to the right of the text) that is required
130// for the previous line to be a text.
132
133// When counting table columns, this is the required gap between two columns
134// (it is multiplied by global_median_xheight_).
135const double kMaxXProjectionGapFactor = 2.0;
136
137// Used for similarity in partitions using stroke width. Values copied
138// from ColFind.cpp in Ray's CL.
141
142static BOOL_VAR(textord_show_tables, false, "Show table regions");
143static BOOL_VAR(textord_tablefind_show_mark, false,
144 "Debug table marking steps in detail");
145static BOOL_VAR(textord_tablefind_show_stats, false,
146 "Show page stats used in table finding");
147static BOOL_VAR(textord_tablefind_recognize_tables, false,
148 "Enables the table recognizer for table layout and filtering.");
149
152
153// Templated helper function used to create destructor callbacks for the
154// BBGrid::ClearGridData() method.
155template <typename T> void DeleteObject(T *object) {
156 delete object;
157}
158
160 : resolution_(0),
161 global_median_xheight_(0),
162 global_median_blob_width_(0),
163 global_median_ledding_(0),
164 left_to_right_language_(true) {
165}
166
168 // ColPartitions and ColSegments created by this class for storage in grids
169 // need to be deleted explicitly.
170 clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>);
171 leader_and_ruling_grid_.ClearGridData(&DeleteObject<ColPartition>);
172 fragmented_text_grid_.ClearGridData(&DeleteObject<ColPartition>);
173 col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>);
174 table_grid_.ClearGridData(&DeleteObject<ColSegment>);
175}
176
179}
180
181void TableFinder::Init(int grid_size, const ICOORD& bottom_left,
182 const ICOORD& top_right) {
183 // Initialize clean partitions list and grid
184 clean_part_grid_.Init(grid_size, bottom_left, top_right);
185 leader_and_ruling_grid_.Init(grid_size, bottom_left, top_right);
186 fragmented_text_grid_.Init(grid_size, bottom_left, top_right);
187 col_seg_grid_.Init(grid_size, bottom_left, top_right);
188 table_grid_.Init(grid_size, bottom_left, top_right);
189}
190
191// Copy cleaned partitions from part_grid_ to clean_part_grid_ and
192// insert leaders and rulers into the leader_and_ruling_grid_
194 TO_BLOCK* block) {
195 // Calculate stats. This lets us filter partitions in AllowTextPartition()
196 // and filter blobs in AllowBlob().
197 SetGlobalSpacings(grid);
198
199 // Iterate the ColPartitions in the grid.
200 ColPartitionGridSearch gsearch(grid);
201 gsearch.SetUniqueMode(true);
202 gsearch.StartFullSearch();
203 ColPartition* part = nullptr;
204 while ((part = gsearch.NextFullSearch()) != nullptr) {
205 // Reject partitions with nothing useful inside of them.
206 if (part->blob_type() == BRT_NOISE || part->bounding_box().area() <= 0)
207 continue;
208 ColPartition* clean_part = part->ShallowCopy();
209 ColPartition* leader_part = nullptr;
210 if (part->IsLineType()) {
211 InsertRulingPartition(clean_part);
212 continue;
213 }
214 // Insert all non-text partitions to clean_parts
215 if (!part->IsTextType()) {
216 InsertImagePartition(clean_part);
217 continue;
218 }
219 // Insert text colpartitions after removing noisy components from them
220 // The leaders are split into a separate grid.
221 BLOBNBOX_CLIST* part_boxes = part->boxes();
222 BLOBNBOX_C_IT pit(part_boxes);
223 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
224 BLOBNBOX *pblob = pit.data();
225 // Bad blobs... happens in UNLV set.
226 // news.3G1, page 17 (around x=6)
227 if (!AllowBlob(*pblob))
228 continue;
229 if (pblob->flow() == BTFT_LEADER) {
230 if (leader_part == nullptr) {
231 leader_part = part->ShallowCopy();
232 leader_part->set_flow(BTFT_LEADER);
233 }
234 leader_part->AddBox(pblob);
235 } else if (pblob->region_type() != BRT_NOISE) {
236 clean_part->AddBox(pblob);
237 }
238 }
239 clean_part->ComputeLimits();
240 ColPartition* fragmented = clean_part->CopyButDontOwnBlobs();
241 InsertTextPartition(clean_part);
243 if (leader_part != nullptr) {
244 // TODO(nbeato): Note that ComputeLimits does not update the column
245 // information. So the leader may appear to span more columns than it
246 // really does later on when IsInSameColumnAs gets called to test
247 // for adjacent leaders.
248 leader_part->ComputeLimits();
249 InsertLeaderPartition(leader_part);
250 }
251 }
252
253 // Make the partition partners better for upper and lower neighbors.
256}
257
258// High level function to perform table detection
260 ColPartitionSet** all_columns,
261 WidthCallback* width_cb,
262 const FCOORD& reskew) {
263 // initialize spacing, neighbors, and columns
264 InitializePartitions(all_columns);
265
266#ifndef GRAPHICS_DISABLED
267 if (textord_show_tables) {
268 ScrollView* table_win = MakeWindow(0, 300, "Column Partitions & Neighbors");
274
275 table_win = MakeWindow(100, 300, "Fragmented Text");
277 }
278#endif // GRAPHICS_DISABLED
279
280 // mark, filter, and smooth candidate table partitions
282
283 // Make single-column blocks from good_columns_ partitions. col_segments are
284 // moved to a grid later which takes the ownership
285 ColSegment_LIST column_blocks;
286 GetColumnBlocks(all_columns, &column_blocks);
287 // Set the ratio of candidate table partitions in each column
288 SetColumnsType(&column_blocks);
289
290 // Move column segments to col_seg_grid_
291 MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_);
292
293 // Detect split in column layout that might have occurred due to the
294 // presence of a table. In such a case, merge the corresponding columns.
296
297 // Group horizontally overlapping table partitions into table columns.
298 // table_columns created here get deleted at the end of this method.
299 ColSegment_LIST table_columns;
300 GetTableColumns(&table_columns);
301
302 // Within each column, mark the range table regions occupy based on the
303 // table columns detected. table_regions are moved to a grid later which
304 // takes the ownership
305 ColSegment_LIST table_regions;
306 GetTableRegions(&table_columns, &table_regions);
307
308#ifndef GRAPHICS_DISABLED
309 if (textord_tablefind_show_mark) {
310 ScrollView* table_win = MakeWindow(1200, 300, "Table Columns and Regions");
311 DisplayColSegments(table_win, &table_columns, ScrollView::DARK_TURQUOISE);
312 DisplayColSegments(table_win, &table_regions, ScrollView::YELLOW);
313 }
314#endif // GRAPHICS_DISABLED
315
316 // Merge table regions across columns for tables spanning multiple
317 // columns
318 MoveColSegmentsToGrid(&table_regions, &table_grid_);
320
321 // Adjust table boundaries by including nearby horizontal lines and left
322 // out column headers
325
326 if (textord_tablefind_recognize_tables) {
327 // Remove false alarms consiting of a single column
329
330#ifndef GRAPHICS_DISABLED
331 if (textord_show_tables) {
332 ScrollView* table_win = MakeWindow(1200, 300, "Detected Table Locations");
334 DisplayColSegments(table_win, &table_columns, ScrollView::KHAKI);
335 table_grid_.DisplayBoxes(table_win);
336 }
337#endif // GRAPHICS_DISABLED
338
339 // Find table grid structure and reject tables that are malformed.
343
344#ifndef GRAPHICS_DISABLED
345 if (textord_show_tables) {
346 ScrollView* table_win = MakeWindow(1400, 600, "Recognized Tables");
349 table_grid_.DisplayBoxes(table_win);
350 }
351#endif // GRAPHICS_DISABLED
352 } else {
353 // Remove false alarms consiting of a single column
354 // TODO(nbeato): verify this is a NOP after structured table rejection.
355 // Right now it isn't. If the recognize function is doing what it is
356 // supposed to do, this function is obsolete.
358
359#ifndef GRAPHICS_DISABLED
360 if (textord_show_tables) {
361 ScrollView* table_win = MakeWindow(1500, 300, "Detected Tables");
364 table_grid_.DisplayBoxes(table_win);
365 }
366#endif // GRAPHICS_DISABLED
367 }
368
369 // Merge all colpartitions in table regions to make them a single
370 // colpartition and revert types of isolated table cells not
371 // assigned to any table to their original types.
372 MakeTableBlocks(grid, all_columns, width_cb);
373}
374// All grids have the same dimensions. The clean_part_grid_ sizes are set from
375// the part_grid_ that is passed to InsertCleanPartitions, which was the same as
376// the grid that is the base of ColumnFinder. Just return the clean_part_grid_
377// dimensions instead of duplicated memory.
379 return clean_part_grid_.gridsize();
380}
383}
386}
388 return clean_part_grid_.bleft();
389}
391 return clean_part_grid_.tright();
392}
393
395 ASSERT_HOST(part != nullptr);
396 if (AllowTextPartition(*part)) {
397 clean_part_grid_.InsertBBox(true, true, part);
398 } else {
399 delete part;
400 }
401}
403 ASSERT_HOST(part != nullptr);
404 if (AllowTextPartition(*part)) {
405 fragmented_text_grid_.InsertBBox(true, true, part);
406 } else {
407 delete part;
408 }
409}
411 ASSERT_HOST(part != nullptr);
412 if (!part->IsEmpty() && part->bounding_box().area() > 0) {
413 leader_and_ruling_grid_.InsertBBox(true, true, part);
414 } else {
415 delete part;
416 }
417}
419 leader_and_ruling_grid_.InsertBBox(true, true, part);
420}
422 // NOTE: If images are placed into a different grid in the future,
423 // the function SetPartitionSpacings needs to be updated. It should
424 // be the only thing that cares about image partitions.
425 clean_part_grid_.InsertBBox(true, true, part);
426}
427
428// Splits a partition into its "words". The splits happen
429// at locations with wide inter-blob spacing. This is useful
430// because it allows the table recognize to "cut through" the
431// text lines on the page. The assumption is that a table
432// will have several lines with similar overlapping whitespace
433// whereas text will not have this type of property.
434// Note: The code Assumes that blobs are sorted by the left side x!
435// This will not work (as well) if the blobs are sorted by center/right.
437 ASSERT_HOST(part != nullptr);
438 // Bye bye empty partitions!
439 if (part->boxes()->empty()) {
440 delete part;
441 return;
442 }
443
444 // The AllowBlob function prevents this.
445 ASSERT_HOST(part->median_width() > 0);
446 const double kThreshold = part->median_width() * kSplitPartitionSize;
447
448 ColPartition* right_part = part;
449 bool found_split = true;
450 while (found_split) {
451 found_split = false;
452 BLOBNBOX_C_IT box_it(right_part->boxes());
453 // Blobs are sorted left side first. If blobs overlap,
454 // the previous blob may have a "more right" right side.
455 // Account for this by always keeping the largest "right"
456 // so far.
457 int previous_right = INT32_MIN;
458
459 // Look for the next split in the partition.
460 for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
461 const TBOX& box = box_it.data()->bounding_box();
462 if (previous_right != INT32_MIN &&
463 box.left() - previous_right > kThreshold) {
464 // We have a split position. Split the partition in two pieces.
465 // Insert the left piece in the grid and keep processing the right.
466 int mid_x = (box.left() + previous_right) / 2;
467 ColPartition* left_part = right_part;
468 right_part = left_part->SplitAt(mid_x);
469
471 found_split = true;
472 break;
473 }
474
475 // The right side of the previous blobs.
476 previous_right = std::max(previous_right, static_cast<int>(box.right()));
477 }
478 }
479 // When a split is not found, the right part is minimized
480 // as much as possible, so process it.
482}
483
484// Some simple criteria to filter out now. We want to make sure the
485// average blob size in the partition is consistent with the
486// global page stats.
487// The area metric will almost always pass for multi-blob partitions.
488// It is useful when filtering out noise caused by an isolated blob.
490 const double kHeightRequired = global_median_xheight_ * kAllowTextHeight;
491 const double kWidthRequired = global_median_blob_width_ * kAllowTextWidth;
492 const int median_area = global_median_xheight_ * global_median_blob_width_;
493 const double kAreaPerBlobRequired = median_area * kAllowTextArea;
494 // Keep comparisons strictly greater to disallow 0!
495 return part.median_height() > kHeightRequired &&
496 part.median_width() > kWidthRequired &&
497 part.bounding_box().area() > kAreaPerBlobRequired * part.boxes_count();
498}
499
500// Same as above, applied to blobs. Keep in mind that
501// leaders, commas, and periods are important in tables.
502bool TableFinder::AllowBlob(const BLOBNBOX& blob) const {
503 const TBOX& box = blob.bounding_box();
504 const double kHeightRequired = global_median_xheight_ * kAllowBlobHeight;
505 const double kWidthRequired = global_median_blob_width_ * kAllowBlobWidth;
506 const int median_area = global_median_xheight_ * global_median_blob_width_;
507 const double kAreaRequired = median_area * kAllowBlobArea;
508 // Keep comparisons strictly greater to disallow 0!
509 return box.height() > kHeightRequired &&
510 box.width() > kWidthRequired &&
511 box.area() > kAreaRequired;
512}
513
514// TODO(nbeato): The grid that makes the window doesn't seem to matter.
515// The only downside is that window messages will be caught by
516// clean_part_grid_ instead of a useful object. This is a temporary solution
517// for the debug windows created by the TableFinder.
518ScrollView* TableFinder::MakeWindow(int x, int y, const char* window_name) {
519 return clean_part_grid_.MakeWindow(x, y, window_name);
520}
521
522// Make single-column blocks from good_columns_ partitions.
524 ColSegment_LIST* column_blocks) {
525 for (int i = 0; i < gridheight(); ++i) {
526 ColPartitionSet* columns = all_columns[i];
527 if (columns != nullptr) {
528 ColSegment_LIST new_blocks;
529 // Get boxes from the current vertical position on the grid
530 columns->GetColumnBoxes(i * gridsize(), (i+1) * gridsize(), &new_blocks);
531 // Merge the new_blocks boxes into column_blocks if they are well-aligned
532 GroupColumnBlocks(&new_blocks, column_blocks);
533 }
534 }
535}
536
537// Merge column segments into the current list if they are well aligned.
538void TableFinder::GroupColumnBlocks(ColSegment_LIST* new_blocks,
539 ColSegment_LIST* column_blocks) {
540 ColSegment_IT src_it(new_blocks);
541 ColSegment_IT dest_it(column_blocks);
542 // iterate through the source list
543 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
544 ColSegment* src_seg = src_it.data();
545 const TBOX& src_box = src_seg->bounding_box();
546 bool match_found = false;
547 // iterate through the destination list to find a matching column block
548 for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
549 ColSegment* dest_seg = dest_it.data();
550 TBOX dest_box = dest_seg->bounding_box();
551 if (ConsecutiveBoxes(src_box, dest_box)) {
552 // If matching block is found, insert the current block into it
553 // and delete the source block.
554 dest_seg->InsertBox(src_box);
555 match_found = true;
556 delete src_it.extract();
557 break;
558 }
559 }
560 // If no match is found, just append the source block to column_blocks
561 if (!match_found) {
562 dest_it.add_after_then_move(src_it.extract());
563 }
564 }
565}
566
567// are the two boxes immediate neighbors along the vertical direction
568bool TableFinder::ConsecutiveBoxes(const TBOX &b1, const TBOX &b2) {
569 int x_margin = 20;
570 int y_margin = 5;
571 return (abs(b1.left() - b2.left()) < x_margin) &&
572 (abs(b1.right() - b2.right()) < x_margin) &&
573 (abs(b1.top()-b2.bottom()) < y_margin ||
574 abs(b2.top()-b1.bottom()) < y_margin);
575}
576
577// Set up info for clean_part_grid_ partitions to be valid during detection
578// code.
583}
584
585// Set left, right and top, bottom spacings of each colpartition.
587 ColPartitionSet** all_columns) {
588 // Iterate the ColPartitions in the grid.
589 ColPartitionGridSearch gsearch(grid);
590 gsearch.StartFullSearch();
591 ColPartition* part = nullptr;
592 while ((part = gsearch.NextFullSearch()) != nullptr) {
593 ColPartitionSet* columns = all_columns[gsearch.GridY()];
594 TBOX box = part->bounding_box();
595 int y = part->MidY();
596 ColPartition* left_column = columns->ColumnContaining(box.left(), y);
597 ColPartition* right_column = columns->ColumnContaining(box.right(), y);
598 // set distance from left column as space to the left
599 if (left_column) {
600 int left_space = std::max(0, box.left() - left_column->LeftAtY(y));
601 part->set_space_to_left(left_space);
602 }
603 // set distance from right column as space to the right
604 if (right_column) {
605 int right_space = std::max(0, right_column->RightAtY(y) - box.right());
606 part->set_space_to_right(right_space);
607 }
608
609 // Look for images that may be closer.
610 // NOTE: used to be part_grid_, might cause issues now
611 ColPartitionGridSearch hsearch(grid);
612 hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
613 ColPartition* neighbor = nullptr;
614 while ((neighbor = hsearch.NextSideSearch(true)) != nullptr) {
615 if (neighbor->type() == PT_PULLOUT_IMAGE ||
616 neighbor->type() == PT_FLOWING_IMAGE ||
617 neighbor->type() == PT_HEADING_IMAGE) {
618 int right = neighbor->bounding_box().right();
619 if (right < box.left()) {
620 int space = std::min(box.left() - right, part->space_to_left());
621 part->set_space_to_left(space);
622 }
623 }
624 }
625 hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
626 neighbor = nullptr;
627 while ((neighbor = hsearch.NextSideSearch(false)) != nullptr) {
628 if (neighbor->type() == PT_PULLOUT_IMAGE ||
629 neighbor->type() == PT_FLOWING_IMAGE ||
630 neighbor->type() == PT_HEADING_IMAGE) {
631 int left = neighbor->bounding_box().left();
632 if (left > box.right()) {
633 int space = std::min(left - box.right(), part->space_to_right());
634 part->set_space_to_right(space);
635 }
636 }
637 }
638
639 ColPartition* upper_part = part->SingletonPartner(true);
640 if (upper_part) {
641 int space = std::max(0, static_cast<int>(upper_part->bounding_box().bottom() -
642 part->bounding_box().bottom()));
643 part->set_space_above(space);
644 } else {
645 // TODO(nbeato): What constitutes a good value?
646 // 0 is the default value when not set, explicitly noting it needs to
647 // be something else.
648 part->set_space_above(INT32_MAX);
649 }
650
651 ColPartition* lower_part = part->SingletonPartner(false);
652 if (lower_part) {
653 int space = std::max(0, static_cast<int>(part->bounding_box().bottom() -
654 lower_part->bounding_box().bottom()));
655 part->set_space_below(space);
656 } else {
657 // TODO(nbeato): What constitutes a good value?
658 // 0 is the default value when not set, explicitly noting it needs to
659 // be something else.
660 part->set_space_below(INT32_MAX);
661 }
662 }
663}
664
665// Set spacing and closest neighbors above and below a given colpartition.
667 TBOX box = part->bounding_box();
668 int top_range = std::min(box.top() + kMaxVerticalSpacing, static_cast<int>(tright().y()));
669 int bottom_range = std::max(box.bottom() - kMaxVerticalSpacing, static_cast<int>(bleft().y()));
670 box.set_top(top_range);
671 box.set_bottom(bottom_range);
672
673 TBOX part_box = part->bounding_box();
674 // Start a rect search
676 rectsearch(&clean_part_grid_);
677 rectsearch.StartRectSearch(box);
678 ColPartition* neighbor;
679 int min_space_above = kMaxVerticalSpacing;
680 int min_space_below = kMaxVerticalSpacing;
681 ColPartition* above_neighbor = nullptr;
682 ColPartition* below_neighbor = nullptr;
683 while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
684 if (neighbor == part)
685 continue;
686 TBOX neighbor_box = neighbor->bounding_box();
687 if (neighbor_box.major_x_overlap(part_box)) {
688 int gap = abs(part->median_bottom() - neighbor->median_bottom());
689 // If neighbor is below current partition
690 if (neighbor_box.top() < part_box.bottom() &&
691 gap < min_space_below) {
692 min_space_below = gap;
693 below_neighbor = neighbor;
694 } // If neighbor is above current partition
695 else if (part_box.top() < neighbor_box.bottom() &&
696 gap < min_space_above) {
697 min_space_above = gap;
698 above_neighbor = neighbor;
699 }
700 }
701 }
702 part->set_space_above(min_space_above);
703 part->set_space_below(min_space_below);
704 part->set_nearest_neighbor_above(above_neighbor);
705 part->set_nearest_neighbor_below(below_neighbor);
706}
707
708// Set global spacing and x-height estimates
710 STATS xheight_stats(0, kMaxVerticalSpacing + 1);
711 STATS width_stats(0, kMaxBlobWidth + 1);
712 STATS ledding_stats(0, kMaxVerticalSpacing + 1);
713 // Iterate the ColPartitions in the grid.
714 ColPartitionGridSearch gsearch(grid);
715 gsearch.SetUniqueMode(true);
716 gsearch.StartFullSearch();
717 ColPartition* part = nullptr;
718 while ((part = gsearch.NextFullSearch()) != nullptr) {
719 // TODO(nbeato): HACK HACK HACK! medians are equal to partition length.
720 // ComputeLimits needs to get called somewhere outside of TableFinder
721 // to make sure the partitions are properly initialized.
722 // When this is called, SmoothPartitionPartners dies in an assert after
723 // table find runs. Alternative solution.
724 // part->ComputeLimits();
725 if (part->IsTextType()) {
726 // xheight_stats.add(part->median_height(), part->boxes_count());
727 // width_stats.add(part->median_width(), part->boxes_count());
728
729 // This loop can be removed when above issues are fixed.
730 // Replace it with the 2 lines commented out above.
731 BLOBNBOX_C_IT it(part->boxes());
732 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
733 xheight_stats.add(it.data()->bounding_box().height(), 1);
734 width_stats.add(it.data()->bounding_box().width(), 1);
735 }
736
737 ledding_stats.add(part->space_above(), 1);
738 ledding_stats.add(part->space_below(), 1);
739 }
740 }
741 // Set estimates based on median of statistics obtained
742 set_global_median_xheight(static_cast<int>(xheight_stats.median() + 0.5));
743 set_global_median_blob_width(static_cast<int>(width_stats.median() + 0.5));
744 set_global_median_ledding(static_cast<int>(ledding_stats.median() + 0.5));
745 #ifndef GRAPHICS_DISABLED
746 if (textord_tablefind_show_stats) {
747 const char* kWindowName = "X-height (R), X-width (G), and ledding (B)";
748 ScrollView* stats_win = MakeWindow(500, 10, kWindowName);
749 xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED);
750 width_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN);
751 ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::BLUE);
752 }
753 #endif // GRAPHICS_DISABLED
754}
755
757 global_median_xheight_ = xheight;
758}
761}
763 global_median_ledding_ = ledding;
764}
765
768 gsearch.StartFullSearch();
769 ColPartition* part = nullptr;
770 while ((part = gsearch.NextFullSearch()) != nullptr) {
771 // TODO(nbeato): Rename this function, meaning is different now.
772 // IT is finding nearest neighbors its own way
773 //SetVerticalSpacing(part);
774
775 ColPartition* upper = part->SingletonPartner(true);
776 if (upper)
777 part->set_nearest_neighbor_above(upper);
778
779 ColPartition* lower = part->SingletonPartner(false);
780 if (lower)
781 part->set_nearest_neighbor_below(lower);
782 }
783}
784
785// High level interface. Input is an unmarked ColPartitionGrid
786// (namely, clean_part_grid_). Partitions are identified using local
787// information and filter/smoothed. The function exit should contain
788// a good sampling of the table partitions.
791 if (textord_tablefind_show_mark) {
792 ScrollView* table_win = MakeWindow(300, 300, "Initial Table Partitions");
796 }
798 if (textord_tablefind_show_mark) {
799 ScrollView* table_win = MakeWindow(600, 300, "Filtered Table Partitions");
803 }
805 if (textord_tablefind_show_mark) {
806 ScrollView* table_win = MakeWindow(900, 300, "Smoothed Table Partitions");
810 }
812 if (textord_tablefind_show_mark || textord_show_tables) {
813 ScrollView* table_win = MakeWindow(900, 300, "Final Table Partitions");
817 }
818}
819
820// These types of partitions are marked as table partitions:
821// 1- Partitions that have at lease one large gap between words
822// 2- Partitions that consist of only one word (no significant gap
823// between components)
824// 3- Partitions that vertically overlap with other partitions within the
825// same column.
826// 4- Partitions with leaders before/after them.
828 // Iterate the ColPartitions in the grid.
830 gsearch(&clean_part_grid_);
831 gsearch.StartFullSearch();
832 ColPartition* part = nullptr;
833 while ((part = gsearch.NextFullSearch()) != nullptr) {
834 if (!part->IsTextType()) // Only consider text partitions
835 continue;
836 // Only consider partitions in dominant font size or smaller
838 continue;
839 // Mark partitions with a large gap, or no significant gap as
840 // table partitions.
841 // Comments: It produces several false alarms at:
842 // - last line of a paragraph (fixed)
843 // - single word section headings
844 // - page headers and footers
845 // - numbered equations
846 // - line drawing regions
847 // TODO(faisal): detect and fix above-mentioned cases
848 if (HasWideOrNoInterWordGap(part) ||
849 HasLeaderAdjacent(*part)) {
850 part->set_table_type();
851 }
852 }
853}
854
855// Check if the partition has at least one large gap between words or no
856// significant gap at all
858 // Should only get text partitions.
859 ASSERT_HOST(part->IsTextType());
860 // Blob access
861 BLOBNBOX_CLIST* part_boxes = part->boxes();
862 BLOBNBOX_C_IT it(part_boxes);
863 // Check if this is a relatively small partition (such as a single word)
864 if (part->bounding_box().width() <
866 part_boxes->length() < kMinBoxesInTextPartition)
867 return true;
868
869 // Variables used to compute inter-blob spacing.
870 int current_x0 = -1;
871 int current_x1 = -1;
872 int previous_x1 = -1;
873 // Stores the maximum gap detected.
874 int largest_partition_gap_found = -1;
875 // Text partition gap limits. If this is text (and not a table),
876 // there should be at least one gap larger than min_gap and no gap
877 // larger than max_gap.
878 const double max_gap = kMaxGapInTextPartition * part->median_height();
879 const double min_gap = kMinMaxGapInTextPartition * part->median_height();
880
881 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
882 BLOBNBOX* blob = it.data();
883 current_x0 = blob->bounding_box().left();
884 current_x1 = blob->bounding_box().right();
885 if (previous_x1 != -1) {
886 int gap = current_x0 - previous_x1;
887
888 // TODO(nbeato): Boxes may overlap? Huh?
889 // For example, mag.3B 8003_033.3B.tif in UNLV data. The titles/authors
890 // on the top right of the page are filtered out with this line.
891 // Note 2: Iterating over blobs in a partition, so we are looking for
892 // spacing between the words.
893 if (gap < 0) {
894 // More likely case, the blobs slightly overlap. This can happen
895 // with diacritics (accents) or broken alphabet symbols (characters).
896 // Merge boxes together by taking max of right sides.
897 if (-gap < part->median_height() * kMaxBlobOverlapFactor) {
898 previous_x1 = std::max(previous_x1, current_x1);
899 continue;
900 }
901 // Extreme case, blobs overlap significantly in the same partition...
902 // This should not happen often (if at all), but it does.
903 // TODO(nbeato): investigate cases when this happens.
904 else {
905 // The behavior before was to completely ignore this case.
906 }
907 }
908
909 // If a large enough gap is found, mark it as a table cell (return true)
910 if (gap > max_gap)
911 return true;
912 if (gap > largest_partition_gap_found)
913 largest_partition_gap_found = gap;
914 }
915 previous_x1 = current_x1;
916 }
917 // Since no large gap was found, return false if the partition is too
918 // long to be a data cell
919 if (part->bounding_box().width() >
921 part_boxes->length() > kMaxBoxesInDataPartition)
922 return false;
923
924 // A partition may be a single blob. In this case, it's an isolated symbol
925 // or non-text (such as a ruling or image).
926 // Detect these as table partitions? Shouldn't this be case by case?
927 // The behavior before was to ignore this, making max_partition_gap < 0
928 // and implicitly return true. Just making it explicit.
929 if (largest_partition_gap_found == -1)
930 return true;
931
932 // return true if the maximum gap found is smaller than the minimum allowed
933 // max_gap in a text partition. This indicates that there is no significant
934 // space in the partition, hence it is likely a single word.
935 return largest_partition_gap_found < min_gap;
936}
937
938// A criteria for possible tables is that a table may have leaders
939// between data cells. An aggressive solution to find such tables is to
940// explicitly mark partitions that have adjacent leaders.
941// Note that this includes overlapping leaders. However, it does not
942// include leaders in different columns on the page.
943// Possible false-positive will include lists, such as a table of contents.
944// As these arise, the aggressive nature of this search may need to be
945// trimmed down.
947 if (part.flow() == BTFT_LEADER)
948 return true;
949 // Search range is left and right bounded by an offset of the
950 // median xheight. This offset is to allow some tolerance to the
951 // the leaders on the page in the event that the alignment is still
952 // a bit off.
953 const TBOX& box = part.bounding_box();
955 const int top = box.top() + search_size;
956 const int bottom = box.bottom() - search_size;
958 for (int direction = 0; direction < 2; ++direction) {
959 bool right_to_left = (direction == 0);
960 int x = right_to_left ? box.right() : box.left();
961 hsearch.StartSideSearch(x, bottom, top);
962 ColPartition* leader = nullptr;
963 while ((leader = hsearch.NextSideSearch(right_to_left)) != nullptr) {
964 // The leader could be a horizontal ruling in the grid.
965 // Make sure it is actually a leader.
966 if (leader->flow() != BTFT_LEADER)
967 continue;
968 // This should not happen, they are in different grids.
969 ASSERT_HOST(&part != leader);
970 // Make sure the leader shares a page column with the partition,
971 // otherwise we are spreading across columns.
972 if (!part.IsInSameColumnAs(*leader))
973 break;
974 // There should be a significant vertical overlap
975 if (!leader->VSignificantCoreOverlap(part))
976 continue;
977 // Leader passed all tests, so it is adjacent.
978 return true;
979 }
980 }
981 // No leaders are adjacent to the given partition.
982 return false;
983}
984
985// Filter individual text partitions marked as table partitions
986// consisting of paragraph endings, small section headings, and
987// headers and footers.
991 // TODO(nbeato): Fully justified text as non-table?
992}
993
995 // Detect last line of paragraph
996 // Iterate the ColPartitions in the grid.
998 gsearch.StartFullSearch();
999 ColPartition* part = nullptr;
1000 while ((part = gsearch.NextFullSearch()) != nullptr) {
1001 if (part->type() != PT_TABLE)
1002 continue; // Consider only table partitions
1003
1004 // Paragraph ending should have flowing text above it.
1005 ColPartition* upper_part = part->nearest_neighbor_above();
1006 if (!upper_part)
1007 continue;
1008 if (upper_part->type() != PT_FLOWING_TEXT)
1009 continue;
1010 if (upper_part->bounding_box().width() <
1011 2 * part->bounding_box().width())
1012 continue;
1013 // Check if its the last line of a paragraph.
1014 // In most cases, a paragraph ending should be left-aligned to text line
1015 // above it. Sometimes, it could be a 2 line paragraph, in which case
1016 // the line above it is indented.
1017 // To account for that, check if the partition center is to
1018 // the left of the one above it.
1019 int mid = (part->bounding_box().left() + part->bounding_box().right()) / 2;
1020 int upper_mid = (upper_part->bounding_box().left() +
1021 upper_part->bounding_box().right()) / 2;
1022 int current_spacing = 0; // spacing of the current line to margin
1023 int upper_spacing = 0; // spacing of the previous line to the margin
1025 // Left to right languages, use mid - left to figure out the distance
1026 // the middle is from the left margin.
1027 int left = std::min(part->bounding_box().left(),
1028 upper_part->bounding_box().left());
1029 current_spacing = mid - left;
1030 upper_spacing = upper_mid - left;
1031 } else {
1032 // Right to left languages, use right - mid to figure out the distance
1033 // the middle is from the right margin.
1034 int right = std::max(part->bounding_box().right(),
1035 upper_part->bounding_box().right());
1036 current_spacing = right - mid;
1037 upper_spacing = right - upper_mid;
1038 }
1039 if (current_spacing * kParagraphEndingPreviousLineRatio > upper_spacing)
1040 continue;
1041
1042 // Paragraphs should have similar fonts.
1043 if (!part->MatchingSizes(*upper_part) ||
1046 continue;
1047 }
1048
1049 // The last line of a paragraph should be left aligned.
1050 // TODO(nbeato): This would be untrue if the text was right aligned.
1051 // How often is that?
1052 if (part->space_to_left() >
1054 continue;
1055 // The line above it should be right aligned (assuming justified format).
1056 // Since we can't assume justified text, we compare whitespace to text.
1057 // The above line should have majority spanning text (or the current
1058 // line could have fit on the previous line). So compare
1059 // whitespace to text.
1060 if (upper_part->bounding_box().width() <
1062 continue;
1063
1064 // Ledding above the line should be less than ledding below
1065 if (part->space_above() >= part->space_below() ||
1066 part->space_above() > 2 * global_median_ledding_)
1067 continue;
1068
1069 // If all checks failed, it is probably text.
1070 part->clear_table_type();
1071 }
1072}
1073
1075 // Consider top-most text colpartition as header and bottom most as footer
1076 ColPartition* header = nullptr;
1077 ColPartition* footer = nullptr;
1078 int max_top = INT32_MIN;
1079 int min_bottom = INT32_MAX;
1081 gsearch.StartFullSearch();
1082 ColPartition* part = nullptr;
1083 while ((part = gsearch.NextFullSearch()) != nullptr) {
1084 if (!part->IsTextType())
1085 continue; // Consider only text partitions
1086 int top = part->bounding_box().top();
1087 int bottom = part->bounding_box().bottom();
1088 if (top > max_top) {
1089 max_top = top;
1090 header = part;
1091 }
1092 if (bottom < min_bottom) {
1093 min_bottom = bottom;
1094 footer = part;
1095 }
1096 }
1097 if (header)
1098 header->clear_table_type();
1099 if (footer)
1100 footer->clear_table_type();
1101}
1102
1103// Mark all ColPartitions as table cells that have a table cell above
1104// and below them
1105// TODO(faisal): This is too aggressive at the moment. The method needs to
1106// consider spacing and alignment as well. Detection of false alarm table cells
1107// should also be done as part of it.
1109 // Iterate the ColPartitions in the grid.
1111 gsearch.StartFullSearch();
1112 ColPartition* part = nullptr;
1113 while ((part = gsearch.NextFullSearch()) != nullptr) {
1114 if (part->type() >= PT_TABLE || part->type() == PT_UNKNOWN)
1115 continue; // Consider only text partitions
1116 ColPartition* upper_part = part->nearest_neighbor_above();
1117 ColPartition* lower_part = part->nearest_neighbor_below();
1118 if (!upper_part || !lower_part)
1119 continue;
1120 if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE)
1121 part->set_table_type();
1122 }
1123
1124 // Pass 2, do the opposite. If both the upper and lower neighbors
1125 // exist and are not tables, this probably shouldn't be a table.
1126 gsearch.StartFullSearch();
1127 part = nullptr;
1128 while ((part = gsearch.NextFullSearch()) != nullptr) {
1129 if (part->type() != PT_TABLE)
1130 continue; // Consider only text partitions
1131 ColPartition* upper_part = part->nearest_neighbor_above();
1132 ColPartition* lower_part = part->nearest_neighbor_below();
1133
1134 // table can't be by itself
1135 if ((upper_part && upper_part->type() != PT_TABLE) &&
1136 (lower_part && lower_part->type() != PT_TABLE)) {
1137 part->clear_table_type();
1138 }
1139 }
1140}
1141
1142// Set the type of a column segment based on the ratio of table to text cells
1143void TableFinder::SetColumnsType(ColSegment_LIST* column_blocks) {
1144 ColSegment_IT it(column_blocks);
1145 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1146 ColSegment* seg = it.data();
1147 TBOX box = seg->bounding_box();
1148 int num_table_cells = 0;
1149 int num_text_cells = 0;
1151 rsearch(&clean_part_grid_);
1152 rsearch.SetUniqueMode(true);
1153 rsearch.StartRectSearch(box);
1154 ColPartition* part = nullptr;
1155 while ((part = rsearch.NextRectSearch()) != nullptr) {
1156 if (part->type() == PT_TABLE) {
1157 num_table_cells++;
1158 } else if (part->type() == PT_FLOWING_TEXT) {
1159 num_text_cells++;
1160 }
1161 }
1162 // If a column block has no text or table partition in it, it is not needed
1163 // for table detection.
1164 if (!num_table_cells && !num_text_cells) {
1165 delete it.extract();
1166 } else {
1167 seg->set_num_table_cells(num_table_cells);
1168 seg->set_num_text_cells(num_text_cells);
1169 // set column type based on the ratio of table to text cells
1170 seg->set_type();
1171 }
1172 }
1173}
1174
1175// Move column blocks to grid
1176void TableFinder::MoveColSegmentsToGrid(ColSegment_LIST *segments,
1177 ColSegmentGrid *col_seg_grid) {
1178 ColSegment_IT it(segments);
1179 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1180 ColSegment* seg = it.extract();
1181 col_seg_grid->InsertBBox(true, true, seg);
1182 }
1183}
1184
1185// Merge column blocks if a split is detected due to the presence of a
1186// table. A text block is considered split if it has multiple
1187// neighboring blocks above/below it, and at least one of the
1188// neighboring blocks is of table type (has a high density of table
1189// partitions). In this case neighboring blocks in the direction
1190// (above/below) of the table block are merged with the text block.
1191
1192// Comment: This method does not handle split due to a full page table
1193// since table columns in this case do not have a text column on which
1194// split decision can be based.
1196 int margin = gridsize();
1197
1198 // Iterate the Column Blocks in the grid.
1200 gsearch(&col_seg_grid_);
1201 gsearch.StartFullSearch();
1202 ColSegment* seg;
1203 while ((seg = gsearch.NextFullSearch()) != nullptr) {
1204 if (seg->type() != COL_TEXT)
1205 continue; // only consider text blocks for split detection
1206 bool neighbor_found = false;
1207 bool modified = false; // Modified at least once
1208 // keep expanding current box as long as neighboring table columns
1209 // are found above or below it.
1210 do {
1211 TBOX box = seg->bounding_box();
1212 // slightly expand the search region vertically
1213 int top_range = std::min(box.top() + margin, static_cast<int>(tright().y()));
1214 int bottom_range = std::max(box.bottom() - margin, static_cast<int>(bleft().y()));
1215 box.set_top(top_range);
1216 box.set_bottom(bottom_range);
1217 neighbor_found = false;
1219 rectsearch(&col_seg_grid_);
1220 rectsearch.StartRectSearch(box);
1221 ColSegment* neighbor = nullptr;
1222 while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
1223 if (neighbor == seg)
1224 continue;
1225 const TBOX& neighbor_box = neighbor->bounding_box();
1226 // If the neighbor box significantly overlaps with the current
1227 // box (due to the expansion of the current box in the
1228 // previous iteration of this loop), remove the neighbor box
1229 // and expand the current box to include it.
1230 if (neighbor_box.overlap_fraction(box) >= 0.9) {
1231 seg->InsertBox(neighbor_box);
1232 modified = true;
1233 rectsearch.RemoveBBox();
1234 gsearch.RepositionIterator();
1235 delete neighbor;
1236 continue;
1237 }
1238 // Only expand if the neighbor box is of table type
1239 if (neighbor->type() != COL_TABLE)
1240 continue;
1241 // Insert the neighbor box into the current column block
1242 if (neighbor_box.major_x_overlap(box) &&
1243 !box.contains(neighbor_box)) {
1244 seg->InsertBox(neighbor_box);
1245 neighbor_found = true;
1246 modified = true;
1247 rectsearch.RemoveBBox();
1248 gsearch.RepositionIterator();
1249 delete neighbor;
1250 }
1251 }
1252 } while (neighbor_found);
1253 if (modified) {
1254 // Because the box has changed, it has to be removed first.
1255 gsearch.RemoveBBox();
1256 col_seg_grid_.InsertBBox(true, true, seg);
1257 gsearch.RepositionIterator();
1258 }
1259 }
1260}
1261
1262// Group horizontally overlapping table partitions into table columns.
1263// TODO(faisal): This is too aggressive at the moment. The method should
1264// consider more attributes to group table partitions together. Some common
1265// errors are:
1266// 1- page number is merged with a table column above it even
1267// if there is a large vertical gap between them.
1268// 2- column headers go on to catch one of the columns arbitrarily
1269// 3- an isolated noise blob near page top or bottom merges with the table
1270// column below/above it
1271// 4- cells from two vertically adjacent tables merge together to make a
1272// single column resulting in merging of the two tables
1273void TableFinder::GetTableColumns(ColSegment_LIST *table_columns) {
1274 ColSegment_IT it(table_columns);
1275 // Iterate the ColPartitions in the grid.
1277 gsearch(&clean_part_grid_);
1278 gsearch.StartFullSearch();
1279 ColPartition* part;
1280 while ((part = gsearch.NextFullSearch()) != nullptr) {
1281 if (part->inside_table_column() || part->type() != PT_TABLE)
1282 continue; // prevent a partition to be assigned to multiple columns
1283 const TBOX& box = part->bounding_box();
1284 auto* col = new ColSegment();
1285 col->InsertBox(box);
1286 part->set_inside_table_column(true);
1287 // Start a search below the current cell to find bottom neighbours
1288 // Note: a full search will always process things above it first, so
1289 // this should be starting at the highest cell and working its way down.
1291 vsearch(&clean_part_grid_);
1292 vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom());
1293 ColPartition* neighbor = nullptr;
1294 bool found_neighbours = false;
1295 while ((neighbor = vsearch.NextVerticalSearch(true)) != nullptr) {
1296 // only consider neighbors not assigned to any column yet
1297 if (neighbor->inside_table_column())
1298 continue;
1299 // Horizontal lines should not break the flow
1300 if (neighbor->IsHorizontalLine())
1301 continue;
1302 // presence of a non-table neighbor marks the end of current
1303 // table column
1304 if (neighbor->type() != PT_TABLE)
1305 break;
1306 // add the neighbor partition to the table column
1307 const TBOX& neighbor_box = neighbor->bounding_box();
1308 col->InsertBox(neighbor_box);
1309 neighbor->set_inside_table_column(true);
1310 found_neighbours = true;
1311 }
1312 if (found_neighbours) {
1313 it.add_after_then_move(col);
1314 } else {
1315 part->set_inside_table_column(false);
1316 delete col;
1317 }
1318 }
1319}
1320
1321// Mark regions in a column that are x-bounded by the column boundaries and
1322// y-bounded by the table columns' projection on the y-axis as table regions
1323void TableFinder::GetTableRegions(ColSegment_LIST* table_columns,
1324 ColSegment_LIST* table_regions) {
1325 ColSegment_IT cit(table_columns);
1326 ColSegment_IT rit(table_regions);
1327 // Iterate through column blocks
1329 gsearch(&col_seg_grid_);
1330 gsearch.StartFullSearch();
1331 ColSegment* part;
1332 int page_height = tright().y() - bleft().y();
1333 ASSERT_HOST(page_height > 0);
1334 // create a bool array to hold projection on y-axis
1335 bool* table_region = new bool[page_height];
1336 while ((part = gsearch.NextFullSearch()) != nullptr) {
1337 const TBOX& part_box = part->bounding_box();
1338 // reset the projection array
1339 for (int i = 0; i < page_height; i++) {
1340 table_region[i] = false;
1341 }
1342 // iterate through all table columns to find regions in the current
1343 // page column block
1344 cit.move_to_first();
1345 for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {
1346 TBOX col_box = cit.data()->bounding_box();
1347 // find intersection region of table column and page column
1348 TBOX intersection_box = col_box.intersection(part_box);
1349 // project table column on the y-axis
1350 for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) {
1351 table_region[i - bleft().y()] = true;
1352 }
1353 }
1354 // set x-limits of table regions to page column width
1355 TBOX current_table_box;
1356 current_table_box.set_left(part_box.left());
1357 current_table_box.set_right(part_box.right());
1358 // go through the y-axis projection to find runs of table
1359 // regions. Each run makes one table region.
1360 for (int i = 1; i < page_height; i++) {
1361 // detect start of a table region
1362 if (!table_region[i - 1] && table_region[i]) {
1363 current_table_box.set_bottom(i + bleft().y());
1364 }
1365 // TODO(nbeato): Is it guaranteed that the last row is not a table region?
1366 // detect end of a table region
1367 if (table_region[i - 1] && !table_region[i]) {
1368 current_table_box.set_top(i + bleft().y());
1369 if (!current_table_box.null_box()) {
1370 auto* seg = new ColSegment();
1371 seg->InsertBox(current_table_box);
1372 rit.add_after_then_move(seg);
1373 }
1374 }
1375 }
1376 }
1377 delete[] table_region;
1378}
1379
1380// Merge table regions corresponding to tables spanning multiple columns if
1381// there is a colpartition (horizontal ruling line or normal text) that
1382// touches both regions.
1383// TODO(faisal): A rare error occurs if there are two horizontally adjacent
1384// tables with aligned ruling lines. In this case, line finder returns a
1385// single line and hence the tables get merged together
1387 // Iterate the table regions in the grid.
1389 gsearch(&table_grid_);
1390 gsearch.StartFullSearch();
1391 ColSegment* seg = nullptr;
1392 while ((seg = gsearch.NextFullSearch()) != nullptr) {
1393 bool neighbor_found = false;
1394 bool modified = false; // Modified at least once
1395 do {
1396 // Start a rectangle search x-bounded by the image and y by the table
1397 const TBOX& box = seg->bounding_box();
1398 TBOX search_region(box);
1399 search_region.set_left(bleft().x());
1400 search_region.set_right(tright().x());
1401 neighbor_found = false;
1403 rectsearch(&table_grid_);
1404 rectsearch.StartRectSearch(search_region);
1405 ColSegment* neighbor = nullptr;
1406 while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
1407 if (neighbor == seg)
1408 continue;
1409 const TBOX& neighbor_box = neighbor->bounding_box();
1410 // Check if a neighbor box has a large overlap with the table
1411 // region. This may happen as a result of merging two table
1412 // regions in the previous iteration.
1413 if (neighbor_box.overlap_fraction(box) >= 0.9) {
1414 seg->InsertBox(neighbor_box);
1415 rectsearch.RemoveBBox();
1416 gsearch.RepositionIterator();
1417 delete neighbor;
1418 modified = true;
1419 continue;
1420 }
1421 // Check if two table regions belong together based on a common
1422 // horizontal ruling line
1423 if (BelongToOneTable(box, neighbor_box)) {
1424 seg->InsertBox(neighbor_box);
1425 neighbor_found = true;
1426 modified = true;
1427 rectsearch.RemoveBBox();
1428 gsearch.RepositionIterator();
1429 delete neighbor;
1430 }
1431 }
1432 } while (neighbor_found);
1433 if (modified) {
1434 // Because the box has changed, it has to be removed first.
1435 gsearch.RemoveBBox();
1436 table_grid_.InsertBBox(true, true, seg);
1437 gsearch.RepositionIterator();
1438 }
1439 }
1440}
1441
1442// Decide if two table regions belong to one table based on a common
1443// horizontal ruling line or another colpartition
1444bool TableFinder::BelongToOneTable(const TBOX &box1, const TBOX &box2) {
1445 // Check the obvious case. Most likely not true because overlapping boxes
1446 // should already be merged, but seems like a good thing to do in case things
1447 // change.
1448 if (box1.overlap(box2))
1449 return true;
1450 // Check for ColPartitions spanning both table regions
1451 TBOX bbox = box1.bounding_union(box2);
1452 // Start a rect search on bbox
1454 rectsearch(&clean_part_grid_);
1455 rectsearch.StartRectSearch(bbox);
1456 ColPartition* part = nullptr;
1457 while ((part = rectsearch.NextRectSearch()) != nullptr) {
1458 const TBOX& part_box = part->bounding_box();
1459 // return true if a colpartition spanning both table regions is found
1460 if (part_box.overlap(box1) && part_box.overlap(box2) &&
1461 !part->IsImageType())
1462 return true;
1463 }
1464 return false;
1465}
1466
1467// Adjust table boundaries by:
1468// - building a tight bounding box around all ColPartitions contained in it.
1469// - expanding table boundaries to include all colpartitions that overlap the
1470// table by more than half of their area
1471// - expanding table boundaries to include nearby horizontal rule lines
1472// - expanding table vertically to include left out column headers
1473// TODO(faisal): Expansion of table boundaries is quite aggressive. It usually
1474// makes following errors:
1475// 1- horizontal lines consisting of underlines are included in the table if
1476// they are close enough
1477// 2- horizontal lines originating from noise tend to get merged with a table
1478// near the top of the page
1479// 3- the criteria for including horizontal lines is very generous. Many times
1480// horizontal lines separating headers and footers get merged with a
1481// single-column table in a multi-column page thereby including text
1482// from the neighboring column inside the table
1483// 4- the criteria for including left out column headers also tends to
1484// occasionally include text-lines above the tables, typically from
1485// table caption
1487 // Iterate the table regions in the grid
1488 ColSegment_CLIST adjusted_tables;
1489 ColSegment_C_IT it(&adjusted_tables);
1491 gsearch.StartFullSearch();
1492 ColSegment* table = nullptr;
1493 while ((table = gsearch.NextFullSearch()) != nullptr) {
1494 const TBOX& table_box = table->bounding_box();
1495 TBOX grown_box = table_box;
1496 GrowTableBox(table_box, &grown_box);
1497 // To prevent a table from expanding again, do not insert the
1498 // modified box back to the grid. Instead move it to a list and
1499 // and remove it from the grid. The list is moved later back to the grid.
1500 if (!grown_box.null_box()) {
1501 auto* col = new ColSegment();
1502 col->InsertBox(grown_box);
1503 it.add_after_then_move(col);
1504 }
1505 gsearch.RemoveBBox();
1506 delete table;
1507 }
1508 // clear table grid to move final tables in it
1509 // TODO(nbeato): table_grid_ should already be empty. The above loop
1510 // removed everything. Maybe just assert it is empty?
1512 it.move_to_first();
1513 // move back final tables to table_grid_
1514 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1515 ColSegment* seg = it.extract();
1516 table_grid_.InsertBBox(true, true, seg);
1517 }
1518}
1519
1520void TableFinder::GrowTableBox(const TBOX& table_box, TBOX* result_box) {
1521 // TODO(nbeato): The growing code is a bit excessive right now.
1522 // By removing these lines, the partitions considered need
1523 // to have some overlap or be special cases. These lines could
1524 // be added again once a check is put in place to make sure that
1525 // growing tables don't stomp on a lot of non-table partitions.
1526
1527 // search for horizontal ruling lines within the vertical margin
1528 // int vertical_margin = kRulingVerticalMargin * gridsize();
1529 TBOX search_box = table_box;
1530 // int top = MIN(search_box.top() + vertical_margin, tright().y());
1531 // int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y());
1532 // search_box.set_top(top);
1533 // search_box.set_bottom(bottom);
1534
1535 GrowTableToIncludePartials(table_box, search_box, result_box);
1536 GrowTableToIncludeLines(table_box, search_box, result_box);
1537 IncludeLeftOutColumnHeaders(result_box);
1538}
1539
1540// Grow a table by increasing the size of the box to include
1541// partitions with significant overlap with the table.
1543 const TBOX& search_range,
1544 TBOX* result_box) {
1545 // Rulings are in a different grid, so search 2 grids for rulings, text,
1546 // and table partitions that are not entirely within the new box.
1547 for (int i = 0; i < 2; ++i) {
1548 ColPartitionGrid* grid = (i == 0) ? &fragmented_text_grid_ :
1550 ColPartitionGridSearch rectsearch(grid);
1551 rectsearch.StartRectSearch(search_range);
1552 ColPartition* part = nullptr;
1553 while ((part = rectsearch.NextRectSearch()) != nullptr) {
1554 // Only include text and table types.
1555 if (part->IsImageType())
1556 continue;
1557 const TBOX& part_box = part->bounding_box();
1558 // Include partition in the table if more than half of it
1559 // is covered by the table
1560 if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1561 *result_box = result_box->bounding_union(part_box);
1562 continue;
1563 }
1564 }
1565 }
1566}
1567
1568// Grow a table by expanding to the extents of significantly
1569// overlapping lines.
1571 const TBOX& search_range,
1572 TBOX* result_box) {
1574 rsearch.SetUniqueMode(true);
1575 rsearch.StartRectSearch(search_range);
1576 ColPartition* part = nullptr;
1577 while ((part = rsearch.NextRectSearch()) != nullptr) {
1578 // TODO(nbeato) This should also do vertical, but column
1579 // boundaries are breaking things. This function needs to be
1580 // updated to allow vertical lines as well.
1581 if (!part->IsLineType())
1582 continue;
1583 // Avoid the following function call if the result of the
1584 // function is irrelevant.
1585 const TBOX& part_box = part->bounding_box();
1586 if (result_box->contains(part_box))
1587 continue;
1588 // Include a partially overlapping horizontal line only if the
1589 // extra ColPartitions that will be included due to expansion
1590 // have large side spacing w.r.t. columns containing them.
1591 if (HLineBelongsToTable(*part, table_box))
1592 *result_box = result_box->bounding_union(part_box);
1593 // TODO(nbeato): Vertical
1594 }
1595}
1596
1597// Checks whether the horizontal line belong to the table by looking at the
1598// side spacing of extra ColParitions that will be included in the table
1599// due to expansion
1601 const TBOX& table_box) {
1602 if (!part.IsHorizontalLine())
1603 return false;
1604 const TBOX& part_box = part.bounding_box();
1605 if (!part_box.major_x_overlap(table_box))
1606 return false;
1607 // Do not consider top-most horizontal line since it usually
1608 // originates from noise.
1609 // TODO(nbeato): I had to comment this out because the ruling grid doesn't
1610 // have neighbors solved.
1611 // if (!part.nearest_neighbor_above())
1612 // return false;
1613 const TBOX bbox = part_box.bounding_union(table_box);
1614 // In the "unioned table" box (the table extents expanded by the line),
1615 // keep track of how many partitions have significant padding to the left
1616 // and right. If more than half of the partitions covered by the new table
1617 // have significant spacing, the line belongs to the table and the table
1618 // grows to include all of the partitions.
1619 int num_extra_partitions = 0;
1620 int extra_space_to_right = 0;
1621 int extra_space_to_left = 0;
1622 // Rulings are in a different grid, so search 2 grids for rulings, text,
1623 // and table partitions that are introduced by the new box.
1624 for (int i = 0; i < 2; ++i) {
1625 ColPartitionGrid* grid = (i == 0) ? &clean_part_grid_ :
1627 // Start a rect search on bbox
1628 ColPartitionGridSearch rectsearch(grid);
1629 rectsearch.SetUniqueMode(true);
1630 rectsearch.StartRectSearch(bbox);
1631 ColPartition* extra_part = nullptr;
1632 while ((extra_part = rectsearch.NextRectSearch()) != nullptr) {
1633 // ColPartition already in table
1634 const TBOX& extra_part_box = extra_part->bounding_box();
1635 if (extra_part_box.overlap_fraction(table_box) > kMinOverlapWithTable)
1636 continue;
1637 // Non-text ColPartitions do not contribute
1638 if (extra_part->IsImageType())
1639 continue;
1640 // Consider this partition.
1641 num_extra_partitions++;
1642 // presence of a table cell is a strong hint, so just increment the scores
1643 // without looking at the spacing.
1644 if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) {
1645 extra_space_to_right++;
1646 extra_space_to_left++;
1647 continue;
1648 }
1649 int space_threshold = kSideSpaceMargin * part.median_height();
1650 if (extra_part->space_to_right() > space_threshold)
1651 extra_space_to_right++;
1652 if (extra_part->space_to_left() > space_threshold)
1653 extra_space_to_left++;
1654 }
1655 }
1656 // tprintf("%d %d %d\n",
1657 // num_extra_partitions,extra_space_to_right,extra_space_to_left);
1658 return (extra_space_to_right > num_extra_partitions / 2) ||
1659 (extra_space_to_left > num_extra_partitions / 2);
1660}
1661
1662// Look for isolated column headers above the given table box and
1663// include them in the table
1665 // Start a search above the current table to look for column headers
1667 vsearch.StartVerticalSearch(table_box->left(), table_box->right(),
1668 table_box->top());
1669 ColPartition* neighbor = nullptr;
1670 ColPartition* previous_neighbor = nullptr;
1671 while ((neighbor = vsearch.NextVerticalSearch(false)) != nullptr) {
1672 // Max distance to find a table heading.
1673 const int max_distance = kMaxColumnHeaderDistance *
1674 neighbor->median_height();
1675 int table_top = table_box->top();
1676 const TBOX& box = neighbor->bounding_box();
1677 // Do not continue if the next box is way above
1678 if (box.bottom() - table_top > max_distance)
1679 break;
1680 // Unconditionally include partitions of type TABLE or LINE
1681 // TODO(faisal): add some reasonable conditions here
1682 if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) {
1683 table_box->set_top(box.top());
1684 previous_neighbor = nullptr;
1685 continue;
1686 }
1687 // If there are two text partitions, one above the other, without a table
1688 // cell on their left or right side, consider them a barrier and quit
1689 if (previous_neighbor == nullptr) {
1690 previous_neighbor = neighbor;
1691 } else {
1692 const TBOX& previous_box = previous_neighbor->bounding_box();
1693 if (!box.major_y_overlap(previous_box))
1694 break;
1695 }
1696 }
1697}
1698
1699// Remove false alarms consiting of a single column based on their
1700// projection on the x-axis. Projection of a real table on the x-axis
1701// should have at least one zero-valley larger than the global median
1702// x-height of the page.
1704 int page_width = tright().x() - bleft().x();
1705 ASSERT_HOST(page_width > 0);
1706 // create an integer array to hold projection on x-axis
1707 int* table_xprojection = new int[page_width];
1708 // Iterate through all tables in the table grid
1710 table_search(&table_grid_);
1711 table_search.StartFullSearch();
1712 ColSegment* table;
1713 while ((table = table_search.NextFullSearch()) != nullptr) {
1714 TBOX table_box = table->bounding_box();
1715 // reset the projection array
1716 for (int i = 0; i < page_width; i++) {
1717 table_xprojection[i] = 0;
1718 }
1719 // Start a rect search on table_box
1721 rectsearch(&clean_part_grid_);
1722 rectsearch.SetUniqueMode(true);
1723 rectsearch.StartRectSearch(table_box);
1724 ColPartition* part;
1725 while ((part = rectsearch.NextRectSearch()) != nullptr) {
1726 if (!part->IsTextType())
1727 continue; // Do not consider non-text partitions
1728 if (part->flow() == BTFT_LEADER)
1729 continue; // Assume leaders are in tables
1730 TBOX part_box = part->bounding_box();
1731 // Do not consider partitions partially covered by the table
1732 if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable)
1733 continue;
1734 BLOBNBOX_CLIST* part_boxes = part->boxes();
1735 BLOBNBOX_C_IT pit(part_boxes);
1736
1737 // Make sure overlapping blobs don't artificially inflate the number
1738 // of rows in the table. This happens frequently with things such as
1739 // decimals and split characters. Do this by assuming the column
1740 // partition is sorted mostly left to right and just clip
1741 // bounding boxes by the previous box's extent.
1742 int next_position_to_write = 0;
1743
1744 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
1745 BLOBNBOX *pblob = pit.data();
1746 // ignore blob height for the purpose of projection since we
1747 // are only interested in finding valleys
1748 int xstart = pblob->bounding_box().left();
1749 int xend = pblob->bounding_box().right();
1750
1751 xstart = std::max(xstart, next_position_to_write);
1752 for (int i = xstart; i < xend; i++)
1753 table_xprojection[i - bleft().x()]++;
1754 next_position_to_write = xend;
1755 }
1756 }
1757 // Find largest valley between two reasonable peaks in the table
1758 if (!GapInXProjection(table_xprojection, page_width)) {
1759 table_search.RemoveBBox();
1760 delete table;
1761 }
1762 }
1763 delete[] table_xprojection;
1764}
1765
1766// Return true if at least one gap larger than the global x-height
1767// exists in the horizontal projection
1768bool TableFinder::GapInXProjection(int* xprojection, int length) {
1769 // Find peak value of the histogram
1770 int peak_value = 0;
1771 for (int i = 0; i < length; i++) {
1772 if (xprojection[i] > peak_value) {
1773 peak_value = xprojection[i];
1774 }
1775 }
1776 // Peak value represents the maximum number of horizontally
1777 // overlapping colpartitions, so this can be considered as the
1778 // number of rows in the table
1779 if (peak_value < kMinRowsInTable)
1780 return false;
1781 double projection_threshold = kSmallTableProjectionThreshold * peak_value;
1782 if (peak_value >= kLargeTableRowCount)
1783 projection_threshold = kLargeTableProjectionThreshold * peak_value;
1784 // Threshold the histogram
1785 for (int i = 0; i < length; i++) {
1786 xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0;
1787 }
1788 // Find the largest run of zeros between two ones
1789 int largest_gap = 0;
1790 int run_start = -1;
1791 for (int i = 1; i < length; i++) {
1792 // detect start of a run of zeros
1793 if (xprojection[i - 1] && !xprojection[i]) {
1794 run_start = i;
1795 }
1796 // detect end of a run of zeros and update the value of largest gap
1797 if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) {
1798 int gap = i - run_start;
1799 if (gap > largest_gap)
1800 largest_gap = gap;
1801 run_start = -1;
1802 }
1803 }
1804 return largest_gap > kMaxXProjectionGapFactor * global_median_xheight_;
1805}
1806
1807// Given the location of a table "guess", try to overlay a cellular
1808// grid in the location, adjusting the boundaries.
1809// TODO(nbeato): Falsely introduces:
1810// -headers/footers (not any worse, too much overlap destroys cells)
1811// -page numbers (not worse, included because maximize margins)
1812// -equations (nicely fit into a celluar grid, but more sparsely)
1813// -figures (random text box, also sparse)
1814// -small left-aligned text areas with overlapping positioned whitespace
1815// (rejected before)
1816// Overall, this just needs some more work.
1818 ScrollView* table_win = nullptr;
1819 if (textord_show_tables) {
1820 table_win = MakeWindow(0, 0, "Table Structure");
1823 // table_grid_.DisplayBoxes(table_win);
1824 }
1825
1826
1827 TableRecognizer recognizer;
1828 recognizer.Init();
1832 recognizer.set_min_height(1.5 * gridheight());
1833 // Loop over all of the tables and try to fit them.
1834 // Store the good tables here.
1835 ColSegment_CLIST good_tables;
1836 ColSegment_C_IT good_it(&good_tables);
1837
1839 gsearch.StartFullSearch();
1840 ColSegment* found_table = nullptr;
1841 while ((found_table = gsearch.NextFullSearch()) != nullptr) {
1842 gsearch.RemoveBBox();
1843
1844 // The goal is to make the tables persistent in a list.
1845 // When that happens, this will move into the search loop.
1846 const TBOX& found_box = found_table->bounding_box();
1847 StructuredTable* table_structure = recognizer.RecognizeTable(found_box);
1848
1849 // Process a table. Good tables are inserted into the grid again later on
1850 // We can't change boxes in the grid while it is running a search.
1851 if (table_structure != nullptr) {
1852 if (textord_show_tables) {
1853 table_structure->Display(table_win, ScrollView::LIME_GREEN);
1854 }
1855 found_table->set_bounding_box(table_structure->bounding_box());
1856 delete table_structure;
1857 good_it.add_after_then_move(found_table);
1858 } else {
1859 delete found_table;
1860 }
1861 }
1862 // TODO(nbeato): MERGE!! There is awesome info now available for merging.
1863
1864 // At this point, the grid is empty. We can safely insert the good tables
1865 // back into grid.
1866 for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward())
1867 table_grid_.InsertBBox(true, true, good_it.extract());
1868}
1869
1870// Displays the column segments in some window.
1872 ColSegment_LIST *segments,
1873 ScrollView::Color color) {
1874#ifndef GRAPHICS_DISABLED
1875 win->Pen(color);
1876 win->Brush(ScrollView::NONE);
1877 ColSegment_IT it(segments);
1878 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1879 ColSegment* col = it.data();
1880 const TBOX& box = col->bounding_box();
1881 int left_x = box.left();
1882 int right_x = box.right();
1883 int top_y = box.top();
1884 int bottom_y = box.bottom();
1885 win->Rectangle(left_x, bottom_y, right_x, top_y);
1886 }
1887 win->UpdateWindow();
1888#endif
1889}
1890
1892 ScrollView::Color color) {
1893#ifndef GRAPHICS_DISABLED
1894 // Iterate the ColPartitions in the grid.
1896 gsearch(grid);
1897 gsearch.StartFullSearch();
1898 ColSegment* seg = nullptr;
1899 while ((seg = gsearch.NextFullSearch()) != nullptr) {
1900 const TBOX& box = seg->bounding_box();
1901 int left_x = box.left();
1902 int right_x = box.right();
1903 int top_y = box.top();
1904 int bottom_y = box.bottom();
1905 win->Brush(ScrollView::NONE);
1906 win->Pen(color);
1907 win->Rectangle(left_x, bottom_y, right_x, top_y);
1908 }
1909 win->UpdateWindow();
1910#endif
1911}
1912
1913// Displays the colpartitions using a new coloring on an existing window.
1914// Note: This method is only for debug purpose during development and
1915// would not be part of checked in code
1917 ColPartitionGrid* grid,
1918 ScrollView::Color default_color,
1919 ScrollView::Color table_color) {
1920#ifndef GRAPHICS_DISABLED
1921 ScrollView::Color color = default_color;
1922 // Iterate the ColPartitions in the grid.
1924 gsearch(grid);
1925 gsearch.StartFullSearch();
1926 ColPartition* part = nullptr;
1927 while ((part = gsearch.NextFullSearch()) != nullptr) {
1928 color = default_color;
1929 if (part->type() == PT_TABLE)
1930 color = table_color;
1931
1932 const TBOX& box = part->bounding_box();
1933 int left_x = box.left();
1934 int right_x = box.right();
1935 int top_y = box.top();
1936 int bottom_y = box.bottom();
1937 win->Brush(ScrollView::NONE);
1938 win->Pen(color);
1939 win->Rectangle(left_x, bottom_y, right_x, top_y);
1940 }
1941 win->UpdateWindow();
1942#endif
1943}
1945 ColPartitionGrid* grid,
1946 ScrollView::Color default_color) {
1947 DisplayColPartitions(win, grid, default_color, ScrollView::YELLOW);
1948}
1949
1951 ScrollView* win,
1952 ColPartitionGrid* grid,
1953 ScrollView::Color color) {
1954#ifndef GRAPHICS_DISABLED
1955 // Iterate the ColPartitions in the grid.
1957 gsearch(grid);
1958 gsearch.StartFullSearch();
1959 ColPartition* part = nullptr;
1960 while ((part = gsearch.NextFullSearch()) != nullptr) {
1961 const TBOX& box = part->bounding_box();
1962 int left_x = box.left();
1963 int right_x = box.right();
1964 int top_y = box.top();
1965 int bottom_y = box.bottom();
1966
1967 ColPartition* upper_part = part->nearest_neighbor_above();
1968 if (upper_part) {
1969 const TBOX& upper_box = upper_part->bounding_box();
1970 int mid_x = (left_x + right_x) / 2;
1971 int mid_y = (top_y + bottom_y) / 2;
1972 int other_x = (upper_box.left() + upper_box.right()) / 2;
1973 int other_y = (upper_box.top() + upper_box.bottom()) / 2;
1974 win->Brush(ScrollView::NONE);
1975 win->Pen(color);
1976 win->Line(mid_x, mid_y, other_x, other_y);
1977 }
1978 ColPartition* lower_part = part->nearest_neighbor_below();
1979 if (lower_part) {
1980 const TBOX& lower_box = lower_part->bounding_box();
1981 int mid_x = (left_x + right_x) / 2;
1982 int mid_y = (top_y + bottom_y) / 2;
1983 int other_x = (lower_box.left() + lower_box.right()) / 2;
1984 int other_y = (lower_box.top() + lower_box.bottom()) / 2;
1985 win->Brush(ScrollView::NONE);
1986 win->Pen(color);
1987 win->Line(mid_x, mid_y, other_x, other_y);
1988 }
1989 }
1990 win->UpdateWindow();
1991#endif
1992}
1993
1994// Merge all colpartitions in table regions to make them a single
1995// colpartition and revert types of isolated table cells not
1996// assigned to any table to their original types.
1998 ColPartitionSet** all_columns,
1999 WidthCallback* width_cb) {
2000 // Since we have table blocks already, remove table tags from all
2001 // colpartitions
2003 gsearch(grid);
2004 gsearch.StartFullSearch();
2005 ColPartition* part = nullptr;
2006
2007 while ((part = gsearch.NextFullSearch()) != nullptr) {
2008 if (part->type() == PT_TABLE) {
2009 part->clear_table_type();
2010 }
2011 }
2012 // Now make a single colpartition out of each table block and remove
2013 // all colpartitions contained within a table
2015 table_search(&table_grid_);
2016 table_search.StartFullSearch();
2017 ColSegment* table;
2018 while ((table = table_search.NextFullSearch()) != nullptr) {
2019 const TBOX& table_box = table->bounding_box();
2020 // Start a rect search on table_box
2022 rectsearch(grid);
2023 rectsearch.StartRectSearch(table_box);
2024 ColPartition* part;
2025 ColPartition* table_partition = nullptr;
2026 while ((part = rectsearch.NextRectSearch()) != nullptr) {
2027 // Do not consider image partitions
2028 if (!part->IsTextType())
2029 continue;
2030 TBOX part_box = part->bounding_box();
2031 // Include partition in the table if more than half of it
2032 // is covered by the table
2033 if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
2034 rectsearch.RemoveBBox();
2035 if (table_partition) {
2036 table_partition->Absorb(part, width_cb);
2037 } else {
2038 table_partition = part;
2039 }
2040 }
2041 }
2042 // Insert table colpartition back to part_grid_
2043 if (table_partition) {
2044 // To match the columns used when transforming to blocks, the new table
2045 // partition must have its first and last column set at the grid y that
2046 // corresponds to its bottom.
2047 const TBOX& table_box = table_partition->bounding_box();
2048 int grid_x, grid_y;
2049 grid->GridCoords(table_box.left(), table_box.bottom(), &grid_x, &grid_y);
2050 table_partition->SetPartitionType(resolution_, all_columns[grid_y]);
2051 table_partition->set_table_type();
2052 table_partition->set_blob_type(BRT_TEXT);
2053 table_partition->set_flow(BTFT_CHAIN);
2054 table_partition->SetBlobTypes();
2055 grid->InsertBBox(true, true, table_partition);
2056 }
2057 }
2058}
2059
2063 : ELIST_LINK(),
2064 num_table_cells_(0),
2065 num_text_cells_(0),
2066 type_(COL_UNKNOWN) {
2067}
2068
2069// Provides a color for BBGrid to draw the rectangle.
2071 const ScrollView::Color kBoxColors[PT_COUNT] = {
2076 };
2077 return kBoxColors[type_];
2078}
2079
2080// Insert a box into this column segment
2081void ColSegment::InsertBox(const TBOX& other) {
2082 bounding_box_ = bounding_box_.bounding_union(other);
2083}
2084
2085// Set column segment type based on the ratio of text and table partitions
2086// in it.
2088 if (num_table_cells_ > kTableColumnThreshold * num_text_cells_)
2089 type_ = COL_TABLE;
2090 else if (num_text_cells_ > num_table_cells_)
2091 type_ = COL_TEXT;
2092 else
2093 type_ = COL_MIXED;
2094}
2095
2096} // namespace tesseract.
@ PT_COUNT
Definition: capi.h:144
@ PT_TABLE
Definition: capi.h:135
@ PT_PULLOUT_IMAGE
Definition: capi.h:140
@ PT_HEADING_IMAGE
Definition: capi.h:139
@ PT_FLOWING_TEXT
Definition: capi.h:130
@ PT_UNKNOWN
Definition: capi.h:129
@ PT_FLOWING_IMAGE
Definition: capi.h:138
@ BTFT_LEADER
Definition: blobbox.h:121
@ BTFT_CHAIN
Definition: blobbox.h:118
@ BRT_TEXT
Definition: blobbox.h:80
@ BRT_NOISE
Definition: blobbox.h:73
#define CLISTIZE(CLASSNAME)
Definition: clst.h:891
#define ELISTIZE(CLASSNAME)
Definition: elst.h:931
#define ASSERT_HOST(x)
Definition: errcode.h:88
#define BOOL_VAR(name, val, comment)
Definition: params.h:306
const int kMaxVerticalSpacing
Definition: tablefind.cpp:37
const double kAllowBlobHeight
Definition: tablefind.cpp:55
const double kMinOverlapWithTable
Definition: tablefind.cpp:96
const double kMaxTableCellXheight
Definition: tablefind.cpp:80
const double kMaxParagraphEndingLeftSpaceMultiple
Definition: tablefind.cpp:125
@ COL_TEXT
Definition: tablefind.h:32
@ COL_MIXED
Definition: tablefind.h:34
@ COL_TABLE
Definition: tablefind.h:33
@ COL_UNKNOWN
Definition: tablefind.h:31
const double kMinMaxGapInTextPartition
Definition: tablefind.cpp:72
const double kLargeTableProjectionThreshold
Definition: tablefind.cpp:106
const int kMinBoxesInTextPartition
Definition: tablefind.cpp:62
const double kStrokeWidthFractionalTolerance
Definition: tablefind.cpp:139
const int kMinRowsInTable
Definition: tablefind.cpp:111
const double kMinParagraphEndingTextToWhitespaceRatio
Definition: tablefind.cpp:131
const double kMaxGapInTextPartition
Definition: tablefind.cpp:68
void DeleteObject(T *object)
Definition: tablefind.cpp:155
const double kTableColumnThreshold
Definition: tablefind.cpp:88
const double kAllowBlobArea
Definition: tablefind.cpp:57
const double kAllowTextArea
Definition: tablefind.cpp:50
const int kAdjacentLeaderSearchPadding
Definition: tablefind.cpp:116
const double kStrokeWidthConstantTolerance
Definition: tablefind.cpp:140
const int kMaxBlobWidth
Definition: tablefind.cpp:39
const double kAllowTextWidth
Definition: tablefind.cpp:49
const double kAllowTextHeight
Definition: tablefind.cpp:48
const double kAllowBlobWidth
Definition: tablefind.cpp:56
const int kMaxColumnHeaderDistance
Definition: tablefind.cpp:84
const int kMaxBoxesInDataPartition
Definition: tablefind.cpp:65
const double kParagraphEndingPreviousLineRatio
Definition: tablefind.cpp:121
const int kLargeTableRowCount
Definition: tablefind.cpp:108
const double kSmallTableProjectionThreshold
Definition: tablefind.cpp:105
const int kSideSpaceMargin
Definition: tablefind.cpp:101
const double kSplitPartitionSize
Definition: tablefind.cpp:43
const double kMaxBlobOverlapFactor
Definition: tablefind.cpp:76
const double kMaxXProjectionGapFactor
Definition: tablefind.cpp:135
BlobRegionType region_type() const
Definition: blobbox.h:283
const TBOX & bounding_box() const
Definition: blobbox.h:230
BlobTextFlowType flow() const
Definition: blobbox.h:295
integer coordinate
Definition: points.h:32
int16_t y() const
access_function
Definition: points.h:56
int16_t x() const
access function
Definition: points.h:52
Definition: points.h:189
Definition: rect.h:34
void set_right(int x)
Definition: rect.h:82
double overlap_fraction(const TBOX &box) const
Definition: rect.h:388
int16_t top() const
Definition: rect.h:58
void set_bottom(int y)
Definition: rect.h:68
int16_t width() const
Definition: rect.h:115
int32_t area() const
Definition: rect.h:122
int16_t height() const
Definition: rect.h:108
bool overlap(const TBOX &box) const
Definition: rect.h:355
void set_top(int y)
Definition: rect.h:61
int16_t left() const
Definition: rect.h:72
int16_t bottom() const
Definition: rect.h:65
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:129
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87
bool contains(const FCOORD pt) const
Definition: rect.h:333
bool null_box() const
Definition: rect.h:50
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:412
void set_left(int x)
Definition: rect.h:75
bool major_y_overlap(const TBOX &box) const
Definition: rect.h:439
int16_t right() const
Definition: rect.h:79
Definition: statistc.h:31
void plot(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
Definition: statistc.cpp:577
void add(int32_t value, int32_t count)
Definition: statistc.cpp:93
double median() const
Definition: statistc.cpp:231
void StartVerticalSearch(int xmin, int xmax, int y)
Definition: bbgrid.h:788
void SetUniqueMode(bool mode)
Definition: bbgrid.h:253
int GridY() const
Definition: bbgrid.h:245
BBC * NextSideSearch(bool right_to_left)
Definition: bbgrid.h:761
BBC * NextRectSearch()
Definition: bbgrid.h:842
void StartSideSearch(int x, int ymin, int ymax)
Definition: bbgrid.h:746
void StartFullSearch()
Definition: bbgrid.h:665
BBC * NextVerticalSearch(bool top_to_bottom)
Definition: bbgrid.h:802
void RepositionIterator()
Definition: bbgrid.h:892
void StartRectSearch(const TBOX &rect)
Definition: bbgrid.h:830
BBC * NextFullSearch()
Definition: bbgrid.h:675
int gridsize() const
Definition: bbgrid.h:63
int gridheight() const
Definition: bbgrid.h:69
const ICOORD & bleft() const
Definition: bbgrid.h:72
int gridwidth() const
Definition: bbgrid.h:66
void GridCoords(int x, int y, int *grid_x, int *grid_y) const
Definition: bbgrid.cpp:52
const ICOORD & tright() const
Definition: bbgrid.h:75
void DisplayBoxes(ScrollView *window)
Definition: bbgrid.h:613
void Clear()
Definition: bbgrid.h:455
void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: bbgrid.h:445
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:486
void ClearGridData(void(*free_method)(BBC *))
Definition: bbgrid.h:464
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: bbgrid.h:589
bool IsImageType() const
Definition: colpartition.h:430
void set_space_to_left(int space)
Definition: colpartition.h:277
BlobTextFlowType flow() const
Definition: colpartition.h:155
bool MatchingStrokeWidth(const ColPartition &other, double fractional_tolerance, double constant_tolerance) const
BLOBNBOX_CLIST * boxes()
Definition: colpartition.h:188
bool IsHorizontalLine() const
Definition: colpartition.h:460
PolyBlockType type() const
Definition: colpartition.h:182
ColPartition * CopyButDontOwnBlobs()
ColPartition * SplitAt(int split_x)
bool VSignificantCoreOverlap(const ColPartition &other) const
Definition: colpartition.h:391
void set_nearest_neighbor_above(ColPartition *part)
Definition: colpartition.h:253
BlobRegionType blob_type() const
Definition: colpartition.h:149
void AddBox(BLOBNBOX *box)
void set_blob_type(BlobRegionType t)
Definition: colpartition.h:152
const TBOX & bounding_box() const
Definition: colpartition.h:110
void set_nearest_neighbor_below(ColPartition *part)
Definition: colpartition.h:259
ColPartition * nearest_neighbor_above() const
Definition: colpartition.h:250
void set_space_above(int space)
Definition: colpartition.h:265
void set_inside_table_column(bool val)
Definition: colpartition.h:247
bool MatchingSizes(const ColPartition &other) const
int LeftAtY(int y) const
Definition: colpartition.h:341
void set_space_to_right(int space)
Definition: colpartition.h:283
ColPartition * ShallowCopy() const
bool IsInSameColumnAs(const ColPartition &part) const
int space_to_right() const
Definition: colpartition.h:280
ColPartition * SingletonPartner(bool upper)
void SetPartitionType(int resolution, ColPartitionSet *columns)
void set_space_below(int space)
Definition: colpartition.h:271
int RightAtY(int y) const
Definition: colpartition.h:345
void Absorb(ColPartition *other, WidthCallback *cb)
ColPartition * nearest_neighbor_below() const
Definition: colpartition.h:256
void set_flow(BlobTextFlowType f)
Definition: colpartition.h:158
void RefinePartitionPartners(bool get_desperate)
ColPartition * ColumnContaining(int x, int y)
void GetColumnBoxes(int y_bottom, int y_top, ColSegment_LIST *segments)
void InsertBox(const TBOX &other)
Definition: tablefind.cpp:2081
void set_bounding_box(const TBOX &other)
Definition: tablefind.h:72
void set_num_table_cells(int n)
Definition: tablefind.h:81
void set_num_text_cells(int n)
Definition: tablefind.h:90
ColSegType type() const
Definition: tablefind.h:94
ScrollView::Color BoxColor() const
Definition: tablefind.cpp:2070
const TBOX & bounding_box() const
Definition: tablefind.h:52
void DisplayColSegments(ScrollView *win, ColSegment_LIST *cols, ScrollView::Color color)
Definition: tablefind.cpp:1871
bool BelongToOneTable(const TBOX &box1, const TBOX &box2)
Definition: tablefind.cpp:1444
void GrowTableBox(const TBOX &table_box, TBOX *result_box)
Definition: tablefind.cpp:1520
void MakeTableBlocks(ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb)
Definition: tablefind.cpp:1997
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:518
void InsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:402
void IncludeLeftOutColumnHeaders(TBOX *table_box)
Definition: tablefind.cpp:1664
void Init(int grid_size, const ICOORD &bottom_left, const ICOORD &top_right)
Definition: tablefind.cpp:181
const ICOORD & bleft() const
Definition: tablefind.cpp:387
void GrowTableToIncludeLines(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1570
void GetTableColumns(ColSegment_LIST *table_columns)
Definition: tablefind.cpp:1273
void LocateTables(ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb, const FCOORD &reskew)
Definition: tablefind.cpp:259
const ICOORD & tright() const
Definition: tablefind.cpp:390
void SplitAndInsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:436
void GroupColumnBlocks(ColSegment_LIST *current_segments, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:538
bool ConsecutiveBoxes(const TBOX &b1, const TBOX &b2)
Definition: tablefind.cpp:568
void SetGlobalSpacings(ColPartitionGrid *grid)
Definition: tablefind.cpp:709
void GetTableRegions(ColSegment_LIST *table_columns, ColSegment_LIST *table_regions)
Definition: tablefind.cpp:1323
bool HasWideOrNoInterWordGap(ColPartition *part) const
Definition: tablefind.cpp:857
void InitializePartitions(ColPartitionSet **all_columns)
Definition: tablefind.cpp:579
bool HasLeaderAdjacent(const ColPartition &part)
Definition: tablefind.cpp:946
bool HLineBelongsToTable(const ColPartition &part, const TBOX &table_box)
Definition: tablefind.cpp:1600
void GetColumnBlocks(ColPartitionSet **columns, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:523
void set_global_median_blob_width(int width)
Definition: tablefind.cpp:759
void DisplayColSegmentGrid(ScrollView *win, ColSegmentGrid *grid, ScrollView::Color color)
Definition: tablefind.cpp:1891
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:415
void MoveColSegmentsToGrid(ColSegment_LIST *segments, ColSegmentGrid *col_seg_grid)
Definition: tablefind.cpp:1176
void InsertLeaderPartition(ColPartition *part)
Definition: tablefind.cpp:410
bool GapInXProjection(int *xprojection, int length)
Definition: tablefind.cpp:1768
void set_global_median_xheight(int xheight)
Definition: tablefind.cpp:756
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:421
int gridheight() const
Definition: tablefind.cpp:384
void InsertCleanPartitions(ColPartitionGrid *grid, TO_BLOCK *block)
Definition: tablefind.cpp:193
static void SetPartitionSpacings(ColPartitionGrid *grid, ColPartitionSet **all_columns)
Definition: tablefind.cpp:586
void set_global_median_ledding(int ledding)
Definition: tablefind.cpp:762
void InsertRulingPartition(ColPartition *part)
Definition: tablefind.cpp:418
void set_left_to_right_language(bool order)
Definition: tablefind.cpp:177
bool AllowBlob(const BLOBNBOX &blob) const
Definition: tablefind.cpp:502
ColSegmentGrid table_grid_
Definition: tablefind.h:423
void SetColumnsType(ColSegment_LIST *col_segments)
Definition: tablefind.cpp:1143
void GrowTableToIncludePartials(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1542
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:419
void InsertTextPartition(ColPartition *part)
Definition: tablefind.cpp:394
void SetVerticalSpacing(ColPartition *part)
Definition: tablefind.cpp:666
void DisplayColPartitionConnections(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
Definition: tablefind.cpp:1950
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1916
void MarkPartitionsUsingLocalInformation()
Definition: tablefind.cpp:827
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413
void InsertImagePartition(ColPartition *part)
Definition: tablefind.cpp:421
bool AllowTextPartition(const ColPartition &part) const
Definition: tablefind.cpp:489
const TBOX & bounding_box() const
Definition: tablerecog.cpp:109
void Display(ScrollView *window, ScrollView::Color color)
Definition: tablerecog.cpp:289
void set_max_text_height(int height)
Definition: tablerecog.cpp:731
void set_line_grid(ColPartitionGrid *lines)
Definition: tablerecog.cpp:722
void set_text_grid(ColPartitionGrid *text)
Definition: tablerecog.cpp:719
void set_min_height(int height)
Definition: tablerecog.cpp:725
StructuredTable * RecognizeTable(const TBOX &guess_box)
Definition: tablerecog.cpp:735
@ DARK_TURQUOISE
Definition: scrollview.h:147
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:532
void UpdateWindow()
Definition: scrollview.cpp:704
void Pen(Color color)
Definition: scrollview.cpp:719
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:600
void Brush(Color color)
Definition: scrollview.cpp:725