tesseract 4.1.1
Loading...
Searching...
No Matches
tablerecog.cpp
Go to the documentation of this file.
1
2// File: tablerecog.cpp
3// Description: Helper class to help structure table areas. Given an bounding
4// box from TableFinder, the TableRecognizer should give a
5// StructuredTable (maybe a list in the future) of "good" tables
6// in that area.
7// Author: Nicholas Beato
8// Created: Friday, Aug. 20, 2010
9//
10// (C) Copyright 2009, Google Inc.
11// Licensed under the Apache License, Version 2.0 (the "License");
12// you may not use this file except in compliance with the License.
13// You may obtain a copy of the License at
14// http://www.apache.org/licenses/LICENSE-2.0
15// Unless required by applicable law or agreed to in writing, software
16// distributed under the License is distributed on an "AS IS" BASIS,
17// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18// See the License for the specific language governing permissions and
19// limitations under the License.
20//
22
23#ifdef HAVE_CONFIG_H
24#include "config_auto.h"
25#endif
26
27#include "tablerecog.h"
28
29#include <algorithm>
30
31namespace tesseract {
32
33// The amount of space required between the ColPartitions in 2 columns
34// of a non-lined table as a multiple of the median width.
35const double kHorizontalSpacing = 0.30;
36// The amount of space required between the ColPartitions in 2 rows
37// of a non-lined table as multiples of the median height.
38const double kVerticalSpacing = -0.2;
39// The number of cells that the grid lines may intersect.
40// See FindCellSplitLocations for explanation.
43// For "lined tables", the number of required lines. Currently a guess.
46// Number of columns required, as a fraction of the most columns found.
47// None of these are tweaked at all.
48const double kRequiredColumns = 0.7;
49// The tolerance for comparing margins of potential tables.
50const double kMarginFactor = 1.1;
51// The first and last row should be consistent cell height.
52// This factor is the first and last row cell height max.
53const double kMaxRowSize = 2.5;
54// Number of filled columns required to form a strong table row.
55// For small tables, this is an absolute number.
56const double kGoodRowNumberOfColumnsSmall[] = { 2, 2, 2, 2, 2, 3, 3 };
58 sizeof(kGoodRowNumberOfColumnsSmall) / sizeof(double) - 1;
59// For large tables, it is a relative number
61// The amount of area that must be covered in a cell by ColPartitions to
62// be considered "filled"
63const double kMinFilledArea = 0.35;
64
68
70 : text_grid_(nullptr),
71 line_grid_(nullptr),
72 is_lined_(false),
73 space_above_(0),
74 space_below_(0),
75 space_left_(0),
76 space_right_(0),
77 median_cell_height_(0),
78 median_cell_width_(0),
79 max_text_height_(INT32_MAX) {
80}
81
83}
84
86 text_grid_ = text_grid;
87}
89 line_grid_ = line_grid;
90}
92 max_text_height_ = height;
93}
95 return is_lined_;
96}
98 return cell_y_.length() == 0 ? 0 : cell_y_.length() - 1;
99}
101 return cell_x_.length() == 0 ? 0 : cell_x_.length() - 1;
102}
104 return row_count() * column_count();
105}
107 bounding_box_ = box;
108}
110 return bounding_box_;
111}
113 return median_cell_height_;
114}
116 return median_cell_width_;
117}
118int StructuredTable::row_height(int row) const {
119 ASSERT_HOST(0 <= row && row < row_count());
120 return cell_y_[row + 1] - cell_y_[row];
121}
122int StructuredTable::column_width(int column) const {
123 ASSERT_HOST(0 <= column && column < column_count());
124 return cell_x_[column + 1] - cell_x_[column];
125}
127 return space_above_;
128}
130 return space_below_;
131}
132
133// At this point, we know that the lines are contained
134// by the box (by FindLinesBoundingBox).
135// So try to find the cell structure and make sure it works out.
136// The assumption is that all lines span the table. If this
137// assumption fails, the VerifyLinedTable method will
138// abort the lined table. The TableRecognizer will fall
139// back on FindWhitespacedStructure.
142
143 // Search for all of the lines in the current box.
144 // Update the cellular structure with the exact lines.
146 box_search.SetUniqueMode(true);
147 box_search.StartRectSearch(bounding_box_);
148 ColPartition* line = nullptr;
149
150 while ((line = box_search.NextRectSearch()) != nullptr) {
151 if (line->IsHorizontalLine())
152 cell_y_.push_back(line->MidY());
153 if (line->IsVerticalLine())
154 cell_x_.push_back(line->MidX());
155 }
156
157 // HasSignificantLines should guarantee cells.
158 // Because that code is a different class, just gracefully
159 // return false. This could be an assert.
160 if (cell_x_.length() < 3 || cell_y_.length() < 3)
161 return false;
162
163 cell_x_.sort();
164 cell_y_.sort();
165
166 // Remove duplicates that may have occurred due to split lines.
169
170 // The border should be the extents of line boxes, not middle.
175
176 // Remove duplicates that may have occurred due to moving the borders.
179
183 return is_lined_;
184}
185
186// Finds the cellular structure given a particular box.
191
192 if (!VerifyWhitespacedTable()) {
193 return false;
194 } else {
202 return true;
203 }
204}
205
206// Tests if a partition fits inside the table structure.
207// Partitions must fully span a grid line in order to intersect it.
208// This means that a partition does not intersect a line
209// that it "just" touches. This is mainly because the assumption
210// throughout the code is that "0" distance is a very very small space.
212 const TBOX& box = part.bounding_box();
213 for (int i = 0; i < cell_x_.length(); ++i)
214 if (box.left() < cell_x_[i] && cell_x_[i] < box.right())
215 return false;
216 for (int i = 0; i < cell_y_.length(); ++i)
217 if (box.bottom() < cell_y_[i] && cell_y_[i] < box.top())
218 return false;
219 return true;
220}
221
222// Checks if a sub-table has multiple data cells filled.
224 return CountFilledCells(0, row_count() - 1, 0, column_count() - 1);
225}
227 return CountFilledCells(row, row, 0, column_count() - 1);
228}
230 return CountFilledCells(0, row_count() - 1, column, column);
231}
232int StructuredTable::CountFilledCells(int row_start, int row_end,
233 int column_start, int column_end) {
234 ASSERT_HOST(0 <= row_start && row_start <= row_end && row_end < row_count());
235 ASSERT_HOST(0 <= column_start && column_start <= column_end &&
236 column_end < column_count());
237 int cell_count = 0;
238 TBOX cell_box;
239 for (int row = row_start; row <= row_end; ++row) {
240 cell_box.set_bottom(cell_y_[row]);
241 cell_box.set_top(cell_y_[row + 1]);
242 for (int col = column_start; col <= column_end; ++col) {
243 cell_box.set_left(cell_x_[col]);
244 cell_box.set_right(cell_x_[col + 1]);
245 if (CountPartitions(cell_box) > 0)
246 ++cell_count;
247 }
248 }
249 return cell_count;
250}
251
252// Makes sure that at least one cell in a row has substantial area filled.
253// This can filter out large whitespace caused by growing tables too far
254// and page numbers.
256 for (int i = 0; i < column_count(); ++i) {
257 double area_filled = CalculateCellFilledPercentage(row, i);
258 if (area_filled >= kMinFilledArea)
259 return true;
260 }
261 return false;
262}
263
264// Finds the filled area in a cell.
265// Assume ColPartitions do not overlap for simplicity (even though they do).
267 ASSERT_HOST(0 <= row && row <= row_count());
268 ASSERT_HOST(0 <= column && column <= column_count());
269 const TBOX kCellBox(cell_x_[column], cell_y_[row],
270 cell_x_[column + 1], cell_y_[row + 1]);
271 ASSERT_HOST(!kCellBox.null_box());
272
274 gsearch.SetUniqueMode(true);
275 gsearch.StartRectSearch(kCellBox);
276 double area_covered = 0;
277 ColPartition* text = nullptr;
278 while ((text = gsearch.NextRectSearch()) != nullptr) {
279 if (text->IsTextType())
280 area_covered += text->bounding_box().intersection(kCellBox).area();
281 }
282 const int32_t current_area = kCellBox.area();
283 if (current_area == 0) {
284 return 1.0;
285 }
286 return std::min(1.0, area_covered / current_area);
287}
288
290#ifndef GRAPHICS_DISABLED
291 window->Brush(ScrollView::NONE);
292 window->Pen(color);
295 for (int i = 0; i < cell_x_.length(); i++) {
296 window->Line(cell_x_[i], bounding_box_.bottom(),
298 }
299 for (int i = 0; i < cell_y_.length(); i++) {
300 window->Line(bounding_box_.left(), cell_y_[i],
302 }
303 window->UpdateWindow();
304#endif
305}
306
307// Clear structure information.
309 cell_x_.clear();
310 cell_y_.clear();
311 is_lined_ = false;
312 space_above_ = 0;
313 space_below_ = 0;
314 space_left_ = 0;
315 space_right_ = 0;
318}
319
320// When a table has lines, the lines should not intersect any partitions.
321// The following function makes sure the previous assumption is met.
323 // Function only called when lines exist.
324 ASSERT_HOST(cell_y_.length() >= 2 && cell_x_.length() >= 2);
325 for (int i = 0; i < cell_y_.length(); ++i) {
327 return false;
328 }
329 for (int i = 0; i < cell_x_.length(); ++i) {
331 return false;
332 }
333 return true;
334}
335
336// TODO(nbeato): Could be much better than this.
337// Examples:
338// - Caclulate the percentage of filled cells.
339// - Calculate the average number of ColPartitions per cell.
340// - Calculate the number of cells per row with partitions.
341// - Check if ColPartitions in adjacent cells are similar.
342// - Check that all columns are at least a certain width.
343// - etc.
345 // criteria for a table, must be at least 2x3 or 3x2
346 return row_count() >= 2 && column_count() >= 2 && cell_count() >= 6;
347}
348
349// Finds vertical splits in the ColPartitions of text_grid_ by considering
350// all possible "good" guesses. A good guess is just the left/right sides of
351// the partitions, since these locations will uniquely define where the
352// extremal values where the splits can occur. The split happens
353// in the middle of the two nearest partitions.
355 // Set of the extents of all partitions on the page.
356 GenericVectorEqEq<int> left_sides;
357 GenericVectorEqEq<int> right_sides;
358
359 // Look at each text partition. We want to find the partitions
360 // that have extremal left/right sides. These will give us a basis
361 // for the table columns.
363 gsearch.SetUniqueMode(true);
365 ColPartition* text = nullptr;
366 while ((text = gsearch.NextRectSearch()) != nullptr) {
367 if (!text->IsTextType())
368 continue;
369
370 ASSERT_HOST(text->bounding_box().left() < text->bounding_box().right());
371 int spacing = static_cast<int>(text->median_width() *
372 kHorizontalSpacing / 2.0 + 0.5);
373 left_sides.push_back(text->bounding_box().left() - spacing);
374 right_sides.push_back(text->bounding_box().right() + spacing);
375 }
376 // It causes disaster below, so avoid it!
377 if (left_sides.length() == 0 || right_sides.length() == 0)
378 return;
379
380 // Since data may be inserted in grid order, we sort the left/right sides.
381 left_sides.sort();
382 right_sides.sort();
383
384 // At this point, in the "merged list", we expect to have a left side,
385 // followed by either more left sides or a right side. The last number
386 // should be a right side. We find places where the splits occur by looking
387 // for "valleys". If we want to force gap sizes or allow overlap, change
388 // the spacing above. If you want to let lines "slice" partitions as long
389 // as it is infrequent, change the following function.
390 FindCellSplitLocations(left_sides, right_sides, kCellSplitColumnThreshold,
391 &cell_x_);
392}
393
394// Finds horizontal splits in the ColPartitions of text_grid_ by considering
395// all possible "good" guesses. A good guess is just the bottom/top sides of
396// the partitions, since these locations will uniquely define where the
397// extremal values where the splits can occur. The split happens
398// in the middle of the two nearest partitions.
400 // Set of the extents of all partitions on the page.
401 GenericVectorEqEq<int> bottom_sides;
402 GenericVectorEqEq<int> top_sides;
403 // We will be "shrinking" partitions, so keep the min/max around to
404 // make sure the bottom/top lines do not intersect text.
405 int min_bottom = INT32_MAX;
406 int max_top = INT32_MIN;
407
408 // Look at each text partition. We want to find the partitions
409 // that have extremal bottom/top sides. These will give us a basis
410 // for the table rows. Because the textlines can be skewed and close due
411 // to warping, the height of the partitions is toned down a little bit.
413 gsearch.SetUniqueMode(true);
415 ColPartition* text = nullptr;
416 while ((text = gsearch.NextRectSearch()) != nullptr) {
417 if (!text->IsTextType())
418 continue;
419
420 ASSERT_HOST(text->bounding_box().bottom() < text->bounding_box().top());
421 min_bottom = std::min(min_bottom, static_cast<int>(text->bounding_box().bottom()));
422 max_top = std::max(max_top, static_cast<int>(text->bounding_box().top()));
423
424 // Ignore "tall" text partitions, as these are usually false positive
425 // vertical text or multiple lines pulled together.
426 if (text->bounding_box().height() > max_text_height_)
427 continue;
428
429 int spacing = static_cast<int>(text->bounding_box().height() *
430 kVerticalSpacing / 2.0 + 0.5);
431 int bottom = text->bounding_box().bottom() - spacing;
432 int top = text->bounding_box().top() + spacing;
433 // For horizontal text, the factor can be negative. This should
434 // probably cause a warning or failure. I haven't actually checked if
435 // it happens.
436 if (bottom >= top)
437 continue;
438
439 bottom_sides.push_back(bottom);
440 top_sides.push_back(top);
441 }
442 // It causes disaster below, so avoid it!
443 if (bottom_sides.length() == 0 || top_sides.length() == 0)
444 return;
445
446 // Since data may be inserted in grid order, we sort the bottom/top sides.
447 bottom_sides.sort();
448 top_sides.sort();
449
450 // At this point, in the "merged list", we expect to have a bottom side,
451 // followed by either more bottom sides or a top side. The last number
452 // should be a top side. We find places where the splits occur by looking
453 // for "valleys". If we want to force gap sizes or allow overlap, change
454 // the spacing above. If you want to let lines "slice" partitions as long
455 // as it is infrequent, change the following function.
456 FindCellSplitLocations(bottom_sides, top_sides, kCellSplitRowThreshold,
457 &cell_y_);
458
459 // Recover the min/max correctly since it was shifted.
460 cell_y_[0] = min_bottom;
461 cell_y_[cell_y_.length() - 1] = max_top;
462}
463
465 space_above_ = INT32_MAX;
466 space_below_ = INT32_MAX;
467 space_right_ = INT32_MAX;
468 space_left_ = INT32_MAX;
471}
472// Finds the nearest partition in grid to the table
473// boundaries and updates the margin.
475 int below = FindVerticalMargin(grid, bounding_box_.bottom(), true);
476 space_below_ = std::min(space_below_, below);
477 int above = FindVerticalMargin(grid, bounding_box_.top(), false);
478 space_above_ = std::min(space_above_, above);
479 int left = FindHorizontalMargin(grid, bounding_box_.left(), true);
480 space_left_ = std::min(space_left_, left);
481 int right = FindHorizontalMargin(grid, bounding_box_.right(), false);
482 space_right_ = std::min(space_right_, right);
483}
485 bool decrease) const {
486 ColPartitionGridSearch gsearch(grid);
487 gsearch.SetUniqueMode(true);
489 border);
490 ColPartition* part = nullptr;
491 while ((part = gsearch.NextVerticalSearch(decrease)) != nullptr) {
492 if (!part->IsTextType() && !part->IsHorizontalLine())
493 continue;
494 int distance = decrease ? border - part->bounding_box().top()
495 : part->bounding_box().bottom() - border;
496 if (distance >= 0)
497 return distance;
498 }
499 return INT32_MAX;
500}
502 bool decrease) const {
503 ColPartitionGridSearch gsearch(grid);
504 gsearch.SetUniqueMode(true);
506 ColPartition* part = nullptr;
507 while ((part = gsearch.NextSideSearch(decrease)) != nullptr) {
508 if (!part->IsTextType() && !part->IsVerticalLine())
509 continue;
510 int distance = decrease ? border - part->bounding_box().right()
511 : part->bounding_box().left() - border;
512 if (distance >= 0)
513 return distance;
514 }
515 return INT32_MAX;
516}
517
519 const int kMaxCellHeight = 1000;
520 const int kMaxCellWidth = 1000;
521 STATS height_stats(0, kMaxCellHeight + 1);
522 STATS width_stats(0, kMaxCellWidth + 1);
523
524 for (int i = 0; i < row_count(); ++i)
525 height_stats.add(row_height(i), column_count());
526 for (int i = 0; i < column_count(); ++i)
527 width_stats.add(column_width(i), row_count());
528
529 median_cell_height_ = static_cast<int>(height_stats.median() + 0.5);
530 median_cell_width_ = static_cast<int>(width_stats.median() + 0.5);
531}
532
533// Looks for grid lines near the current bounding box and
534// grows the bounding box to include them if no intersections
535// will occur as a result. This is necessary because the margins
536// are calculated relative to the closest line/text. If the
537// line isn't absorbed, the margin will be the distance to the line.
540 gsearch.SetUniqueMode(true);
541
542 // Is the closest line above good? Loop multiple times for tables with
543 // multi-line (sometimes 2) borders. Limit the number of lines by
544 // making sure they stay within a table cell or so.
545 ColPartition* line = nullptr;
548 while ((line = gsearch.NextVerticalSearch(false)) != nullptr) {
549 if (!line->IsHorizontalLine())
550 break;
551 TBOX text_search(bounding_box_.left(), bounding_box_.top() + 1,
552 bounding_box_.right(), line->MidY());
553 if (text_search.height() > median_cell_height_ * 2)
554 break;
555 if (CountPartitions(text_search) > 0)
556 break;
557 bounding_box_.set_top(line->MidY());
558 }
559 // As above, is the closest line below good?
560 line = nullptr;
563 while ((line = gsearch.NextVerticalSearch(true)) != nullptr) {
564 if (!line->IsHorizontalLine())
565 break;
566 TBOX text_search(bounding_box_.left(), line->MidY(),
568 if (text_search.height() > median_cell_height_ * 2)
569 break;
570 if (CountPartitions(text_search) > 0)
571 break;
573 }
574 // TODO(nbeato): vertical lines
575}
576
577
578// This function will find all "0 valleys" (of any length) given two
579// arrays. The arrays are the mins and maxes of partitions (either
580// left and right or bottom and top). Since the min/max lists are generated
581// with pairs of increasing integers, we can make some assumptions in
582// the function about ordering of the overall list, which are shown in the
583// asserts.
584// The algorithm works as follows:
585// While there are numbers to process, take the smallest number.
586// If it is from the min_list, increment the "hill" counter.
587// Otherwise, decrement the "hill" counter.
588// In the process of doing this, keep track of "crossing" the
589// desired height.
590// The first/last items are extremal values of the list and known.
591// NOTE: This function assumes the lists are sorted!
593 const GenericVector<int>& max_list,
594 int max_merged,
595 GenericVector<int>* locations) {
596 locations->clear();
597 ASSERT_HOST(min_list.length() == max_list.length());
598 if (min_list.length() == 0)
599 return;
600 ASSERT_HOST(min_list.get(0) < max_list.get(0));
601 ASSERT_HOST(min_list.get(min_list.length() - 1) <
602 max_list.get(max_list.length() - 1));
603
604 locations->push_back(min_list.get(0));
605 int min_index = 0;
606 int max_index = 0;
607 int stacked_partitions = 0;
608 int last_cross_position = INT32_MAX;
609 // max_index will expire after min_index.
610 // However, we can't "increase" the hill size if min_index expired.
611 // So finish processing when min_index expires.
612 while (min_index < min_list.length()) {
613 // Increase the hill count.
614 if (min_list[min_index] < max_list[max_index]) {
615 ++stacked_partitions;
616 if (last_cross_position != INT32_MAX &&
617 stacked_partitions > max_merged) {
618 int mid = (last_cross_position + min_list[min_index]) / 2;
619 locations->push_back(mid);
620 last_cross_position = INT32_MAX;
621 }
622 ++min_index;
623 } else {
624 // Decrease the hill count.
625 --stacked_partitions;
626 if (last_cross_position == INT32_MAX &&
627 stacked_partitions <= max_merged) {
628 last_cross_position = max_list[max_index];
629 }
630 ++max_index;
631 }
632 }
633 locations->push_back(max_list.get(max_list.length() - 1));
634}
635
636// Counts the number of partitions in the table
637// box that intersection the given x value.
639 int count = 0;
640 // Make a small box to keep the search time down.
641 const int kGridSize = text_grid_->gridsize();
642 TBOX vertical_box = bounding_box_;
643 vertical_box.set_left(x - kGridSize);
644 vertical_box.set_right(x + kGridSize);
645
647 gsearch.SetUniqueMode(true);
648 gsearch.StartRectSearch(vertical_box);
649 ColPartition* text = nullptr;
650 while ((text = gsearch.NextRectSearch()) != nullptr) {
651 if (!text->IsTextType())
652 continue;
653 const TBOX& box = text->bounding_box();
654 if (box.left() < x && x < box.right())
655 ++count;
656 }
657 return count;
658}
659
660// Counts the number of partitions in the table
661// box that intersection the given y value.
663 int count = 0;
664 // Make a small box to keep the search time down.
665 const int kGridSize = text_grid_->gridsize();
666 TBOX horizontal_box = bounding_box_;
667 horizontal_box.set_bottom(y - kGridSize);
668 horizontal_box.set_top(y + kGridSize);
669
671 gsearch.SetUniqueMode(true);
672 gsearch.StartRectSearch(horizontal_box);
673 ColPartition* text = nullptr;
674 while ((text = gsearch.NextRectSearch()) != nullptr) {
675 if (!text->IsTextType())
676 continue;
677
678 const TBOX& box = text->bounding_box();
679 if (box.bottom() < y && y < box.top())
680 ++count;
681 }
682 return count;
683}
684
685// Counts how many text partitions are in this box.
686// This is used to count partitons in cells, as that can indicate
687// how "strong" a potential table row/column (or even full table) actually is.
690 gsearch.SetUniqueMode(true);
691 gsearch.StartRectSearch(box);
692 int count = 0;
693 ColPartition* text = nullptr;
694 while ((text = gsearch.NextRectSearch()) != nullptr) {
695 if (text->IsTextType())
696 ++count;
697 }
698 return count;
699}
700
704
706 : text_grid_(nullptr),
707 line_grid_(nullptr),
708 min_height_(0),
709 min_width_(0),
710 max_text_height_(INT32_MAX) {
711}
712
714}
715
717}
718
720 text_grid_ = text_grid;
721}
723 line_grid_ = line_grid;
724}
726 min_height_ = height;
727}
729 min_width_ = width;
730}
732 max_text_height_ = height;
733}
734
736 auto* table = new StructuredTable();
737 table->Init();
738 table->set_text_grid(text_grid_);
739 table->set_line_grid(line_grid_);
740 table->set_max_text_height(max_text_height_);
741
742 // Try to solve this simple case, a table with *both*
743 // vertical and horizontal lines.
744 if (RecognizeLinedTable(guess, table))
745 return table;
746
747 // Fallback to whitespace if that failed.
748 // TODO(nbeato): Break this apart to take advantage of horizontal
749 // lines or vertical lines when present.
750 if (RecognizeWhitespacedTable(guess, table))
751 return table;
752
753 // No table found...
754 delete table;
755 return nullptr;
756}
757
759 StructuredTable* table) {
760 if (!HasSignificantLines(guess_box))
761 return false;
762 TBOX line_bound = guess_box;
763 if (!FindLinesBoundingBox(&line_bound))
764 return false;
765 table->set_bounding_box(line_bound);
766 return table->FindLinedStructure();
767}
768
769// Quick implementation. Just count the number of lines in the box.
770// A better implementation would counter intersections and look for connected
771// components. It could even go as far as finding similar length lines.
772// To account for these possible issues, the VerifyLinedTableCells function
773// will reject lined tables that cause intersections with text on the page.
774// TODO(nbeato): look for "better" lines
777 box_search.SetUniqueMode(true);
778 box_search.StartRectSearch(guess);
779 ColPartition* line = nullptr;
780 int vertical_count = 0;
781 int horizontal_count = 0;
782
783 while ((line = box_search.NextRectSearch()) != nullptr) {
784 if (line->IsHorizontalLine())
785 ++horizontal_count;
786 if (line->IsVerticalLine())
787 ++vertical_count;
788 }
789
790 return vertical_count >= kLinedTableMinVerticalLines &&
791 horizontal_count >= kLinedTableMinHorizontalLines;
792}
793
794// Given a bounding box with a bunch of horizontal / vertical lines,
795// we just find the extents of all of these lines iteratively.
796// The box will be at least as large as guess. This
797// could possibly be a bad assumption.
798// It is guaranteed to halt in at least O(n * gridarea) where n
799// is the number of lines.
800// The assumption is that growing the box iteratively will add lines
801// several times, but eventually we'll find the extents.
802//
803// For tables, the approach is a bit aggressive, a single line (which could be
804// noise or a column ruling) can destroy the table inside.
805//
806// TODO(nbeato): This is a quick first implementation.
807// A better implementation would actually look for consistency
808// in extents of the lines and find the extents using lines
809// that clearly describe the table. This would allow the
810// lines to "vote" for height/width. An approach like
811// this would solve issues with page layout rulings.
812// I haven't looked for these issues yet, so I can't even
813// say they happen confidently.
815 // The first iteration will tell us if there are lines
816 // present and shrink the box to a minimal iterative size.
817 if (!FindLinesBoundingBoxIteration(bounding_box))
818 return false;
819
820 // Keep growing until the area of the table stabilizes.
821 // The box can only get bigger, increasing area.
822 bool changed = true;
823 while (changed) {
824 changed = false;
825 int old_area = bounding_box->area();
826 bool check = FindLinesBoundingBoxIteration(bounding_box);
827 // At this point, the function will return true.
828 ASSERT_HOST(check);
829 ASSERT_HOST(bounding_box->area() >= old_area);
830 changed = (bounding_box->area() > old_area);
831 }
832
833 return true;
834}
835
837 // Search for all of the lines in the current box, keeping track of extents.
839 box_search.SetUniqueMode(true);
840 box_search.StartRectSearch(*bounding_box);
841 ColPartition* line = nullptr;
842 bool first_line = true;
843
844 while ((line = box_search.NextRectSearch()) != nullptr) {
845 if (line->IsLineType()) {
846 if (first_line) {
847 // The first iteration can shrink the box.
848 *bounding_box = line->bounding_box();
849 first_line = false;
850 } else {
851 *bounding_box += line->bounding_box();
852 }
853 }
854 }
855 return !first_line;
856}
857
858// The goal of this function is to move the table boundaries around and find
859// a table that maximizes the whitespace around the table while maximizing
860// the cellular structure. As a result, it gets confused by headers, footers,
861// and merged columns (text that crosses columns). There is a tolerance
862// that allows a few partitions to count towards potential cell merges.
863// It's the max_merged parameter to FindPartitionLocations.
864// It can work, but it needs some false positive remove on boundaries.
865// For now, the grid structure must not intersect any partitions.
866// Also, small tolerance is added to the horizontal lines for tightly packed
867// tables. The tolerance is added by adjusting the bounding boxes of the
868// partitions (in FindHorizontalPartitions). The current implementation
869// only adjusts the vertical extents of the table.
870//
871// Also note. This was hacked at a lot. It could probably use some
872// more hacking at to find a good set of border conditions and then a
873// nice clean up.
875 StructuredTable* table) {
876 TBOX best_box = guess_box; // Best borders known.
877 int best_below = 0; // Margin size above best table.
878 int best_above = 0; // Margin size below best table.
879 TBOX adjusted = guess_box; // The search box.
880
881 // We assume that the guess box is somewhat accurate, so we don't allow
882 // the adjusted border to pass half of the guessed area. This prevents
883 // "negative" tables from forming.
884 const int kMidGuessY = (guess_box.bottom() + guess_box.top()) / 2;
885 // Keeps track of the most columns in an accepted table. The resulting table
886 // may be less than the max, but we don't want to stray too far.
887 int best_cols = 0;
888 // Make sure we find a good border.
889 bool found_good_border = false;
890
891 // Find the bottom of the table by trying a few different locations. For
892 // each location, the top, left, and right are fixed. We start the search
893 // in a smaller table to favor best_cols getting a good estimate sooner.
894 int last_bottom = INT32_MAX;
895 int bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(),
896 kMidGuessY - min_height_ / 2, true);
897 int top = NextHorizontalSplit(guess_box.left(), guess_box.right(),
898 kMidGuessY + min_height_ / 2, false);
899 adjusted.set_top(top);
900
901 // Headers/footers can be spaced far from everything.
902 // Make sure that the space below is greater than the space above
903 // the lowest row.
904 int previous_below = 0;
905 const int kMaxChances = 10;
906 int chances = kMaxChances;
907 while (bottom != last_bottom) {
908 adjusted.set_bottom(bottom);
909
910 if (adjusted.height() >= min_height_) {
911 // Try to fit the grid on the current box. We give it a chance
912 // if the number of columns didn't significantly drop.
913 table->set_bounding_box(adjusted);
914 if (table->FindWhitespacedStructure() &&
915 table->column_count() >= best_cols * kRequiredColumns) {
916 if (false && IsWeakTableRow(table, 0)) {
917 // Currently buggy, but was looking promising so disabled.
918 --chances;
919 } else {
920 // We favor 2 things,
921 // 1- Adding rows that have partitioned data.
922 // 2- Better margins (to find header/footer).
923 // For better tables, we just look for multiple cells in the
924 // bottom row with data in them.
925 // For margins, the space below the last row should
926 // be better than a table with the last row removed.
927 chances = kMaxChances;
928 double max_row_height = kMaxRowSize * table->median_cell_height();
929 if ((table->space_below() * kMarginFactor >= best_below &&
930 table->space_below() >= previous_below) ||
931 (table->CountFilledCellsInRow(0) > 1 &&
932 table->row_height(0) < max_row_height)) {
933 best_box.set_bottom(bottom);
934 best_below = table->space_below();
935 best_cols = std::max(table->column_count(), best_cols);
936 found_good_border = true;
937 }
938 }
939 previous_below = table->space_below();
940 } else {
941 --chances;
942 }
943 }
944 if (chances <= 0)
945 break;
946
947 last_bottom = bottom;
948 bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(),
949 last_bottom, true);
950 }
951 if (!found_good_border)
952 return false;
953
954 // TODO(nbeato) comments: follow modified code above... put it in a function!
955 found_good_border = false;
956 int last_top = INT32_MIN;
957 top = NextHorizontalSplit(guess_box.left(), guess_box.right(),
958 kMidGuessY + min_height_ / 2, false);
959 int previous_above = 0;
960 chances = kMaxChances;
961
962 adjusted.set_bottom(best_box.bottom());
963 while (last_top != top) {
964 adjusted.set_top(top);
965 if (adjusted.height() >= min_height_) {
966 table->set_bounding_box(adjusted);
967 if (table->FindWhitespacedStructure() &&
968 table->column_count() >= best_cols * kRequiredColumns) {
969 int last_row = table->row_count() - 1;
970 if (false && IsWeakTableRow(table, last_row)) {
971 // Currently buggy, but was looking promising so disabled.
972 --chances;
973 } else {
974 chances = kMaxChances;
975 double max_row_height = kMaxRowSize * table->median_cell_height();
976 if ((table->space_above() * kMarginFactor >= best_above &&
977 table->space_above() >= previous_above) ||
978 (table->CountFilledCellsInRow(last_row) > 1 &&
979 table->row_height(last_row) < max_row_height)) {
980 best_box.set_top(top);
981 best_above = table->space_above();
982 best_cols = std::max(table->column_count(), best_cols);
983 found_good_border = true;
984 }
985 }
986 previous_above = table->space_above();
987 } else {
988 --chances;
989 }
990 }
991 if (chances <= 0)
992 break;
993
994 last_top = top;
995 top = NextHorizontalSplit(guess_box.left(), guess_box.right(),
996 last_top, false);
997 }
998
999 if (!found_good_border)
1000 return false;
1001
1002 // If we get here, this shouldn't happen. It can be an assert, but
1003 // I haven't tested it enough to make it crash things.
1004 if (best_box.null_box())
1005 return false;
1006
1007 // Given the best locations, fit the box to those locations.
1008 table->set_bounding_box(best_box);
1009 return table->FindWhitespacedStructure();
1010}
1011
1012// Finds the closest value to y that can safely cause a horizontal
1013// split in the partitions.
1014// This function has been buggy and not as reliable as I would've
1015// liked. I suggest finding all of the splits using the
1016// FindPartitionLocations once and then just keeping the results
1017// of that function cached somewhere.
1018int TableRecognizer::NextHorizontalSplit(int left, int right, int y,
1019 bool top_to_bottom) {
1021 gsearch.SetUniqueMode(true);
1022 gsearch.StartVerticalSearch(left, right, y);
1023 ColPartition* text = nullptr;
1024 int last_y = y;
1025 while ((text = gsearch.NextVerticalSearch(top_to_bottom)) != nullptr) {
1026 if (!text->IsTextType() || !text->IsHorizontalType())
1027 continue;
1028 if (text->bounding_box().height() > max_text_height_)
1029 continue;
1030
1031 const TBOX& text_box = text->bounding_box();
1032 if (top_to_bottom && (last_y >= y || last_y <= text_box.top())) {
1033 last_y = std::min(last_y, static_cast<int>(text_box.bottom()));
1034 continue;
1035 }
1036 if (!top_to_bottom && (last_y <= y || last_y >= text_box.bottom())) {
1037 last_y = std::max(last_y, static_cast<int>(text_box.top()));
1038 continue;
1039 }
1040
1041 return last_y;
1042 }
1043 // If none is found, we at least want to preserve the min/max,
1044 // which defines the overlap of y with the last partition in the grid.
1045 return last_y;
1046}
1047
1048// Code is buggy right now. It is disabled in the calling function.
1049// It seems like sometimes the row that is passed in is not correct
1050// sometimes (like a phantom row is introduced). There's something going
1051// on in the cell_y_ data member before this is called... not certain.
1053 if (!table->VerifyRowFilled(row))
1054 return false;
1055
1056 double threshold = 0.0;
1058 threshold = table->column_count() * kGoodRowNumberOfColumnsLarge;
1059 else
1060 threshold = kGoodRowNumberOfColumnsSmall[table->column_count()];
1061
1062 return table->CountFilledCellsInRow(row) < threshold;
1063}
1064
1065} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:88
int count(LIST var_list)
Definition: oldlist.cpp:95
const double kVerticalSpacing
Definition: tablerecog.cpp:38
const double kGoodRowNumberOfColumnsLarge
Definition: tablerecog.cpp:60
const int kLinedTableMinHorizontalLines
Definition: tablerecog.cpp:45
const int kCellSplitRowThreshold
Definition: tablerecog.cpp:41
const double kHorizontalSpacing
Definition: tablerecog.cpp:35
const int kCellSplitColumnThreshold
Definition: tablerecog.cpp:42
const double kRequiredColumns
Definition: tablerecog.cpp:48
const double kGoodRowNumberOfColumnsSmall[]
Definition: tablerecog.cpp:56
const double kMinFilledArea
Definition: tablerecog.cpp:63
const int kGoodRowNumberOfColumnsSmallSize
Definition: tablerecog.cpp:57
const double kMaxRowSize
Definition: tablerecog.cpp:53
const double kMarginFactor
Definition: tablerecog.cpp:50
const int kLinedTableMinVerticalLines
Definition: tablerecog.cpp:44
int push_back(T object)
int length() const
Definition: genericvector.h:86
void compact_sorted()
T & get(int index) const
Definition: rect.h:34
void set_right(int x)
Definition: rect.h:82
int16_t top() const
Definition: rect.h:58
void set_bottom(int y)
Definition: rect.h:68
int32_t area() const
Definition: rect.h:122
int16_t height() const
Definition: rect.h:108
void set_top(int y)
Definition: rect.h:61
int16_t left() const
Definition: rect.h:72
int16_t bottom() const
Definition: rect.h:65
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87
bool null_box() const
Definition: rect.h:50
void set_left(int x)
Definition: rect.h:75
int16_t right() const
Definition: rect.h:79
Definition: statistc.h:31
void add(int32_t value, int32_t count)
Definition: statistc.cpp:93
double median() const
Definition: statistc.cpp:231
void StartVerticalSearch(int xmin, int xmax, int y)
Definition: bbgrid.h:788
void SetUniqueMode(bool mode)
Definition: bbgrid.h:253
BBC * NextSideSearch(bool right_to_left)
Definition: bbgrid.h:761
BBC * NextRectSearch()
Definition: bbgrid.h:842
void StartSideSearch(int x, int ymin, int ymax)
Definition: bbgrid.h:746
BBC * NextVerticalSearch(bool top_to_bottom)
Definition: bbgrid.h:802
void StartRectSearch(const TBOX &rect)
Definition: bbgrid.h:830
int gridsize() const
Definition: bbgrid.h:63
bool IsHorizontalLine() const
Definition: colpartition.h:460
bool IsVerticalLine() const
Definition: colpartition.h:455
const TBOX & bounding_box() const
Definition: colpartition.h:110
bool IsHorizontalType() const
Definition: colpartition.h:446
bool VerifyRowFilled(int row)
Definition: tablerecog.cpp:255
const TBOX & bounding_box() const
Definition: tablerecog.cpp:109
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
int row_height(int row) const
Definition: tablerecog.cpp:118
void set_max_text_height(int height)
Definition: tablerecog.cpp:91
bool DoesPartitionFit(const ColPartition &part) const
Definition: tablerecog.cpp:211
int column_width(int column) const
Definition: tablerecog.cpp:122
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
ColPartitionGrid * line_grid_
Definition: tablerecog.h:238
int FindVerticalMargin(ColPartitionGrid *grid, int start_x, bool decrease) const
Definition: tablerecog.cpp:484
void set_bounding_box(const TBOX &box)
Definition: tablerecog.cpp:106
int CountFilledCellsInColumn(int column)
Definition: tablerecog.cpp:229
static void FindCellSplitLocations(const GenericVector< int > &min_list, const GenericVector< int > &max_list, int max_merged, GenericVector< int > *locations)
Definition: tablerecog.cpp:592
double CalculateCellFilledPercentage(int row, int column)
Definition: tablerecog.cpp:266
void Display(ScrollView *window, ScrollView::Color color)
Definition: tablerecog.cpp:289
void set_line_grid(ColPartitionGrid *lines)
Definition: tablerecog.cpp:88
int CountHorizontalIntersections(int y)
Definition: tablerecog.cpp:662
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
int CountFilledCellsInRow(int row)
Definition: tablerecog.cpp:226
int FindHorizontalMargin(ColPartitionGrid *grid, int start_y, bool decrease) const
Definition: tablerecog.cpp:501
int CountPartitions(const TBOX &box)
Definition: tablerecog.cpp:688
int CountVerticalIntersections(int x)
Definition: tablerecog.cpp:638
void set_text_grid(ColPartitionGrid *text)
Definition: tablerecog.cpp:85
void UpdateMargins(ColPartitionGrid *grid)
Definition: tablerecog.cpp:474
bool RecognizeLinedTable(const TBOX &guess_box, StructuredTable *table)
Definition: tablerecog.cpp:758
bool FindLinesBoundingBoxIteration(TBOX *bounding_box)
Definition: tablerecog.cpp:836
bool FindLinesBoundingBox(TBOX *bounding_box)
Definition: tablerecog.cpp:814
ColPartitionGrid * text_grid_
Definition: tablerecog.h:367
void set_min_width(int width)
Definition: tablerecog.cpp:728
static bool IsWeakTableRow(StructuredTable *table, int row)
void set_max_text_height(int height)
Definition: tablerecog.cpp:731
int NextHorizontalSplit(int left, int right, int y, bool top_to_bottom)
void set_line_grid(ColPartitionGrid *lines)
Definition: tablerecog.cpp:722
bool HasSignificantLines(const TBOX &guess)
Definition: tablerecog.cpp:775
void set_text_grid(ColPartitionGrid *text)
Definition: tablerecog.cpp:719
ColPartitionGrid * line_grid_
Definition: tablerecog.h:368
void set_min_height(int height)
Definition: tablerecog.cpp:725
bool RecognizeWhitespacedTable(const TBOX &guess_box, StructuredTable *table)
Definition: tablerecog.cpp:874
StructuredTable * RecognizeTable(const TBOX &guess_box)
Definition: tablerecog.cpp:735
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:532
void UpdateWindow()
Definition: scrollview.cpp:704
void Pen(Color color)
Definition: scrollview.cpp:719
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:600
void Brush(Color color)
Definition: scrollview.cpp:725