tesseract 4.1.1
Loading...
Searching...
No Matches
tesseract::TableFinder Class Reference

#include <tablefind.h>

Public Member Functions

 TableFinder ()
 
 ~TableFinder ()
 
void set_resolution (int resolution)
 
void set_left_to_right_language (bool order)
 
void Init (int grid_size, const ICOORD &bottom_left, const ICOORD &top_right)
 
void InsertCleanPartitions (ColPartitionGrid *grid, TO_BLOCK *block)
 
void LocateTables (ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb, const FCOORD &reskew)
 

Protected Member Functions

int gridsize () const
 
int gridwidth () const
 
int gridheight () const
 
const ICOORDbleft () const
 
const ICOORDtright () const
 
ScrollViewMakeWindow (int x, int y, const char *window_name)
 
void InsertTextPartition (ColPartition *part)
 
void InsertFragmentedTextPartition (ColPartition *part)
 
void InsertLeaderPartition (ColPartition *part)
 
void InsertRulingPartition (ColPartition *part)
 
void InsertImagePartition (ColPartition *part)
 
void SplitAndInsertFragmentedTextPartition (ColPartition *part)
 
bool AllowTextPartition (const ColPartition &part) const
 
bool AllowBlob (const BLOBNBOX &blob) const
 
void MoveColSegmentsToGrid (ColSegment_LIST *segments, ColSegmentGrid *col_seg_grid)
 
void InitializePartitions (ColPartitionSet **all_columns)
 
void SetVerticalSpacing (ColPartition *part)
 
void SetGlobalSpacings (ColPartitionGrid *grid)
 
void set_global_median_xheight (int xheight)
 
void set_global_median_blob_width (int width)
 
void set_global_median_ledding (int ledding)
 
void FindNeighbors ()
 
void MarkTablePartitions ()
 
void MarkPartitionsUsingLocalInformation ()
 
bool HasWideOrNoInterWordGap (ColPartition *part) const
 
bool HasLeaderAdjacent (const ColPartition &part)
 
void FilterFalseAlarms ()
 
void FilterParagraphEndings ()
 
void FilterHeaderAndFooter ()
 
void SmoothTablePartitionRuns ()
 
void GetColumnBlocks (ColPartitionSet **columns, ColSegment_LIST *col_segments)
 
void GroupColumnBlocks (ColSegment_LIST *current_segments, ColSegment_LIST *col_segments)
 
bool ConsecutiveBoxes (const TBOX &b1, const TBOX &b2)
 
void SetColumnsType (ColSegment_LIST *col_segments)
 
void GridMergeColumnBlocks ()
 
void GetTableColumns (ColSegment_LIST *table_columns)
 
void GetTableRegions (ColSegment_LIST *table_columns, ColSegment_LIST *table_regions)
 
void GridMergeTableRegions ()
 
bool BelongToOneTable (const TBOX &box1, const TBOX &box2)
 
void AdjustTableBoundaries ()
 
void GrowTableBox (const TBOX &table_box, TBOX *result_box)
 
void GrowTableToIncludePartials (const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
 
void GrowTableToIncludeLines (const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
 
bool HLineBelongsToTable (const ColPartition &part, const TBOX &table_box)
 
void IncludeLeftOutColumnHeaders (TBOX *table_box)
 
void DeleteSingleColumnTables ()
 
bool GapInXProjection (int *xprojection, int length)
 
void RecognizeTables ()
 
void DisplayColSegments (ScrollView *win, ColSegment_LIST *cols, ScrollView::Color color)
 
void DisplayColPartitions (ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
 
void DisplayColPartitions (ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
 
void DisplayColPartitionConnections (ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
 
void DisplayColSegmentGrid (ScrollView *win, ColSegmentGrid *grid, ScrollView::Color color)
 
void MakeTableBlocks (ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb)
 

Static Protected Member Functions

static void SetPartitionSpacings (ColPartitionGrid *grid, ColPartitionSet **all_columns)
 

Protected Attributes

int resolution_
 
int global_median_xheight_
 
int global_median_blob_width_
 
int global_median_ledding_
 
ColPartitionGrid clean_part_grid_
 
ColPartitionGrid leader_and_ruling_grid_
 
ColPartitionGrid fragmented_text_grid_
 
ColSegmentGrid col_seg_grid_
 
ColSegmentGrid table_grid_
 
bool left_to_right_language_
 

Detailed Description

Definition at line 131 of file tablefind.h.

Constructor & Destructor Documentation

◆ TableFinder()

tesseract::TableFinder::TableFinder ( )

◆ ~TableFinder()

tesseract::TableFinder::~TableFinder ( )

Definition at line 167 of file tablefind.cpp.

167 {
168 // ColPartitions and ColSegments created by this class for storage in grids
169 // need to be deleted explicitly.
170 clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>);
171 leader_and_ruling_grid_.ClearGridData(&DeleteObject<ColPartition>);
172 fragmented_text_grid_.ClearGridData(&DeleteObject<ColPartition>);
173 col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>);
174 table_grid_.ClearGridData(&DeleteObject<ColSegment>);
175}
void ClearGridData(void(*free_method)(BBC *))
Definition: bbgrid.h:464
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:415
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:421
ColSegmentGrid table_grid_
Definition: tablefind.h:423
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:419
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:413

Member Function Documentation

◆ AdjustTableBoundaries()

void tesseract::TableFinder::AdjustTableBoundaries ( )
protected

Definition at line 1486 of file tablefind.cpp.

1486 {
1487 // Iterate the table regions in the grid
1488 ColSegment_CLIST adjusted_tables;
1489 ColSegment_C_IT it(&adjusted_tables);
1491 gsearch.StartFullSearch();
1492 ColSegment* table = nullptr;
1493 while ((table = gsearch.NextFullSearch()) != nullptr) {
1494 const TBOX& table_box = table->bounding_box();
1495 TBOX grown_box = table_box;
1496 GrowTableBox(table_box, &grown_box);
1497 // To prevent a table from expanding again, do not insert the
1498 // modified box back to the grid. Instead move it to a list and
1499 // and remove it from the grid. The list is moved later back to the grid.
1500 if (!grown_box.null_box()) {
1501 auto* col = new ColSegment();
1502 col->InsertBox(grown_box);
1503 it.add_after_then_move(col);
1504 }
1505 gsearch.RemoveBBox();
1506 delete table;
1507 }
1508 // clear table grid to move final tables in it
1509 // TODO(nbeato): table_grid_ should already be empty. The above loop
1510 // removed everything. Maybe just assert it is empty?
1512 it.move_to_first();
1513 // move back final tables to table_grid_
1514 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1515 ColSegment* seg = it.extract();
1516 table_grid_.InsertBBox(true, true, seg);
1517 }
1518}
GridSearch< ColSegment, ColSegment_CLIST, ColSegment_C_IT > ColSegmentGridSearch
Definition: tablefind.h:121
Definition: rect.h:34
bool null_box() const
Definition: rect.h:50
void Clear()
Definition: bbgrid.h:455
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:486
void GrowTableBox(const TBOX &table_box, TBOX *result_box)
Definition: tablefind.cpp:1520

◆ AllowBlob()

bool tesseract::TableFinder::AllowBlob ( const BLOBNBOX blob) const
protected

Definition at line 502 of file tablefind.cpp.

502 {
503 const TBOX& box = blob.bounding_box();
504 const double kHeightRequired = global_median_xheight_ * kAllowBlobHeight;
505 const double kWidthRequired = global_median_blob_width_ * kAllowBlobWidth;
506 const int median_area = global_median_xheight_ * global_median_blob_width_;
507 const double kAreaRequired = median_area * kAllowBlobArea;
508 // Keep comparisons strictly greater to disallow 0!
509 return box.height() > kHeightRequired &&
510 box.width() > kWidthRequired &&
511 box.area() > kAreaRequired;
512}
const double kAllowBlobHeight
Definition: tablefind.cpp:55
const double kAllowBlobArea
Definition: tablefind.cpp:57
const double kAllowBlobWidth
Definition: tablefind.cpp:56
const TBOX & bounding_box() const
Definition: blobbox.h:230
int16_t width() const
Definition: rect.h:115
int32_t area() const
Definition: rect.h:122
int16_t height() const
Definition: rect.h:108

◆ AllowTextPartition()

bool tesseract::TableFinder::AllowTextPartition ( const ColPartition part) const
protected

Definition at line 489 of file tablefind.cpp.

489 {
490 const double kHeightRequired = global_median_xheight_ * kAllowTextHeight;
491 const double kWidthRequired = global_median_blob_width_ * kAllowTextWidth;
492 const int median_area = global_median_xheight_ * global_median_blob_width_;
493 const double kAreaPerBlobRequired = median_area * kAllowTextArea;
494 // Keep comparisons strictly greater to disallow 0!
495 return part.median_height() > kHeightRequired &&
496 part.median_width() > kWidthRequired &&
497 part.bounding_box().area() > kAreaPerBlobRequired * part.boxes_count();
498}
const double kAllowTextArea
Definition: tablefind.cpp:50
const double kAllowTextWidth
Definition: tablefind.cpp:49
const double kAllowTextHeight
Definition: tablefind.cpp:48

◆ BelongToOneTable()

bool tesseract::TableFinder::BelongToOneTable ( const TBOX box1,
const TBOX box2 
)
protected

Definition at line 1444 of file tablefind.cpp.

1444 {
1445 // Check the obvious case. Most likely not true because overlapping boxes
1446 // should already be merged, but seems like a good thing to do in case things
1447 // change.
1448 if (box1.overlap(box2))
1449 return true;
1450 // Check for ColPartitions spanning both table regions
1451 TBOX bbox = box1.bounding_union(box2);
1452 // Start a rect search on bbox
1453 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1454 rectsearch(&clean_part_grid_);
1455 rectsearch.StartRectSearch(bbox);
1456 ColPartition* part = nullptr;
1457 while ((part = rectsearch.NextRectSearch()) != nullptr) {
1458 const TBOX& part_box = part->bounding_box();
1459 // return true if a colpartition spanning both table regions is found
1460 if (part_box.overlap(box1) && part_box.overlap(box2) &&
1461 !part->IsImageType())
1462 return true;
1463 }
1464 return false;
1465}
bool overlap(const TBOX &box) const
Definition: rect.h:355
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:129

◆ bleft()

const ICOORD & tesseract::TableFinder::bleft ( ) const
protected

Definition at line 387 of file tablefind.cpp.

387 {
388 return clean_part_grid_.bleft();
389}
const ICOORD & bleft() const
Definition: bbgrid.h:72

◆ ConsecutiveBoxes()

bool tesseract::TableFinder::ConsecutiveBoxes ( const TBOX b1,
const TBOX b2 
)
protected

Definition at line 568 of file tablefind.cpp.

568 {
569 int x_margin = 20;
570 int y_margin = 5;
571 return (abs(b1.left() - b2.left()) < x_margin) &&
572 (abs(b1.right() - b2.right()) < x_margin) &&
573 (abs(b1.top()-b2.bottom()) < y_margin ||
574 abs(b2.top()-b1.bottom()) < y_margin);
575}
int16_t top() const
Definition: rect.h:58
int16_t left() const
Definition: rect.h:72
int16_t bottom() const
Definition: rect.h:65
int16_t right() const
Definition: rect.h:79

◆ DeleteSingleColumnTables()

void tesseract::TableFinder::DeleteSingleColumnTables ( )
protected

Definition at line 1703 of file tablefind.cpp.

1703 {
1704 int page_width = tright().x() - bleft().x();
1705 ASSERT_HOST(page_width > 0);
1706 // create an integer array to hold projection on x-axis
1707 int* table_xprojection = new int[page_width];
1708 // Iterate through all tables in the table grid
1709 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1710 table_search(&table_grid_);
1711 table_search.StartFullSearch();
1712 ColSegment* table;
1713 while ((table = table_search.NextFullSearch()) != nullptr) {
1714 TBOX table_box = table->bounding_box();
1715 // reset the projection array
1716 for (int i = 0; i < page_width; i++) {
1717 table_xprojection[i] = 0;
1718 }
1719 // Start a rect search on table_box
1720 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1721 rectsearch(&clean_part_grid_);
1722 rectsearch.SetUniqueMode(true);
1723 rectsearch.StartRectSearch(table_box);
1724 ColPartition* part;
1725 while ((part = rectsearch.NextRectSearch()) != nullptr) {
1726 if (!part->IsTextType())
1727 continue; // Do not consider non-text partitions
1728 if (part->flow() == BTFT_LEADER)
1729 continue; // Assume leaders are in tables
1730 TBOX part_box = part->bounding_box();
1731 // Do not consider partitions partially covered by the table
1732 if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable)
1733 continue;
1734 BLOBNBOX_CLIST* part_boxes = part->boxes();
1735 BLOBNBOX_C_IT pit(part_boxes);
1736
1737 // Make sure overlapping blobs don't artificially inflate the number
1738 // of rows in the table. This happens frequently with things such as
1739 // decimals and split characters. Do this by assuming the column
1740 // partition is sorted mostly left to right and just clip
1741 // bounding boxes by the previous box's extent.
1742 int next_position_to_write = 0;
1743
1744 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
1745 BLOBNBOX *pblob = pit.data();
1746 // ignore blob height for the purpose of projection since we
1747 // are only interested in finding valleys
1748 int xstart = pblob->bounding_box().left();
1749 int xend = pblob->bounding_box().right();
1750
1751 xstart = std::max(xstart, next_position_to_write);
1752 for (int i = xstart; i < xend; i++)
1753 table_xprojection[i - bleft().x()]++;
1754 next_position_to_write = xend;
1755 }
1756 }
1757 // Find largest valley between two reasonable peaks in the table
1758 if (!GapInXProjection(table_xprojection, page_width)) {
1759 table_search.RemoveBBox();
1760 delete table;
1761 }
1762 }
1763 delete[] table_xprojection;
1764}
@ BTFT_LEADER
Definition: blobbox.h:121
#define ASSERT_HOST(x)
Definition: errcode.h:88
const double kMinOverlapWithTable
Definition: tablefind.cpp:96
int16_t x() const
access function
Definition: points.h:52
double overlap_fraction(const TBOX &box) const
Definition: rect.h:388
const ICOORD & bleft() const
Definition: tablefind.cpp:387
const ICOORD & tright() const
Definition: tablefind.cpp:390
bool GapInXProjection(int *xprojection, int length)
Definition: tablefind.cpp:1768

◆ DisplayColPartitionConnections()

void tesseract::TableFinder::DisplayColPartitionConnections ( ScrollView win,
ColPartitionGrid grid,
ScrollView::Color  default_color 
)
protected

Definition at line 1950 of file tablefind.cpp.

1953 {
1954#ifndef GRAPHICS_DISABLED
1955 // Iterate the ColPartitions in the grid.
1956 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1957 gsearch(grid);
1958 gsearch.StartFullSearch();
1959 ColPartition* part = nullptr;
1960 while ((part = gsearch.NextFullSearch()) != nullptr) {
1961 const TBOX& box = part->bounding_box();
1962 int left_x = box.left();
1963 int right_x = box.right();
1964 int top_y = box.top();
1965 int bottom_y = box.bottom();
1966
1967 ColPartition* upper_part = part->nearest_neighbor_above();
1968 if (upper_part) {
1969 const TBOX& upper_box = upper_part->bounding_box();
1970 int mid_x = (left_x + right_x) / 2;
1971 int mid_y = (top_y + bottom_y) / 2;
1972 int other_x = (upper_box.left() + upper_box.right()) / 2;
1973 int other_y = (upper_box.top() + upper_box.bottom()) / 2;
1974 win->Brush(ScrollView::NONE);
1975 win->Pen(color);
1976 win->Line(mid_x, mid_y, other_x, other_y);
1977 }
1978 ColPartition* lower_part = part->nearest_neighbor_below();
1979 if (lower_part) {
1980 const TBOX& lower_box = lower_part->bounding_box();
1981 int mid_x = (left_x + right_x) / 2;
1982 int mid_y = (top_y + bottom_y) / 2;
1983 int other_x = (lower_box.left() + lower_box.right()) / 2;
1984 int other_y = (lower_box.top() + lower_box.bottom()) / 2;
1985 win->Brush(ScrollView::NONE);
1986 win->Pen(color);
1987 win->Line(mid_x, mid_y, other_x, other_y);
1988 }
1989 }
1990 win->UpdateWindow();
1991#endif
1992}
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:532
void UpdateWindow()
Definition: scrollview.cpp:704
void Pen(Color color)
Definition: scrollview.cpp:719
void Brush(Color color)
Definition: scrollview.cpp:725

◆ DisplayColPartitions() [1/2]

void tesseract::TableFinder::DisplayColPartitions ( ScrollView win,
ColPartitionGrid grid,
ScrollView::Color  default_color 
)
protected

Definition at line 1944 of file tablefind.cpp.

1946 {
1947 DisplayColPartitions(win, grid, default_color, ScrollView::YELLOW);
1948}
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1916

◆ DisplayColPartitions() [2/2]

void tesseract::TableFinder::DisplayColPartitions ( ScrollView win,
ColPartitionGrid grid,
ScrollView::Color  text_color,
ScrollView::Color  table_color 
)
protected

Definition at line 1916 of file tablefind.cpp.

1919 {
1920#ifndef GRAPHICS_DISABLED
1921 ScrollView::Color color = default_color;
1922 // Iterate the ColPartitions in the grid.
1923 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1924 gsearch(grid);
1925 gsearch.StartFullSearch();
1926 ColPartition* part = nullptr;
1927 while ((part = gsearch.NextFullSearch()) != nullptr) {
1928 color = default_color;
1929 if (part->type() == PT_TABLE)
1930 color = table_color;
1931
1932 const TBOX& box = part->bounding_box();
1933 int left_x = box.left();
1934 int right_x = box.right();
1935 int top_y = box.top();
1936 int bottom_y = box.bottom();
1937 win->Brush(ScrollView::NONE);
1938 win->Pen(color);
1939 win->Rectangle(left_x, bottom_y, right_x, top_y);
1940 }
1941 win->UpdateWindow();
1942#endif
1943}
@ PT_TABLE
Definition: capi.h:135
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:600

◆ DisplayColSegmentGrid()

void tesseract::TableFinder::DisplayColSegmentGrid ( ScrollView win,
ColSegmentGrid grid,
ScrollView::Color  color 
)
protected

Definition at line 1891 of file tablefind.cpp.

1892 {
1893#ifndef GRAPHICS_DISABLED
1894 // Iterate the ColPartitions in the grid.
1895 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1896 gsearch(grid);
1897 gsearch.StartFullSearch();
1898 ColSegment* seg = nullptr;
1899 while ((seg = gsearch.NextFullSearch()) != nullptr) {
1900 const TBOX& box = seg->bounding_box();
1901 int left_x = box.left();
1902 int right_x = box.right();
1903 int top_y = box.top();
1904 int bottom_y = box.bottom();
1905 win->Brush(ScrollView::NONE);
1906 win->Pen(color);
1907 win->Rectangle(left_x, bottom_y, right_x, top_y);
1908 }
1909 win->UpdateWindow();
1910#endif
1911}

◆ DisplayColSegments()

void tesseract::TableFinder::DisplayColSegments ( ScrollView win,
ColSegment_LIST *  cols,
ScrollView::Color  color 
)
protected

Definition at line 1871 of file tablefind.cpp.

1873 {
1874#ifndef GRAPHICS_DISABLED
1875 win->Pen(color);
1876 win->Brush(ScrollView::NONE);
1877 ColSegment_IT it(segments);
1878 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1879 ColSegment* col = it.data();
1880 const TBOX& box = col->bounding_box();
1881 int left_x = box.left();
1882 int right_x = box.right();
1883 int top_y = box.top();
1884 int bottom_y = box.bottom();
1885 win->Rectangle(left_x, bottom_y, right_x, top_y);
1886 }
1887 win->UpdateWindow();
1888#endif
1889}

◆ FilterFalseAlarms()

void tesseract::TableFinder::FilterFalseAlarms ( )
protected

Definition at line 988 of file tablefind.cpp.

988 {
991 // TODO(nbeato): Fully justified text as non-table?
992}

◆ FilterHeaderAndFooter()

void tesseract::TableFinder::FilterHeaderAndFooter ( )
protected

Definition at line 1074 of file tablefind.cpp.

1074 {
1075 // Consider top-most text colpartition as header and bottom most as footer
1076 ColPartition* header = nullptr;
1077 ColPartition* footer = nullptr;
1078 int max_top = INT32_MIN;
1079 int min_bottom = INT32_MAX;
1081 gsearch.StartFullSearch();
1082 ColPartition* part = nullptr;
1083 while ((part = gsearch.NextFullSearch()) != nullptr) {
1084 if (!part->IsTextType())
1085 continue; // Consider only text partitions
1086 int top = part->bounding_box().top();
1087 int bottom = part->bounding_box().bottom();
1088 if (top > max_top) {
1089 max_top = top;
1090 header = part;
1091 }
1092 if (bottom < min_bottom) {
1093 min_bottom = bottom;
1094 footer = part;
1095 }
1096 }
1097 if (header)
1098 header->clear_table_type();
1099 if (footer)
1100 footer->clear_table_type();
1101}
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936

◆ FilterParagraphEndings()

void tesseract::TableFinder::FilterParagraphEndings ( )
protected

Definition at line 994 of file tablefind.cpp.

994 {
995 // Detect last line of paragraph
996 // Iterate the ColPartitions in the grid.
998 gsearch.StartFullSearch();
999 ColPartition* part = nullptr;
1000 while ((part = gsearch.NextFullSearch()) != nullptr) {
1001 if (part->type() != PT_TABLE)
1002 continue; // Consider only table partitions
1003
1004 // Paragraph ending should have flowing text above it.
1005 ColPartition* upper_part = part->nearest_neighbor_above();
1006 if (!upper_part)
1007 continue;
1008 if (upper_part->type() != PT_FLOWING_TEXT)
1009 continue;
1010 if (upper_part->bounding_box().width() <
1011 2 * part->bounding_box().width())
1012 continue;
1013 // Check if its the last line of a paragraph.
1014 // In most cases, a paragraph ending should be left-aligned to text line
1015 // above it. Sometimes, it could be a 2 line paragraph, in which case
1016 // the line above it is indented.
1017 // To account for that, check if the partition center is to
1018 // the left of the one above it.
1019 int mid = (part->bounding_box().left() + part->bounding_box().right()) / 2;
1020 int upper_mid = (upper_part->bounding_box().left() +
1021 upper_part->bounding_box().right()) / 2;
1022 int current_spacing = 0; // spacing of the current line to margin
1023 int upper_spacing = 0; // spacing of the previous line to the margin
1025 // Left to right languages, use mid - left to figure out the distance
1026 // the middle is from the left margin.
1027 int left = std::min(part->bounding_box().left(),
1028 upper_part->bounding_box().left());
1029 current_spacing = mid - left;
1030 upper_spacing = upper_mid - left;
1031 } else {
1032 // Right to left languages, use right - mid to figure out the distance
1033 // the middle is from the right margin.
1034 int right = std::max(part->bounding_box().right(),
1035 upper_part->bounding_box().right());
1036 current_spacing = right - mid;
1037 upper_spacing = right - upper_mid;
1038 }
1039 if (current_spacing * kParagraphEndingPreviousLineRatio > upper_spacing)
1040 continue;
1041
1042 // Paragraphs should have similar fonts.
1043 if (!part->MatchingSizes(*upper_part) ||
1044 !part->MatchingStrokeWidth(*upper_part, kStrokeWidthFractionalTolerance,
1046 continue;
1047 }
1048
1049 // The last line of a paragraph should be left aligned.
1050 // TODO(nbeato): This would be untrue if the text was right aligned.
1051 // How often is that?
1052 if (part->space_to_left() >
1053 kMaxParagraphEndingLeftSpaceMultiple * part->median_height())
1054 continue;
1055 // The line above it should be right aligned (assuming justified format).
1056 // Since we can't assume justified text, we compare whitespace to text.
1057 // The above line should have majority spanning text (or the current
1058 // line could have fit on the previous line). So compare
1059 // whitespace to text.
1060 if (upper_part->bounding_box().width() <
1061 kMinParagraphEndingTextToWhitespaceRatio * upper_part->space_to_right())
1062 continue;
1063
1064 // Ledding above the line should be less than ledding below
1065 if (part->space_above() >= part->space_below() ||
1066 part->space_above() > 2 * global_median_ledding_)
1067 continue;
1068
1069 // If all checks failed, it is probably text.
1070 part->clear_table_type();
1071 }
1072}
@ PT_FLOWING_TEXT
Definition: capi.h:130
const double kMaxParagraphEndingLeftSpaceMultiple
Definition: tablefind.cpp:125
const double kStrokeWidthFractionalTolerance
Definition: tablefind.cpp:139
const double kMinParagraphEndingTextToWhitespaceRatio
Definition: tablefind.cpp:131
const double kStrokeWidthConstantTolerance
Definition: tablefind.cpp:140
const double kParagraphEndingPreviousLineRatio
Definition: tablefind.cpp:121

◆ FindNeighbors()

void tesseract::TableFinder::FindNeighbors ( )
protected

Definition at line 766 of file tablefind.cpp.

766 {
768 gsearch.StartFullSearch();
769 ColPartition* part = nullptr;
770 while ((part = gsearch.NextFullSearch()) != nullptr) {
771 // TODO(nbeato): Rename this function, meaning is different now.
772 // IT is finding nearest neighbors its own way
773 //SetVerticalSpacing(part);
774
775 ColPartition* upper = part->SingletonPartner(true);
776 if (upper)
777 part->set_nearest_neighbor_above(upper);
778
779 ColPartition* lower = part->SingletonPartner(false);
780 if (lower)
781 part->set_nearest_neighbor_below(lower);
782 }
783}

◆ GapInXProjection()

bool tesseract::TableFinder::GapInXProjection ( int *  xprojection,
int  length 
)
protected

Definition at line 1768 of file tablefind.cpp.

1768 {
1769 // Find peak value of the histogram
1770 int peak_value = 0;
1771 for (int i = 0; i < length; i++) {
1772 if (xprojection[i] > peak_value) {
1773 peak_value = xprojection[i];
1774 }
1775 }
1776 // Peak value represents the maximum number of horizontally
1777 // overlapping colpartitions, so this can be considered as the
1778 // number of rows in the table
1779 if (peak_value < kMinRowsInTable)
1780 return false;
1781 double projection_threshold = kSmallTableProjectionThreshold * peak_value;
1782 if (peak_value >= kLargeTableRowCount)
1783 projection_threshold = kLargeTableProjectionThreshold * peak_value;
1784 // Threshold the histogram
1785 for (int i = 0; i < length; i++) {
1786 xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0;
1787 }
1788 // Find the largest run of zeros between two ones
1789 int largest_gap = 0;
1790 int run_start = -1;
1791 for (int i = 1; i < length; i++) {
1792 // detect start of a run of zeros
1793 if (xprojection[i - 1] && !xprojection[i]) {
1794 run_start = i;
1795 }
1796 // detect end of a run of zeros and update the value of largest gap
1797 if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) {
1798 int gap = i - run_start;
1799 if (gap > largest_gap)
1800 largest_gap = gap;
1801 run_start = -1;
1802 }
1803 }
1804 return largest_gap > kMaxXProjectionGapFactor * global_median_xheight_;
1805}
const double kLargeTableProjectionThreshold
Definition: tablefind.cpp:106
const int kMinRowsInTable
Definition: tablefind.cpp:111
const int kLargeTableRowCount
Definition: tablefind.cpp:108
const double kSmallTableProjectionThreshold
Definition: tablefind.cpp:105
const double kMaxXProjectionGapFactor
Definition: tablefind.cpp:135

◆ GetColumnBlocks()

void tesseract::TableFinder::GetColumnBlocks ( ColPartitionSet **  columns,
ColSegment_LIST *  col_segments 
)
protected

Definition at line 523 of file tablefind.cpp.

524 {
525 for (int i = 0; i < gridheight(); ++i) {
526 ColPartitionSet* columns = all_columns[i];
527 if (columns != nullptr) {
528 ColSegment_LIST new_blocks;
529 // Get boxes from the current vertical position on the grid
530 columns->GetColumnBoxes(i * gridsize(), (i+1) * gridsize(), &new_blocks);
531 // Merge the new_blocks boxes into column_blocks if they are well-aligned
532 GroupColumnBlocks(&new_blocks, column_blocks);
533 }
534 }
535}
void GroupColumnBlocks(ColSegment_LIST *current_segments, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:538
int gridheight() const
Definition: tablefind.cpp:384

◆ GetTableColumns()

void tesseract::TableFinder::GetTableColumns ( ColSegment_LIST *  table_columns)
protected

Definition at line 1273 of file tablefind.cpp.

1273 {
1274 ColSegment_IT it(table_columns);
1275 // Iterate the ColPartitions in the grid.
1276 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1277 gsearch(&clean_part_grid_);
1278 gsearch.StartFullSearch();
1279 ColPartition* part;
1280 while ((part = gsearch.NextFullSearch()) != nullptr) {
1281 if (part->inside_table_column() || part->type() != PT_TABLE)
1282 continue; // prevent a partition to be assigned to multiple columns
1283 const TBOX& box = part->bounding_box();
1284 auto* col = new ColSegment();
1285 col->InsertBox(box);
1286 part->set_inside_table_column(true);
1287 // Start a search below the current cell to find bottom neighbours
1288 // Note: a full search will always process things above it first, so
1289 // this should be starting at the highest cell and working its way down.
1290 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1291 vsearch(&clean_part_grid_);
1292 vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom());
1293 ColPartition* neighbor = nullptr;
1294 bool found_neighbours = false;
1295 while ((neighbor = vsearch.NextVerticalSearch(true)) != nullptr) {
1296 // only consider neighbors not assigned to any column yet
1297 if (neighbor->inside_table_column())
1298 continue;
1299 // Horizontal lines should not break the flow
1300 if (neighbor->IsHorizontalLine())
1301 continue;
1302 // presence of a non-table neighbor marks the end of current
1303 // table column
1304 if (neighbor->type() != PT_TABLE)
1305 break;
1306 // add the neighbor partition to the table column
1307 const TBOX& neighbor_box = neighbor->bounding_box();
1308 col->InsertBox(neighbor_box);
1309 neighbor->set_inside_table_column(true);
1310 found_neighbours = true;
1311 }
1312 if (found_neighbours) {
1313 it.add_after_then_move(col);
1314 } else {
1315 part->set_inside_table_column(false);
1316 delete col;
1317 }
1318 }
1319}

◆ GetTableRegions()

void tesseract::TableFinder::GetTableRegions ( ColSegment_LIST *  table_columns,
ColSegment_LIST *  table_regions 
)
protected

Definition at line 1323 of file tablefind.cpp.

1324 {
1325 ColSegment_IT cit(table_columns);
1326 ColSegment_IT rit(table_regions);
1327 // Iterate through column blocks
1328 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1329 gsearch(&col_seg_grid_);
1330 gsearch.StartFullSearch();
1331 ColSegment* part;
1332 int page_height = tright().y() - bleft().y();
1333 ASSERT_HOST(page_height > 0);
1334 // create a bool array to hold projection on y-axis
1335 bool* table_region = new bool[page_height];
1336 while ((part = gsearch.NextFullSearch()) != nullptr) {
1337 const TBOX& part_box = part->bounding_box();
1338 // reset the projection array
1339 for (int i = 0; i < page_height; i++) {
1340 table_region[i] = false;
1341 }
1342 // iterate through all table columns to find regions in the current
1343 // page column block
1344 cit.move_to_first();
1345 for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {
1346 TBOX col_box = cit.data()->bounding_box();
1347 // find intersection region of table column and page column
1348 TBOX intersection_box = col_box.intersection(part_box);
1349 // project table column on the y-axis
1350 for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) {
1351 table_region[i - bleft().y()] = true;
1352 }
1353 }
1354 // set x-limits of table regions to page column width
1355 TBOX current_table_box;
1356 current_table_box.set_left(part_box.left());
1357 current_table_box.set_right(part_box.right());
1358 // go through the y-axis projection to find runs of table
1359 // regions. Each run makes one table region.
1360 for (int i = 1; i < page_height; i++) {
1361 // detect start of a table region
1362 if (!table_region[i - 1] && table_region[i]) {
1363 current_table_box.set_bottom(i + bleft().y());
1364 }
1365 // TODO(nbeato): Is it guaranteed that the last row is not a table region?
1366 // detect end of a table region
1367 if (table_region[i - 1] && !table_region[i]) {
1368 current_table_box.set_top(i + bleft().y());
1369 if (!current_table_box.null_box()) {
1370 auto* seg = new ColSegment();
1371 seg->InsertBox(current_table_box);
1372 rit.add_after_then_move(seg);
1373 }
1374 }
1375 }
1376 }
1377 delete[] table_region;
1378}
int16_t y() const
access_function
Definition: points.h:56
void set_right(int x)
Definition: rect.h:82
void set_bottom(int y)
Definition: rect.h:68
void set_top(int y)
Definition: rect.h:61
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87
void set_left(int x)
Definition: rect.h:75

◆ gridheight()

int tesseract::TableFinder::gridheight ( ) const
protected

Definition at line 384 of file tablefind.cpp.

384 {
386}
int gridheight() const
Definition: bbgrid.h:69

◆ GridMergeColumnBlocks()

void tesseract::TableFinder::GridMergeColumnBlocks ( )
protected

Definition at line 1195 of file tablefind.cpp.

1195 {
1196 int margin = gridsize();
1197
1198 // Iterate the Column Blocks in the grid.
1199 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1200 gsearch(&col_seg_grid_);
1201 gsearch.StartFullSearch();
1202 ColSegment* seg;
1203 while ((seg = gsearch.NextFullSearch()) != nullptr) {
1204 if (seg->type() != COL_TEXT)
1205 continue; // only consider text blocks for split detection
1206 bool neighbor_found = false;
1207 bool modified = false; // Modified at least once
1208 // keep expanding current box as long as neighboring table columns
1209 // are found above or below it.
1210 do {
1211 TBOX box = seg->bounding_box();
1212 // slightly expand the search region vertically
1213 int top_range = std::min(box.top() + margin, static_cast<int>(tright().y()));
1214 int bottom_range = std::max(box.bottom() - margin, static_cast<int>(bleft().y()));
1215 box.set_top(top_range);
1216 box.set_bottom(bottom_range);
1217 neighbor_found = false;
1218 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1219 rectsearch(&col_seg_grid_);
1220 rectsearch.StartRectSearch(box);
1221 ColSegment* neighbor = nullptr;
1222 while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
1223 if (neighbor == seg)
1224 continue;
1225 const TBOX& neighbor_box = neighbor->bounding_box();
1226 // If the neighbor box significantly overlaps with the current
1227 // box (due to the expansion of the current box in the
1228 // previous iteration of this loop), remove the neighbor box
1229 // and expand the current box to include it.
1230 if (neighbor_box.overlap_fraction(box) >= 0.9) {
1231 seg->InsertBox(neighbor_box);
1232 modified = true;
1233 rectsearch.RemoveBBox();
1234 gsearch.RepositionIterator();
1235 delete neighbor;
1236 continue;
1237 }
1238 // Only expand if the neighbor box is of table type
1239 if (neighbor->type() != COL_TABLE)
1240 continue;
1241 // Insert the neighbor box into the current column block
1242 if (neighbor_box.major_x_overlap(box) &&
1243 !box.contains(neighbor_box)) {
1244 seg->InsertBox(neighbor_box);
1245 neighbor_found = true;
1246 modified = true;
1247 rectsearch.RemoveBBox();
1248 gsearch.RepositionIterator();
1249 delete neighbor;
1250 }
1251 }
1252 } while (neighbor_found);
1253 if (modified) {
1254 // Because the box has changed, it has to be removed first.
1255 gsearch.RemoveBBox();
1256 col_seg_grid_.InsertBBox(true, true, seg);
1257 gsearch.RepositionIterator();
1258 }
1259 }
1260}
@ COL_TEXT
Definition: tablefind.h:32
@ COL_TABLE
Definition: tablefind.h:33
bool contains(const FCOORD pt) const
Definition: rect.h:333
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:412

◆ GridMergeTableRegions()

void tesseract::TableFinder::GridMergeTableRegions ( )
protected

Definition at line 1386 of file tablefind.cpp.

1386 {
1387 // Iterate the table regions in the grid.
1388 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1389 gsearch(&table_grid_);
1390 gsearch.StartFullSearch();
1391 ColSegment* seg = nullptr;
1392 while ((seg = gsearch.NextFullSearch()) != nullptr) {
1393 bool neighbor_found = false;
1394 bool modified = false; // Modified at least once
1395 do {
1396 // Start a rectangle search x-bounded by the image and y by the table
1397 const TBOX& box = seg->bounding_box();
1398 TBOX search_region(box);
1399 search_region.set_left(bleft().x());
1400 search_region.set_right(tright().x());
1401 neighbor_found = false;
1402 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1403 rectsearch(&table_grid_);
1404 rectsearch.StartRectSearch(search_region);
1405 ColSegment* neighbor = nullptr;
1406 while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
1407 if (neighbor == seg)
1408 continue;
1409 const TBOX& neighbor_box = neighbor->bounding_box();
1410 // Check if a neighbor box has a large overlap with the table
1411 // region. This may happen as a result of merging two table
1412 // regions in the previous iteration.
1413 if (neighbor_box.overlap_fraction(box) >= 0.9) {
1414 seg->InsertBox(neighbor_box);
1415 rectsearch.RemoveBBox();
1416 gsearch.RepositionIterator();
1417 delete neighbor;
1418 modified = true;
1419 continue;
1420 }
1421 // Check if two table regions belong together based on a common
1422 // horizontal ruling line
1423 if (BelongToOneTable(box, neighbor_box)) {
1424 seg->InsertBox(neighbor_box);
1425 neighbor_found = true;
1426 modified = true;
1427 rectsearch.RemoveBBox();
1428 gsearch.RepositionIterator();
1429 delete neighbor;
1430 }
1431 }
1432 } while (neighbor_found);
1433 if (modified) {
1434 // Because the box has changed, it has to be removed first.
1435 gsearch.RemoveBBox();
1436 table_grid_.InsertBBox(true, true, seg);
1437 gsearch.RepositionIterator();
1438 }
1439 }
1440}
bool BelongToOneTable(const TBOX &box1, const TBOX &box2)
Definition: tablefind.cpp:1444

◆ gridsize()

int tesseract::TableFinder::gridsize ( ) const
protected

Definition at line 378 of file tablefind.cpp.

378 {
379 return clean_part_grid_.gridsize();
380}
int gridsize() const
Definition: bbgrid.h:63

◆ gridwidth()

int tesseract::TableFinder::gridwidth ( ) const
protected

Definition at line 381 of file tablefind.cpp.

381 {
383}
int gridwidth() const
Definition: bbgrid.h:66

◆ GroupColumnBlocks()

void tesseract::TableFinder::GroupColumnBlocks ( ColSegment_LIST *  current_segments,
ColSegment_LIST *  col_segments 
)
protected

Definition at line 538 of file tablefind.cpp.

539 {
540 ColSegment_IT src_it(new_blocks);
541 ColSegment_IT dest_it(column_blocks);
542 // iterate through the source list
543 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
544 ColSegment* src_seg = src_it.data();
545 const TBOX& src_box = src_seg->bounding_box();
546 bool match_found = false;
547 // iterate through the destination list to find a matching column block
548 for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
549 ColSegment* dest_seg = dest_it.data();
550 TBOX dest_box = dest_seg->bounding_box();
551 if (ConsecutiveBoxes(src_box, dest_box)) {
552 // If matching block is found, insert the current block into it
553 // and delete the source block.
554 dest_seg->InsertBox(src_box);
555 match_found = true;
556 delete src_it.extract();
557 break;
558 }
559 }
560 // If no match is found, just append the source block to column_blocks
561 if (!match_found) {
562 dest_it.add_after_then_move(src_it.extract());
563 }
564 }
565}
bool ConsecutiveBoxes(const TBOX &b1, const TBOX &b2)
Definition: tablefind.cpp:568

◆ GrowTableBox()

void tesseract::TableFinder::GrowTableBox ( const TBOX table_box,
TBOX result_box 
)
protected

Definition at line 1520 of file tablefind.cpp.

1520 {
1521 // TODO(nbeato): The growing code is a bit excessive right now.
1522 // By removing these lines, the partitions considered need
1523 // to have some overlap or be special cases. These lines could
1524 // be added again once a check is put in place to make sure that
1525 // growing tables don't stomp on a lot of non-table partitions.
1526
1527 // search for horizontal ruling lines within the vertical margin
1528 // int vertical_margin = kRulingVerticalMargin * gridsize();
1529 TBOX search_box = table_box;
1530 // int top = MIN(search_box.top() + vertical_margin, tright().y());
1531 // int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y());
1532 // search_box.set_top(top);
1533 // search_box.set_bottom(bottom);
1534
1535 GrowTableToIncludePartials(table_box, search_box, result_box);
1536 GrowTableToIncludeLines(table_box, search_box, result_box);
1537 IncludeLeftOutColumnHeaders(result_box);
1538}
void IncludeLeftOutColumnHeaders(TBOX *table_box)
Definition: tablefind.cpp:1664
void GrowTableToIncludeLines(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1570
void GrowTableToIncludePartials(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1542

◆ GrowTableToIncludeLines()

void tesseract::TableFinder::GrowTableToIncludeLines ( const TBOX table_box,
const TBOX search_range,
TBOX result_box 
)
protected

Definition at line 1570 of file tablefind.cpp.

1572 {
1574 rsearch.SetUniqueMode(true);
1575 rsearch.StartRectSearch(search_range);
1576 ColPartition* part = nullptr;
1577 while ((part = rsearch.NextRectSearch()) != nullptr) {
1578 // TODO(nbeato) This should also do vertical, but column
1579 // boundaries are breaking things. This function needs to be
1580 // updated to allow vertical lines as well.
1581 if (!part->IsLineType())
1582 continue;
1583 // Avoid the following function call if the result of the
1584 // function is irrelevant.
1585 const TBOX& part_box = part->bounding_box();
1586 if (result_box->contains(part_box))
1587 continue;
1588 // Include a partially overlapping horizontal line only if the
1589 // extra ColPartitions that will be included due to expansion
1590 // have large side spacing w.r.t. columns containing them.
1591 if (HLineBelongsToTable(*part, table_box))
1592 *result_box = result_box->bounding_union(part_box);
1593 // TODO(nbeato): Vertical
1594 }
1595}
bool HLineBelongsToTable(const ColPartition &part, const TBOX &table_box)
Definition: tablefind.cpp:1600

◆ GrowTableToIncludePartials()

void tesseract::TableFinder::GrowTableToIncludePartials ( const TBOX table_box,
const TBOX search_range,
TBOX result_box 
)
protected

Definition at line 1542 of file tablefind.cpp.

1544 {
1545 // Rulings are in a different grid, so search 2 grids for rulings, text,
1546 // and table partitions that are not entirely within the new box.
1547 for (int i = 0; i < 2; ++i) {
1548 ColPartitionGrid* grid = (i == 0) ? &fragmented_text_grid_ :
1550 ColPartitionGridSearch rectsearch(grid);
1551 rectsearch.StartRectSearch(search_range);
1552 ColPartition* part = nullptr;
1553 while ((part = rectsearch.NextRectSearch()) != nullptr) {
1554 // Only include text and table types.
1555 if (part->IsImageType())
1556 continue;
1557 const TBOX& part_box = part->bounding_box();
1558 // Include partition in the table if more than half of it
1559 // is covered by the table
1560 if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1561 *result_box = result_box->bounding_union(part_box);
1562 continue;
1563 }
1564 }
1565 }
1566}

◆ HasLeaderAdjacent()

bool tesseract::TableFinder::HasLeaderAdjacent ( const ColPartition part)
protected

Definition at line 946 of file tablefind.cpp.

946 {
947 if (part.flow() == BTFT_LEADER)
948 return true;
949 // Search range is left and right bounded by an offset of the
950 // median xheight. This offset is to allow some tolerance to the
951 // the leaders on the page in the event that the alignment is still
952 // a bit off.
953 const TBOX& box = part.bounding_box();
955 const int top = box.top() + search_size;
956 const int bottom = box.bottom() - search_size;
958 for (int direction = 0; direction < 2; ++direction) {
959 bool right_to_left = (direction == 0);
960 int x = right_to_left ? box.right() : box.left();
961 hsearch.StartSideSearch(x, bottom, top);
962 ColPartition* leader = nullptr;
963 while ((leader = hsearch.NextSideSearch(right_to_left)) != nullptr) {
964 // The leader could be a horizontal ruling in the grid.
965 // Make sure it is actually a leader.
966 if (leader->flow() != BTFT_LEADER)
967 continue;
968 // This should not happen, they are in different grids.
969 ASSERT_HOST(&part != leader);
970 // Make sure the leader shares a page column with the partition,
971 // otherwise we are spreading across columns.
972 if (!part.IsInSameColumnAs(*leader))
973 break;
974 // There should be a significant vertical overlap
975 if (!leader->VSignificantCoreOverlap(part))
976 continue;
977 // Leader passed all tests, so it is adjacent.
978 return true;
979 }
980 }
981 // No leaders are adjacent to the given partition.
982 return false;
983}
const int kAdjacentLeaderSearchPadding
Definition: tablefind.cpp:116

◆ HasWideOrNoInterWordGap()

bool tesseract::TableFinder::HasWideOrNoInterWordGap ( ColPartition part) const
protected

Definition at line 857 of file tablefind.cpp.

857 {
858 // Should only get text partitions.
859 ASSERT_HOST(part->IsTextType());
860 // Blob access
861 BLOBNBOX_CLIST* part_boxes = part->boxes();
862 BLOBNBOX_C_IT it(part_boxes);
863 // Check if this is a relatively small partition (such as a single word)
864 if (part->bounding_box().width() <
865 kMinBoxesInTextPartition * part->median_height() &&
866 part_boxes->length() < kMinBoxesInTextPartition)
867 return true;
868
869 // Variables used to compute inter-blob spacing.
870 int current_x0 = -1;
871 int current_x1 = -1;
872 int previous_x1 = -1;
873 // Stores the maximum gap detected.
874 int largest_partition_gap_found = -1;
875 // Text partition gap limits. If this is text (and not a table),
876 // there should be at least one gap larger than min_gap and no gap
877 // larger than max_gap.
878 const double max_gap = kMaxGapInTextPartition * part->median_height();
879 const double min_gap = kMinMaxGapInTextPartition * part->median_height();
880
881 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
882 BLOBNBOX* blob = it.data();
883 current_x0 = blob->bounding_box().left();
884 current_x1 = blob->bounding_box().right();
885 if (previous_x1 != -1) {
886 int gap = current_x0 - previous_x1;
887
888 // TODO(nbeato): Boxes may overlap? Huh?
889 // For example, mag.3B 8003_033.3B.tif in UNLV data. The titles/authors
890 // on the top right of the page are filtered out with this line.
891 // Note 2: Iterating over blobs in a partition, so we are looking for
892 // spacing between the words.
893 if (gap < 0) {
894 // More likely case, the blobs slightly overlap. This can happen
895 // with diacritics (accents) or broken alphabet symbols (characters).
896 // Merge boxes together by taking max of right sides.
897 if (-gap < part->median_height() * kMaxBlobOverlapFactor) {
898 previous_x1 = std::max(previous_x1, current_x1);
899 continue;
900 }
901 // Extreme case, blobs overlap significantly in the same partition...
902 // This should not happen often (if at all), but it does.
903 // TODO(nbeato): investigate cases when this happens.
904 else {
905 // The behavior before was to completely ignore this case.
906 }
907 }
908
909 // If a large enough gap is found, mark it as a table cell (return true)
910 if (gap > max_gap)
911 return true;
912 if (gap > largest_partition_gap_found)
913 largest_partition_gap_found = gap;
914 }
915 previous_x1 = current_x1;
916 }
917 // Since no large gap was found, return false if the partition is too
918 // long to be a data cell
919 if (part->bounding_box().width() >
920 kMaxBoxesInDataPartition * part->median_height() ||
921 part_boxes->length() > kMaxBoxesInDataPartition)
922 return false;
923
924 // A partition may be a single blob. In this case, it's an isolated symbol
925 // or non-text (such as a ruling or image).
926 // Detect these as table partitions? Shouldn't this be case by case?
927 // The behavior before was to ignore this, making max_partition_gap < 0
928 // and implicitly return true. Just making it explicit.
929 if (largest_partition_gap_found == -1)
930 return true;
931
932 // return true if the maximum gap found is smaller than the minimum allowed
933 // max_gap in a text partition. This indicates that there is no significant
934 // space in the partition, hence it is likely a single word.
935 return largest_partition_gap_found < min_gap;
936}
const double kMinMaxGapInTextPartition
Definition: tablefind.cpp:72
const int kMinBoxesInTextPartition
Definition: tablefind.cpp:62
const double kMaxGapInTextPartition
Definition: tablefind.cpp:68
const int kMaxBoxesInDataPartition
Definition: tablefind.cpp:65
const double kMaxBlobOverlapFactor
Definition: tablefind.cpp:76

◆ HLineBelongsToTable()

bool tesseract::TableFinder::HLineBelongsToTable ( const ColPartition part,
const TBOX table_box 
)
protected

Definition at line 1600 of file tablefind.cpp.

1601 {
1602 if (!part.IsHorizontalLine())
1603 return false;
1604 const TBOX& part_box = part.bounding_box();
1605 if (!part_box.major_x_overlap(table_box))
1606 return false;
1607 // Do not consider top-most horizontal line since it usually
1608 // originates from noise.
1609 // TODO(nbeato): I had to comment this out because the ruling grid doesn't
1610 // have neighbors solved.
1611 // if (!part.nearest_neighbor_above())
1612 // return false;
1613 const TBOX bbox = part_box.bounding_union(table_box);
1614 // In the "unioned table" box (the table extents expanded by the line),
1615 // keep track of how many partitions have significant padding to the left
1616 // and right. If more than half of the partitions covered by the new table
1617 // have significant spacing, the line belongs to the table and the table
1618 // grows to include all of the partitions.
1619 int num_extra_partitions = 0;
1620 int extra_space_to_right = 0;
1621 int extra_space_to_left = 0;
1622 // Rulings are in a different grid, so search 2 grids for rulings, text,
1623 // and table partitions that are introduced by the new box.
1624 for (int i = 0; i < 2; ++i) {
1625 ColPartitionGrid* grid = (i == 0) ? &clean_part_grid_ :
1627 // Start a rect search on bbox
1628 ColPartitionGridSearch rectsearch(grid);
1629 rectsearch.SetUniqueMode(true);
1630 rectsearch.StartRectSearch(bbox);
1631 ColPartition* extra_part = nullptr;
1632 while ((extra_part = rectsearch.NextRectSearch()) != nullptr) {
1633 // ColPartition already in table
1634 const TBOX& extra_part_box = extra_part->bounding_box();
1635 if (extra_part_box.overlap_fraction(table_box) > kMinOverlapWithTable)
1636 continue;
1637 // Non-text ColPartitions do not contribute
1638 if (extra_part->IsImageType())
1639 continue;
1640 // Consider this partition.
1641 num_extra_partitions++;
1642 // presence of a table cell is a strong hint, so just increment the scores
1643 // without looking at the spacing.
1644 if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) {
1645 extra_space_to_right++;
1646 extra_space_to_left++;
1647 continue;
1648 }
1649 int space_threshold = kSideSpaceMargin * part.median_height();
1650 if (extra_part->space_to_right() > space_threshold)
1651 extra_space_to_right++;
1652 if (extra_part->space_to_left() > space_threshold)
1653 extra_space_to_left++;
1654 }
1655 }
1656 // tprintf("%d %d %d\n",
1657 // num_extra_partitions,extra_space_to_right,extra_space_to_left);
1658 return (extra_space_to_right > num_extra_partitions / 2) ||
1659 (extra_space_to_left > num_extra_partitions / 2);
1660}
const int kSideSpaceMargin
Definition: tablefind.cpp:101

◆ IncludeLeftOutColumnHeaders()

void tesseract::TableFinder::IncludeLeftOutColumnHeaders ( TBOX table_box)
protected

Definition at line 1664 of file tablefind.cpp.

1664 {
1665 // Start a search above the current table to look for column headers
1667 vsearch.StartVerticalSearch(table_box->left(), table_box->right(),
1668 table_box->top());
1669 ColPartition* neighbor = nullptr;
1670 ColPartition* previous_neighbor = nullptr;
1671 while ((neighbor = vsearch.NextVerticalSearch(false)) != nullptr) {
1672 // Max distance to find a table heading.
1673 const int max_distance = kMaxColumnHeaderDistance *
1674 neighbor->median_height();
1675 int table_top = table_box->top();
1676 const TBOX& box = neighbor->bounding_box();
1677 // Do not continue if the next box is way above
1678 if (box.bottom() - table_top > max_distance)
1679 break;
1680 // Unconditionally include partitions of type TABLE or LINE
1681 // TODO(faisal): add some reasonable conditions here
1682 if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) {
1683 table_box->set_top(box.top());
1684 previous_neighbor = nullptr;
1685 continue;
1686 }
1687 // If there are two text partitions, one above the other, without a table
1688 // cell on their left or right side, consider them a barrier and quit
1689 if (previous_neighbor == nullptr) {
1690 previous_neighbor = neighbor;
1691 } else {
1692 const TBOX& previous_box = previous_neighbor->bounding_box();
1693 if (!box.major_y_overlap(previous_box))
1694 break;
1695 }
1696 }
1697}
const int kMaxColumnHeaderDistance
Definition: tablefind.cpp:84
bool major_y_overlap(const TBOX &box) const
Definition: rect.h:439

◆ Init()

void tesseract::TableFinder::Init ( int  grid_size,
const ICOORD bottom_left,
const ICOORD top_right 
)

Definition at line 181 of file tablefind.cpp.

182 {
183 // Initialize clean partitions list and grid
184 clean_part_grid_.Init(grid_size, bottom_left, top_right);
185 leader_and_ruling_grid_.Init(grid_size, bottom_left, top_right);
186 fragmented_text_grid_.Init(grid_size, bottom_left, top_right);
187 col_seg_grid_.Init(grid_size, bottom_left, top_right);
188 table_grid_.Init(grid_size, bottom_left, top_right);
189}
void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: bbgrid.h:445

◆ InitializePartitions()

void tesseract::TableFinder::InitializePartitions ( ColPartitionSet **  all_columns)
protected

Definition at line 579 of file tablefind.cpp.

579 {
583}
void SetGlobalSpacings(ColPartitionGrid *grid)
Definition: tablefind.cpp:709
static void SetPartitionSpacings(ColPartitionGrid *grid, ColPartitionSet **all_columns)
Definition: tablefind.cpp:586

◆ InsertCleanPartitions()

void tesseract::TableFinder::InsertCleanPartitions ( ColPartitionGrid grid,
TO_BLOCK block 
)

Definition at line 193 of file tablefind.cpp.

194 {
195 // Calculate stats. This lets us filter partitions in AllowTextPartition()
196 // and filter blobs in AllowBlob().
197 SetGlobalSpacings(grid);
198
199 // Iterate the ColPartitions in the grid.
200 ColPartitionGridSearch gsearch(grid);
201 gsearch.SetUniqueMode(true);
202 gsearch.StartFullSearch();
203 ColPartition* part = nullptr;
204 while ((part = gsearch.NextFullSearch()) != nullptr) {
205 // Reject partitions with nothing useful inside of them.
206 if (part->blob_type() == BRT_NOISE || part->bounding_box().area() <= 0)
207 continue;
208 ColPartition* clean_part = part->ShallowCopy();
209 ColPartition* leader_part = nullptr;
210 if (part->IsLineType()) {
211 InsertRulingPartition(clean_part);
212 continue;
213 }
214 // Insert all non-text partitions to clean_parts
215 if (!part->IsTextType()) {
216 InsertImagePartition(clean_part);
217 continue;
218 }
219 // Insert text colpartitions after removing noisy components from them
220 // The leaders are split into a separate grid.
221 BLOBNBOX_CLIST* part_boxes = part->boxes();
222 BLOBNBOX_C_IT pit(part_boxes);
223 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
224 BLOBNBOX *pblob = pit.data();
225 // Bad blobs... happens in UNLV set.
226 // news.3G1, page 17 (around x=6)
227 if (!AllowBlob(*pblob))
228 continue;
229 if (pblob->flow() == BTFT_LEADER) {
230 if (leader_part == nullptr) {
231 leader_part = part->ShallowCopy();
232 leader_part->set_flow(BTFT_LEADER);
233 }
234 leader_part->AddBox(pblob);
235 } else if (pblob->region_type() != BRT_NOISE) {
236 clean_part->AddBox(pblob);
237 }
238 }
239 clean_part->ComputeLimits();
240 ColPartition* fragmented = clean_part->CopyButDontOwnBlobs();
241 InsertTextPartition(clean_part);
243 if (leader_part != nullptr) {
244 // TODO(nbeato): Note that ComputeLimits does not update the column
245 // information. So the leader may appear to span more columns than it
246 // really does later on when IsInSameColumnAs gets called to test
247 // for adjacent leaders.
248 leader_part->ComputeLimits();
249 InsertLeaderPartition(leader_part);
250 }
251 }
252
253 // Make the partition partners better for upper and lower neighbors.
256}
@ BRT_NOISE
Definition: blobbox.h:73
BlobRegionType region_type() const
Definition: blobbox.h:283
void set_flow(BlobTextFlowType value)
Definition: blobbox.h:298
BlobTextFlowType flow() const
Definition: blobbox.h:295
void RefinePartitionPartners(bool get_desperate)
void SplitAndInsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:436
void InsertLeaderPartition(ColPartition *part)
Definition: tablefind.cpp:410
void InsertRulingPartition(ColPartition *part)
Definition: tablefind.cpp:418
bool AllowBlob(const BLOBNBOX &blob) const
Definition: tablefind.cpp:502
void InsertTextPartition(ColPartition *part)
Definition: tablefind.cpp:394
void InsertImagePartition(ColPartition *part)
Definition: tablefind.cpp:421

◆ InsertFragmentedTextPartition()

void tesseract::TableFinder::InsertFragmentedTextPartition ( ColPartition part)
protected

Definition at line 402 of file tablefind.cpp.

402 {
403 ASSERT_HOST(part != nullptr);
404 if (AllowTextPartition(*part)) {
405 fragmented_text_grid_.InsertBBox(true, true, part);
406 } else {
407 delete part;
408 }
409}
bool AllowTextPartition(const ColPartition &part) const
Definition: tablefind.cpp:489

◆ InsertImagePartition()

void tesseract::TableFinder::InsertImagePartition ( ColPartition part)
protected

Definition at line 421 of file tablefind.cpp.

421 {
422 // NOTE: If images are placed into a different grid in the future,
423 // the function SetPartitionSpacings needs to be updated. It should
424 // be the only thing that cares about image partitions.
425 clean_part_grid_.InsertBBox(true, true, part);
426}

◆ InsertLeaderPartition()

void tesseract::TableFinder::InsertLeaderPartition ( ColPartition part)
protected

Definition at line 410 of file tablefind.cpp.

410 {
411 ASSERT_HOST(part != nullptr);
412 if (!part->IsEmpty() && part->bounding_box().area() > 0) {
413 leader_and_ruling_grid_.InsertBBox(true, true, part);
414 } else {
415 delete part;
416 }
417}

◆ InsertRulingPartition()

void tesseract::TableFinder::InsertRulingPartition ( ColPartition part)
protected

Definition at line 418 of file tablefind.cpp.

418 {
419 leader_and_ruling_grid_.InsertBBox(true, true, part);
420}

◆ InsertTextPartition()

void tesseract::TableFinder::InsertTextPartition ( ColPartition part)
protected

Definition at line 394 of file tablefind.cpp.

394 {
395 ASSERT_HOST(part != nullptr);
396 if (AllowTextPartition(*part)) {
397 clean_part_grid_.InsertBBox(true, true, part);
398 } else {
399 delete part;
400 }
401}

◆ LocateTables()

void tesseract::TableFinder::LocateTables ( ColPartitionGrid grid,
ColPartitionSet **  columns,
WidthCallback width_cb,
const FCOORD reskew 
)

Definition at line 259 of file tablefind.cpp.

262 {
263 // initialize spacing, neighbors, and columns
264 InitializePartitions(all_columns);
265
266#ifndef GRAPHICS_DISABLED
267 if (textord_show_tables) {
268 ScrollView* table_win = MakeWindow(0, 300, "Column Partitions & Neighbors");
274
275 table_win = MakeWindow(100, 300, "Fragmented Text");
277 }
278#endif // GRAPHICS_DISABLED
279
280 // mark, filter, and smooth candidate table partitions
282
283 // Make single-column blocks from good_columns_ partitions. col_segments are
284 // moved to a grid later which takes the ownership
285 ColSegment_LIST column_blocks;
286 GetColumnBlocks(all_columns, &column_blocks);
287 // Set the ratio of candidate table partitions in each column
288 SetColumnsType(&column_blocks);
289
290 // Move column segments to col_seg_grid_
291 MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_);
292
293 // Detect split in column layout that might have occurred due to the
294 // presence of a table. In such a case, merge the corresponding columns.
296
297 // Group horizontally overlapping table partitions into table columns.
298 // table_columns created here get deleted at the end of this method.
299 ColSegment_LIST table_columns;
300 GetTableColumns(&table_columns);
301
302 // Within each column, mark the range table regions occupy based on the
303 // table columns detected. table_regions are moved to a grid later which
304 // takes the ownership
305 ColSegment_LIST table_regions;
306 GetTableRegions(&table_columns, &table_regions);
307
308#ifndef GRAPHICS_DISABLED
309 if (textord_tablefind_show_mark) {
310 ScrollView* table_win = MakeWindow(1200, 300, "Table Columns and Regions");
311 DisplayColSegments(table_win, &table_columns, ScrollView::DARK_TURQUOISE);
312 DisplayColSegments(table_win, &table_regions, ScrollView::YELLOW);
313 }
314#endif // GRAPHICS_DISABLED
315
316 // Merge table regions across columns for tables spanning multiple
317 // columns
318 MoveColSegmentsToGrid(&table_regions, &table_grid_);
320
321 // Adjust table boundaries by including nearby horizontal lines and left
322 // out column headers
325
326 if (textord_tablefind_recognize_tables) {
327 // Remove false alarms consiting of a single column
329
330#ifndef GRAPHICS_DISABLED
331 if (textord_show_tables) {
332 ScrollView* table_win = MakeWindow(1200, 300, "Detected Table Locations");
334 DisplayColSegments(table_win, &table_columns, ScrollView::KHAKI);
335 table_grid_.DisplayBoxes(table_win);
336 }
337#endif // GRAPHICS_DISABLED
338
339 // Find table grid structure and reject tables that are malformed.
343
344#ifndef GRAPHICS_DISABLED
345 if (textord_show_tables) {
346 ScrollView* table_win = MakeWindow(1400, 600, "Recognized Tables");
349 table_grid_.DisplayBoxes(table_win);
350 }
351#endif // GRAPHICS_DISABLED
352 } else {
353 // Remove false alarms consiting of a single column
354 // TODO(nbeato): verify this is a NOP after structured table rejection.
355 // Right now it isn't. If the recognize function is doing what it is
356 // supposed to do, this function is obsolete.
358
359#ifndef GRAPHICS_DISABLED
360 if (textord_show_tables) {
361 ScrollView* table_win = MakeWindow(1500, 300, "Detected Tables");
364 table_grid_.DisplayBoxes(table_win);
365 }
366#endif // GRAPHICS_DISABLED
367 }
368
369 // Merge all colpartitions in table regions to make them a single
370 // colpartition and revert types of isolated table cells not
371 // assigned to any table to their original types.
372 MakeTableBlocks(grid, all_columns, width_cb);
373}
void DisplayBoxes(ScrollView *window)
Definition: bbgrid.h:613
void DisplayColSegments(ScrollView *win, ColSegment_LIST *cols, ScrollView::Color color)
Definition: tablefind.cpp:1871
void MakeTableBlocks(ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb)
Definition: tablefind.cpp:1997
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:518
void GetTableColumns(ColSegment_LIST *table_columns)
Definition: tablefind.cpp:1273
void GetTableRegions(ColSegment_LIST *table_columns, ColSegment_LIST *table_regions)
Definition: tablefind.cpp:1323
void InitializePartitions(ColPartitionSet **all_columns)
Definition: tablefind.cpp:579
void GetColumnBlocks(ColPartitionSet **columns, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:523
void MoveColSegmentsToGrid(ColSegment_LIST *segments, ColSegmentGrid *col_seg_grid)
Definition: tablefind.cpp:1176
void SetColumnsType(ColSegment_LIST *col_segments)
Definition: tablefind.cpp:1143
void DisplayColPartitionConnections(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
Definition: tablefind.cpp:1950
@ DARK_TURQUOISE
Definition: scrollview.h:147

◆ MakeTableBlocks()

void tesseract::TableFinder::MakeTableBlocks ( ColPartitionGrid grid,
ColPartitionSet **  columns,
WidthCallback width_cb 
)
protected

Definition at line 1997 of file tablefind.cpp.

1999 {
2000 // Since we have table blocks already, remove table tags from all
2001 // colpartitions
2002 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
2003 gsearch(grid);
2004 gsearch.StartFullSearch();
2005 ColPartition* part = nullptr;
2006
2007 while ((part = gsearch.NextFullSearch()) != nullptr) {
2008 if (part->type() == PT_TABLE) {
2009 part->clear_table_type();
2010 }
2011 }
2012 // Now make a single colpartition out of each table block and remove
2013 // all colpartitions contained within a table
2014 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
2015 table_search(&table_grid_);
2016 table_search.StartFullSearch();
2017 ColSegment* table;
2018 while ((table = table_search.NextFullSearch()) != nullptr) {
2019 const TBOX& table_box = table->bounding_box();
2020 // Start a rect search on table_box
2021 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
2022 rectsearch(grid);
2023 rectsearch.StartRectSearch(table_box);
2024 ColPartition* part;
2025 ColPartition* table_partition = nullptr;
2026 while ((part = rectsearch.NextRectSearch()) != nullptr) {
2027 // Do not consider image partitions
2028 if (!part->IsTextType())
2029 continue;
2030 TBOX part_box = part->bounding_box();
2031 // Include partition in the table if more than half of it
2032 // is covered by the table
2033 if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
2034 rectsearch.RemoveBBox();
2035 if (table_partition) {
2036 table_partition->Absorb(part, width_cb);
2037 } else {
2038 table_partition = part;
2039 }
2040 }
2041 }
2042 // Insert table colpartition back to part_grid_
2043 if (table_partition) {
2044 // To match the columns used when transforming to blocks, the new table
2045 // partition must have its first and last column set at the grid y that
2046 // corresponds to its bottom.
2047 const TBOX& table_box = table_partition->bounding_box();
2048 int grid_x, grid_y;
2049 grid->GridCoords(table_box.left(), table_box.bottom(), &grid_x, &grid_y);
2050 table_partition->SetPartitionType(resolution_, all_columns[grid_y]);
2051 table_partition->set_table_type();
2052 table_partition->set_blob_type(BRT_TEXT);
2053 table_partition->set_flow(BTFT_CHAIN);
2054 table_partition->SetBlobTypes();
2055 grid->InsertBBox(true, true, table_partition);
2056 }
2057 }
2058}
@ BTFT_CHAIN
Definition: blobbox.h:118
@ BRT_TEXT
Definition: blobbox.h:80

◆ MakeWindow()

ScrollView * tesseract::TableFinder::MakeWindow ( int  x,
int  y,
const char *  window_name 
)
protected

Definition at line 518 of file tablefind.cpp.

518 {
519 return clean_part_grid_.MakeWindow(x, y, window_name);
520}
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: bbgrid.h:589

◆ MarkPartitionsUsingLocalInformation()

void tesseract::TableFinder::MarkPartitionsUsingLocalInformation ( )
protected

Definition at line 827 of file tablefind.cpp.

827 {
828 // Iterate the ColPartitions in the grid.
829 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
830 gsearch(&clean_part_grid_);
831 gsearch.StartFullSearch();
832 ColPartition* part = nullptr;
833 while ((part = gsearch.NextFullSearch()) != nullptr) {
834 if (!part->IsTextType()) // Only consider text partitions
835 continue;
836 // Only consider partitions in dominant font size or smaller
837 if (part->median_height() > kMaxTableCellXheight * global_median_xheight_)
838 continue;
839 // Mark partitions with a large gap, or no significant gap as
840 // table partitions.
841 // Comments: It produces several false alarms at:
842 // - last line of a paragraph (fixed)
843 // - single word section headings
844 // - page headers and footers
845 // - numbered equations
846 // - line drawing regions
847 // TODO(faisal): detect and fix above-mentioned cases
848 if (HasWideOrNoInterWordGap(part) ||
849 HasLeaderAdjacent(*part)) {
850 part->set_table_type();
851 }
852 }
853}
const double kMaxTableCellXheight
Definition: tablefind.cpp:80
bool HasWideOrNoInterWordGap(ColPartition *part) const
Definition: tablefind.cpp:857
bool HasLeaderAdjacent(const ColPartition &part)
Definition: tablefind.cpp:946

◆ MarkTablePartitions()

void tesseract::TableFinder::MarkTablePartitions ( )
protected

Definition at line 789 of file tablefind.cpp.

789 {
791 if (textord_tablefind_show_mark) {
792 ScrollView* table_win = MakeWindow(300, 300, "Initial Table Partitions");
796 }
798 if (textord_tablefind_show_mark) {
799 ScrollView* table_win = MakeWindow(600, 300, "Filtered Table Partitions");
803 }
805 if (textord_tablefind_show_mark) {
806 ScrollView* table_win = MakeWindow(900, 300, "Smoothed Table Partitions");
810 }
812 if (textord_tablefind_show_mark || textord_show_tables) {
813 ScrollView* table_win = MakeWindow(900, 300, "Final Table Partitions");
817 }
818}
void MarkPartitionsUsingLocalInformation()
Definition: tablefind.cpp:827

◆ MoveColSegmentsToGrid()

void tesseract::TableFinder::MoveColSegmentsToGrid ( ColSegment_LIST *  segments,
ColSegmentGrid col_seg_grid 
)
protected

Definition at line 1176 of file tablefind.cpp.

1177 {
1178 ColSegment_IT it(segments);
1179 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1180 ColSegment* seg = it.extract();
1181 col_seg_grid->InsertBBox(true, true, seg);
1182 }
1183}

◆ RecognizeTables()

void tesseract::TableFinder::RecognizeTables ( )
protected

Definition at line 1817 of file tablefind.cpp.

1817 {
1818 ScrollView* table_win = nullptr;
1819 if (textord_show_tables) {
1820 table_win = MakeWindow(0, 0, "Table Structure");
1823 // table_grid_.DisplayBoxes(table_win);
1824 }
1825
1826
1827 TableRecognizer recognizer;
1828 recognizer.Init();
1829 recognizer.set_line_grid(&leader_and_ruling_grid_);
1830 recognizer.set_text_grid(&fragmented_text_grid_);
1831 recognizer.set_max_text_height(global_median_xheight_ * 2.0);
1832 recognizer.set_min_height(1.5 * gridheight());
1833 // Loop over all of the tables and try to fit them.
1834 // Store the good tables here.
1835 ColSegment_CLIST good_tables;
1836 ColSegment_C_IT good_it(&good_tables);
1837
1839 gsearch.StartFullSearch();
1840 ColSegment* found_table = nullptr;
1841 while ((found_table = gsearch.NextFullSearch()) != nullptr) {
1842 gsearch.RemoveBBox();
1843
1844 // The goal is to make the tables persistent in a list.
1845 // When that happens, this will move into the search loop.
1846 const TBOX& found_box = found_table->bounding_box();
1847 StructuredTable* table_structure = recognizer.RecognizeTable(found_box);
1848
1849 // Process a table. Good tables are inserted into the grid again later on
1850 // We can't change boxes in the grid while it is running a search.
1851 if (table_structure != nullptr) {
1852 if (textord_show_tables) {
1853 table_structure->Display(table_win, ScrollView::LIME_GREEN);
1854 }
1855 found_table->set_bounding_box(table_structure->bounding_box());
1856 delete table_structure;
1857 good_it.add_after_then_move(found_table);
1858 } else {
1859 delete found_table;
1860 }
1861 }
1862 // TODO(nbeato): MERGE!! There is awesome info now available for merging.
1863
1864 // At this point, the grid is empty. We can safely insert the good tables
1865 // back into grid.
1866 for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward())
1867 table_grid_.InsertBBox(true, true, good_it.extract());
1868}

◆ set_global_median_blob_width()

void tesseract::TableFinder::set_global_median_blob_width ( int  width)
protected

Definition at line 759 of file tablefind.cpp.

759 {
761}

◆ set_global_median_ledding()

void tesseract::TableFinder::set_global_median_ledding ( int  ledding)
protected

Definition at line 762 of file tablefind.cpp.

762 {
763 global_median_ledding_ = ledding;
764}

◆ set_global_median_xheight()

void tesseract::TableFinder::set_global_median_xheight ( int  xheight)
protected

Definition at line 756 of file tablefind.cpp.

756 {
757 global_median_xheight_ = xheight;
758}

◆ set_left_to_right_language()

void tesseract::TableFinder::set_left_to_right_language ( bool  order)

Definition at line 177 of file tablefind.cpp.

177 {
179}

◆ set_resolution()

void tesseract::TableFinder::set_resolution ( int  resolution)
inline

Definition at line 138 of file tablefind.h.

138 {
139 resolution_ = resolution;
140 }

◆ SetColumnsType()

void tesseract::TableFinder::SetColumnsType ( ColSegment_LIST *  col_segments)
protected

Definition at line 1143 of file tablefind.cpp.

1143 {
1144 ColSegment_IT it(column_blocks);
1145 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1146 ColSegment* seg = it.data();
1147 TBOX box = seg->bounding_box();
1148 int num_table_cells = 0;
1149 int num_text_cells = 0;
1150 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1151 rsearch(&clean_part_grid_);
1152 rsearch.SetUniqueMode(true);
1153 rsearch.StartRectSearch(box);
1154 ColPartition* part = nullptr;
1155 while ((part = rsearch.NextRectSearch()) != nullptr) {
1156 if (part->type() == PT_TABLE) {
1157 num_table_cells++;
1158 } else if (part->type() == PT_FLOWING_TEXT) {
1159 num_text_cells++;
1160 }
1161 }
1162 // If a column block has no text or table partition in it, it is not needed
1163 // for table detection.
1164 if (!num_table_cells && !num_text_cells) {
1165 delete it.extract();
1166 } else {
1167 seg->set_num_table_cells(num_table_cells);
1168 seg->set_num_text_cells(num_text_cells);
1169 // set column type based on the ratio of table to text cells
1170 seg->set_type();
1171 }
1172 }
1173}

◆ SetGlobalSpacings()

void tesseract::TableFinder::SetGlobalSpacings ( ColPartitionGrid grid)
protected

Definition at line 709 of file tablefind.cpp.

709 {
710 STATS xheight_stats(0, kMaxVerticalSpacing + 1);
711 STATS width_stats(0, kMaxBlobWidth + 1);
712 STATS ledding_stats(0, kMaxVerticalSpacing + 1);
713 // Iterate the ColPartitions in the grid.
714 ColPartitionGridSearch gsearch(grid);
715 gsearch.SetUniqueMode(true);
716 gsearch.StartFullSearch();
717 ColPartition* part = nullptr;
718 while ((part = gsearch.NextFullSearch()) != nullptr) {
719 // TODO(nbeato): HACK HACK HACK! medians are equal to partition length.
720 // ComputeLimits needs to get called somewhere outside of TableFinder
721 // to make sure the partitions are properly initialized.
722 // When this is called, SmoothPartitionPartners dies in an assert after
723 // table find runs. Alternative solution.
724 // part->ComputeLimits();
725 if (part->IsTextType()) {
726 // xheight_stats.add(part->median_height(), part->boxes_count());
727 // width_stats.add(part->median_width(), part->boxes_count());
728
729 // This loop can be removed when above issues are fixed.
730 // Replace it with the 2 lines commented out above.
731 BLOBNBOX_C_IT it(part->boxes());
732 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
733 xheight_stats.add(it.data()->bounding_box().height(), 1);
734 width_stats.add(it.data()->bounding_box().width(), 1);
735 }
736
737 ledding_stats.add(part->space_above(), 1);
738 ledding_stats.add(part->space_below(), 1);
739 }
740 }
741 // Set estimates based on median of statistics obtained
742 set_global_median_xheight(static_cast<int>(xheight_stats.median() + 0.5));
743 set_global_median_blob_width(static_cast<int>(width_stats.median() + 0.5));
744 set_global_median_ledding(static_cast<int>(ledding_stats.median() + 0.5));
745 #ifndef GRAPHICS_DISABLED
746 if (textord_tablefind_show_stats) {
747 const char* kWindowName = "X-height (R), X-width (G), and ledding (B)";
748 ScrollView* stats_win = MakeWindow(500, 10, kWindowName);
749 xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED);
750 width_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN);
751 ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::BLUE);
752 }
753 #endif // GRAPHICS_DISABLED
754}
const int kMaxVerticalSpacing
Definition: tablefind.cpp:37
const int kMaxBlobWidth
Definition: tablefind.cpp:39
Definition: statistc.h:31
void set_global_median_blob_width(int width)
Definition: tablefind.cpp:759
void set_global_median_xheight(int xheight)
Definition: tablefind.cpp:756
void set_global_median_ledding(int ledding)
Definition: tablefind.cpp:762

◆ SetPartitionSpacings()

void tesseract::TableFinder::SetPartitionSpacings ( ColPartitionGrid grid,
ColPartitionSet **  all_columns 
)
staticprotected

Definition at line 586 of file tablefind.cpp.

587 {
588 // Iterate the ColPartitions in the grid.
589 ColPartitionGridSearch gsearch(grid);
590 gsearch.StartFullSearch();
591 ColPartition* part = nullptr;
592 while ((part = gsearch.NextFullSearch()) != nullptr) {
593 ColPartitionSet* columns = all_columns[gsearch.GridY()];
594 TBOX box = part->bounding_box();
595 int y = part->MidY();
596 ColPartition* left_column = columns->ColumnContaining(box.left(), y);
597 ColPartition* right_column = columns->ColumnContaining(box.right(), y);
598 // set distance from left column as space to the left
599 if (left_column) {
600 int left_space = std::max(0, box.left() - left_column->LeftAtY(y));
601 part->set_space_to_left(left_space);
602 }
603 // set distance from right column as space to the right
604 if (right_column) {
605 int right_space = std::max(0, right_column->RightAtY(y) - box.right());
606 part->set_space_to_right(right_space);
607 }
608
609 // Look for images that may be closer.
610 // NOTE: used to be part_grid_, might cause issues now
611 ColPartitionGridSearch hsearch(grid);
612 hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
613 ColPartition* neighbor = nullptr;
614 while ((neighbor = hsearch.NextSideSearch(true)) != nullptr) {
615 if (neighbor->type() == PT_PULLOUT_IMAGE ||
616 neighbor->type() == PT_FLOWING_IMAGE ||
617 neighbor->type() == PT_HEADING_IMAGE) {
618 int right = neighbor->bounding_box().right();
619 if (right < box.left()) {
620 int space = std::min(box.left() - right, part->space_to_left());
621 part->set_space_to_left(space);
622 }
623 }
624 }
625 hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
626 neighbor = nullptr;
627 while ((neighbor = hsearch.NextSideSearch(false)) != nullptr) {
628 if (neighbor->type() == PT_PULLOUT_IMAGE ||
629 neighbor->type() == PT_FLOWING_IMAGE ||
630 neighbor->type() == PT_HEADING_IMAGE) {
631 int left = neighbor->bounding_box().left();
632 if (left > box.right()) {
633 int space = std::min(left - box.right(), part->space_to_right());
634 part->set_space_to_right(space);
635 }
636 }
637 }
638
639 ColPartition* upper_part = part->SingletonPartner(true);
640 if (upper_part) {
641 int space = std::max(0, static_cast<int>(upper_part->bounding_box().bottom() -
642 part->bounding_box().bottom()));
643 part->set_space_above(space);
644 } else {
645 // TODO(nbeato): What constitutes a good value?
646 // 0 is the default value when not set, explicitly noting it needs to
647 // be something else.
648 part->set_space_above(INT32_MAX);
649 }
650
651 ColPartition* lower_part = part->SingletonPartner(false);
652 if (lower_part) {
653 int space = std::max(0, static_cast<int>(part->bounding_box().bottom() -
654 lower_part->bounding_box().bottom()));
655 part->set_space_below(space);
656 } else {
657 // TODO(nbeato): What constitutes a good value?
658 // 0 is the default value when not set, explicitly noting it needs to
659 // be something else.
660 part->set_space_below(INT32_MAX);
661 }
662 }
663}
@ PT_PULLOUT_IMAGE
Definition: capi.h:140
@ PT_HEADING_IMAGE
Definition: capi.h:139
@ PT_FLOWING_IMAGE
Definition: capi.h:138

◆ SetVerticalSpacing()

void tesseract::TableFinder::SetVerticalSpacing ( ColPartition part)
protected

Definition at line 666 of file tablefind.cpp.

666 {
667 TBOX box = part->bounding_box();
668 int top_range = std::min(box.top() + kMaxVerticalSpacing, static_cast<int>(tright().y()));
669 int bottom_range = std::max(box.bottom() - kMaxVerticalSpacing, static_cast<int>(bleft().y()));
670 box.set_top(top_range);
671 box.set_bottom(bottom_range);
672
673 TBOX part_box = part->bounding_box();
674 // Start a rect search
675 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
676 rectsearch(&clean_part_grid_);
677 rectsearch.StartRectSearch(box);
678 ColPartition* neighbor;
679 int min_space_above = kMaxVerticalSpacing;
680 int min_space_below = kMaxVerticalSpacing;
681 ColPartition* above_neighbor = nullptr;
682 ColPartition* below_neighbor = nullptr;
683 while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
684 if (neighbor == part)
685 continue;
686 TBOX neighbor_box = neighbor->bounding_box();
687 if (neighbor_box.major_x_overlap(part_box)) {
688 int gap = abs(part->median_bottom() - neighbor->median_bottom());
689 // If neighbor is below current partition
690 if (neighbor_box.top() < part_box.bottom() &&
691 gap < min_space_below) {
692 min_space_below = gap;
693 below_neighbor = neighbor;
694 } // If neighbor is above current partition
695 else if (part_box.top() < neighbor_box.bottom() &&
696 gap < min_space_above) {
697 min_space_above = gap;
698 above_neighbor = neighbor;
699 }
700 }
701 }
702 part->set_space_above(min_space_above);
703 part->set_space_below(min_space_below);
704 part->set_nearest_neighbor_above(above_neighbor);
705 part->set_nearest_neighbor_below(below_neighbor);
706}

◆ SmoothTablePartitionRuns()

void tesseract::TableFinder::SmoothTablePartitionRuns ( )
protected

Definition at line 1108 of file tablefind.cpp.

1108 {
1109 // Iterate the ColPartitions in the grid.
1111 gsearch.StartFullSearch();
1112 ColPartition* part = nullptr;
1113 while ((part = gsearch.NextFullSearch()) != nullptr) {
1114 if (part->type() >= PT_TABLE || part->type() == PT_UNKNOWN)
1115 continue; // Consider only text partitions
1116 ColPartition* upper_part = part->nearest_neighbor_above();
1117 ColPartition* lower_part = part->nearest_neighbor_below();
1118 if (!upper_part || !lower_part)
1119 continue;
1120 if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE)
1121 part->set_table_type();
1122 }
1123
1124 // Pass 2, do the opposite. If both the upper and lower neighbors
1125 // exist and are not tables, this probably shouldn't be a table.
1126 gsearch.StartFullSearch();
1127 part = nullptr;
1128 while ((part = gsearch.NextFullSearch()) != nullptr) {
1129 if (part->type() != PT_TABLE)
1130 continue; // Consider only text partitions
1131 ColPartition* upper_part = part->nearest_neighbor_above();
1132 ColPartition* lower_part = part->nearest_neighbor_below();
1133
1134 // table can't be by itself
1135 if ((upper_part && upper_part->type() != PT_TABLE) &&
1136 (lower_part && lower_part->type() != PT_TABLE)) {
1137 part->clear_table_type();
1138 }
1139 }
1140}
@ PT_UNKNOWN
Definition: capi.h:129

◆ SplitAndInsertFragmentedTextPartition()

void tesseract::TableFinder::SplitAndInsertFragmentedTextPartition ( ColPartition part)
protected

Definition at line 436 of file tablefind.cpp.

436 {
437 ASSERT_HOST(part != nullptr);
438 // Bye bye empty partitions!
439 if (part->boxes()->empty()) {
440 delete part;
441 return;
442 }
443
444 // The AllowBlob function prevents this.
445 ASSERT_HOST(part->median_width() > 0);
446 const double kThreshold = part->median_width() * kSplitPartitionSize;
447
448 ColPartition* right_part = part;
449 bool found_split = true;
450 while (found_split) {
451 found_split = false;
452 BLOBNBOX_C_IT box_it(right_part->boxes());
453 // Blobs are sorted left side first. If blobs overlap,
454 // the previous blob may have a "more right" right side.
455 // Account for this by always keeping the largest "right"
456 // so far.
457 int previous_right = INT32_MIN;
458
459 // Look for the next split in the partition.
460 for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
461 const TBOX& box = box_it.data()->bounding_box();
462 if (previous_right != INT32_MIN &&
463 box.left() - previous_right > kThreshold) {
464 // We have a split position. Split the partition in two pieces.
465 // Insert the left piece in the grid and keep processing the right.
466 int mid_x = (box.left() + previous_right) / 2;
467 ColPartition* left_part = right_part;
468 right_part = left_part->SplitAt(mid_x);
469
471 found_split = true;
472 break;
473 }
474
475 // The right side of the previous blobs.
476 previous_right = std::max(previous_right, static_cast<int>(box.right()));
477 }
478 }
479 // When a split is not found, the right part is minimized
480 // as much as possible, so process it.
482}
const double kSplitPartitionSize
Definition: tablefind.cpp:43
void InsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:402

◆ tright()

const ICOORD & tesseract::TableFinder::tright ( ) const
protected

Definition at line 390 of file tablefind.cpp.

390 {
391 return clean_part_grid_.tright();
392}
const ICOORD & tright() const
Definition: bbgrid.h:75

Member Data Documentation

◆ clean_part_grid_

ColPartitionGrid tesseract::TableFinder::clean_part_grid_
protected

Definition at line 413 of file tablefind.h.

◆ col_seg_grid_

ColSegmentGrid tesseract::TableFinder::col_seg_grid_
protected

Definition at line 421 of file tablefind.h.

◆ fragmented_text_grid_

ColPartitionGrid tesseract::TableFinder::fragmented_text_grid_
protected

Definition at line 419 of file tablefind.h.

◆ global_median_blob_width_

int tesseract::TableFinder::global_median_blob_width_
protected

Definition at line 407 of file tablefind.h.

◆ global_median_ledding_

int tesseract::TableFinder::global_median_ledding_
protected

Definition at line 409 of file tablefind.h.

◆ global_median_xheight_

int tesseract::TableFinder::global_median_xheight_
protected

Definition at line 405 of file tablefind.h.

◆ leader_and_ruling_grid_

ColPartitionGrid tesseract::TableFinder::leader_and_ruling_grid_
protected

Definition at line 415 of file tablefind.h.

◆ left_to_right_language_

bool tesseract::TableFinder::left_to_right_language_
protected

Definition at line 425 of file tablefind.h.

◆ resolution_

int tesseract::TableFinder::resolution_
protected

Definition at line 403 of file tablefind.h.

◆ table_grid_

ColSegmentGrid tesseract::TableFinder::table_grid_
protected

Definition at line 423 of file tablefind.h.


The documentation for this class was generated from the following files: