tesseract 4.1.1
Loading...
Searching...
No Matches
tabvector.cpp
Go to the documentation of this file.
1
2// File: tabvector.cpp
3// Description: Class to hold a near-vertical vector representing a tab-stop.
4// Author: Ray Smith
5//
6// (C) Copyright 2008, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16//
18
19#ifdef HAVE_CONFIG_H
20#include "config_auto.h"
21#endif
22
23#include "tabvector.h"
24#include "blobbox.h"
25#include "colfind.h"
26#include "colpartitionset.h"
27#include "detlinefit.h"
28#include "statistc.h"
29
30#include <algorithm>
31
32namespace tesseract {
33
34// Multiple of height used as a gutter for evaluation search.
35const int kGutterMultiple = 4;
36// Multiple of neighbour gap that we expect the gutter gap to be at minimum.
38// Pixel distance for tab vectors to be considered the same.
39const int kSimilarVectorDist = 10;
40// Pixel distance for ragged tab vectors to be considered the same if there
41// is nothing in the overlap box
42const int kSimilarRaggedDist = 50;
43// Max multiple of height to allow filling in between blobs when evaluating.
44const int kMaxFillinMultiple = 11;
45// Min fraction of mean gutter size to allow a gutter on a good tab blob.
46const double kMinGutterFraction = 0.5;
47// Multiple of 1/n lines as a minimum gutter in evaluation.
48const double kLineCountReciprocal = 4.0;
49// Constant add-on for minimum gutter for aligned tabs.
50const double kMinAlignedGutter = 0.25;
51// Constant add-on for minimum gutter for ragged tabs.
52const double kMinRaggedGutter = 1.5;
53
55 "max fraction of mean blob width allowed for vertical gaps in vertical text");
56
58 "Fraction of box matches required to declare a line vertical");
59
61
62// Create a constraint for the top or bottom of this TabVector.
63void TabConstraint::CreateConstraint(TabVector* vector, bool is_top) {
64 auto* constraint = new TabConstraint(vector, is_top);
65 auto* constraints = new TabConstraint_LIST;
66 TabConstraint_IT it(constraints);
67 it.add_to_end(constraint);
68 if (is_top)
69 vector->set_top_constraints(constraints);
70 else
71 vector->set_bottom_constraints(constraints);
72}
73
74// Test to see if the constraints are compatible enough to merge.
75bool TabConstraint::CompatibleConstraints(TabConstraint_LIST* list1,
76 TabConstraint_LIST* list2) {
77 if (list1 == list2)
78 return false;
79 int y_min = -INT32_MAX;
80 int y_max = INT32_MAX;
82 tprintf("Testing constraint compatibility\n");
83 GetConstraints(list1, &y_min, &y_max);
84 GetConstraints(list2, &y_min, &y_max);
86 tprintf("Resulting range = [%d,%d]\n", y_min, y_max);
87 return y_max >= y_min;
88}
89
90// Merge the lists of constraints and update the TabVector pointers.
91// The second list is deleted.
92void TabConstraint::MergeConstraints(TabConstraint_LIST* list1,
93 TabConstraint_LIST* list2) {
94 if (list1 == list2)
95 return;
96 TabConstraint_IT it(list2);
98 tprintf("Merging constraints\n");
99 // The vectors of all constraints on list2 are now going to be on list1.
100 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
101 TabConstraint* constraint = it.data();
103 constraint->vector_->Print("Merge");
104 if (constraint->is_top_)
105 constraint->vector_->set_top_constraints(list1);
106 else
107 constraint->vector_->set_bottom_constraints(list1);
108 }
109 it = list1;
110 it.add_list_before(list2);
111 delete list2;
112}
113
114// Set all the tops and bottoms as appropriate to a mean of the
115// constrained range. Delete all the constraints and list.
116void TabConstraint::ApplyConstraints(TabConstraint_LIST* constraints) {
117 int y_min = -INT32_MAX;
118 int y_max = INT32_MAX;
119 GetConstraints(constraints, &y_min, &y_max);
120 int y = (y_min + y_max) / 2;
121 TabConstraint_IT it(constraints);
122 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
123 TabConstraint* constraint = it.data();
124 TabVector* v = constraint->vector_;
125 if (constraint->is_top_) {
126 v->SetYEnd(y);
127 v->set_top_constraints(nullptr);
128 } else {
129 v->SetYStart(y);
130 v->set_bottom_constraints(nullptr);
131 }
132 }
133 delete constraints;
134}
135
136TabConstraint::TabConstraint(TabVector* vector, bool is_top)
137 : vector_(vector), is_top_(is_top) {
138 if (is_top) {
139 y_min_ = vector->endpt().y();
140 y_max_ = vector->extended_ymax();
141 } else {
142 y_max_ = vector->startpt().y();
143 y_min_ = vector->extended_ymin();
144 }
145}
146
147// Get the max of the mins and the min of the maxes.
148void TabConstraint::GetConstraints(TabConstraint_LIST* constraints,
149 int* y_min, int* y_max) {
150 TabConstraint_IT it(constraints);
151 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
152 TabConstraint* constraint = it.data();
153 if (textord_debug_tabfind > 3) {
154 tprintf("Constraint is [%d,%d]", constraint->y_min_, constraint->y_max_);
155 constraint->vector_->Print(" for");
156 }
157 *y_min = std::max(*y_min, constraint->y_min_);
158 *y_max = std::min(*y_max, constraint->y_max_);
159 }
160}
161
162ELIST2IZE(TabVector)
163CLISTIZE(TabVector)
164
165// The constructor is private. See the bottom of the file...
166
167
168// Public factory to build a TabVector from a list of boxes.
169// The TabVector will be of the given alignment type.
170// The input vertical vector is used in fitting, and the output
171// vertical_x, vertical_y have the resulting line vector added to them
172// if the alignment is not ragged.
173// The extended_start_y and extended_end_y are the maximum possible
174// extension to the line segment that can be used to align with others.
175// The input CLIST of BLOBNBOX good_points is consumed and taken over.
176TabVector* TabVector::FitVector(TabAlignment alignment, ICOORD vertical,
177 int extended_start_y, int extended_end_y,
178 BLOBNBOX_CLIST* good_points,
179 int* vertical_x, int* vertical_y) {
180 auto* vector = new TabVector(extended_start_y, extended_end_y,
181 alignment, good_points);
182 if (!vector->Fit(vertical, false)) {
183 delete vector;
184 return nullptr;
185 }
186 if (!vector->IsRagged()) {
187 vertical = vector->endpt_ - vector->startpt_;
188 int weight = vector->BoxCount();
189 *vertical_x += vertical.x() * weight;
190 *vertical_y += vertical.y() * weight;
191 }
192 return vector;
193}
194
195// Build a ragged TabVector by copying another's direction, shifting it
196// to match the given blob, and making its initial extent the height
197// of the blob, but its extended bounds from the bounds of the original.
199 const ICOORD& vertical_skew, BLOBNBOX* blob)
200 : extended_ymin_(src.extended_ymin_), extended_ymax_(src.extended_ymax_),
201 needs_refit_(true), needs_evaluation_(true),
202 alignment_(alignment) {
203 BLOBNBOX_C_IT it(&boxes_);
204 it.add_to_end(blob);
205 TBOX box = blob->bounding_box();
206 if (IsLeftTab()) {
207 startpt_ = box.botleft();
208 endpt_ = box.topleft();
209 } else {
210 startpt_ = box.botright();
211 endpt_ = box.topright();
212 }
213 sort_key_ = SortKey(vertical_skew,
214 (startpt_.x() + endpt_.x()) / 2,
215 (startpt_.y() + endpt_.y()) / 2);
216 if (textord_debug_tabfind > 3)
217 Print("Constructed a new tab vector:");
218}
219
220// Copies basic attributes of a tab vector for simple operations.
221// Copies things such startpt, endpt, range.
222// Does not copy things such as partners, boxes, or constraints.
223// This is useful if you only need vector information for processing, such
224// as in the table detection code.
226 auto* copy = new TabVector();
227 copy->startpt_ = startpt_;
228 copy->endpt_ = endpt_;
229 copy->alignment_ = alignment_;
230 copy->extended_ymax_ = extended_ymax_;
231 copy->extended_ymin_ = extended_ymin_;
232 copy->intersects_other_lines_ = intersects_other_lines_;
233 return copy;
234}
235
236// Extend this vector to include the supplied blob if it doesn't
237// already have it.
239 TBOX new_box = new_blob->bounding_box();
240 BLOBNBOX_C_IT it(&boxes_);
241 if (!it.empty()) {
242 BLOBNBOX* blob = it.data();
243 TBOX box = blob->bounding_box();
244 while (!it.at_last() && box.top() <= new_box.top()) {
245 if (blob == new_blob)
246 return; // We have it already.
247 it.forward();
248 blob = it.data();
249 box = blob->bounding_box();
250 }
251 if (box.top() >= new_box.top()) {
252 it.add_before_stay_put(new_blob);
253 needs_refit_ = true;
254 return;
255 }
256 }
257 needs_refit_ = true;
258 it.add_after_stay_put(new_blob);
259}
260
261// Set the ycoord of the start and move the xcoord to match.
262void TabVector::SetYStart(int start_y) {
263 startpt_.set_x(XAtY(start_y));
264 startpt_.set_y(start_y);
265}
266// Set the ycoord of the end and move the xcoord to match.
267void TabVector::SetYEnd(int end_y) {
268 endpt_.set_x(XAtY(end_y));
269 endpt_.set_y(end_y);
270}
271
272// Rotate the ends by the given vector. Auto flip start and end if needed.
273void TabVector::Rotate(const FCOORD& rotation) {
274 startpt_.rotate(rotation);
275 endpt_.rotate(rotation);
276 int dx = endpt_.x() - startpt_.x();
277 int dy = endpt_.y() - startpt_.y();
278 if ((dy < 0 && abs(dy) > abs(dx)) || (dx < 0 && abs(dx) > abs(dy))) {
279 // Need to flip start/end.
280 ICOORD tmp = startpt_;
281 startpt_ = endpt_;
282 endpt_ = tmp;
283 }
284}
285
286// Setup the initial constraints, being the limits of
287// the vector and the extended ends.
291}
292
293// Setup the constraints between the partners of this TabVector.
295 // With the first and last partner, we want a common bottom and top,
296 // respectively, and for each change of partner, we want a common
297 // top of first with bottom of next.
298 TabVector_C_IT it(&partners_);
299 TabVector* prev_partner = nullptr;
300 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
301 TabVector* partner = it.data();
302 if (partner->top_constraints_ == nullptr ||
303 partner->bottom_constraints_ == nullptr) {
304 partner->Print("Impossible: has no constraints");
305 Print("This vector has it as a partner");
306 continue;
307 }
308 if (prev_partner == nullptr) {
309 // This is the first partner, so common bottom.
310 if (TabConstraint::CompatibleConstraints(bottom_constraints_,
311 partner->bottom_constraints_))
312 TabConstraint::MergeConstraints(bottom_constraints_,
313 partner->bottom_constraints_);
314 } else {
315 // We need prev top to be common with partner bottom.
316 if (TabConstraint::CompatibleConstraints(prev_partner->top_constraints_,
317 partner->bottom_constraints_))
318 TabConstraint::MergeConstraints(prev_partner->top_constraints_,
319 partner->bottom_constraints_);
320 }
321 prev_partner = partner;
322 if (it.at_last()) {
323 // This is the last partner, so common top.
324 if (TabConstraint::CompatibleConstraints(top_constraints_,
325 partner->top_constraints_))
326 TabConstraint::MergeConstraints(top_constraints_,
327 partner->top_constraints_);
328 }
329 }
330}
331
332// Setup the constraints between this and its partner.
334 if (TabConstraint::CompatibleConstraints(bottom_constraints_,
335 partner->bottom_constraints_))
336 TabConstraint::MergeConstraints(bottom_constraints_,
337 partner->bottom_constraints_);
338 if (TabConstraint::CompatibleConstraints(top_constraints_,
339 partner->top_constraints_))
340 TabConstraint::MergeConstraints(top_constraints_,
341 partner->top_constraints_);
342}
343
344// Use the constraints to modify the top and bottom.
346 if (top_constraints_ != nullptr)
347 TabConstraint::ApplyConstraints(top_constraints_);
348 if (bottom_constraints_ != nullptr)
349 TabConstraint::ApplyConstraints(bottom_constraints_);
350}
351
352// Merge close tab vectors of the same side that overlap.
354 TabVector_LIST* vectors,
355 BlobGrid* grid) {
356 TabVector_IT it1(vectors);
357 for (it1.mark_cycle_pt(); !it1.cycled_list(); it1.forward()) {
358 TabVector* v1 = it1.data();
359 TabVector_IT it2(it1);
360 for (it2.forward(); !it2.at_first(); it2.forward()) {
361 TabVector* v2 = it2.data();
362 if (v2->SimilarTo(vertical, *v1, grid)) {
363 // Merge into the forward one, in case the combined vector now
364 // overlaps one in between.
366 v2->Print("Merging");
367 v1->Print("by deleting");
368 }
369 v2->MergeWith(vertical, it1.extract());
371 v2->Print("Producing");
372 }
373 ICOORD merged_vector = v2->endpt();
374 merged_vector -= v2->startpt();
375 if (textord_debug_tabfind && abs(merged_vector.x()) > 100) {
376 v2->Print("Garbage result of merge?");
377 }
378 break;
379 }
380 }
381 }
382}
383
384// Return true if this vector is the same side, overlaps, and close
385// enough to the other to be merged.
386bool TabVector::SimilarTo(const ICOORD& vertical,
387 const TabVector& other, BlobGrid* grid) const {
388 if ((IsRightTab() && other.IsRightTab()) ||
389 (IsLeftTab() && other.IsLeftTab())) {
390 // If they don't overlap, at least in extensions, then there is no chance.
391 if (ExtendedOverlap(other.extended_ymax_, other.extended_ymin_) < 0)
392 return false;
393 // A fast approximation to the scale factor of the sort_key_.
394 int v_scale = abs(vertical.y());
395 if (v_scale == 0)
396 v_scale = 1;
397 // If they are close enough, then OK.
398 if (sort_key_ + kSimilarVectorDist * v_scale >= other.sort_key_ &&
399 sort_key_ - kSimilarVectorDist * v_scale <= other.sort_key_)
400 return true;
401 // Ragged tabs get a bigger threshold.
402 if (!IsRagged() || !other.IsRagged() ||
403 sort_key_ + kSimilarRaggedDist * v_scale < other.sort_key_ ||
404 sort_key_ - kSimilarRaggedDist * v_scale > other.sort_key_)
405 return false;
406 if (grid == nullptr) {
407 // There is nothing else to test!
408 return true;
409 }
410 // If there is nothing in the rectangle between the vector that is going to
411 // move, and the place it is moving to, then they can be merged.
412 // Setup a vertical search for any blob.
413 const TabVector* mover = (IsRightTab() &&
414 sort_key_ < other.sort_key_) ? this : &other;
415 int top_y = mover->endpt_.y();
416 int bottom_y = mover->startpt_.y();
417 int left = std::min(mover->XAtY(top_y), mover->XAtY(bottom_y));
418 int right = std::max(mover->XAtY(top_y), mover->XAtY(bottom_y));
419 int shift = abs(sort_key_ - other.sort_key_) / v_scale;
420 if (IsRightTab()) {
421 right += shift;
422 } else {
423 left -= shift;
424 }
425
427 vsearch.StartVerticalSearch(left, right, top_y);
428 BLOBNBOX* blob;
429 while ((blob = vsearch.NextVerticalSearch(true)) != nullptr) {
430 const TBOX& box = blob->bounding_box();
431 if (box.top() > bottom_y)
432 return true; // Nothing found.
433 if (box.bottom() < top_y)
434 continue; // Doesn't overlap.
435 int left_at_box = XAtY(box.bottom());
436 int right_at_box = left_at_box;
437 if (IsRightTab())
438 right_at_box += shift;
439 else
440 left_at_box -= shift;
441 if (std::min(right_at_box, static_cast<int>(box.right())) > std::max(left_at_box, static_cast<int>(box.left())))
442 return false;
443 }
444 return true; // Nothing found.
445 }
446 return false;
447}
448
449// Eat the other TabVector into this and delete it.
450void TabVector::MergeWith(const ICOORD& vertical, TabVector* other) {
451 extended_ymin_ = std::min(extended_ymin_, other->extended_ymin_);
452 extended_ymax_ = std::max(extended_ymax_, other->extended_ymax_);
453 if (other->IsRagged()) {
454 alignment_ = other->alignment_;
455 }
456 // Merge sort the two lists of boxes.
457 BLOBNBOX_C_IT it1(&boxes_);
458 BLOBNBOX_C_IT it2(&other->boxes_);
459 while (!it2.empty()) {
460 BLOBNBOX* bbox2 = it2.extract();
461 it2.forward();
462 TBOX box2 = bbox2->bounding_box();
463 BLOBNBOX* bbox1 = it1.data();
464 TBOX box1 = bbox1->bounding_box();
465 while (box1.bottom() < box2.bottom() && !it1.at_last()) {
466 it1.forward();
467 bbox1 = it1.data();
468 box1 = bbox1->bounding_box();
469 }
470 if (box1.bottom() < box2.bottom()) {
471 it1.add_to_end(bbox2);
472 } else if (bbox1 != bbox2) {
473 it1.add_before_stay_put(bbox2);
474 }
475 }
476 Fit(vertical, true);
477 other->Delete(this);
478}
479
480// Add a new element to the list of partner TabVectors.
481// Partners must be added in order of increasing y coordinate of the text line
482// that makes them partners.
483// Groups of identical partners are merged into one.
485 if (IsSeparator() || partner->IsSeparator())
486 return;
487 TabVector_C_IT it(&partners_);
488 if (!it.empty()) {
489 it.move_to_last();
490 if (it.data() == partner)
491 return;
492 }
493 it.add_after_then_move(partner);
494}
495
496// Return true if other is a partner of this.
498 TabVector_C_IT it(&partners_);
499 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
500 if (it.data() == other)
501 return true;
502 }
503 return false;
504}
505
506// These names must be synced with the TabAlignment enum in tabvector.h.
507static const char* const kAlignmentNames[] = {
508 "Left Aligned",
509 "Left Ragged",
510 "Center",
511 "Right Aligned",
512 "Right Ragged",
513 "Separator"
514};
515
516// Print basic information about this tab vector.
517void TabVector::Print(const char* prefix) {
518 tprintf(
519 "%s %s (%d,%d)->(%d,%d) w=%d s=%d, sort key=%d, boxes=%d,"
520 " partners=%d\n",
521 prefix, kAlignmentNames[alignment_], startpt_.x(), startpt_.y(),
522 endpt_.x(), endpt_.y(), mean_width_, percent_score_, sort_key_,
523 boxes_.length(), partners_.length());
524}
525
526// Print basic information about this tab vector and every box in it.
527void TabVector::Debug(const char* prefix) {
528 Print(prefix);
529 BLOBNBOX_C_IT it(&boxes_);
530 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
531 BLOBNBOX* bbox = it.data();
532 const TBOX& box = bbox->bounding_box();
533 tprintf("Box at (%d,%d)->(%d,%d)\n",
534 box.left(), box.bottom(), box.right(), box.top());
535 }
536}
537
538// Draw this tabvector in place in the given window.
540#ifndef GRAPHICS_DISABLED
542 tab_win->Pen(ScrollView::BLUE);
543 else if (alignment_ == TA_LEFT_ALIGNED)
544 tab_win->Pen(ScrollView::LIME_GREEN);
545 else if (alignment_ == TA_LEFT_RAGGED)
546 tab_win->Pen(ScrollView::DARK_GREEN);
547 else if (alignment_ == TA_RIGHT_ALIGNED)
548 tab_win->Pen(ScrollView::PINK);
549 else if (alignment_ == TA_RIGHT_RAGGED)
550 tab_win->Pen(ScrollView::CORAL);
551 else
552 tab_win->Pen(ScrollView::WHITE);
553 tab_win->Line(startpt_.x(), startpt_.y(), endpt_.x(), endpt_.y());
554 tab_win->Pen(ScrollView::GREY);
555 tab_win->Line(startpt_.x(), startpt_.y(), startpt_.x(), extended_ymin_);
556 tab_win->Line(endpt_.x(), extended_ymax_, endpt_.x(), endpt_.y());
557 char score_buf[64];
558 snprintf(score_buf, sizeof(score_buf), "%d", percent_score_);
559 tab_win->TextAttributes("Times", 50, false, false, false);
560 tab_win->Text(startpt_.x(), startpt_.y(), score_buf);
561#endif
562}
563
564// Refit the line and/or re-evaluate the vector if the dirty flags are set.
566 TabFind* finder) {
567 if (needs_refit_)
568 Fit(vertical, true);
569 if (needs_evaluation_)
570 Evaluate(vertical, finder);
571}
572
573// Evaluate the vector in terms of coverage of its length by good-looking
574// box edges. A good looking box is one where its nearest neighbour on the
575// inside is nearer than half the distance its nearest neighbour on the
576// outside of the putative column. Bad boxes are removed from the line.
577// A second pass then further filters boxes by requiring that the gutter
578// width be a minimum fraction of the mean gutter along the line.
579void TabVector::Evaluate(const ICOORD& vertical, TabFind* finder) {
580 bool debug = false;
581 needs_evaluation_ = false;
582 int length = endpt_.y() - startpt_.y();
583 if (length == 0 || boxes_.empty()) {
584 percent_score_ = 0;
585 Print("Zero length in evaluate");
586 return;
587 }
588 // Compute the mean box height.
589 BLOBNBOX_C_IT it(&boxes_);
590 int mean_height = 0;
591 int height_count = 0;
592 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
593 BLOBNBOX* bbox = it.data();
594 const TBOX& box = bbox->bounding_box();
595 int height = box.height();
596 mean_height += height;
597 ++height_count;
598 }
599 if (height_count > 0) mean_height /= height_count;
600 int max_gutter = kGutterMultiple * mean_height;
601 if (IsRagged()) {
602 // Ragged edges face a tougher test in that the gap must always be within
603 // the height of the blob.
604 max_gutter = kGutterToNeighbourRatio * mean_height;
605 }
606
607 STATS gutters(0, max_gutter + 1);
608 // Evaluate the boxes for their goodness, calculating the coverage as we go.
609 // Remove boxes that are not good and shorten the list to the first and
610 // last good boxes.
611 int num_deleted_boxes = 0;
612 bool text_on_image = false;
613 int good_length = 0;
614 const TBOX* prev_good_box = nullptr;
615 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
616 BLOBNBOX* bbox = it.data();
617 const TBOX& box = bbox->bounding_box();
618 int mid_y = (box.top() + box.bottom()) / 2;
619 if (TabFind::WithinTestRegion(2, XAtY(box.bottom()), box.bottom())) {
620 if (!debug) {
621 tprintf("After already deleting %d boxes, ", num_deleted_boxes);
622 Print("Starting evaluation");
623 }
624 debug = true;
625 }
626 // A good box is one where the nearest neighbour on the inside is closer
627 // than half the distance to the nearest neighbour on the outside
628 // (of the putative column).
629 bool left = IsLeftTab();
630 int tab_x = XAtY(mid_y);
631 int gutter_width;
632 int neighbour_gap;
633 finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left,
634 bbox, &gutter_width, &neighbour_gap);
635 if (debug) {
636 tprintf("Box (%d,%d)->(%d,%d) has gutter %d, ndist %d\n",
637 box.left(), box.bottom(), box.right(), box.top(),
638 gutter_width, neighbour_gap);
639 }
640 // Now we can make the test.
641 if (neighbour_gap * kGutterToNeighbourRatio <= gutter_width) {
642 // A good box contributes its height to the good_length.
643 good_length += box.top() - box.bottom();
644 gutters.add(gutter_width, 1);
645 // Two good boxes together contribute the gap between them
646 // to the good_length as well, as long as the gap is not
647 // too big.
648 if (prev_good_box != nullptr) {
649 int vertical_gap = box.bottom() - prev_good_box->top();
650 double size1 = sqrt(static_cast<double>(prev_good_box->area()));
651 double size2 = sqrt(static_cast<double>(box.area()));
652 if (vertical_gap < kMaxFillinMultiple * std::min(size1, size2))
653 good_length += vertical_gap;
654 if (debug) {
655 tprintf("Box and prev good, gap=%d, target %g, goodlength=%d\n",
656 vertical_gap, kMaxFillinMultiple * std::min(size1, size2),
657 good_length);
658 }
659 } else {
660 // Adjust the start to the first good box.
661 SetYStart(box.bottom());
662 }
663 prev_good_box = &box;
664 if (bbox->flow() == BTFT_TEXT_ON_IMAGE)
665 text_on_image = true;
666 } else {
667 // Get rid of boxes that are not good.
668 if (debug) {
669 tprintf("Bad Box (%d,%d)->(%d,%d) with gutter %d, ndist %d\n",
670 box.left(), box.bottom(), box.right(), box.top(),
671 gutter_width, neighbour_gap);
672 }
673 it.extract();
674 ++num_deleted_boxes;
675 }
676 }
677 if (debug) {
678 Print("Evaluating:");
679 }
680 // If there are any good boxes, do it again, except this time get rid of
681 // boxes that have a gutter that is a small fraction of the mean gutter.
682 // This filters out ends that run into a coincidental gap in the text.
683 int search_top = endpt_.y();
684 int search_bottom = startpt_.y();
685 int median_gutter = IntCastRounded(gutters.median());
686 if (gutters.get_total() > 0) {
687 prev_good_box = nullptr;
688 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
689 BLOBNBOX* bbox = it.data();
690 const TBOX& box = bbox->bounding_box();
691 int mid_y = (box.top() + box.bottom()) / 2;
692 // A good box is one where the gutter width is at least some constant
693 // fraction of the mean gutter width.
694 bool left = IsLeftTab();
695 int tab_x = XAtY(mid_y);
696 int max_gutter = kGutterMultiple * mean_height;
697 if (IsRagged()) {
698 // Ragged edges face a tougher test in that the gap must always be
699 // within the height of the blob.
700 max_gutter = kGutterToNeighbourRatio * mean_height;
701 }
702 int gutter_width;
703 int neighbour_gap;
704 finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left,
705 bbox, &gutter_width, &neighbour_gap);
706 // Now we can make the test.
707 if (gutter_width >= median_gutter * kMinGutterFraction) {
708 if (prev_good_box == nullptr) {
709 // Adjust the start to the first good box.
710 SetYStart(box.bottom());
711 search_bottom = box.top();
712 }
713 prev_good_box = &box;
714 search_top = box.bottom();
715 } else {
716 // Get rid of boxes that are not good.
717 if (debug) {
718 tprintf("Bad Box (%d,%d)->(%d,%d) with gutter %d, mean gutter %d\n",
719 box.left(), box.bottom(), box.right(), box.top(),
720 gutter_width, median_gutter);
721 }
722 it.extract();
723 ++num_deleted_boxes;
724 }
725 }
726 }
727 // If there has been a good box, adjust the end.
728 if (prev_good_box != nullptr) {
729 SetYEnd(prev_good_box->top());
730 // Compute the percentage of the vector that is occupied by good boxes.
731 int length = endpt_.y() - startpt_.y();
732 percent_score_ = 100 * good_length / length;
733 if (num_deleted_boxes > 0) {
734 needs_refit_ = true;
735 FitAndEvaluateIfNeeded(vertical, finder);
736 if (boxes_.empty())
737 return;
738 }
739 // Test the gutter over the whole vector, instead of just at the boxes.
740 int required_shift;
741 if (search_bottom > search_top) {
742 search_bottom = startpt_.y();
743 search_top = endpt_.y();
744 }
745 double min_gutter_width = kLineCountReciprocal / boxes_.length();
746 min_gutter_width += IsRagged() ? kMinRaggedGutter : kMinAlignedGutter;
747 min_gutter_width *= mean_height;
748 int max_gutter_width = IntCastRounded(min_gutter_width) + 1;
749 if (median_gutter > max_gutter_width)
750 max_gutter_width = median_gutter;
751 int gutter_width = finder->GutterWidth(search_bottom, search_top, *this,
752 text_on_image, max_gutter_width,
753 &required_shift);
754 if (gutter_width < min_gutter_width) {
755 if (debug) {
756 tprintf("Rejecting bad tab Vector with %d gutter vs %g min\n",
757 gutter_width, min_gutter_width);
758 }
759 boxes_.shallow_clear();
760 percent_score_ = 0;
761 } else if (debug) {
762 tprintf("Final gutter %d, vs limit of %g, required shift = %d\n",
763 gutter_width, min_gutter_width, required_shift);
764 }
765 } else {
766 // There are no good boxes left, so score is 0.
767 percent_score_ = 0;
768 }
769
770 if (debug) {
771 Print("Evaluation complete:");
772 }
773}
774
775// (Re)Fit a line to the stored points. Returns false if the line
776// is degenerate. Althougth the TabVector code mostly doesn't care about the
777// direction of lines, XAtY would give silly results for a horizontal line.
778// The class is mostly aimed at use for vertical lines representing
779// horizontal tab stops.
780bool TabVector::Fit(ICOORD vertical, bool force_parallel) {
781 needs_refit_ = false;
782 if (boxes_.empty()) {
783 // Don't refit something with no boxes, as that only happens
784 // in Evaluate, and we don't want to end up with a zero vector.
785 if (!force_parallel)
786 return false;
787 // If we are forcing parallel, then we just need to set the sort_key_.
788 ICOORD midpt = startpt_;
789 midpt += endpt_;
790 midpt /= 2;
791 sort_key_ = SortKey(vertical, midpt.x(), midpt.y());
792 return startpt_.y() != endpt_.y();
793 }
794 if (!force_parallel && !IsRagged()) {
795 // Use a fitted line as the vertical.
796 DetLineFit linepoints;
797 BLOBNBOX_C_IT it(&boxes_);
798 // Fit a line to all the boxes in the list.
799 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
800 BLOBNBOX* bbox = it.data();
801 const TBOX& box = bbox->bounding_box();
802 int x1 = IsRightTab() ? box.right() : box.left();
803 ICOORD boxpt(x1, box.bottom());
804 linepoints.Add(boxpt);
805 if (it.at_last()) {
806 ICOORD top_pt(x1, box.top());
807 linepoints.Add(top_pt);
808 }
809 }
810 linepoints.Fit(&startpt_, &endpt_);
811 if (startpt_.y() != endpt_.y()) {
812 vertical = endpt_;
813 vertical -= startpt_;
814 }
815 }
816 int start_y = startpt_.y();
817 int end_y = endpt_.y();
818 sort_key_ = IsLeftTab() ? INT32_MAX : -INT32_MAX;
819 BLOBNBOX_C_IT it(&boxes_);
820 // Choose a line parallel to the vertical such that all boxes are on the
821 // correct side of it.
822 mean_width_ = 0;
823 int width_count = 0;
824 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
825 BLOBNBOX* bbox = it.data();
826 const TBOX& box = bbox->bounding_box();
827 mean_width_ += box.width();
828 ++width_count;
829 int x1 = IsRightTab() ? box.right() : box.left();
830 // Test both the bottom and the top, as one will be more extreme, depending
831 // on the direction of skew.
832 int bottom_y = box.bottom();
833 int top_y = box.top();
834 int key = SortKey(vertical, x1, bottom_y);
835 if (IsLeftTab() == (key < sort_key_)) {
836 sort_key_ = key;
837 startpt_ = ICOORD(x1, bottom_y);
838 }
839 key = SortKey(vertical, x1, top_y);
840 if (IsLeftTab() == (key < sort_key_)) {
841 sort_key_ = key;
842 startpt_ = ICOORD(x1, top_y);
843 }
844 if (it.at_first())
845 start_y = bottom_y;
846 if (it.at_last())
847 end_y = top_y;
848 }
849 if (width_count > 0) {
850 mean_width_ = (mean_width_ + width_count - 1) / width_count;
851 }
852 endpt_ = startpt_ + vertical;
853 needs_evaluation_ = true;
854 if (start_y != end_y) {
855 // Set the ends of the vector to fully include the first and last blobs.
856 startpt_.set_x(XAtY(vertical, sort_key_, start_y));
857 startpt_.set_y(start_y);
858 endpt_.set_x(XAtY(vertical, sort_key_, end_y));
859 endpt_.set_y(end_y);
860 return true;
861 }
862 return false;
863}
864
865// Returns the singleton partner if there is one, or nullptr otherwise.
867 if (!partners_.singleton())
868 return nullptr;
869 TabVector_C_IT partner_it(&partners_);
870 TabVector* partner = partner_it.data();
871 return partner;
872}
873
874// Return the partner of this TabVector if the vector qualifies as
875// being a vertical text line, otherwise nullptr.
877 if (!partners_.singleton())
878 return nullptr;
879 TabVector_C_IT partner_it(&partners_);
880 TabVector* partner = partner_it.data();
881 BLOBNBOX_C_IT box_it1(&boxes_);
882 BLOBNBOX_C_IT box_it2(&partner->boxes_);
883 // Count how many boxes are also in the other list.
884 // At the same time, gather the mean width and median vertical gap.
885 if (textord_debug_tabfind > 1) {
886 Print("Testing for vertical text");
887 partner->Print(" partner");
888 }
889 int num_matched = 0;
890 int num_unmatched = 0;
891 int total_widths = 0;
892 int width = startpt().x() - partner->startpt().x();
893 if (width < 0)
894 width = -width;
895 STATS gaps(0, width * 2);
896 BLOBNBOX* prev_bbox = nullptr;
897 box_it2.mark_cycle_pt();
898 for (box_it1.mark_cycle_pt(); !box_it1.cycled_list(); box_it1.forward()) {
899 BLOBNBOX* bbox = box_it1.data();
900 TBOX box = bbox->bounding_box();
901 if (prev_bbox != nullptr) {
902 gaps.add(box.bottom() - prev_bbox->bounding_box().top(), 1);
903 }
904 while (!box_it2.cycled_list() && box_it2.data() != bbox &&
905 box_it2.data()->bounding_box().bottom() < box.bottom()) {
906 box_it2.forward();
907 }
908 if (!box_it2.cycled_list() && box_it2.data() == bbox &&
909 bbox->region_type() >= BRT_UNKNOWN &&
910 (prev_bbox == nullptr || prev_bbox->region_type() >= BRT_UNKNOWN))
911 ++num_matched;
912 else
913 ++num_unmatched;
914 total_widths += box.width();
915 prev_bbox = bbox;
916 }
917 if (num_unmatched + num_matched == 0) return nullptr;
918 double avg_width = total_widths * 1.0 / (num_unmatched + num_matched);
919 double max_gap = textord_tabvector_vertical_gap_fraction * avg_width;
920 int min_box_match = static_cast<int>((num_matched + num_unmatched) *
922 bool is_vertical = (gaps.get_total() > 0 &&
923 num_matched >= min_box_match &&
924 gaps.median() <= max_gap);
925 if (textord_debug_tabfind > 1) {
926 tprintf("gaps=%d, matched=%d, unmatched=%d, min_match=%d "
927 "median gap=%.2f, width=%.2f max_gap=%.2f Vertical=%s\n",
928 gaps.get_total(), num_matched, num_unmatched, min_box_match,
929 gaps.median(), avg_width, max_gap, is_vertical?"Yes":"No");
930 }
931 return (is_vertical) ? partner : nullptr;
932}
933
934// The constructor is private.
935TabVector::TabVector(int extended_ymin, int extended_ymax,
936 TabAlignment alignment, BLOBNBOX_CLIST* boxes)
937 : extended_ymin_(extended_ymin), extended_ymax_(extended_ymax),
938 sort_key_(0), percent_score_(0), mean_width_(0),
939 needs_refit_(true), needs_evaluation_(true), alignment_(alignment),
940 top_constraints_(nullptr), bottom_constraints_(nullptr) {
941 BLOBNBOX_C_IT it(&boxes_);
942 it.add_list_after(boxes);
943}
944
945// Delete this, but first, repoint all the partners to point to
946// replacement. If replacement is nullptr, then partner relationships
947// are removed.
948void TabVector::Delete(TabVector* replacement) {
949 TabVector_C_IT it(&partners_);
950 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
951 TabVector* partner = it.data();
952 TabVector_C_IT p_it(&partner->partners_);
953 // If partner already has replacement in its list, then make
954 // replacement null, and just remove this TabVector when we find it.
955 TabVector* partner_replacement = replacement;
956 for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) {
957 TabVector* p_partner = p_it.data();
958 if (p_partner == partner_replacement) {
959 partner_replacement = nullptr;
960 break;
961 }
962 }
963 // Remove all references to this, and replace with replacement if not nullptr.
964 for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) {
965 TabVector* p_partner = p_it.data();
966 if (p_partner == this) {
967 p_it.extract();
968 if (partner_replacement != nullptr)
969 p_it.add_before_stay_put(partner_replacement);
970 }
971 }
972 if (partner_replacement != nullptr) {
973 partner_replacement->AddPartner(partner);
974 }
975 }
976 delete this;
977}
978
979
980} // namespace tesseract.
@ BTFT_TEXT_ON_IMAGE
Definition: blobbox.h:120
@ BRT_UNKNOWN
Definition: blobbox.h:78
#define CLISTIZE(CLASSNAME)
Definition: clst.h:891
#define ELISTIZE(CLASSNAME)
Definition: elst.h:931
#define ELIST2IZE(CLASSNAME)
Definition: elst2.h:939
int IntCastRounded(double x)
Definition: helpers.h:175
#define double_VAR(name, val, comment)
Definition: params.h:312
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
bool textord_debug_printable
Definition: alignedblob.cpp:33
int textord_debug_tabfind
Definition: alignedblob.cpp:27
double textord_tabvector_vertical_gap_fraction
Definition: tabvector.cpp:55
const double kMinAlignedGutter
Definition: tabvector.cpp:50
const int kGutterMultiple
Definition: tabvector.cpp:35
const int kSimilarVectorDist
Definition: tabvector.cpp:39
const double kMinRaggedGutter
Definition: tabvector.cpp:52
@ TA_RIGHT_ALIGNED
Definition: tabvector.h:48
@ TA_RIGHT_RAGGED
Definition: tabvector.h:49
@ TA_LEFT_ALIGNED
Definition: tabvector.h:45
@ TA_LEFT_RAGGED
Definition: tabvector.h:46
const double kLineCountReciprocal
Definition: tabvector.cpp:48
const int kMaxFillinMultiple
Definition: tabvector.cpp:44
const double kMinGutterFraction
Definition: tabvector.cpp:46
const int kGutterToNeighbourRatio
Definition: tabvector.cpp:37
const int kSimilarRaggedDist
Definition: tabvector.cpp:42
double textord_tabvector_vertical_box_ratio
Definition: tabvector.cpp:58
BlobRegionType region_type() const
Definition: blobbox.h:283
const TBOX & bounding_box() const
Definition: blobbox.h:230
BlobTextFlowType flow() const
Definition: blobbox.h:295
void Add(const ICOORD &pt)
Definition: detlinefit.cpp:51
double Fit(ICOORD *pt1, ICOORD *pt2)
Definition: detlinefit.h:75
integer coordinate
Definition: points.h:32
void set_x(int16_t xin)
rewrite function
Definition: points.h:61
int16_t y() const
access_function
Definition: points.h:56
void set_y(int16_t yin)
rewrite function
Definition: points.h:65
int16_t x() const
access function
Definition: points.h:52
void rotate(const FCOORD &vec)
Definition: points.h:536
Definition: points.h:189
Definition: rect.h:34
const ICOORD & botleft() const
Definition: rect.h:92
int16_t top() const
Definition: rect.h:58
ICOORD botright() const
Definition: rect.h:96
int16_t width() const
Definition: rect.h:115
int32_t area() const
Definition: rect.h:122
int16_t height() const
Definition: rect.h:108
ICOORD topleft() const
Definition: rect.h:100
int16_t left() const
Definition: rect.h:72
int16_t bottom() const
Definition: rect.h:65
const ICOORD & topright() const
Definition: rect.h:104
int16_t right() const
Definition: rect.h:79
Definition: statistc.h:31
void add(int32_t value, int32_t count)
Definition: statistc.cpp:93
double median() const
Definition: statistc.cpp:231
int32_t get_total() const
Definition: statistc.h:84
static bool WithinTestRegion(int detail_level, int x, int y)
void StartVerticalSearch(int xmin, int xmax, int y)
Definition: bbgrid.h:788
BBC * NextVerticalSearch(bool top_to_bottom)
Definition: bbgrid.h:802
void GutterWidthAndNeighbourGap(int tab_x, int mean_height, int max_gutter, bool left, BLOBNBOX *bbox, int *gutter_width, int *neighbour_gap)
Definition: tabfind.cpp:208
int GutterWidth(int bottom_y, int top_y, const TabVector &v, bool ignore_unmergeables, int max_gutter_width, int *required_shift)
Definition: tabfind.cpp:161
static void CreateConstraint(TabVector *vector, bool is_top)
Definition: tabvector.cpp:63
static void ApplyConstraints(TabConstraint_LIST *constraints)
Definition: tabvector.cpp:116
static void MergeConstraints(TabConstraint_LIST *list1, TabConstraint_LIST *list2)
Definition: tabvector.cpp:92
static bool CompatibleConstraints(TabConstraint_LIST *list1, TabConstraint_LIST *list2)
Definition: tabvector.cpp:75
int XAtY(int y) const
Definition: tabvector.h:188
void set_bottom_constraints(TabConstraint_LIST *constraints)
Definition: tabvector.h:166
static int SortKey(const ICOORD &vertical, int x, int y)
Definition: tabvector.h:279
const ICOORD & endpt() const
Definition: tabvector.h:148
void Rotate(const FCOORD &rotation)
Definition: tabvector.cpp:273
void AddPartner(TabVector *partner)
Definition: tabvector.cpp:484
bool IsSeparator() const
Definition: tabvector.h:220
static void MergeSimilarTabVectors(const ICOORD &vertical, TabVector_LIST *vectors, BlobGrid *grid)
Definition: tabvector.cpp:353
int ExtendedOverlap(int top_y, int bottom_y) const
Definition: tabvector.h:207
int extended_ymin() const
Definition: tabvector.h:154
bool IsAPartner(const TabVector *other)
Definition: tabvector.cpp:497
bool IsRightTab() const
Definition: tabvector.h:216
TabVector * VerticalTextlinePartner()
Definition: tabvector.cpp:876
void Evaluate(const ICOORD &vertical, TabFind *finder)
Definition: tabvector.cpp:579
int extended_ymax() const
Definition: tabvector.h:151
void SetupPartnerConstraints()
Definition: tabvector.cpp:294
void SetYEnd(int end_y)
Definition: tabvector.cpp:267
void set_top_constraints(TabConstraint_LIST *constraints)
Definition: tabvector.h:163
bool Fit(ICOORD vertical, bool force_parallel)
Definition: tabvector.cpp:780
bool IsRagged() const
Definition: tabvector.h:228
void SetYStart(int start_y)
Definition: tabvector.cpp:262
bool IsLeftTab() const
Definition: tabvector.h:212
void Print(const char *prefix)
Definition: tabvector.cpp:517
void FitAndEvaluateIfNeeded(const ICOORD &vertical, TabFind *finder)
Definition: tabvector.cpp:565
const ICOORD & startpt() const
Definition: tabvector.h:145
void Display(ScrollView *tab_win)
Definition: tabvector.cpp:539
TabVector * ShallowCopy() const
Definition: tabvector.cpp:225
void MergeWith(const ICOORD &vertical, TabVector *other)
Definition: tabvector.cpp:450
void Debug(const char *prefix)
Definition: tabvector.cpp:527
TabVector * GetSinglePartner()
Definition: tabvector.cpp:866
void ExtendToBox(BLOBNBOX *blob)
Definition: tabvector.cpp:238
bool SimilarTo(const ICOORD &vertical, const TabVector &other, BlobGrid *grid) const
Definition: tabvector.cpp:386
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:532
void TextAttributes(const char *font, int pixel_size, bool bold, bool italic, bool underlined)
Definition: scrollview.cpp:635
void Text(int x, int y, const char *mystring)
Definition: scrollview.cpp:652
void Pen(Color color)
Definition: scrollview.cpp:719