tesseract 4.1.1
Loading...
Searching...
No Matches
blobbox.h
Go to the documentation of this file.
1/**********************************************************************
2 * File: blobbox.h (Formerly blobnbox.h)
3 * Description: Code for the textord blob class.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 1992, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19#ifndef BLOBBOX_H
20#define BLOBBOX_H
21
22#include <cinttypes> // for PRId32
23#include <cmath> // for std::sqrt
24#include <cstdint> // for int16_t, int32_t
25#include "elst.h" // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
26#include "elst2.h" // for ELIST2_ITERATOR, ELIST2IZEH, ELIST2_LINK
27#include "errcode.h" // for ASSERT_HOST
28#include "ocrblock.h" // for BLOCK
29#include "params.h" // for DoubleParam, double_VAR_H
30#include "pdblock.h" // for PDBLK
31#include "points.h" // for FCOORD, ICOORD, ICOORDELT_LIST
32#include "quspline.h" // for QSPLINE
33#include "rect.h" // for TBOX
34#include "scrollview.h" // for ScrollView, ScrollView::Color
35#include "statistc.h" // for STATS
36#include "stepblob.h" // for C_BLOB
37#include "tprintf.h" // for tprintf
38#include "werd.h" // for WERD_LIST
39
40class C_OUTLINE;
41
42struct Pix;
43
45{
46 PITCH_DUNNO, // insufficient data
47 PITCH_DEF_FIXED, // definitely fixed
48 PITCH_MAYBE_FIXED, // could be
53};
54
55// The possible tab-stop types of each side of a BLOBNBOX.
56// The ordering is important, as it is used for deleting dead-ends in the
57// search. ALIGNED, CONFIRMED and VLINE should remain greater than the
58// non-aligned, unset, or deleted members.
59enum TabType {
60 TT_NONE, // Not a tab.
61 TT_DELETED, // Not a tab after detailed analysis.
62 TT_MAYBE_RAGGED, // Initial designation of a tab-stop candidate.
63 TT_MAYBE_ALIGNED, // Initial designation of a tab-stop candidate.
64 TT_CONFIRMED, // Aligned with neighbours.
65 TT_VLINE // Detected as a vertical line.
66};
67
68// The possible region types of a BLOBNBOX.
69// Note: keep all the text types > BRT_UNKNOWN and all the image types less.
70// Keep in sync with kBlobTypes in colpartition.cpp and BoxColor, and the
71// *Type static functions below.
73 BRT_NOISE, // Neither text nor image.
74 BRT_HLINE, // Horizontal separator line.
75 BRT_VLINE, // Vertical separator line.
76 BRT_RECTIMAGE, // Rectangular image.
77 BRT_POLYIMAGE, // Non-rectangular image.
78 BRT_UNKNOWN, // Not determined yet.
79 BRT_VERT_TEXT, // Vertical alignment, not necessarily vertically oriented.
80 BRT_TEXT, // Convincing text.
81
82 BRT_COUNT // Number of possibilities.
83};
84
85// enum for elements of arrays that refer to neighbours.
86// NOTE: keep in this order, so ^2 can be used to flip direction.
93};
94
95// enum for special type of text characters, such as math symbol or italic.
97 BSTT_NONE, // No special.
98 BSTT_ITALIC, // Italic style.
99 BSTT_DIGIT, // Digit symbols.
100 BSTT_MATH, // Mathmatical symobls (not including digit).
101 BSTT_UNCLEAR, // Characters with low recognition rate.
102 BSTT_SKIP, // Characters that we skip labeling (usually too small).
105
107 return static_cast<BlobNeighbourDir>(dir ^ 2);
108}
109
110// BlobTextFlowType indicates the quality of neighbouring information
111// related to a chain of connected components, either horizontally or
112// vertically. Also used by ColPartition for the collection of blobs
113// within, which should all have the same value in most cases.
115 BTFT_NONE, // No text flow set yet.
116 BTFT_NONTEXT, // Flow too poor to be likely text.
117 BTFT_NEIGHBOURS, // Neighbours support flow in this direction.
118 BTFT_CHAIN, // There is a weak chain of text in this direction.
119 BTFT_STRONG_CHAIN, // There is a strong chain of text in this direction.
120 BTFT_TEXT_ON_IMAGE, // There is a strong chain of text on an image.
121 BTFT_LEADER, // Leader dots/dashes etc.
124
125// Returns true if type1 dominates type2 in a merge. Mostly determined by the
126// ordering of the enum, LEADER is weak and dominates nothing.
127// The function is anti-symmetric (t1 > t2) === !(t2 > t1), except that
128// this cannot be true if t1 == t2, so the result is undefined.
130 // LEADER always loses.
131 if (type1 == BTFT_LEADER) return false;
132 if (type2 == BTFT_LEADER) return true;
133 // With those out of the way, the ordering of the enum determines the result.
134 return type1 >= type2;
135}
136
137namespace tesseract {
138class ColPartition;
139}
140
141class BLOBNBOX;
144{
145 public:
147 ReInit();
148 }
149 explicit BLOBNBOX(C_BLOB *srcblob) {
150 box = srcblob->bounding_box();
151 ReInit();
152 cblob_ptr = srcblob;
153 area = static_cast<int>(srcblob->area());
154 }
156 if (owns_cblob_) delete cblob_ptr;
157 }
158 static BLOBNBOX* RealBlob(C_OUTLINE* outline) {
159 auto* blob = new C_BLOB(outline);
160 return new BLOBNBOX(blob);
161 }
162
163 // Rotates the box and the underlying blob.
164 void rotate(FCOORD rotation);
165
166 // Methods that act on the box without touching the underlying blob.
167 // Reflect the box in the y-axis, leaving the underlying blob untouched.
169 // Rotates the box by the angle given by rotation.
170 // If the blob is a diacritic, then only small rotations for skew
171 // correction can be applied.
172 void rotate_box(FCOORD rotation);
173 // Moves just the box by the given vector.
175 if (IsDiacritic()) {
176 box.move(v);
177 base_char_top_ += v.y();
178 base_char_bottom_ += v.y();
179 } else {
180 box.move(v);
182 }
183 }
184 void merge(BLOBNBOX *nextblob);
185 void really_merge(BLOBNBOX* other);
186 void chop( // fake chop blob
187 BLOBNBOX_IT *start_it, // location of this
188 BLOBNBOX_IT *blob_it, // iterator
189 FCOORD rotation, // for landscape
190 float xheight); // line height
191
192 void NeighbourGaps(int gaps[BND_COUNT]) const;
193 void MinMaxGapsClipped(int* h_min, int* h_max,
194 int* v_min, int* v_max) const;
195 void CleanNeighbours();
196 // Returns positive if there is at least one side neighbour that has a
197 // similar stroke width and is not on the other side of a rule line.
198 int GoodTextBlob() const;
199 // Returns the number of side neighbours that are of type BRT_NOISE.
200 int NoisyNeighbours() const;
201
202 // Returns true if the blob is noise and has no owner.
203 bool DeletableNoise() const {
204 return owner() == nullptr && region_type() == BRT_NOISE;
205 }
206
207 // Returns true, and sets vert_possible/horz_possible if the blob has some
208 // feature that makes it individually appear to flow one way.
209 // eg if it has a high aspect ratio, yet has a complex shape, such as a
210 // joined word in Latin, Arabic, or Hindi, rather than being a -, I, l, 1.
212
213 // Returns true if there is no tabstop violation in merging this and other.
214 bool ConfirmNoTabViolation(const BLOBNBOX& other) const;
215
216 // Returns true if other has a similar stroke width to this.
217 bool MatchingStrokeWidth(const BLOBNBOX& other,
218 double fractional_tolerance,
219 double constant_tolerance) const;
220
221 // Returns a bounding box of the outline contained within the
222 // given horizontal range.
223 TBOX BoundsWithinLimits(int left, int right);
224
225 // Estimates and stores the baseline position based on the shape of the
226 // outline.
228
229 // Simple accessors.
230 const TBOX& bounding_box() const {
231 return box;
232 }
233 // Set the bounding box. Use with caution.
234 // Normally use compute_bounding_box instead.
235 void set_bounding_box(const TBOX& new_box) {
236 box = new_box;
237 base_char_top_ = box.top();
238 base_char_bottom_ = box.bottom();
239 }
241 box = cblob_ptr->bounding_box();
242 base_char_top_ = box.top();
243 base_char_bottom_ = box.bottom();
244 baseline_y_ = box.bottom();
245 }
246 const TBOX& reduced_box() const {
247 return red_box;
248 }
249 void set_reduced_box(TBOX new_box) {
250 red_box = new_box;
251 reduced = true;
252 }
253 int32_t enclosed_area() const {
254 return area;
255 }
256 bool joined_to_prev() const {
257 return joined;
258 }
259 bool red_box_set() const {
260 return reduced;
261 }
262 int repeated_set() const {
263 return repeated_set_;
264 }
265 void set_repeated_set(int set_id) {
266 repeated_set_ = set_id;
267 }
268 C_BLOB *cblob() const {
269 return cblob_ptr;
270 }
272 return left_tab_type_;
273 }
274 void set_left_tab_type(TabType new_type) {
275 left_tab_type_ = new_type;
276 }
278 return right_tab_type_;
279 }
281 right_tab_type_ = new_type;
282 }
284 return region_type_;
285 }
287 region_type_ = new_type;
288 }
290 return spt_type_;
291 }
293 spt_type_ = new_type;
294 }
296 return flow_;
297 }
299 flow_ = value;
300 }
301 bool vert_possible() const {
302 return vert_possible_;
303 }
304 void set_vert_possible(bool value) {
305 vert_possible_ = value;
306 }
307 bool horz_possible() const {
308 return horz_possible_;
309 }
310 void set_horz_possible(bool value) {
311 horz_possible_ = value;
312 }
313 int left_rule() const {
314 return left_rule_;
315 }
316 void set_left_rule(int new_left) {
317 left_rule_ = new_left;
318 }
319 int right_rule() const {
320 return right_rule_;
321 }
322 void set_right_rule(int new_right) {
323 right_rule_ = new_right;
324 }
325 int left_crossing_rule() const {
326 return left_crossing_rule_;
327 }
328 void set_left_crossing_rule(int new_left) {
329 left_crossing_rule_ = new_left;
330 }
332 return right_crossing_rule_;
333 }
334 void set_right_crossing_rule(int new_right) {
335 right_crossing_rule_ = new_right;
336 }
337 float horz_stroke_width() const {
338 return horz_stroke_width_;
339 }
340 void set_horz_stroke_width(float width) {
341 horz_stroke_width_ = width;
342 }
343 float vert_stroke_width() const {
344 return vert_stroke_width_;
345 }
346 void set_vert_stroke_width(float width) {
347 vert_stroke_width_ = width;
348 }
349 float area_stroke_width() const {
350 return area_stroke_width_;
351 }
353 return owner_;
354 }
356 owner_ = new_owner;
357 }
358 bool leader_on_left() const {
359 return leader_on_left_;
360 }
361 void set_leader_on_left(bool flag) {
362 leader_on_left_ = flag;
363 }
364 bool leader_on_right() const {
365 return leader_on_right_;
366 }
367 void set_leader_on_right(bool flag) {
368 leader_on_right_ = flag;
369 }
371 return neighbours_[n];
372 }
374 return good_stroke_neighbours_[n];
375 }
377 neighbours_[n] = neighbour;
378 good_stroke_neighbours_[n] = good;
379 }
380 bool IsDiacritic() const {
381 return base_char_top_ != box.top() || base_char_bottom_ != box.bottom();
382 }
383 int base_char_top() const {
384 return base_char_top_;
385 }
386 int base_char_bottom() const {
387 return base_char_bottom_;
388 }
389 int baseline_position() const {
390 return baseline_y_;
391 }
392 int line_crossings() const {
393 return line_crossings_;
394 }
395 void set_line_crossings(int value) {
396 line_crossings_ = value;
397 }
398 void set_diacritic_box(const TBOX& diacritic_box) {
399 base_char_top_ = diacritic_box.top();
400 base_char_bottom_ = diacritic_box.bottom();
401 }
403 return base_char_blob_;
404 }
406 base_char_blob_ = blob;
407 }
408 void set_owns_cblob(bool value) { owns_cblob_ = value; }
409
410 bool UniquelyVertical() const {
411 return vert_possible_ && !horz_possible_;
412 }
413 bool UniquelyHorizontal() const {
414 return horz_possible_ && !vert_possible_;
415 }
416
417 // Returns true if the region type is text.
418 static bool IsTextType(BlobRegionType type) {
419 return type == BRT_TEXT || type == BRT_VERT_TEXT;
420 }
421 // Returns true if the region type is image.
422 static bool IsImageType(BlobRegionType type) {
423 return type == BRT_RECTIMAGE || type == BRT_POLYIMAGE;
424 }
425 // Returns true if the region type is line.
426 static bool IsLineType(BlobRegionType type) {
427 return type == BRT_HLINE || type == BRT_VLINE;
428 }
429 // Returns true if the region type cannot be merged.
430 static bool UnMergeableType(BlobRegionType type) {
431 return IsLineType(type) || IsImageType(type);
432 }
433 // Helper to call CleanNeighbours on all blobs on the list.
434 static void CleanNeighbours(BLOBNBOX_LIST* blobs);
435 // Helper to delete all the deletable blobs on the list.
436 static void DeleteNoiseBlobs(BLOBNBOX_LIST* blobs);
437 // Helper to compute edge offsets for all the blobs on the list.
438 // See coutln.h for an explanation of edge offsets.
439 static void ComputeEdgeOffsets(Pix* thresholds, Pix* grey,
440 BLOBNBOX_LIST* blobs);
441
442#ifndef GRAPHICS_DISABLED
443 // Helper to draw all the blobs on the list in the given body_colour,
444 // with child outlines in the child_colour.
445 static void PlotBlobs(BLOBNBOX_LIST* list,
446 ScrollView::Color body_colour,
447 ScrollView::Color child_colour,
448 ScrollView* win);
449 // Helper to draw only DeletableNoise blobs (unowned, BRT_NOISE) on the
450 // given list in the given body_colour, with child outlines in the
451 // child_colour.
452 static void PlotNoiseBlobs(BLOBNBOX_LIST* list,
453 ScrollView::Color body_colour,
454 ScrollView::Color child_colour,
455 ScrollView* win);
456
458 BlobTextFlowType flow_type);
459
460 // Keep in sync with BlobRegionType.
462
463 void plot(ScrollView* window, // window to draw in
464 ScrollView::Color blob_colour, // for outer bits
465 ScrollView::Color child_colour); // for holes
466#endif
467
468 // Initializes members set by StrokeWidth and beyond, without discarding
469 // stored area and strokewidth values, which are expensive to calculate.
470 void ReInit() {
471 joined = false;
472 reduced = false;
473 repeated_set_ = 0;
474 left_tab_type_ = TT_NONE;
475 right_tab_type_ = TT_NONE;
476 region_type_ = BRT_UNKNOWN;
477 flow_ = BTFT_NONE;
478 spt_type_ = BSTT_SKIP;
479 left_rule_ = 0;
480 right_rule_ = 0;
481 left_crossing_rule_ = 0;
482 right_crossing_rule_ = 0;
483 if (area_stroke_width_ == 0.0f && area > 0 && cblob() != nullptr
484 && cblob()->perimeter()!=0)
485 area_stroke_width_ = 2.0f * area / cblob()->perimeter();
486 owner_ = nullptr;
487 base_char_top_ = box.top();
488 base_char_bottom_ = box.bottom();
489 baseline_y_ = box.bottom();
490 line_crossings_ = 0;
491 base_char_blob_ = nullptr;
492 horz_possible_ = false;
493 vert_possible_ = false;
494 leader_on_left_ = false;
495 leader_on_right_ = false;
497 }
498
500 for (int n = 0; n < BND_COUNT; ++n) {
501 neighbours_[n] = nullptr;
502 good_stroke_neighbours_[n] = false;
503 }
504 }
505
506 private:
507 C_BLOB* cblob_ptr = nullptr; // edgestep blob
508 TBOX box; // bounding box
509 TBOX red_box; // bounding box
510 int32_t area = 0; // enclosed area
511 int32_t repeated_set_ = 0; // id of the set of repeated blobs
512 TabType left_tab_type_ = TT_NONE; // Indicates tab-stop assessment
513 TabType right_tab_type_ = TT_NONE; // Indicates tab-stop assessment
514 BlobRegionType region_type_ = BRT_UNKNOWN; // Type of region this blob belongs to
515 BlobTextFlowType flow_ = BTFT_NONE; // Quality of text flow.
516 BlobSpecialTextType spt_type_; // Special text type.
517 bool joined = false; // joined to prev
518 bool reduced = false; // reduced box set
519 int16_t left_rule_ = 0; // x-coord of nearest but not crossing rule line
520 int16_t right_rule_ = 0; // x-coord of nearest but not crossing rule line
521 int16_t left_crossing_rule_; // x-coord of nearest or crossing rule line
522 int16_t right_crossing_rule_; // x-coord of nearest or crossing rule line
523 int16_t base_char_top_; // y-coord of top/bottom of diacritic base,
524 int16_t base_char_bottom_; // if it exists else top/bottom of this blob.
525 int16_t baseline_y_; // Estimate of baseline position.
526 int32_t line_crossings_; // Number of line intersections touched.
527 BLOBNBOX* base_char_blob_; // The blob that was the base char.
528 tesseract::ColPartition* owner_; // Who will delete me when I am not needed
529 BLOBNBOX* neighbours_[BND_COUNT];
530 float horz_stroke_width_ = 0.0f; // Median horizontal stroke width
531 float vert_stroke_width_ = 0.0f; // Median vertical stroke width
532 float area_stroke_width_ = 0.0f; // Stroke width from area/perimeter ratio.
533 bool good_stroke_neighbours_[BND_COUNT];
534 bool horz_possible_; // Could be part of horizontal flow.
535 bool vert_possible_; // Could be part of vertical flow.
536 bool leader_on_left_; // There is a leader to the left.
537 bool leader_on_right_; // There is a leader to the right.
538 // Iff true, then the destructor should delete the cblob_ptr.
539 // TODO(rays) migrate all uses to correctly setting this flag instead of
540 // deleting the C_BLOB before deleting the BLOBNBOX.
541 bool owns_cblob_ = false;
542};
543
544class TO_ROW: public ELIST2_LINK
545{
546 public:
547 static const int kErrorWeight = 3;
548
550 clear();
551 } //empty
552 TO_ROW( //constructor
553 BLOBNBOX *blob, //from first blob
554 float top, //of row //target height
555 float bottom,
556 float row_size);
557
558 void print() const;
559 float max_y() const { //access function
560 return y_max;
561 }
562 float min_y() const {
563 return y_min;
564 }
565 float mean_y() const {
566 return (y_min + y_max) / 2.0f;
567 }
568 float initial_min_y() const {
569 return initial_y_min;
570 }
571 float line_m() const { //access to line fit
572 return m;
573 }
574 float line_c() const {
575 return c;
576 }
577 float line_error() const {
578 return error;
579 }
580 float parallel_c() const {
581 return para_c;
582 }
583 float parallel_error() const {
584 return para_error;
585 }
586 float believability() const { //baseline goodness
587 return credibility;
588 }
589 float intercept() const { //real parallel_c
590 return y_origin;
591 }
592 void add_blob( //put in row
593 BLOBNBOX *blob, //blob to add
594 float top, //of row //target height
595 float bottom,
596 float row_size);
597 void insert_blob( //put in row in order
598 BLOBNBOX *blob);
599
600 BLOBNBOX_LIST *blob_list() { //get list
601 return &blobs;
602 }
603
604 void set_line( //set line spec
605 float new_m, //line to set
606 float new_c,
607 float new_error) {
608 m = new_m;
609 c = new_c;
610 error = new_error;
611 }
612 void set_parallel_line( //set fixed gradient line
613 float gradient, //page gradient
614 float new_c,
615 float new_error) {
616 para_c = new_c;
617 para_error = new_error;
618 credibility = blobs.length() - kErrorWeight * new_error;
619 y_origin = new_c / std::sqrt(1 + gradient * gradient);
620 //real intercept
621 }
622 void set_limits( //set min,max
623 float new_min, //bottom and
624 float new_max) { //top of row
625 y_min = new_min;
626 y_max = new_max;
627 }
629 //get projection
630
631 bool rep_chars_marked() const {
632 return num_repeated_sets_ != -1;
633 }
635 num_repeated_sets_ = -1;
636 }
637 int num_repeated_sets() const {
638 return num_repeated_sets_;
639 }
640 void set_num_repeated_sets(int num_sets) {
641 num_repeated_sets_ = num_sets;
642 }
643
644 // true when dead
645 bool merged = false;
646 bool all_caps; // had no ascenders
647 bool used_dm_model; // in guessing pitch
648 int16_t projection_left; // start of projection
649 int16_t projection_right; // start of projection
650 PITCH_TYPE pitch_decision; // how strong is decision
651 float fixed_pitch; // pitch or 0
652 float fp_space; // sp if fixed pitch
653 float fp_nonsp; // nonsp if fixed pitch
654 float pr_space; // sp if prop
655 float pr_nonsp; // non sp if prop
656 float spacing; // to "next" row
657 float xheight; // of line
658 int xheight_evidence; // number of blobs of height xheight
659 float ascrise; // ascenders
660 float descdrop; // descenders
661 float body_size; // of CJK characters. Assumed to be
662 // xheight+ascrise for non-CJK text.
663 int32_t min_space; // min size for real space
664 int32_t max_nonspace; // max size of non-space
665 int32_t space_threshold; // space vs nonspace
666 float kern_size; // average non-space
667 float space_size; // average space
668 WERD_LIST rep_words; // repeated chars
669 ICOORDELT_LIST char_cells; // fixed pitch cells
670 QSPLINE baseline; // curved baseline
671 STATS projection; // vertical projection
672
673 private:
674 void clear(); // clear all values to reasonable defaults
675
676 BLOBNBOX_LIST blobs; //blobs in row
677 float y_min; //coords
678 float y_max;
679 float initial_y_min;
680 float m, c; //line spec
681 float error; //line error
682 float para_c; //constrained fit
683 float para_error;
684 float y_origin; //rotated para_c;
685 float credibility; //baseline believability
686 int num_repeated_sets_; // number of sets of repeated blobs
687 // set to -1 if we have not searched
688 // for repeated blobs in this row yet
689};
690
693{
694 public:
695 TO_BLOCK() : pitch_decision(PITCH_DUNNO) {
696 clear();
697 } //empty
698 TO_BLOCK( //constructor
699 BLOCK *src_block); //real block
700 ~TO_BLOCK();
701
702 void clear(); // clear all scalar members.
703
704 TO_ROW_LIST *get_rows() { //access function
705 return &row_list;
706 }
707
708 // Rotate all the blobnbox lists and the underlying block. Then update the
709 // median size statistic from the blobs list.
710 void rotate(const FCOORD& rotation) {
711 BLOBNBOX_LIST* blobnbox_list[] = {&blobs, &underlines, &noise_blobs,
712 &small_blobs, &large_blobs, nullptr};
713 for (BLOBNBOX_LIST** list = blobnbox_list; *list != nullptr; ++list) {
714 BLOBNBOX_IT it(*list);
715 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
716 it.data()->rotate(rotation);
717 }
718 }
719 // Rotate the block
720 ASSERT_HOST(block->pdblk.poly_block() != nullptr);
721 block->rotate(rotation);
722 // Update the median size statistic from the blobs list.
723 STATS widths(0, block->pdblk.bounding_box().width());
724 STATS heights(0, block->pdblk.bounding_box().height());
725 BLOBNBOX_IT blob_it(&blobs);
726 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
727 widths.add(blob_it.data()->bounding_box().width(), 1);
728 heights.add(blob_it.data()->bounding_box().height(), 1);
729 }
730 block->set_median_size(static_cast<int>(widths.median() + 0.5),
731 static_cast<int>(heights.median() + 0.5));
732 }
733
734 void print_rows() { //debug info
735 TO_ROW_IT row_it = &row_list;
736 TO_ROW *row;
737
738 for (row_it.mark_cycle_pt(); !row_it.cycled_list();
739 row_it.forward()) {
740 row = row_it.data();
741 tprintf("Row range (%g,%g), para_c=%g, blobcount=%" PRId32 "\n",
742 row->min_y(), row->max_y(), row->parallel_c(),
743 row->blob_list()->length());
744 }
745 }
746
747 // Reorganizes the blob lists with a different definition of small, medium
748 // and large, compared to the original definition.
749 // Height is still the primary filter key, but medium width blobs of small
750 // height become medium, and very wide blobs of small height stay small.
751 void ReSetAndReFilterBlobs();
752
753 // Deletes noise blobs from all lists where not owned by a ColPartition.
754 void DeleteUnownedNoise();
755
756 // Computes and stores the edge offsets on each blob for use in feature
757 // extraction, using greyscale if the supplied grey and thresholds pixes
758 // are 8-bit or otherwise (if nullptr or not 8 bit) the original binary
759 // edge step outlines.
760 // Thresholds must either be the same size as grey or an integer down-scale
761 // of grey.
762 // See coutln.h for an explanation of edge offsets.
763 void ComputeEdgeOffsets(Pix* thresholds, Pix* grey);
764
765#ifndef GRAPHICS_DISABLED
766 // Draw the noise blobs from all lists in red.
767 void plot_noise_blobs(ScrollView* to_win);
768 // Draw the blobs on on the various lists in the block in different colors.
769 void plot_graded_blobs(ScrollView* to_win);
770#endif
771
772 BLOBNBOX_LIST blobs; //medium size
773 BLOBNBOX_LIST underlines; //underline blobs
774 BLOBNBOX_LIST noise_blobs; //very small
775 BLOBNBOX_LIST small_blobs; //fairly small
776 BLOBNBOX_LIST large_blobs; //big blobs
777 BLOCK *block; //real block
778 PITCH_TYPE pitch_decision; //how strong is decision
779 float line_spacing; //estimate
780 // line_size is a lower-bound estimate of the font size in pixels of
781 // the text in the block (with ascenders and descenders), being a small
782 // (1.25) multiple of the median height of filtered blobs.
783 // In most cases the font size will be bigger, but it will be closer
784 // if the text is allcaps, or in a no-x-height script.
785 float line_size; //estimate
786 float max_blob_size; //line assignment limit
787 float baseline_offset; //phase shift
788 float xheight; //median blob size
789 float fixed_pitch; //pitch or 0
790 float kern_size; //average non-space
791 float space_size; //average space
792 int32_t min_space; //min definite space
793 int32_t max_nonspace; //max definite
794 float fp_space; //sp if fixed pitch
795 float fp_nonsp; //nonsp if fixed pitch
796 float pr_space; //sp if prop
797 float pr_nonsp; //non sp if prop
798 TO_ROW *key_row; //starting row
799
800 private:
801 TO_ROW_LIST row_list; //temporary rows
802};
803
806"Weighting for error in believability");
807void find_cblob_limits( //get y limits
808 C_BLOB *blob, //blob to search
809 float leftx, //x limits
810 float rightx,
811 FCOORD rotation, //for landscape
812 float &ymin, //output y limits
813 float &ymax);
814void find_cblob_vlimits( //get y limits
815 C_BLOB *blob, //blob to search
816 float leftx, //x limits
817 float rightx,
818 float &ymin, //output y limits
819 float &ymax);
820void find_cblob_hlimits( //get x limits
821 C_BLOB *blob, //blob to search
822 float bottomy, //y limits
823 float topy,
824 float &xmin, //output x limits
825 float &xymax);
826C_BLOB *crotate_cblob( //rotate it
827 C_BLOB *blob, //blob to search
828 FCOORD rotation //for landscape
829 );
830TBOX box_next( //get bounding box
831 BLOBNBOX_IT *it //iterator to blobds
832 );
833TBOX box_next_pre_chopped( //get bounding box
834 BLOBNBOX_IT *it //iterator to blobds
835 );
836void vertical_cblob_projection( //project outlines
837 C_BLOB *blob, //blob to project
838 STATS *stats //output
839 );
840void vertical_coutline_projection( //project outlines
841 C_OUTLINE *outline, //outline to project
842 STATS *stats //output
843 );
844#ifndef GRAPHICS_DISABLED
845void plot_blob_list(ScrollView* win, // window to draw in
846 BLOBNBOX_LIST *list, // blob list
847 ScrollView::Color body_colour, // colour to draw
848 ScrollView::Color child_colour); // colour of child
849#endif // GRAPHICS_DISABLED
850#endif
void vertical_cblob_projection(C_BLOB *blob, STATS *stats)
Definition: blobbox.cpp:868
void find_cblob_limits(C_BLOB *blob, float leftx, float rightx, FCOORD rotation, float &ymin, float &ymax)
Definition: blobbox.cpp:499
void plot_blob_list(ScrollView *win, BLOBNBOX_LIST *list, ScrollView::Color body_colour, ScrollView::Color child_colour)
Definition: blobbox.cpp:1086
BlobNeighbourDir DirOtherWay(BlobNeighbourDir dir)
Definition: blobbox.h:106
TBOX box_next(BLOBNBOX_IT *it)
Definition: blobbox.cpp:636
TBOX box_next_pre_chopped(BLOBNBOX_IT *it)
Definition: blobbox.cpp:665
void vertical_coutline_projection(C_OUTLINE *outline, STATS *stats)
Definition: blobbox.cpp:888
BlobNeighbourDir
Definition: blobbox.h:87
@ BND_COUNT
Definition: blobbox.h:92
@ BND_ABOVE
Definition: blobbox.h:91
@ BND_LEFT
Definition: blobbox.h:88
@ BND_BELOW
Definition: blobbox.h:89
@ BND_RIGHT
Definition: blobbox.h:90
PITCH_TYPE
Definition: blobbox.h:45
@ PITCH_DEF_FIXED
Definition: blobbox.h:47
@ PITCH_DUNNO
Definition: blobbox.h:46
@ PITCH_MAYBE_FIXED
Definition: blobbox.h:48
@ PITCH_CORR_FIXED
Definition: blobbox.h:51
@ PITCH_DEF_PROP
Definition: blobbox.h:49
@ PITCH_CORR_PROP
Definition: blobbox.h:52
@ PITCH_MAYBE_PROP
Definition: blobbox.h:50
void find_cblob_vlimits(C_BLOB *blob, float leftx, float rightx, float &ymin, float &ymax)
Definition: blobbox.cpp:539
void find_cblob_hlimits(C_BLOB *blob, float bottomy, float topy, float &xmin, float &xymax)
Definition: blobbox.cpp:576
TabType
Definition: blobbox.h:59
@ TT_MAYBE_RAGGED
Definition: blobbox.h:62
@ TT_VLINE
Definition: blobbox.h:65
@ TT_DELETED
Definition: blobbox.h:61
@ TT_CONFIRMED
Definition: blobbox.h:64
@ TT_MAYBE_ALIGNED
Definition: blobbox.h:63
@ TT_NONE
Definition: blobbox.h:60
BlobSpecialTextType
Definition: blobbox.h:96
@ BSTT_NONE
Definition: blobbox.h:97
@ BSTT_MATH
Definition: blobbox.h:100
@ BSTT_UNCLEAR
Definition: blobbox.h:101
@ BSTT_SKIP
Definition: blobbox.h:102
@ BSTT_ITALIC
Definition: blobbox.h:98
@ BSTT_DIGIT
Definition: blobbox.h:99
@ BSTT_COUNT
Definition: blobbox.h:103
bool DominatesInMerge(BlobTextFlowType type1, BlobTextFlowType type2)
Definition: blobbox.h:129
double textord_error_weight
BlobTextFlowType
Definition: blobbox.h:114
@ BTFT_TEXT_ON_IMAGE
Definition: blobbox.h:120
@ BTFT_COUNT
Definition: blobbox.h:122
@ BTFT_LEADER
Definition: blobbox.h:121
@ BTFT_NONE
Definition: blobbox.h:115
@ BTFT_CHAIN
Definition: blobbox.h:118
@ BTFT_STRONG_CHAIN
Definition: blobbox.h:119
@ BTFT_NEIGHBOURS
Definition: blobbox.h:117
@ BTFT_NONTEXT
Definition: blobbox.h:116
C_BLOB * crotate_cblob(C_BLOB *blob, FCOORD rotation)
Definition: blobbox.cpp:611
BlobRegionType
Definition: blobbox.h:72
@ BRT_RECTIMAGE
Definition: blobbox.h:76
@ BRT_COUNT
Definition: blobbox.h:82
@ BRT_POLYIMAGE
Definition: blobbox.h:77
@ BRT_TEXT
Definition: blobbox.h:80
@ BRT_HLINE
Definition: blobbox.h:74
@ BRT_VLINE
Definition: blobbox.h:75
@ BRT_UNKNOWN
Definition: blobbox.h:78
@ BRT_NOISE
Definition: blobbox.h:73
@ BRT_VERT_TEXT
Definition: blobbox.h:79
#define ELISTIZEH(CLASSNAME)
Definition: elst.h:918
#define ELIST2IZEH(CLASSNAME)
Definition: elst2.h:927
#define ASSERT_HOST(x)
Definition: errcode.h:88
#define double_VAR_H(name, val, comment)
Definition: params.h:301
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
ScrollView * to_win
Definition: drawtord.cpp:35
static bool IsTextType(BlobRegionType type)
Definition: blobbox.h:418
static void DeleteNoiseBlobs(BLOBNBOX_LIST *blobs)
Definition: blobbox.cpp:372
void set_left_tab_type(TabType new_type)
Definition: blobbox.h:274
void MinMaxGapsClipped(int *h_min, int *h_max, int *v_min, int *v_max) const
Definition: blobbox.cpp:200
void set_leader_on_right(bool flag)
Definition: blobbox.h:367
bool good_stroke_neighbour(BlobNeighbourDir n) const
Definition: blobbox.h:373
BLOBNBOX * base_char_blob() const
Definition: blobbox.h:402
BLOBNBOX(C_BLOB *srcblob)
Definition: blobbox.h:149
void chop(BLOBNBOX_IT *start_it, BLOBNBOX_IT *blob_it, FCOORD rotation, float xheight)
Definition: blobbox.cpp:120
bool leader_on_right() const
Definition: blobbox.h:364
static bool IsImageType(BlobRegionType type)
Definition: blobbox.h:422
bool DefiniteIndividualFlow()
Definition: blobbox.cpp:252
void set_special_text_type(BlobSpecialTextType new_type)
Definition: blobbox.h:292
static BLOBNBOX * RealBlob(C_OUTLINE *outline)
Definition: blobbox.h:158
int NoisyNeighbours() const
Definition: blobbox.cpp:237
void set_right_crossing_rule(int new_right)
Definition: blobbox.h:334
bool ConfirmNoTabViolation(const BLOBNBOX &other) const
Definition: blobbox.cpp:292
bool MatchingStrokeWidth(const BLOBNBOX &other, double fractional_tolerance, double constant_tolerance) const
Definition: blobbox.cpp:305
void plot(ScrollView *window, ScrollView::Color blob_colour, ScrollView::Color child_colour)
Definition: blobbox.cpp:485
float vert_stroke_width() const
Definition: blobbox.h:343
void set_right_tab_type(TabType new_type)
Definition: blobbox.h:280
bool leader_on_left() const
Definition: blobbox.h:358
float horz_stroke_width() const
Definition: blobbox.h:337
static void ComputeEdgeOffsets(Pix *thresholds, Pix *grey, BLOBNBOX_LIST *blobs)
Definition: blobbox.cpp:385
void compute_bounding_box()
Definition: blobbox.h:240
void really_merge(BLOBNBOX *other)
Definition: blobbox.cpp:103
void set_owns_cblob(bool value)
Definition: blobbox.h:408
void set_left_rule(int new_left)
Definition: blobbox.h:316
BlobRegionType region_type() const
Definition: blobbox.h:283
void rotate_box(FCOORD rotation)
Definition: blobbox.cpp:71
int right_crossing_rule() const
Definition: blobbox.h:331
static void PlotBlobs(BLOBNBOX_LIST *list, ScrollView::Color body_colour, ScrollView::Color child_colour, ScrollView *win)
Definition: blobbox.cpp:419
~BLOBNBOX()
Definition: blobbox.h:155
void set_horz_possible(bool value)
Definition: blobbox.h:310
void set_reduced_box(TBOX new_box)
Definition: blobbox.h:249
TabType right_tab_type() const
Definition: blobbox.h:277
void set_repeated_set(int set_id)
Definition: blobbox.h:265
void EstimateBaselinePosition()
Definition: blobbox.cpp:357
void set_vert_possible(bool value)
Definition: blobbox.h:304
bool vert_possible() const
Definition: blobbox.h:301
void set_line_crossings(int value)
Definition: blobbox.h:395
void ReInit()
Definition: blobbox.h:470
TBOX BoundsWithinLimits(int left, int right)
Definition: blobbox.cpp:333
BLOBNBOX()
Definition: blobbox.h:146
BLOBNBOX * neighbour(BlobNeighbourDir n) const
Definition: blobbox.h:370
float area_stroke_width() const
Definition: blobbox.h:349
TabType left_tab_type() const
Definition: blobbox.h:271
int right_rule() const
Definition: blobbox.h:319
int line_crossings() const
Definition: blobbox.h:392
static void PlotNoiseBlobs(BLOBNBOX_LIST *list, ScrollView::Color body_colour, ScrollView::Color child_colour, ScrollView *win)
Definition: blobbox.cpp:432
int left_rule() const
Definition: blobbox.h:313
ScrollView::Color BoxColor() const
Definition: blobbox.cpp:481
int repeated_set() const
Definition: blobbox.h:262
bool DeletableNoise() const
Definition: blobbox.h:203
void set_leader_on_left(bool flag)
Definition: blobbox.h:361
void set_base_char_blob(BLOBNBOX *blob)
Definition: blobbox.h:405
static ScrollView::Color TextlineColor(BlobRegionType region_type, BlobTextFlowType flow_type)
Definition: blobbox.cpp:444
void set_flow(BlobTextFlowType value)
Definition: blobbox.h:298
BlobSpecialTextType special_text_type() const
Definition: blobbox.h:289
bool IsDiacritic() const
Definition: blobbox.h:380
void set_neighbour(BlobNeighbourDir n, BLOBNBOX *neighbour, bool good)
Definition: blobbox.h:376
static bool IsLineType(BlobRegionType type)
Definition: blobbox.h:426
void set_bounding_box(const TBOX &new_box)
Definition: blobbox.h:235
void rotate(FCOORD rotation)
Definition: blobbox.cpp:55
void set_region_type(BlobRegionType new_type)
Definition: blobbox.h:286
int base_char_bottom() const
Definition: blobbox.h:386
int base_char_top() const
Definition: blobbox.h:383
const TBOX & bounding_box() const
Definition: blobbox.h:230
C_BLOB * cblob() const
Definition: blobbox.h:268
tesseract::ColPartition * owner() const
Definition: blobbox.h:352
int left_crossing_rule() const
Definition: blobbox.h:325
int GoodTextBlob() const
Definition: blobbox.cpp:226
void set_horz_stroke_width(float width)
Definition: blobbox.h:340
bool joined_to_prev() const
Definition: blobbox.h:256
bool red_box_set() const
Definition: blobbox.h:259
int32_t enclosed_area() const
Definition: blobbox.h:253
BlobTextFlowType flow() const
Definition: blobbox.h:295
void reflect_box_in_y_axis()
Definition: blobbox.cpp:62
void NeighbourGaps(int gaps[BND_COUNT]) const
Definition: blobbox.cpp:181
void ClearNeighbours()
Definition: blobbox.h:499
bool UniquelyHorizontal() const
Definition: blobbox.h:413
void set_owner(tesseract::ColPartition *new_owner)
Definition: blobbox.h:355
void translate_box(ICOORD v)
Definition: blobbox.h:174
bool horz_possible() const
Definition: blobbox.h:307
void CleanNeighbours()
Definition: blobbox.cpp:214
void merge(BLOBNBOX *nextblob)
Definition: blobbox.cpp:92
void set_right_rule(int new_right)
Definition: blobbox.h:322
void set_vert_stroke_width(float width)
Definition: blobbox.h:346
static bool UnMergeableType(BlobRegionType type)
Definition: blobbox.h:430
void set_left_crossing_rule(int new_left)
Definition: blobbox.h:328
const TBOX & reduced_box() const
Definition: blobbox.h:246
void set_diacritic_box(const TBOX &diacritic_box)
Definition: blobbox.h:398
int baseline_position() const
Definition: blobbox.h:389
bool UniquelyVertical() const
Definition: blobbox.h:410
void add_blob(BLOBNBOX *blob, float top, float bottom, float row_size)
Definition: blobbox.cpp:733
float fixed_pitch
Definition: blobbox.h:651
void set_line(float new_m, float new_c, float new_error)
Definition: blobbox.h:604
float pr_space
Definition: blobbox.h:654
float body_size
Definition: blobbox.h:661
bool all_caps
Definition: blobbox.h:646
int16_t projection_left
Definition: blobbox.h:648
float pr_nonsp
Definition: blobbox.h:655
void insert_blob(BLOBNBOX *blob)
Definition: blobbox.cpp:769
void print() const
Definition: blobbox.cpp:717
void set_num_repeated_sets(int num_sets)
Definition: blobbox.h:640
STATS projection
Definition: blobbox.h:671
void clear_rep_chars_marked()
Definition: blobbox.h:634
void set_parallel_line(float gradient, float new_c, float new_error)
Definition: blobbox.h:612
float intercept() const
Definition: blobbox.h:589
int16_t projection_right
Definition: blobbox.h:649
float line_c() const
Definition: blobbox.h:574
float spacing
Definition: blobbox.h:656
float max_y() const
Definition: blobbox.h:559
void set_limits(float new_min, float new_max)
Definition: blobbox.h:622
float parallel_error() const
Definition: blobbox.h:583
bool used_dm_model
Definition: blobbox.h:647
QSPLINE baseline
Definition: blobbox.h:670
float initial_min_y() const
Definition: blobbox.h:568
bool merged
Definition: blobbox.h:645
int xheight_evidence
Definition: blobbox.h:658
void compute_vertical_projection()
Definition: blobbox.cpp:796
int32_t space_threshold
Definition: blobbox.h:665
float xheight
Definition: blobbox.h:657
PITCH_TYPE pitch_decision
Definition: blobbox.h:650
int32_t max_nonspace
Definition: blobbox.h:664
bool rep_chars_marked() const
Definition: blobbox.h:631
float fp_nonsp
Definition: blobbox.h:653
ICOORDELT_LIST char_cells
Definition: blobbox.h:669
WERD_LIST rep_words
Definition: blobbox.h:668
float parallel_c() const
Definition: blobbox.h:580
TO_ROW()
Definition: blobbox.h:549
int num_repeated_sets() const
Definition: blobbox.h:637
float min_y() const
Definition: blobbox.h:562
float kern_size
Definition: blobbox.h:666
float mean_y() const
Definition: blobbox.h:565
float fp_space
Definition: blobbox.h:652
static const int kErrorWeight
Definition: blobbox.h:547
float believability() const
Definition: blobbox.h:586
float descdrop
Definition: blobbox.h:660
float ascrise
Definition: blobbox.h:659
int32_t min_space
Definition: blobbox.h:663
float line_m() const
Definition: blobbox.h:571
float space_size
Definition: blobbox.h:667
float line_error() const
Definition: blobbox.h:577
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:600
PITCH_TYPE pitch_decision
Definition: blobbox.h:778
BLOCK * block
Definition: blobbox.h:777
float pr_nonsp
Definition: blobbox.h:797
void print_rows()
Definition: blobbox.h:734
int32_t max_nonspace
Definition: blobbox.h:793
float xheight
Definition: blobbox.h:788
BLOBNBOX_LIST blobs
Definition: blobbox.h:772
float fp_nonsp
Definition: blobbox.h:795
TO_ROW * key_row
Definition: blobbox.h:798
TO_ROW_LIST * get_rows()
Definition: blobbox.h:704
BLOBNBOX_LIST noise_blobs
Definition: blobbox.h:774
BLOBNBOX_LIST large_blobs
Definition: blobbox.h:776
float line_size
Definition: blobbox.h:785
float space_size
Definition: blobbox.h:791
float baseline_offset
Definition: blobbox.h:787
int32_t min_space
Definition: blobbox.h:792
float max_blob_size
Definition: blobbox.h:786
float kern_size
Definition: blobbox.h:790
BLOBNBOX_LIST underlines
Definition: blobbox.h:773
void rotate(const FCOORD &rotation)
Definition: blobbox.h:710
float line_spacing
Definition: blobbox.h:779
float pr_space
Definition: blobbox.h:796
float fixed_pitch
Definition: blobbox.h:789
BLOBNBOX_LIST small_blobs
Definition: blobbox.h:775
float fp_space
Definition: blobbox.h:794
TO_BLOCK()
Definition: blobbox.h:695
Definition: ocrblock.h:31
integer coordinate
Definition: points.h:32
int16_t y() const
access_function
Definition: points.h:56
Definition: points.h:189
Definition: rect.h:34
int16_t top() const
Definition: rect.h:58
int16_t bottom() const
Definition: rect.h:65
Definition: statistc.h:31
void add(int32_t value, int32_t count)
Definition: statistc.cpp:93
double median() const
Definition: statistc.cpp:231
int32_t area()
Definition: stepblob.cpp:273
TBOX bounding_box() const
Definition: stepblob.cpp:253
int32_t perimeter()
Definition: stepblob.cpp:292