tesseract 4.1.1
Loading...
Searching...
No Matches
strokewidth.cpp
Go to the documentation of this file.
1
2// File: strokewidth.cpp
3// Description: Subclass of BBGrid to find uniformity of strokewidth.
4// Author: Ray Smith
5//
6// (C) Copyright 2008, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16//
18
19#ifdef HAVE_CONFIG_H
20#include "config_auto.h"
21#endif
22
23#include "strokewidth.h"
24
25#include <algorithm>
26#include <cmath>
27
28#include "blobbox.h"
29#include "colpartition.h"
30#include "colpartitiongrid.h"
31#include "imagefind.h"
32#include "linlsq.h"
33#include "statistc.h"
34#include "tabfind.h"
35#include "textlineprojection.h"
36#include "tordmain.h" // For SetBlobStrokeWidth.
37
38namespace tesseract {
39
40static INT_VAR(textord_tabfind_show_strokewidths, 0, "Show stroke widths");
41static BOOL_VAR(textord_tabfind_only_strokewidths, false, "Only run stroke widths");
42
44const double kStrokeWidthFractionTolerance = 0.125;
49const double kStrokeWidthTolerance = 1.5;
50// Same but for CJK we are a bit more generous.
51const double kStrokeWidthFractionCJK = 0.25;
52const double kStrokeWidthCJK = 2.0;
53// Radius in grid cells of search for broken CJK. Doesn't need to be very
54// large as the grid size should be about the size of a character anyway.
55const int kCJKRadius = 2;
56// Max distance fraction of size to join close but broken CJK characters.
57const double kCJKBrokenDistanceFraction = 0.25;
58// Max number of components in a broken CJK character.
59const int kCJKMaxComponents = 8;
60// Max aspect ratio of CJK broken characters when put back together.
61const double kCJKAspectRatio = 1.25;
62// Max increase in aspect ratio of CJK broken characters when merged.
63const double kCJKAspectRatioIncrease = 1.0625;
64// Max multiple of the grid size that will be used in computing median CJKsize.
65const int kMaxCJKSizeRatio = 5;
66// Min fraction of blobs broken CJK to iterate and run it again.
67const double kBrokenCJKIterationFraction = 0.125;
68// Multiple of gridsize as x-padding for a search box for diacritic base
69// characters.
70const double kDiacriticXPadRatio = 7.0;
71// Multiple of gridsize as y-padding for a search box for diacritic base
72// characters.
73const double kDiacriticYPadRatio = 1.75;
74// Min multiple of diacritic height that a neighbour must be to be a
75// convincing base character.
76const double kMinDiacriticSizeRatio = 1.0625;
77// Max multiple of a textline's median height as a threshold for the sum of
78// a diacritic's farthest x and y distances (gap + size).
79const double kMaxDiacriticDistanceRatio = 1.25;
80// Max x-gap between a diacritic and its base char as a fraction of the height
81// of the base char (allowing other blobs to fill the gap.)
83// Ratio between longest side of a line and longest side of a character.
84// (neighbor_min > blob_min * kLineTrapShortest &&
85// neighbor_max < blob_max / kLineTrapLongest)
86// => neighbor is a grapheme and blob is a line.
87const int kLineTrapLongest = 4;
88// Ratio between shortest side of a line and shortest side of a character.
89const int kLineTrapShortest = 2;
90// Max aspect ratio of the total box before CountNeighbourGaps
91// decides immediately based on the aspect ratio.
92const int kMostlyOneDirRatio = 3;
93// Aspect ratio for a blob to be considered as line residue.
94const double kLineResidueAspectRatio = 8.0;
95// Padding ratio for line residue search box.
97// Min multiple of neighbour size for a line residue to be genuine.
98const double kLineResidueSizeRatio = 1.75;
99// Aspect ratio filter for OSD.
100const float kSizeRatioToReject = 2.0;
101// Expansion factor for search box for good neighbours.
102const double kNeighbourSearchFactor = 2.5;
103// Factor of increase of overlap when adding diacritics to make an image noisy.
104const double kNoiseOverlapGrowthFactor = 4.0;
105// Fraction of the image size to add overlap when adding diacritics for an
106// image to qualify as noisy.
107const double kNoiseOverlapAreaFactor = 1.0 / 512;
108
110 const ICOORD& bleft, const ICOORD& tright)
111 : BlobGrid(gridsize, bleft, tright), nontext_map_(nullptr), projection_(nullptr),
112 denorm_(nullptr), grid_box_(bleft, tright), rerotation_(1.0f, 0.0f) {
113 leaders_win_ = nullptr;
114 widths_win_ = nullptr;
115 initial_widths_win_ = nullptr;
116 chains_win_ = nullptr;
117 diacritics_win_ = nullptr;
118 textlines_win_ = nullptr;
119 smoothed_win_ = nullptr;
120}
121
123 if (widths_win_ != nullptr) {
124 #ifndef GRAPHICS_DISABLED
125 delete widths_win_->AwaitEvent(SVET_DESTROY);
126 #endif // GRAPHICS_DISABLED
127 if (textord_tabfind_only_strokewidths)
128 exit(0);
129 delete widths_win_;
130 }
131 delete leaders_win_;
132 delete initial_widths_win_;
133 delete chains_win_;
134 delete textlines_win_;
135 delete smoothed_win_;
136 delete diacritics_win_;
137}
138
139// Sets the neighbours member of the medium-sized blobs in the block.
140// Searches on 4 sides of each blob for similar-sized, similar-strokewidth
141// blobs and sets pointers to the good neighbours.
143 // Run a preliminary strokewidth neighbour detection on the medium blobs.
144 InsertBlobList(&block->blobs);
145 BLOBNBOX_IT blob_it(&block->blobs);
146 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
147 SetNeighbours(false, false, blob_it.data());
148 }
149 Clear();
150}
151
152// Sets the neighbour/textline writing direction members of the medium
153// and large blobs with optional repair of broken CJK characters first.
154// Repair of broken CJK is needed here because broken CJK characters
155// can fool the textline direction detection algorithm.
157 bool cjk_merge,
158 TO_BLOCK* input_block) {
159 // Setup the grid with the remaining (non-noise) blobs.
160 InsertBlobs(input_block);
161 // Repair broken CJK characters if needed.
162 while (cjk_merge && FixBrokenCJK(input_block));
163 // Grade blobs by inspection of neighbours.
164 FindTextlineFlowDirection(pageseg_mode, false);
165 // Clear the grid ready for rotation or leader finding.
166 Clear();
167}
168
169// Helper to collect and count horizontal and vertical blobs from a list.
170static void CollectHorizVertBlobs(BLOBNBOX_LIST* input_blobs,
171 int* num_vertical_blobs,
172 int* num_horizontal_blobs,
173 BLOBNBOX_CLIST* vertical_blobs,
174 BLOBNBOX_CLIST* horizontal_blobs,
175 BLOBNBOX_CLIST* nondescript_blobs) {
176 BLOBNBOX_C_IT v_it(vertical_blobs);
177 BLOBNBOX_C_IT h_it(horizontal_blobs);
178 BLOBNBOX_C_IT n_it(nondescript_blobs);
179 BLOBNBOX_IT blob_it(input_blobs);
180 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
181 BLOBNBOX* blob = blob_it.data();
182 const TBOX& box = blob->bounding_box();
183 float y_x = static_cast<float>(box.height()) / box.width();
184 float x_y = 1.0f / y_x;
185 // Select a >= 1.0 ratio
186 float ratio = x_y > y_x ? x_y : y_x;
187 // If the aspect ratio is small and we want them for osd, save the blob.
188 bool ok_blob = ratio <= kSizeRatioToReject;
189 if (blob->UniquelyVertical()) {
190 ++*num_vertical_blobs;
191 if (ok_blob) v_it.add_after_then_move(blob);
192 } else if (blob->UniquelyHorizontal()) {
193 ++*num_horizontal_blobs;
194 if (ok_blob) h_it.add_after_then_move(blob);
195 } else if (ok_blob) {
196 n_it.add_after_then_move(blob);
197 }
198 }
199}
200
201
202// Types all the blobs as vertical or horizontal text or unknown and
203// returns true if the majority are vertical.
204// If the blobs are rotated, it is necessary to call CorrectForRotation
205// after rotating everything, otherwise the work done here will be enough.
206// If osd_blobs is not null, a list of blobs from the dominant textline
207// direction are returned for use in orientation and script detection.
208bool StrokeWidth::TestVerticalTextDirection(double find_vertical_text_ratio,
209 TO_BLOCK* block,
210 BLOBNBOX_CLIST* osd_blobs) {
211 int vertical_boxes = 0;
212 int horizontal_boxes = 0;
213 // Count vertical normal and large blobs.
214 BLOBNBOX_CLIST vertical_blobs;
215 BLOBNBOX_CLIST horizontal_blobs;
216 BLOBNBOX_CLIST nondescript_blobs;
217 CollectHorizVertBlobs(&block->blobs, &vertical_boxes, &horizontal_boxes,
218 &vertical_blobs, &horizontal_blobs, &nondescript_blobs);
219 CollectHorizVertBlobs(&block->large_blobs, &vertical_boxes, &horizontal_boxes,
220 &vertical_blobs, &horizontal_blobs, &nondescript_blobs);
222 tprintf("TextDir hbox=%d vs vbox=%d, %dH, %dV, %dN osd blobs\n",
223 horizontal_boxes, vertical_boxes,
224 horizontal_blobs.length(), vertical_blobs.length(),
225 nondescript_blobs.length());
226 if (osd_blobs != nullptr && vertical_boxes == 0 && horizontal_boxes == 0) {
227 // Only nondescript blobs available, so return those.
228 BLOBNBOX_C_IT osd_it(osd_blobs);
229 osd_it.add_list_after(&nondescript_blobs);
230 return false;
231 }
232 int min_vert_boxes = static_cast<int>((vertical_boxes + horizontal_boxes) *
233 find_vertical_text_ratio);
234 if (vertical_boxes >= min_vert_boxes) {
235 if (osd_blobs != nullptr) {
236 BLOBNBOX_C_IT osd_it(osd_blobs);
237 osd_it.add_list_after(&vertical_blobs);
238 }
239 return true;
240 } else {
241 if (osd_blobs != nullptr) {
242 BLOBNBOX_C_IT osd_it(osd_blobs);
243 osd_it.add_list_after(&horizontal_blobs);
244 }
245 return false;
246 }
247}
248
249// Corrects the data structures for the given rotation.
251 ColPartitionGrid* part_grid) {
252 Init(part_grid->gridsize(), part_grid->bleft(), part_grid->tright());
253 grid_box_ = TBOX(bleft(), tright());
254 rerotation_.set_x(rotation.x());
255 rerotation_.set_y(-rotation.y());
256}
257
258// Finds leader partitions and inserts them into the given part_grid.
260 ColPartitionGrid* part_grid) {
261 Clear();
262 // Find and isolate leaders in the noise list.
263 ColPartition_LIST leader_parts;
264 FindLeadersAndMarkNoise(block, &leader_parts);
265 // Setup the strokewidth grid with the block's remaining (non-noise) blobs.
266 InsertBlobList(&block->blobs);
267 // Mark blobs that have leader neighbours.
268 for (ColPartition_IT it(&leader_parts); !it.empty(); it.forward()) {
269 ColPartition* part = it.extract();
270 part->ClaimBoxes();
271 MarkLeaderNeighbours(part, LR_LEFT);
272 MarkLeaderNeighbours(part, LR_RIGHT);
273 part_grid->InsertBBox(true, true, part);
274 }
275}
276
277// Finds and marks noise those blobs that look like bits of vertical lines
278// that would otherwise screw up layout analysis.
279void StrokeWidth::RemoveLineResidue(ColPartition_LIST* big_part_list) {
280 BlobGridSearch gsearch(this);
281 BLOBNBOX* bbox;
282 // For every vertical line-like bbox in the grid, search its neighbours
283 // to find the tallest, and if the original box is taller by sufficient
284 // margin, then call it line residue and delete it.
285 gsearch.StartFullSearch();
286 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
287 TBOX box = bbox->bounding_box();
288 if (box.height() < box.width() * kLineResidueAspectRatio)
289 continue;
290 // Set up a rectangle search around the blob to find the size of its
291 // neighbours.
292 int padding = box.height() * kLineResiduePadRatio;
293 TBOX search_box = box;
294 search_box.pad(padding, padding);
295 bool debug = AlignedBlob::WithinTestRegion(2, box.left(),
296 box.bottom());
297 // Find the largest object in the search box not equal to bbox.
298 BlobGridSearch rsearch(this);
299 int max_height = 0;
300 BLOBNBOX* n;
301 rsearch.StartRectSearch(search_box);
302 while ((n = rsearch.NextRectSearch()) != nullptr) {
303 if (n == bbox) continue;
304 TBOX nbox = n->bounding_box();
305 if (nbox.height() > max_height) {
306 max_height = nbox.height();
307 }
308 }
309 if (debug) {
310 tprintf("Max neighbour size=%d for candidate line box at:", max_height);
311 box.print();
312 }
313 if (max_height * kLineResidueSizeRatio < box.height()) {
314 #ifndef GRAPHICS_DISABLED
315 if (leaders_win_ != nullptr) {
316 // We are debugging, so display deleted in pink blobs in the same
317 // window that we use to display leader detection.
318 leaders_win_->Pen(ScrollView::PINK);
319 leaders_win_->Rectangle(box.left(), box.bottom(),
320 box.right(), box.top());
321 }
322 #endif // GRAPHICS_DISABLED
323 ColPartition::MakeBigPartition(bbox, big_part_list);
324 }
325 }
326}
327
328// Types all the blobs as vertical text or horizontal text or unknown and
329// puts them into initial ColPartitions in the supplied part_grid.
330// rerotation determines how to get back to the image coordinates from the
331// blob coordinates (since they may have been rotated for vertical text).
332// block is the single block for the whole page or rectangle to be OCRed.
333// nontext_pix (full-size), is a binary mask used to prevent merges across
334// photo/text boundaries. It is not kept beyond this function.
335// denorm provides a mapping back to the image from the current blob
336// coordinate space.
337// projection provides a measure of textline density over the image and
338// provides functions to assist with diacritic detection. It should be a
339// pointer to a new TextlineProjection, and will be setup here.
340// part_grid is the output grid of textline partitions.
341// Large blobs that cause overlap are put in separate partitions and added
342// to the big_parts list.
344 PageSegMode pageseg_mode, const FCOORD& rerotation, TO_BLOCK* block,
345 Pix* nontext_pix, const DENORM* denorm, bool cjk_script,
346 TextlineProjection* projection, BLOBNBOX_LIST* diacritic_blobs,
347 ColPartitionGrid* part_grid, ColPartition_LIST* big_parts) {
348 nontext_map_ = nontext_pix;
349 projection_ = projection;
350 denorm_ = denorm;
351 // Clear and re Insert to take advantage of the tab stops in the blobs.
352 Clear();
353 // Setup the strokewidth grid with the remaining non-noise, non-leader blobs.
354 InsertBlobs(block);
355
356 // Run FixBrokenCJK() again if the page is CJK.
357 if (cjk_script) {
358 FixBrokenCJK(block);
359 }
360 FindTextlineFlowDirection(pageseg_mode, false);
361 projection_->ConstructProjection(block, rerotation, nontext_map_);
362 if (textord_tabfind_show_strokewidths) {
363 ScrollView* line_blobs_win = MakeWindow(0, 0, "Initial textline Blobs");
364 projection_->PlotGradedBlobs(&block->blobs, line_blobs_win);
365 projection_->PlotGradedBlobs(&block->small_blobs, line_blobs_win);
366 }
367 projection_->MoveNonTextlineBlobs(&block->blobs, &block->noise_blobs);
368 projection_->MoveNonTextlineBlobs(&block->small_blobs, &block->noise_blobs);
369 // Clear and re Insert to take advantage of the removed diacritics.
370 Clear();
371 InsertBlobs(block);
372 FCOORD skew;
373 FindTextlineFlowDirection(pageseg_mode, true);
375 FindInitialPartitions(pageseg_mode, rerotation, true, block,
376 diacritic_blobs, part_grid, big_parts, &skew);
377 if (r == PFR_NOISE) {
378 tprintf("Detected %d diacritics\n", diacritic_blobs->length());
379 // Noise was found, and removed.
380 Clear();
381 InsertBlobs(block);
382 FindTextlineFlowDirection(pageseg_mode, true);
383 r = FindInitialPartitions(pageseg_mode, rerotation, false, block,
384 diacritic_blobs, part_grid, big_parts, &skew);
385 }
386 nontext_map_ = nullptr;
387 projection_ = nullptr;
388 denorm_ = nullptr;
389}
390
391static void PrintBoxWidths(BLOBNBOX* neighbour) {
392 const TBOX& nbox = neighbour->bounding_box();
393 tprintf("Box (%d,%d)->(%d,%d): h-width=%.1f, v-width=%.1f p-width=%1.f\n",
394 nbox.left(), nbox.bottom(), nbox.right(), nbox.top(),
395 neighbour->horz_stroke_width(), neighbour->vert_stroke_width(),
396 2.0 * neighbour->cblob()->area()/neighbour->cblob()->perimeter());
397}
398
400void StrokeWidth::HandleClick(int x, int y) {
402 // Run a radial search for blobs that overlap.
403 BlobGridSearch radsearch(this);
404 radsearch.StartRadSearch(x, y, 1);
405 BLOBNBOX* neighbour;
406 FCOORD click(static_cast<float>(x), static_cast<float>(y));
407 while ((neighbour = radsearch.NextRadSearch()) != nullptr) {
408 TBOX nbox = neighbour->bounding_box();
409 if (nbox.contains(click) && neighbour->cblob() != nullptr) {
410 PrintBoxWidths(neighbour);
411 if (neighbour->neighbour(BND_LEFT) != nullptr)
412 PrintBoxWidths(neighbour->neighbour(BND_LEFT));
413 if (neighbour->neighbour(BND_RIGHT) != nullptr)
414 PrintBoxWidths(neighbour->neighbour(BND_RIGHT));
415 if (neighbour->neighbour(BND_ABOVE) != nullptr)
416 PrintBoxWidths(neighbour->neighbour(BND_ABOVE));
417 if (neighbour->neighbour(BND_BELOW) != nullptr)
418 PrintBoxWidths(neighbour->neighbour(BND_BELOW));
419 int gaps[BND_COUNT];
420 neighbour->NeighbourGaps(gaps);
421 tprintf("Left gap=%d, right=%d, above=%d, below=%d, horz=%d, vert=%d\n"
422 "Good= %d %d %d %d\n",
423 gaps[BND_LEFT], gaps[BND_RIGHT],
424 gaps[BND_ABOVE], gaps[BND_BELOW],
425 neighbour->horz_possible(),
426 neighbour->vert_possible(),
430 neighbour->good_stroke_neighbour(BND_BELOW));
431 break;
432 }
433 }
434}
435
436// Detects and marks leader dots/dashes.
437// Leaders are horizontal chains of small or noise blobs that look
438// monospace according to ColPartition::MarkAsLeaderIfMonospaced().
439// Detected leaders become the only occupants of the block->small_blobs list.
440// Non-leader small blobs get moved to the blobs list.
441// Non-leader noise blobs remain singletons in the noise list.
442// All small and noise blobs in high density regions are marked BTFT_NONTEXT.
443// block is the single block for the whole page or rectangle to be OCRed.
444// leader_parts is the output.
445void StrokeWidth::FindLeadersAndMarkNoise(TO_BLOCK* block,
446 ColPartition_LIST* leader_parts) {
449 BlobGridSearch gsearch(this);
450 BLOBNBOX* bbox;
451 // For every bbox in the grid, set its neighbours.
452 gsearch.StartFullSearch();
453 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
454 SetNeighbours(true, false, bbox);
455 }
456 ColPartition_IT part_it(leader_parts);
457 gsearch.StartFullSearch();
458 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
459 if (bbox->flow() == BTFT_NONE) {
460 if (bbox->neighbour(BND_RIGHT) == nullptr &&
461 bbox->neighbour(BND_LEFT) == nullptr)
462 continue;
463 // Put all the linked blobs into a ColPartition.
464 ColPartition* part = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));
465 BLOBNBOX* blob;
466 for (blob = bbox; blob != nullptr && blob->flow() == BTFT_NONE;
467 blob = blob->neighbour(BND_RIGHT))
468 part->AddBox(blob);
469 for (blob = bbox->neighbour(BND_LEFT); blob != nullptr &&
470 blob->flow() == BTFT_NONE;
471 blob = blob->neighbour(BND_LEFT))
472 part->AddBox(blob);
473 if (part->MarkAsLeaderIfMonospaced())
474 part_it.add_after_then_move(part);
475 else
476 delete part;
477 }
478 }
479 if (textord_tabfind_show_strokewidths) {
480 leaders_win_ = DisplayGoodBlobs("LeaderNeighbours", 0, 0);
481 }
482 // Move any non-leaders from the small to the blobs list, as they are
483 // most likely dashes or broken characters.
484 BLOBNBOX_IT blob_it(&block->blobs);
485 BLOBNBOX_IT small_it(&block->small_blobs);
486 for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
487 BLOBNBOX* blob = small_it.data();
488 if (blob->flow() != BTFT_LEADER) {
489 if (blob->flow() == BTFT_NEIGHBOURS)
490 blob->set_flow(BTFT_NONE);
491 blob->ClearNeighbours();
492 blob_it.add_to_end(small_it.extract());
493 }
494 }
495 // Move leaders from the noise list to the small list, leaving the small
496 // list exclusively leaders, so they don't get processed further,
497 // and the remaining small blobs all in the noise list.
498 BLOBNBOX_IT noise_it(&block->noise_blobs);
499 for (noise_it.mark_cycle_pt(); !noise_it.cycled_list(); noise_it.forward()) {
500 BLOBNBOX* blob = noise_it.data();
501 if (blob->flow() == BTFT_LEADER || blob->joined_to_prev()) {
502 small_it.add_to_end(noise_it.extract());
503 } else if (blob->flow() == BTFT_NEIGHBOURS) {
504 blob->set_flow(BTFT_NONE);
505 blob->ClearNeighbours();
506 }
507 }
508 // Clear the grid as we don't want the small stuff hanging around in it.
509 Clear();
510}
511
514void StrokeWidth::InsertBlobs(TO_BLOCK* block) {
515 InsertBlobList(&block->blobs);
517}
518
519// Checks the left or right side of the given leader partition and sets the
520// (opposite) leader_on_right or leader_on_left flags for blobs
521// that are next to the given side of the given leader partition.
522void StrokeWidth::MarkLeaderNeighbours(const ColPartition* part,
523 LeftOrRight side) {
524 const TBOX& part_box = part->bounding_box();
525 BlobGridSearch blobsearch(this);
526 // Search to the side of the leader for the nearest neighbour.
527 BLOBNBOX* best_blob = nullptr;
528 int best_gap = 0;
529 blobsearch.StartSideSearch(side == LR_LEFT ? part_box.left()
530 : part_box.right(),
531 part_box.bottom(), part_box.top());
532 BLOBNBOX* blob;
533 while ((blob = blobsearch.NextSideSearch(side == LR_LEFT)) != nullptr) {
534 const TBOX& blob_box = blob->bounding_box();
535 if (!blob_box.y_overlap(part_box))
536 continue;
537 int x_gap = blob_box.x_gap(part_box);
538 if (x_gap > 2 * gridsize()) {
539 break;
540 } else if (best_blob == nullptr || x_gap < best_gap) {
541 best_blob = blob;
542 best_gap = x_gap;
543 }
544 }
545 if (best_blob != nullptr) {
546 if (side == LR_LEFT)
547 best_blob->set_leader_on_right(true);
548 else
549 best_blob->set_leader_on_left(true);
550 #ifndef GRAPHICS_DISABLED
551 if (leaders_win_ != nullptr) {
552 leaders_win_->Pen(side == LR_LEFT ? ScrollView::RED : ScrollView::GREEN);
553 const TBOX& blob_box = best_blob->bounding_box();
554 leaders_win_->Rectangle(blob_box.left(), blob_box.bottom(),
555 blob_box.right(), blob_box.top());
556 }
557 #endif // GRAPHICS_DISABLED
558 }
559}
560
561// Helper to compute the UQ of the square-ish CJK characters.
562static int UpperQuartileCJKSize(int gridsize, BLOBNBOX_LIST* blobs) {
563 STATS sizes(0, gridsize * kMaxCJKSizeRatio);
564 BLOBNBOX_IT it(blobs);
565 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
566 BLOBNBOX* blob = it.data();
567 int width = blob->bounding_box().width();
568 int height = blob->bounding_box().height();
569 if (width <= height * kCJKAspectRatio && height < width * kCJKAspectRatio)
570 sizes.add(height, 1);
571 }
572 return static_cast<int>(sizes.ile(0.75f) + 0.5);
573}
574
575// Fix broken CJK characters, using the fake joined blobs mechanism.
576// Blobs are really merged, ie the master takes all the outlines and the
577// others are deleted.
578// Returns true if sufficient blobs are merged that it may be worth running
579// again, due to a better estimate of character size.
580bool StrokeWidth::FixBrokenCJK(TO_BLOCK* block) {
581 BLOBNBOX_LIST* blobs = &block->blobs;
582 int median_height = UpperQuartileCJKSize(gridsize(), blobs);
583 int max_dist = static_cast<int>(median_height * kCJKBrokenDistanceFraction);
584 int max_height = static_cast<int>(median_height * kCJKAspectRatio);
585 int num_fixed = 0;
586 BLOBNBOX_IT blob_it(blobs);
587
588 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
589 BLOBNBOX* blob = blob_it.data();
590 if (blob->cblob() == nullptr || blob->cblob()->out_list()->empty())
591 continue;
592 TBOX bbox = blob->bounding_box();
593 bool debug = AlignedBlob::WithinTestRegion(3, bbox.left(),
594 bbox.bottom());
595 if (debug) {
596 tprintf("Checking for Broken CJK (max size=%d):", max_height);
597 bbox.print();
598 }
599 // Generate a list of blobs that overlap or are near enough to merge.
600 BLOBNBOX_CLIST overlapped_blobs;
601 AccumulateOverlaps(blob, debug, max_height, max_dist,
602 &bbox, &overlapped_blobs);
603 if (!overlapped_blobs.empty()) {
604 // There are overlapping blobs, so qualify them as being satisfactory
605 // before removing them from the grid and replacing them with the union.
606 // The final box must be roughly square.
607 if (bbox.width() > bbox.height() * kCJKAspectRatio ||
608 bbox.height() > bbox.width() * kCJKAspectRatio) {
609 if (debug) {
610 tprintf("Bad final aspectratio:");
611 bbox.print();
612 }
613 continue;
614 }
615 // There can't be too many blobs to merge.
616 if (overlapped_blobs.length() >= kCJKMaxComponents) {
617 if (debug)
618 tprintf("Too many neighbours: %d\n", overlapped_blobs.length());
619 continue;
620 }
621 // The strokewidths must match amongst the join candidates.
622 BLOBNBOX_C_IT n_it(&overlapped_blobs);
623 for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {
624 BLOBNBOX* neighbour = nullptr;
625 neighbour = n_it.data();
626 if (!blob->MatchingStrokeWidth(*neighbour, kStrokeWidthFractionCJK,
628 break;
629 }
630 if (!n_it.cycled_list()) {
631 if (debug) {
632 tprintf("Bad stroke widths:");
633 PrintBoxWidths(blob);
634 }
635 continue; // Not good enough.
636 }
637
638 // Merge all the candidates into blob.
639 // We must remove blob from the grid and reinsert it after merging
640 // to maintain the integrity of the grid.
641 RemoveBBox(blob);
642 // Everything else will be calculated later.
643 for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {
644 BLOBNBOX* neighbour = n_it.data();
645 RemoveBBox(neighbour);
646 // Mark empty blob for deletion.
647 neighbour->set_region_type(BRT_NOISE);
648 blob->really_merge(neighbour);
649 if (rerotation_.x() != 1.0f || rerotation_.y() != 0.0f) {
650 blob->rotate_box(rerotation_);
651 }
652 }
653 InsertBBox(true, true, blob);
654 ++num_fixed;
655 if (debug) {
656 tprintf("Done! Final box:");
657 bbox.print();
658 }
659 }
660 }
661 // Count remaining blobs.
662 int num_remaining = 0;
663 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
664 BLOBNBOX* blob = blob_it.data();
665 if (blob->cblob() != nullptr && !blob->cblob()->out_list()->empty()) {
666 ++num_remaining;
667 }
668 }
669 // Permanently delete all the marked blobs after first removing all
670 // references in the neighbour members.
671 block->DeleteUnownedNoise();
672 return num_fixed > num_remaining * kBrokenCJKIterationFraction;
673}
674
675// Helper function to determine whether it is reasonable to merge the
676// bbox and the nbox for repairing broken CJK.
677// The distance apart must not exceed max_dist, the combined size must
678// not exceed max_size, and the aspect ratio must either improve or at
679// least not get worse by much.
680static bool AcceptableCJKMerge(const TBOX& bbox, const TBOX& nbox,
681 bool debug, int max_size, int max_dist,
682 int* x_gap, int* y_gap) {
683 *x_gap = bbox.x_gap(nbox);
684 *y_gap = bbox.y_gap(nbox);
685 TBOX merged(nbox);
686 merged += bbox;
687 if (debug) {
688 tprintf("gaps = %d, %d, merged_box:", *x_gap, *y_gap);
689 merged.print();
690 }
691 if (*x_gap <= max_dist && *y_gap <= max_dist &&
692 merged.width() <= max_size && merged.height() <= max_size) {
693 // Close enough to call overlapping. Check aspect ratios.
694 double old_ratio = static_cast<double>(bbox.width()) / bbox.height();
695 if (old_ratio < 1.0) old_ratio = 1.0 / old_ratio;
696 double new_ratio = static_cast<double>(merged.width()) / merged.height();
697 if (new_ratio < 1.0) new_ratio = 1.0 / new_ratio;
698 if (new_ratio <= old_ratio * kCJKAspectRatioIncrease)
699 return true;
700 }
701 return false;
702}
703
704// Collect blobs that overlap or are within max_dist of the input bbox.
705// Return them in the list of blobs and expand the bbox to be the union
706// of all the boxes. not_this is excluded from the search, as are blobs
707// that cause the merged box to exceed max_size in either dimension.
708void StrokeWidth::AccumulateOverlaps(const BLOBNBOX* not_this, bool debug,
709 int max_size, int max_dist,
710 TBOX* bbox, BLOBNBOX_CLIST* blobs) {
711 // While searching, nearests holds the nearest failed blob in each
712 // direction. When we have a nearest in each of the 4 directions, then
713 // the search is over, and at this point the final bbox must not overlap
714 // any of the nearests.
715 BLOBNBOX* nearests[BND_COUNT];
716 for (auto & nearest : nearests) {
717 nearest = nullptr;
718 }
719 int x = (bbox->left() + bbox->right()) / 2;
720 int y = (bbox->bottom() + bbox->top()) / 2;
721 // Run a radial search for blobs that overlap or are sufficiently close.
722 BlobGridSearch radsearch(this);
723 radsearch.StartRadSearch(x, y, kCJKRadius);
724 BLOBNBOX* neighbour;
725 while ((neighbour = radsearch.NextRadSearch()) != nullptr) {
726 if (neighbour == not_this) continue;
727 TBOX nbox = neighbour->bounding_box();
728 int x_gap, y_gap;
729 if (AcceptableCJKMerge(*bbox, nbox, debug, max_size, max_dist,
730 &x_gap, &y_gap)) {
731 // Close enough to call overlapping. Merge boxes.
732 *bbox += nbox;
733 blobs->add_sorted(SortByBoxLeft<BLOBNBOX>, true, neighbour);
734 if (debug) {
735 tprintf("Added:");
736 nbox.print();
737 }
738 // Since we merged, search the nearests, as some might now me mergeable.
739 for (int dir = 0; dir < BND_COUNT; ++dir) {
740 if (nearests[dir] == nullptr) continue;
741 nbox = nearests[dir]->bounding_box();
742 if (AcceptableCJKMerge(*bbox, nbox, debug, max_size,
743 max_dist, &x_gap, &y_gap)) {
744 // Close enough to call overlapping. Merge boxes.
745 *bbox += nbox;
746 blobs->add_sorted(SortByBoxLeft<BLOBNBOX>, true, nearests[dir]);
747 if (debug) {
748 tprintf("Added:");
749 nbox.print();
750 }
751 nearests[dir] = nullptr;
752 dir = -1; // Restart the search.
753 }
754 }
755 } else if (x_gap < 0 && x_gap <= y_gap) {
756 // A vertical neighbour. Record the nearest.
757 BlobNeighbourDir dir = nbox.top() > bbox->top() ? BND_ABOVE : BND_BELOW;
758 if (nearests[dir] == nullptr ||
759 y_gap < bbox->y_gap(nearests[dir]->bounding_box())) {
760 nearests[dir] = neighbour;
761 }
762 } else if (y_gap < 0 && y_gap <= x_gap) {
763 // A horizontal neighbour. Record the nearest.
764 BlobNeighbourDir dir = nbox.left() > bbox->left() ? BND_RIGHT : BND_LEFT;
765 if (nearests[dir] == nullptr ||
766 x_gap < bbox->x_gap(nearests[dir]->bounding_box())) {
767 nearests[dir] = neighbour;
768 }
769 }
770 // If all nearests are non-null, then we have finished.
771 if (nearests[BND_LEFT] && nearests[BND_RIGHT] &&
772 nearests[BND_ABOVE] && nearests[BND_BELOW])
773 break;
774 }
775 // Final overlap with a nearest is not allowed.
776 for (auto & nearest : nearests) {
777 if (nearest == nullptr) continue;
778 const TBOX& nbox = nearest->bounding_box();
779 if (debug) {
780 tprintf("Testing for overlap with:");
781 nbox.print();
782 }
783 if (bbox->overlap(nbox)) {
784 blobs->shallow_clear();
785 if (debug)
786 tprintf("Final box overlaps nearest\n");
787 return;
788 }
789 }
790}
791
792// For each blob in this grid, Finds the textline direction to be horizontal
793// or vertical according to distance to neighbours and 1st and 2nd order
794// neighbours. Non-text tends to end up without a definite direction.
795// Result is setting of the neighbours and vert_possible/horz_possible
796// flags in the BLOBNBOXes currently in this grid.
797// This function is called more than once if page orientation is uncertain,
798// so display_if_debugging is true on the final call to display the results.
799void StrokeWidth::FindTextlineFlowDirection(PageSegMode pageseg_mode,
800 bool display_if_debugging) {
801 BlobGridSearch gsearch(this);
802 BLOBNBOX* bbox;
803 // For every bbox in the grid, set its neighbours.
804 gsearch.StartFullSearch();
805 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
806 SetNeighbours(false, display_if_debugging, bbox);
807 }
808 // Where vertical or horizontal wins by a big margin, clarify it.
809 gsearch.StartFullSearch();
810 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
811 SimplifyObviousNeighbours(bbox);
812 }
813 // Now try to make the blobs only vertical or horizontal using neighbours.
814 gsearch.StartFullSearch();
815 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
816 if (FindingVerticalOnly(pageseg_mode)) {
817 bbox->set_vert_possible(true);
818 bbox->set_horz_possible(false);
819 } else if (FindingHorizontalOnly(pageseg_mode)) {
820 bbox->set_vert_possible(false);
821 bbox->set_horz_possible(true);
822 } else {
823 SetNeighbourFlows(bbox);
824 }
825 }
826 if ((textord_tabfind_show_strokewidths && display_if_debugging) ||
827 textord_tabfind_show_strokewidths > 1) {
828 initial_widths_win_ = DisplayGoodBlobs("InitialStrokewidths", 400, 0);
829 }
830 // Improve flow direction with neighbours.
831 gsearch.StartFullSearch();
832 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
833 SmoothNeighbourTypes(pageseg_mode, false, bbox);
834 }
835 // Now allow reset of firm values to fix renegades.
836 gsearch.StartFullSearch();
837 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
838 SmoothNeighbourTypes(pageseg_mode, true, bbox);
839 }
840 // Repeat.
841 gsearch.StartFullSearch();
842 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
843 SmoothNeighbourTypes(pageseg_mode, true, bbox);
844 }
845 if ((textord_tabfind_show_strokewidths && display_if_debugging) ||
846 textord_tabfind_show_strokewidths > 1) {
847 widths_win_ = DisplayGoodBlobs("ImprovedStrokewidths", 800, 0);
848 }
849}
850
851// Sets the neighbours and good_stroke_neighbours members of the blob by
852// searching close on all 4 sides.
853// When finding leader dots/dashes, there is a slightly different rule for
854// what makes a good neighbour.
855void StrokeWidth::SetNeighbours(bool leaders, bool activate_line_trap,
856 BLOBNBOX* blob) {
857 int line_trap_count = 0;
858 for (int dir = 0; dir < BND_COUNT; ++dir) {
859 auto bnd = static_cast<BlobNeighbourDir>(dir);
860 line_trap_count += FindGoodNeighbour(bnd, leaders, blob);
861 }
862 if (line_trap_count > 0 && activate_line_trap) {
863 // It looks like a line so isolate it by clearing its neighbours.
864 blob->ClearNeighbours();
865 const TBOX& box = blob->bounding_box();
866 blob->set_region_type(box.width() > box.height() ? BRT_HLINE : BRT_VLINE);
867 }
868}
869
870
871// Sets the good_stroke_neighbours member of the blob if it has a
872// GoodNeighbour on the given side.
873// Also sets the neighbour in the blob, whether or not a good one is found.
874// Returns the number of blobs in the nearby search area that would lead us to
875// believe that this blob is a line separator.
876// Leaders get extra special lenient treatment.
877int StrokeWidth::FindGoodNeighbour(BlobNeighbourDir dir, bool leaders,
878 BLOBNBOX* blob) {
879 // Search for neighbours that overlap vertically.
880 TBOX blob_box = blob->bounding_box();
881 bool debug = AlignedBlob::WithinTestRegion(2, blob_box.left(),
882 blob_box.bottom());
883 if (debug) {
884 tprintf("FGN in dir %d for blob:", dir);
885 blob_box.print();
886 }
887 int top = blob_box.top();
888 int bottom = blob_box.bottom();
889 int left = blob_box.left();
890 int right = blob_box.right();
891 int width = right - left;
892 int height = top - bottom;
893
894 // A trap to detect lines tests for the min dimension of neighbours
895 // being larger than a multiple of the min dimension of the line
896 // and the larger dimension being smaller than a fraction of the max
897 // dimension of the line.
898 int line_trap_max = std::max(width, height) / kLineTrapLongest;
899 int line_trap_min = std::min(width, height) * kLineTrapShortest;
900 int line_trap_count = 0;
901
902 int min_good_overlap = (dir == BND_LEFT || dir == BND_RIGHT)
903 ? height / 2 : width / 2;
904 int min_decent_overlap = (dir == BND_LEFT || dir == BND_RIGHT)
905 ? height / 3 : width / 3;
906 if (leaders)
907 min_good_overlap = min_decent_overlap = 1;
908
909 int search_pad = static_cast<int>(
910 sqrt(static_cast<double>(width * height)) * kNeighbourSearchFactor);
911 if (gridsize() > search_pad)
912 search_pad = gridsize();
913 TBOX search_box = blob_box;
914 // Pad the search in the appropriate direction.
915 switch (dir) {
916 case BND_LEFT:
917 search_box.set_left(search_box.left() - search_pad);
918 break;
919 case BND_RIGHT:
920 search_box.set_right(search_box.right() + search_pad);
921 break;
922 case BND_BELOW:
923 search_box.set_bottom(search_box.bottom() - search_pad);
924 break;
925 case BND_ABOVE:
926 search_box.set_top(search_box.top() + search_pad);
927 break;
928 case BND_COUNT:
929 return 0;
930 }
931
932 BlobGridSearch rectsearch(this);
933 rectsearch.StartRectSearch(search_box);
934 BLOBNBOX* best_neighbour = nullptr;
935 double best_goodness = 0.0;
936 bool best_is_good = false;
937 BLOBNBOX* neighbour;
938 while ((neighbour = rectsearch.NextRectSearch()) != nullptr) {
939 TBOX nbox = neighbour->bounding_box();
940 if (neighbour == blob)
941 continue;
942 int mid_x = (nbox.left() + nbox.right()) / 2;
943 if (mid_x < blob->left_rule() || mid_x > blob->right_rule())
944 continue; // In a different column.
945 if (debug) {
946 tprintf("Neighbour at:");
947 nbox.print();
948 }
949
950 // Last-minute line detector. There is a small upper limit to the line
951 // width accepted by the morphological line detector.
952 int n_width = nbox.width();
953 int n_height = nbox.height();
954 if (std::min(n_width, n_height) > line_trap_min &&
955 std::max(n_width, n_height) < line_trap_max)
956 ++line_trap_count;
957 // Heavily joined text, such as Arabic may have very different sizes when
958 // looking at the maxes, but the heights may be almost identical, so check
959 // for a difference in height if looking sideways or width vertically.
960 if (TabFind::VeryDifferentSizes(std::max(n_width, n_height),
961 std::max(width, height)) &&
962 (((dir == BND_LEFT || dir ==BND_RIGHT) &&
963 TabFind::DifferentSizes(n_height, height)) ||
964 ((dir == BND_BELOW || dir ==BND_ABOVE) &&
965 TabFind::DifferentSizes(n_width, width)))) {
966 if (debug) tprintf("Bad size\n");
967 continue; // Could be a different font size or non-text.
968 }
969 // Amount of vertical overlap between the blobs.
970 int overlap;
971 // If the overlap is along the short side of the neighbour, and it
972 // is fully overlapped, then perp_overlap holds the length of the long
973 // side of the neighbour. A measure to include hyphens and dashes as
974 // legitimate neighbours.
975 int perp_overlap;
976 int gap;
977 if (dir == BND_LEFT || dir == BND_RIGHT) {
978 overlap = std::min(static_cast<int>(nbox.top()), top) - std::max(static_cast<int>(nbox.bottom()), bottom);
979 if (overlap == nbox.height() && nbox.width() > nbox.height())
980 perp_overlap = nbox.width();
981 else
982 perp_overlap = overlap;
983 gap = dir == BND_LEFT ? left - nbox.left() : nbox.right() - right;
984 if (gap <= 0) {
985 if (debug) tprintf("On wrong side\n");
986 continue; // On the wrong side.
987 }
988 gap -= n_width;
989 } else {
990 overlap = std::min(static_cast<int>(nbox.right()), right) - std::max(static_cast<int>(nbox.left()), left);
991 if (overlap == nbox.width() && nbox.height() > nbox.width())
992 perp_overlap = nbox.height();
993 else
994 perp_overlap = overlap;
995 gap = dir == BND_BELOW ? bottom - nbox.bottom() : nbox.top() - top;
996 if (gap <= 0) {
997 if (debug) tprintf("On wrong side\n");
998 continue; // On the wrong side.
999 }
1000 gap -= n_height;
1001 }
1002 if (-gap > overlap) {
1003 if (debug) tprintf("Overlaps wrong way\n");
1004 continue; // Overlaps the wrong way.
1005 }
1006 if (perp_overlap < min_decent_overlap) {
1007 if (debug) tprintf("Doesn't overlap enough\n");
1008 continue; // Doesn't overlap enough.
1009 }
1010 bool bad_sizes = TabFind::DifferentSizes(height, n_height) &&
1011 TabFind::DifferentSizes(width, n_width);
1012 bool is_good = overlap >= min_good_overlap && !bad_sizes &&
1013 blob->MatchingStrokeWidth(*neighbour,
1016 // Best is a fuzzy combination of gap, overlap and is good.
1017 // Basically if you make one thing twice as good without making
1018 // anything else twice as bad, then it is better.
1019 if (gap < 1) gap = 1;
1020 double goodness = (1.0 + is_good) * overlap / gap;
1021 if (debug) {
1022 tprintf("goodness = %g vs best of %g, good=%d, overlap=%d, gap=%d\n",
1023 goodness, best_goodness, is_good, overlap, gap);
1024 }
1025 if (goodness > best_goodness) {
1026 best_neighbour = neighbour;
1027 best_goodness = goodness;
1028 best_is_good = is_good;
1029 }
1030 }
1031 blob->set_neighbour(dir, best_neighbour, best_is_good);
1032 return line_trap_count;
1033}
1034
1035// Helper to get a list of 1st-order neighbours.
1036static void ListNeighbours(const BLOBNBOX* blob,
1037 BLOBNBOX_CLIST* neighbours) {
1038 for (int dir = 0; dir < BND_COUNT; ++dir) {
1039 auto bnd = static_cast<BlobNeighbourDir>(dir);
1040 BLOBNBOX* neighbour = blob->neighbour(bnd);
1041 if (neighbour != nullptr) {
1042 neighbours->add_sorted(SortByBoxLeft<BLOBNBOX>, true, neighbour);
1043 }
1044 }
1045}
1046
1047// Helper to get a list of 1st and 2nd order neighbours.
1048static void List2ndNeighbours(const BLOBNBOX* blob,
1049 BLOBNBOX_CLIST* neighbours) {
1050 ListNeighbours(blob, neighbours);
1051 for (int dir = 0; dir < BND_COUNT; ++dir) {
1052 auto bnd = static_cast<BlobNeighbourDir>(dir);
1053 BLOBNBOX* neighbour = blob->neighbour(bnd);
1054 if (neighbour != nullptr) {
1055 ListNeighbours(neighbour, neighbours);
1056 }
1057 }
1058}
1059
1060// Helper to get a list of 1st, 2nd and 3rd order neighbours.
1061static void List3rdNeighbours(const BLOBNBOX* blob,
1062 BLOBNBOX_CLIST* neighbours) {
1063 List2ndNeighbours(blob, neighbours);
1064 for (int dir = 0; dir < BND_COUNT; ++dir) {
1065 auto bnd = static_cast<BlobNeighbourDir>(dir);
1066 BLOBNBOX* neighbour = blob->neighbour(bnd);
1067 if (neighbour != nullptr) {
1068 List2ndNeighbours(neighbour, neighbours);
1069 }
1070 }
1071}
1072
1073// Helper to count the evidence for verticalness or horizontalness
1074// in a list of neighbours.
1075static void CountNeighbourGaps(bool debug, BLOBNBOX_CLIST* neighbours,
1076 int* pure_h_count, int* pure_v_count) {
1077 if (neighbours->length() <= kMostlyOneDirRatio)
1078 return;
1079 BLOBNBOX_C_IT it(neighbours);
1080 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1081 BLOBNBOX* blob = it.data();
1082 int h_min, h_max, v_min, v_max;
1083 blob->MinMaxGapsClipped(&h_min, &h_max, &v_min, &v_max);
1084 if (debug)
1085 tprintf("Hgaps [%d,%d], vgaps [%d,%d]:", h_min, h_max, v_min, v_max);
1086 if (h_max < v_min ||
1087 blob->leader_on_left() || blob->leader_on_right()) {
1088 // Horizontal gaps are clear winners. Count a pure horizontal.
1089 ++*pure_h_count;
1090 if (debug) tprintf("Horz at:");
1091 } else if (v_max < h_min) {
1092 // Vertical gaps are clear winners. Clear a pure vertical.
1093 ++*pure_v_count;
1094 if (debug) tprintf("Vert at:");
1095 } else {
1096 if (debug) tprintf("Neither at:");
1097 }
1098 if (debug)
1099 blob->bounding_box().print();
1100 }
1101}
1102
1103// Makes the blob to be only horizontal or vertical where evidence
1104// is clear based on gaps of 2nd order neighbours, or definite individual
1105// blobs.
1106void StrokeWidth::SetNeighbourFlows(BLOBNBOX* blob) {
1107 if (blob->DefiniteIndividualFlow())
1108 return;
1109 bool debug = AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
1110 blob->bounding_box().bottom());
1111 if (debug) {
1112 tprintf("SetNeighbourFlows (current flow=%d, type=%d) on:",
1113 blob->flow(), blob->region_type());
1114 blob->bounding_box().print();
1115 }
1116 BLOBNBOX_CLIST neighbours;
1117 List3rdNeighbours(blob, &neighbours);
1118 // The number of pure horizontal and vertical neighbours.
1119 int pure_h_count = 0;
1120 int pure_v_count = 0;
1121 CountNeighbourGaps(debug, &neighbours, &pure_h_count, &pure_v_count);
1122 if (debug) {
1123 HandleClick(blob->bounding_box().left() + 1,
1124 blob->bounding_box().bottom() + 1);
1125 tprintf("SetFlows: h_count=%d, v_count=%d\n",
1126 pure_h_count, pure_v_count);
1127 }
1128 if (!neighbours.empty()) {
1129 blob->set_vert_possible(true);
1130 blob->set_horz_possible(true);
1131 if (pure_h_count > 2 * pure_v_count) {
1132 // Horizontal gaps are clear winners. Clear vertical neighbours.
1133 blob->set_vert_possible(false);
1134 } else if (pure_v_count > 2 * pure_h_count) {
1135 // Vertical gaps are clear winners. Clear horizontal neighbours.
1136 blob->set_horz_possible(false);
1137 }
1138 } else {
1139 // Lonely blob. Can't tell its flow direction.
1140 blob->set_vert_possible(false);
1141 blob->set_horz_possible(false);
1142 }
1143}
1144
1145
1146// Helper to count the number of horizontal and vertical blobs in a list.
1147static void CountNeighbourTypes(BLOBNBOX_CLIST* neighbours,
1148 int* pure_h_count, int* pure_v_count) {
1149 BLOBNBOX_C_IT it(neighbours);
1150 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1151 BLOBNBOX* blob = it.data();
1152 if (blob->UniquelyHorizontal())
1153 ++*pure_h_count;
1154 if (blob->UniquelyVertical())
1155 ++*pure_v_count;
1156 }
1157}
1158
1159// Nullify the neighbours in the wrong directions where the direction
1160// is clear-cut based on a distance margin. Good for isolating vertical
1161// text from neighbouring horizontal text.
1162void StrokeWidth::SimplifyObviousNeighbours(BLOBNBOX* blob) {
1163 // Case 1: We have text that is likely several characters, blurry and joined
1164 // together.
1165 if ((blob->bounding_box().width() > 3 * blob->area_stroke_width() &&
1166 blob->bounding_box().height() > 3 * blob->area_stroke_width())) {
1167 // The blob is complex (not stick-like).
1168 if (blob->bounding_box().width() > 4 * blob->bounding_box().height()) {
1169 // Horizontal conjoined text.
1170 blob->set_neighbour(BND_ABOVE, nullptr, false);
1171 blob->set_neighbour(BND_BELOW, nullptr, false);
1172 return;
1173 }
1174 if (blob->bounding_box().height() > 4 * blob->bounding_box().width()) {
1175 // Vertical conjoined text.
1176 blob->set_neighbour(BND_LEFT, nullptr, false);
1177 blob->set_neighbour(BND_RIGHT, nullptr, false);
1178 return;
1179 }
1180 }
1181
1182 // Case 2: This blob is likely a single character.
1183 int margin = gridsize() / 2;
1184 int h_min, h_max, v_min, v_max;
1185 blob->MinMaxGapsClipped(&h_min, &h_max, &v_min, &v_max);
1186 if ((h_max + margin < v_min && h_max < margin / 2) ||
1187 blob->leader_on_left() || blob->leader_on_right()) {
1188 // Horizontal gaps are clear winners. Clear vertical neighbours.
1189 blob->set_neighbour(BND_ABOVE, nullptr, false);
1190 blob->set_neighbour(BND_BELOW, nullptr, false);
1191 } else if (v_max + margin < h_min && v_max < margin / 2) {
1192 // Vertical gaps are clear winners. Clear horizontal neighbours.
1193 blob->set_neighbour(BND_LEFT, nullptr, false);
1194 blob->set_neighbour(BND_RIGHT, nullptr, false);
1195 }
1196}
1197
1198// Smoothes the vertical/horizontal type of the blob based on the
1199// 2nd-order neighbours. If reset_all is true, then all blobs are
1200// changed. Otherwise, only ambiguous blobs are processed.
1201void StrokeWidth::SmoothNeighbourTypes(PageSegMode pageseg_mode, bool reset_all,
1202 BLOBNBOX* blob) {
1203 if ((blob->vert_possible() && blob->horz_possible()) || reset_all) {
1204 // There are both horizontal and vertical so try to fix it.
1205 BLOBNBOX_CLIST neighbours;
1206 List2ndNeighbours(blob, &neighbours);
1207 // The number of pure horizontal and vertical neighbours.
1208 int pure_h_count = 0;
1209 int pure_v_count = 0;
1210 CountNeighbourTypes(&neighbours, &pure_h_count, &pure_v_count);
1212 blob->bounding_box().bottom())) {
1213 HandleClick(blob->bounding_box().left() + 1,
1214 blob->bounding_box().bottom() + 1);
1215 tprintf("pure_h=%d, pure_v=%d\n",
1216 pure_h_count, pure_v_count);
1217 }
1218 if (pure_h_count > pure_v_count && !FindingVerticalOnly(pageseg_mode)) {
1219 // Horizontal gaps are clear winners. Clear vertical neighbours.
1220 blob->set_vert_possible(false);
1221 blob->set_horz_possible(true);
1222 } else if (pure_v_count > pure_h_count &&
1223 !FindingHorizontalOnly(pageseg_mode)) {
1224 // Vertical gaps are clear winners. Clear horizontal neighbours.
1225 blob->set_horz_possible(false);
1226 blob->set_vert_possible(true);
1227 }
1228 } else if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
1229 blob->bounding_box().bottom())) {
1230 HandleClick(blob->bounding_box().left() + 1,
1231 blob->bounding_box().bottom() + 1);
1232 tprintf("Clean on pass 3!\n");
1233 }
1234}
1235
1236// Partition creation. Accumulates vertical and horizontal text chains,
1237// puts the remaining blobs in as unknowns, and then merges/splits to
1238// minimize overlap and smoothes the types with neighbours and the color
1239// image if provided. rerotation is used to rotate the coordinate space
1240// back to the nontext_map_ image.
1241// If find_problems is true, detects possible noise pollution by the amount
1242// of partition overlap that is created by the diacritics. If excessive, the
1243// noise is separated out into diacritic blobs, and PFR_NOISE is returned.
1244// [TODO(rays): if the partition overlap is caused by heavy skew, deskews
1245// the components, saves the skew_angle and returns PFR_SKEW.] If the return
1246// is not PFR_OK, the job is incomplete, and FindInitialPartitions must be
1247// called again after cleaning up the partly done work.
1248PartitionFindResult StrokeWidth::FindInitialPartitions(
1249 PageSegMode pageseg_mode, const FCOORD& rerotation, bool find_problems,
1250 TO_BLOCK* block, BLOBNBOX_LIST* diacritic_blobs,
1251 ColPartitionGrid* part_grid, ColPartition_LIST* big_parts,
1252 FCOORD* skew_angle) {
1253 if (!FindingHorizontalOnly(pageseg_mode)) FindVerticalTextChains(part_grid);
1254 if (!FindingVerticalOnly(pageseg_mode)) FindHorizontalTextChains(part_grid);
1255 if (textord_tabfind_show_strokewidths) {
1256 chains_win_ = MakeWindow(0, 400, "Initial text chains");
1257 part_grid->DisplayBoxes(chains_win_);
1258 projection_->DisplayProjection();
1259 }
1260 if (find_problems) {
1261 // TODO(rays) Do something to find skew, set skew_angle and return if there
1262 // is some.
1263 }
1264 part_grid->SplitOverlappingPartitions(big_parts);
1265 EasyMerges(part_grid);
1266 RemoveLargeUnusedBlobs(block, part_grid, big_parts);
1267 TBOX grid_box(bleft(), tright());
1268 while (part_grid->GridSmoothNeighbours(BTFT_CHAIN, nontext_map_, grid_box,
1269 rerotation));
1270 while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_,
1271 grid_box, rerotation));
1272 int pre_overlap = part_grid->ComputeTotalOverlap(nullptr);
1273 TestDiacritics(part_grid, block);
1274 MergeDiacritics(block, part_grid);
1275 if (find_problems && diacritic_blobs != nullptr &&
1276 DetectAndRemoveNoise(pre_overlap, grid_box, block, part_grid,
1277 diacritic_blobs)) {
1278 return PFR_NOISE;
1279 }
1280 if (textord_tabfind_show_strokewidths) {
1281 textlines_win_ = MakeWindow(400, 400, "GoodTextline blobs");
1282 part_grid->DisplayBoxes(textlines_win_);
1283 diacritics_win_ = DisplayDiacritics("Diacritics", 0, 0, block);
1284 }
1285 PartitionRemainingBlobs(pageseg_mode, part_grid);
1286 part_grid->SplitOverlappingPartitions(big_parts);
1287 EasyMerges(part_grid);
1288 while (part_grid->GridSmoothNeighbours(BTFT_CHAIN, nontext_map_, grid_box,
1289 rerotation));
1290 while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_,
1291 grid_box, rerotation));
1292 // Now eliminate strong stuff in a sea of the opposite.
1293 while (part_grid->GridSmoothNeighbours(BTFT_STRONG_CHAIN, nontext_map_,
1294 grid_box, rerotation));
1295 if (textord_tabfind_show_strokewidths) {
1296 smoothed_win_ = MakeWindow(800, 400, "Smoothed blobs");
1297 part_grid->DisplayBoxes(smoothed_win_);
1298 }
1299 return PFR_OK;
1300}
1301
1302// Detects noise by a significant increase in partition overlap from
1303// pre_overlap to now, and removes noise from the union of all the overlapping
1304// partitions, placing the blobs in diacritic_blobs. Returns true if any noise
1305// was found and removed.
1306bool StrokeWidth::DetectAndRemoveNoise(int pre_overlap, const TBOX& grid_box,
1307 TO_BLOCK* block,
1308 ColPartitionGrid* part_grid,
1309 BLOBNBOX_LIST* diacritic_blobs) {
1310 ColPartitionGrid* noise_grid = nullptr;
1311 int post_overlap = part_grid->ComputeTotalOverlap(&noise_grid);
1312 if (pre_overlap == 0) pre_overlap = 1;
1313 BLOBNBOX_IT diacritic_it(diacritic_blobs);
1314 if (noise_grid != nullptr) {
1315 if (post_overlap > pre_overlap * kNoiseOverlapGrowthFactor &&
1316 post_overlap > grid_box.area() * kNoiseOverlapAreaFactor) {
1317 // This is noisy enough to fix.
1318 if (textord_tabfind_show_strokewidths) {
1319 ScrollView* noise_win = MakeWindow(1000, 500, "Noise Areas");
1320 noise_grid->DisplayBoxes(noise_win);
1321 }
1322 part_grid->DeleteNonLeaderParts();
1323 BLOBNBOX_IT blob_it(&block->noise_blobs);
1324 ColPartitionGridSearch rsearch(noise_grid);
1325 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1326 BLOBNBOX* blob = blob_it.data();
1327 blob->ClearNeighbours();
1328 if (!blob->IsDiacritic() || blob->owner() != nullptr)
1329 continue; // Not a noise candidate.
1330 TBOX blob_box(blob->bounding_box());
1331 TBOX search_box(blob->bounding_box());
1332 search_box.pad(gridsize(), gridsize());
1333 rsearch.StartRectSearch(search_box);
1334 ColPartition* part = rsearch.NextRectSearch();
1335 if (part != nullptr) {
1336 // Consider blob as possible noise.
1337 blob->set_owns_cblob(true);
1338 blob->compute_bounding_box();
1339 diacritic_it.add_after_then_move(blob_it.extract());
1340 }
1341 }
1342 noise_grid->DeleteParts();
1343 delete noise_grid;
1344 return true;
1345 }
1346 noise_grid->DeleteParts();
1347 delete noise_grid;
1348 }
1349 return false;
1350}
1351
1352// Helper verifies that blob's neighbour in direction dir is good to add to a
1353// vertical text chain by returning the neighbour if it is not null, not owned,
1354// and not uniquely horizontal, as well as its neighbour in the opposite
1355// direction is blob.
1356static BLOBNBOX* MutualUnusedVNeighbour(const BLOBNBOX* blob,
1357 BlobNeighbourDir dir) {
1358 BLOBNBOX* next_blob = blob->neighbour(dir);
1359 if (next_blob == nullptr || next_blob->owner() != nullptr ||
1360 next_blob->UniquelyHorizontal())
1361 return nullptr;
1362 if (next_blob->neighbour(DirOtherWay(dir)) == blob)
1363 return next_blob;
1364 return nullptr;
1365}
1366
1367// Finds vertical chains of text-like blobs and puts them in ColPartitions.
1368void StrokeWidth::FindVerticalTextChains(ColPartitionGrid* part_grid) {
1369 // A PageSegMode that forces vertical textlines with the current rotation.
1370 PageSegMode pageseg_mode =
1371 rerotation_.y() == 0.0f ? PSM_SINGLE_BLOCK_VERT_TEXT : PSM_SINGLE_COLUMN;
1372 BlobGridSearch gsearch(this);
1373 BLOBNBOX* bbox;
1374 gsearch.StartFullSearch();
1375 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
1376 // Only process boxes that have no horizontal hope and have not yet
1377 // been included in a chain.
1378 BLOBNBOX* blob;
1379 if (bbox->owner() == nullptr && bbox->UniquelyVertical() &&
1380 (blob = MutualUnusedVNeighbour(bbox, BND_ABOVE)) != nullptr) {
1381 // Put all the linked blobs into a ColPartition.
1382 ColPartition* part = new ColPartition(BRT_VERT_TEXT, ICOORD(0, 1));
1383 part->AddBox(bbox);
1384 while (blob != nullptr) {
1385 part->AddBox(blob);
1386 blob = MutualUnusedVNeighbour(blob, BND_ABOVE);
1387 }
1388 blob = MutualUnusedVNeighbour(bbox, BND_BELOW);
1389 while (blob != nullptr) {
1390 part->AddBox(blob);
1391 blob = MutualUnusedVNeighbour(blob, BND_BELOW);
1392 }
1393 CompletePartition(pageseg_mode, part, part_grid);
1394 }
1395 }
1396}
1397
1398// Helper verifies that blob's neighbour in direction dir is good to add to a
1399// horizontal text chain by returning the neighbour if it is not null, not
1400// owned, and not uniquely vertical, as well as its neighbour in the opposite
1401// direction is blob.
1402static BLOBNBOX* MutualUnusedHNeighbour(const BLOBNBOX* blob,
1403 BlobNeighbourDir dir) {
1404 BLOBNBOX* next_blob = blob->neighbour(dir);
1405 if (next_blob == nullptr || next_blob->owner() != nullptr ||
1406 next_blob->UniquelyVertical())
1407 return nullptr;
1408 if (next_blob->neighbour(DirOtherWay(dir)) == blob)
1409 return next_blob;
1410 return nullptr;
1411}
1412
1413// Finds horizontal chains of text-like blobs and puts them in ColPartitions.
1414void StrokeWidth::FindHorizontalTextChains(ColPartitionGrid* part_grid) {
1415 // A PageSegMode that forces horizontal textlines with the current rotation.
1416 PageSegMode pageseg_mode =
1417 rerotation_.y() == 0.0f ? PSM_SINGLE_COLUMN : PSM_SINGLE_BLOCK_VERT_TEXT;
1418 BlobGridSearch gsearch(this);
1419 BLOBNBOX* bbox;
1420 gsearch.StartFullSearch();
1421 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
1422 BLOBNBOX* blob;
1423 if (bbox->owner() == nullptr && bbox->UniquelyHorizontal() &&
1424 (blob = MutualUnusedHNeighbour(bbox, BND_RIGHT)) != nullptr) {
1425 // Put all the linked blobs into a ColPartition.
1426 ColPartition* part = new ColPartition(BRT_TEXT, ICOORD(0, 1));
1427 part->AddBox(bbox);
1428 while (blob != nullptr) {
1429 part->AddBox(blob);
1430 blob = MutualUnusedHNeighbour(blob, BND_RIGHT);
1431 }
1432 blob = MutualUnusedHNeighbour(bbox, BND_LEFT);
1433 while (blob != nullptr) {
1434 part->AddBox(blob);
1435 blob = MutualUnusedVNeighbour(blob, BND_LEFT);
1436 }
1437 CompletePartition(pageseg_mode, part, part_grid);
1438 }
1439 }
1440}
1441
1442// Finds diacritics and saves their base character in the blob.
1443// The objective is to move all diacritics to the noise_blobs list, so
1444// they don't mess up early textline finding/merging, or force splits
1445// on textlines that overlap a bit. Blobs that become diacritics must be
1446// either part of no ColPartition (nullptr owner) or in a small partition in
1447// which ALL the blobs are diacritics, in which case the partition is
1448// exploded (deleted) back to its blobs.
1449void StrokeWidth::TestDiacritics(ColPartitionGrid* part_grid, TO_BLOCK* block) {
1450 BlobGrid small_grid(gridsize(), bleft(), tright());
1451 small_grid.InsertBlobList(&block->noise_blobs);
1452 small_grid.InsertBlobList(&block->blobs);
1453 int medium_diacritics = 0;
1454 int small_diacritics = 0;
1455 BLOBNBOX_IT small_it(&block->noise_blobs);
1456 for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
1457 BLOBNBOX* blob = small_it.data();
1458 if (blob->owner() == nullptr && !blob->IsDiacritic() &&
1459 DiacriticBlob(&small_grid, blob)) {
1460 ++small_diacritics;
1461 }
1462 }
1463 BLOBNBOX_IT blob_it(&block->blobs);
1464 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1465 BLOBNBOX* blob = blob_it.data();
1466 if (blob->IsDiacritic()) {
1467 small_it.add_to_end(blob_it.extract());
1468 continue; // Already a diacritic.
1469 }
1470 ColPartition* part = blob->owner();
1471 if (part == nullptr && DiacriticBlob(&small_grid, blob)) {
1472 ++medium_diacritics;
1473 RemoveBBox(blob);
1474 small_it.add_to_end(blob_it.extract());
1475 } else if (part != nullptr && !part->block_owned() &&
1476 part->boxes_count() < 3) {
1477 // We allow blobs in small partitions to become diacritics if ALL the
1478 // blobs in the partition qualify as we can then cleanly delete the
1479 // partition, turn all the blobs in it to diacritics and they can be
1480 // merged into the base character partition more easily than merging
1481 // the partitions.
1482 BLOBNBOX_C_IT box_it(part->boxes());
1483 for (box_it.mark_cycle_pt(); !box_it.cycled_list() &&
1484 DiacriticBlob(&small_grid, box_it.data());
1485 box_it.forward());
1486 if (box_it.cycled_list()) {
1487 // They are all good.
1488 while (!box_it.empty()) {
1489 // Liberate the blob from its partition so it can be treated
1490 // as a diacritic and merged explicitly with the base part.
1491 // The blob is really owned by the block. The partition "owner"
1492 // is nulled to allow the blob to get merged with its base character
1493 // partition.
1494 BLOBNBOX* box = box_it.extract();
1495 box->set_owner(nullptr);
1496 box_it.forward();
1497 ++medium_diacritics;
1498 // We remove the blob from the grid so it isn't found by subsequent
1499 // searches where we might not want to include diacritics.
1500 RemoveBBox(box);
1501 }
1502 // We only move the one blob to the small list here, but the others
1503 // all get moved by the test at the top of the loop.
1504 small_it.add_to_end(blob_it.extract());
1505 part_grid->RemoveBBox(part);
1506 delete part;
1507 }
1508 } else if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
1509 blob->bounding_box().bottom())) {
1510 tprintf("Blob not available to be a diacritic at:");
1511 blob->bounding_box().print();
1512 }
1513 }
1514 if (textord_tabfind_show_strokewidths) {
1515 tprintf("Found %d small diacritics, %d medium\n",
1516 small_diacritics, medium_diacritics);
1517 }
1518}
1519
1520// Searches this grid for an appropriately close and sized neighbour of the
1521// given [small] blob. If such a blob is found, the diacritic base is saved
1522// in the blob and true is returned.
1523// The small_grid is a secondary grid that contains the small/noise objects
1524// that are not in this grid, but may be useful for determining a connection
1525// between blob and its potential base character. (See DiacriticXGapFilled.)
1526bool StrokeWidth::DiacriticBlob(BlobGrid* small_grid, BLOBNBOX* blob) {
1528 blob->region_type() == BRT_VERT_TEXT)
1529 return false;
1530 TBOX small_box(blob->bounding_box());
1531 bool debug = AlignedBlob::WithinTestRegion(2, small_box.left(),
1532 small_box.bottom());
1533 if (debug) {
1534 tprintf("Testing blob for diacriticness at:");
1535 small_box.print();
1536 }
1537 int x = (small_box.left() + small_box.right()) / 2;
1538 int y = (small_box.bottom() + small_box.top()) / 2;
1539 int grid_x, grid_y;
1540 GridCoords(x, y, &grid_x, &grid_y);
1541 int height = small_box.height();
1542 // Setup a rectangle search to find its nearest base-character neighbour.
1543 // We keep 2 different best candidates:
1544 // best_x_overlap is a category of base characters that have an overlap in x
1545 // (like a acute) in which we look for the least y-gap, computed using the
1546 // projection to favor base characters in the same textline.
1547 // best_y_overlap is a category of base characters that have no x overlap,
1548 // (nominally a y-overlap is preferrecd but not essential) in which we
1549 // look for the least weighted sum of x-gap and y-gap, with x-gap getting
1550 // a lower weight to catch quotes at the end of a textline.
1551 // NOTE that x-gap and y-gap are measured from the nearest side of the base
1552 // character to the FARTHEST side of the diacritic to allow small diacritics
1553 // to be a reasonable distance away, but not big diacritics.
1554 BLOBNBOX* best_x_overlap = nullptr;
1555 BLOBNBOX* best_y_overlap = nullptr;
1556 int best_total_dist = 0;
1557 int best_y_gap = 0;
1558 TBOX best_xbox;
1559 // TODO(rays) the search box could be setup using the projection as a guide.
1560 TBOX search_box(small_box);
1563 search_box.pad(x_pad, y_pad);
1564 BlobGridSearch rsearch(this);
1565 rsearch.SetUniqueMode(true);
1566 int min_height = height * kMinDiacriticSizeRatio;
1567 rsearch.StartRectSearch(search_box);
1568 BLOBNBOX* neighbour;
1569 while ((neighbour = rsearch.NextRectSearch()) != nullptr) {
1570 if (BLOBNBOX::UnMergeableType(neighbour->region_type()) ||
1571 neighbour == blob || neighbour->owner() == blob->owner())
1572 continue;
1573 TBOX nbox = neighbour->bounding_box();
1574 if (neighbour->owner() == nullptr || neighbour->owner()->IsVerticalType() ||
1575 (neighbour->flow() != BTFT_CHAIN &&
1576 neighbour->flow() != BTFT_STRONG_CHAIN)) {
1577 if (debug) {
1578 tprintf("Neighbour not strong enough:");
1579 nbox.print();
1580 }
1581 continue; // Diacritics must be attached to strong text.
1582 }
1583 if (nbox.height() < min_height) {
1584 if (debug) {
1585 tprintf("Neighbour not big enough:");
1586 nbox.print();
1587 }
1588 continue; // Too small to be the base character.
1589 }
1590 int x_gap = small_box.x_gap(nbox);
1591 int y_gap = small_box.y_gap(nbox);
1592 int total_distance = projection_->DistanceOfBoxFromBox(small_box, nbox,
1593 true, denorm_,
1594 debug);
1595 if (debug) tprintf("xgap=%d, y=%d, total dist=%d\n",
1596 x_gap, y_gap, total_distance);
1597 if (total_distance >
1599 if (debug) {
1600 tprintf("Neighbour with median size %d too far away:",
1601 neighbour->owner()->median_height());
1602 neighbour->bounding_box().print();
1603 }
1604 continue; // Diacritics must not be too distant.
1605 }
1606 if (x_gap <= 0) {
1607 if (debug) {
1608 tprintf("Computing reduced box for :");
1609 nbox.print();
1610 }
1611 int left = small_box.left() - small_box.width();
1612 int right = small_box.right() + small_box.width();
1613 nbox = neighbour->BoundsWithinLimits(left, right);
1614 y_gap = small_box.y_gap(nbox);
1615 if (best_x_overlap == nullptr || y_gap < best_y_gap) {
1616 best_x_overlap = neighbour;
1617 best_xbox = nbox;
1618 best_y_gap = y_gap;
1619 if (debug) {
1620 tprintf("New best:");
1621 nbox.print();
1622 }
1623 } else if (debug) {
1624 tprintf("Shrunken box doesn't win:");
1625 nbox.print();
1626 }
1627 } else if (blob->ConfirmNoTabViolation(*neighbour)) {
1628 if (best_y_overlap == nullptr || total_distance < best_total_dist) {
1629 if (debug) {
1630 tprintf("New best y overlap:");
1631 nbox.print();
1632 }
1633 best_y_overlap = neighbour;
1634 best_total_dist = total_distance;
1635 } else if (debug) {
1636 tprintf("New y overlap box doesn't win:");
1637 nbox.print();
1638 }
1639 } else if (debug) {
1640 tprintf("Neighbour wrong side of a tab:");
1641 nbox.print();
1642 }
1643 }
1644 if (best_x_overlap != nullptr &&
1645 (best_y_overlap == nullptr ||
1646 best_xbox.major_y_overlap(best_y_overlap->bounding_box()))) {
1647 blob->set_diacritic_box(best_xbox);
1648 blob->set_base_char_blob(best_x_overlap);
1649 if (debug) {
1650 tprintf("DiacriticBlob OK! (x-overlap:");
1651 small_box.print();
1652 best_xbox.print();
1653 }
1654 return true;
1655 }
1656 if (best_y_overlap != nullptr &&
1657 DiacriticXGapFilled(small_grid, small_box,
1658 best_y_overlap->bounding_box()) &&
1659 NoNoiseInBetween(small_box, best_y_overlap->bounding_box())) {
1660 blob->set_diacritic_box(best_y_overlap->bounding_box());
1661 blob->set_base_char_blob(best_y_overlap);
1662 if (debug) {
1663 tprintf("DiacriticBlob OK! (y-overlap:");
1664 small_box.print();
1665 best_y_overlap->bounding_box().print();
1666 }
1667 return true;
1668 }
1669 if (debug) {
1670 tprintf("DiacriticBlob fails:");
1671 small_box.print();
1672 tprintf("Best x+y gap = %d, y = %d\n", best_total_dist, best_y_gap);
1673 if (best_y_overlap != nullptr) {
1674 tprintf("XGapFilled=%d, NoiseBetween=%d\n",
1675 DiacriticXGapFilled(small_grid, small_box,
1676 best_y_overlap->bounding_box()),
1677 NoNoiseInBetween(small_box, best_y_overlap->bounding_box()));
1678 }
1679 }
1680 return false;
1681}
1682
1683// Returns true if there is no gap between the base char and the diacritic
1684// bigger than a fraction of the height of the base char:
1685// Eg: line end.....'
1686// The quote is a long way from the end of the line, yet it needs to be a
1687// diacritic. To determine that the quote is not part of an image, or
1688// a different text block, we check for other marks in the gap between
1689// the base char and the diacritic.
1690// '<--Diacritic
1691// |---------|
1692// | |<-toobig-gap->
1693// | Base |<ok gap>
1694// |---------| x<-----Dot occupying gap
1695// The grid is const really.
1696bool StrokeWidth::DiacriticXGapFilled(BlobGrid* grid,
1697 const TBOX& diacritic_box,
1698 const TBOX& base_box) {
1699 // Since most gaps are small, use an iterative algorithm to search the gap.
1700 int max_gap = IntCastRounded(base_box.height() *
1702 TBOX occupied_box(base_box);
1703 int diacritic_gap;
1704 while ((diacritic_gap = diacritic_box.x_gap(occupied_box)) > max_gap) {
1705 TBOX search_box(occupied_box);
1706 if (diacritic_box.left() > search_box.right()) {
1707 // We are looking right.
1708 search_box.set_left(search_box.right());
1709 search_box.set_right(search_box.left() + max_gap);
1710 } else {
1711 // We are looking left.
1712 search_box.set_right(search_box.left());
1713 search_box.set_left(search_box.left() - max_gap);
1714 }
1715 BlobGridSearch rsearch(grid);
1716 rsearch.StartRectSearch(search_box);
1717 BLOBNBOX* neighbour;
1718 while ((neighbour = rsearch.NextRectSearch()) != nullptr) {
1719 const TBOX& nbox = neighbour->bounding_box();
1720 if (nbox.x_gap(diacritic_box) < diacritic_gap) {
1721 if (nbox.left() < occupied_box.left())
1722 occupied_box.set_left(nbox.left());
1723 if (nbox.right() > occupied_box.right())
1724 occupied_box.set_right(nbox.right());
1725 break;
1726 }
1727 }
1728 if (neighbour == nullptr)
1729 return false; // Found a big gap.
1730 }
1731 return true; // The gap was filled.
1732}
1733
1734// Merges diacritics with the ColPartition of the base character blob.
1735void StrokeWidth::MergeDiacritics(TO_BLOCK* block,
1736 ColPartitionGrid* part_grid) {
1737 BLOBNBOX_IT small_it(&block->noise_blobs);
1738 for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
1739 BLOBNBOX* blob = small_it.data();
1740 if (blob->base_char_blob() != nullptr) {
1741 ColPartition* part = blob->base_char_blob()->owner();
1742 // The base character must be owned by a partition and that partition
1743 // must not be on the big_parts list (not block owned).
1744 if (part != nullptr && !part->block_owned() && blob->owner() == nullptr &&
1745 blob->IsDiacritic()) {
1746 // The partition has to be removed from the grid and reinserted
1747 // because its bounding box may change.
1748 part_grid->RemoveBBox(part);
1749 part->AddBox(blob);
1750 blob->set_region_type(part->blob_type());
1751 blob->set_flow(part->flow());
1752 blob->set_owner(part);
1753 part_grid->InsertBBox(true, true, part);
1754 }
1755 // Set all base chars to nullptr before any blobs get deleted.
1756 blob->set_base_char_blob(nullptr);
1757 }
1758 }
1759}
1760
1761// Any blobs on the large_blobs list of block that are still unowned by a
1762// ColPartition, are probably drop-cap or vertically touching so the blobs
1763// are removed to the big_parts list and treated separately.
1764void StrokeWidth::RemoveLargeUnusedBlobs(TO_BLOCK* block,
1765 ColPartitionGrid* part_grid,
1766 ColPartition_LIST* big_parts) {
1767 BLOBNBOX_IT large_it(&block->large_blobs);
1768 for (large_it.mark_cycle_pt(); !large_it.cycled_list(); large_it.forward()) {
1769 BLOBNBOX* blob = large_it.data();
1770 ColPartition* big_part = blob->owner();
1771 if (big_part == nullptr) {
1772 // Large blobs should have gone into partitions by now if they are
1773 // genuine characters, so move any unowned ones out to the big parts
1774 // list. This will include drop caps and vertically touching characters.
1775 ColPartition::MakeBigPartition(blob, big_parts);
1776 }
1777 }
1778}
1779
1780// All remaining unused blobs are put in individual ColPartitions.
1781void StrokeWidth::PartitionRemainingBlobs(PageSegMode pageseg_mode,
1782 ColPartitionGrid* part_grid) {
1783 BlobGridSearch gsearch(this);
1784 BLOBNBOX* bbox;
1785 int prev_grid_x = -1;
1786 int prev_grid_y = -1;
1787 BLOBNBOX_CLIST cell_list;
1788 BLOBNBOX_C_IT cell_it(&cell_list);
1789 bool cell_all_noise = true;
1790 gsearch.StartFullSearch();
1791 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
1792 int grid_x = gsearch.GridX();
1793 int grid_y = gsearch.GridY();
1794 if (grid_x != prev_grid_x || grid_y != prev_grid_y) {
1795 // New cell. Process old cell.
1796 MakePartitionsFromCellList(pageseg_mode, cell_all_noise, part_grid,
1797 &cell_list);
1798 cell_it.set_to_list(&cell_list);
1799 prev_grid_x = grid_x;
1800 prev_grid_y = grid_y;
1801 cell_all_noise = true;
1802 }
1803 if (bbox->owner() == nullptr) {
1804 cell_it.add_to_end(bbox);
1805 if (bbox->flow() != BTFT_NONTEXT)
1806 cell_all_noise = false;
1807 } else {
1808 cell_all_noise = false;
1809 }
1810 }
1811 MakePartitionsFromCellList(pageseg_mode, cell_all_noise, part_grid,
1812 &cell_list);
1813}
1814
1815// If combine, put all blobs in the cell_list into a single partition, otherwise
1816// put each one into its own partition.
1817void StrokeWidth::MakePartitionsFromCellList(PageSegMode pageseg_mode,
1818 bool combine,
1819 ColPartitionGrid* part_grid,
1820 BLOBNBOX_CLIST* cell_list) {
1821 if (cell_list->empty())
1822 return;
1823 BLOBNBOX_C_IT cell_it(cell_list);
1824 if (combine) {
1825 BLOBNBOX* bbox = cell_it.extract();
1826 ColPartition* part = new ColPartition(bbox->region_type(), ICOORD(0, 1));
1827 part->AddBox(bbox);
1828 part->set_flow(bbox->flow());
1829 for (cell_it.forward(); !cell_it.empty(); cell_it.forward()) {
1830 part->AddBox(cell_it.extract());
1831 }
1832 CompletePartition(pageseg_mode, part, part_grid);
1833 } else {
1834 for (; !cell_it.empty(); cell_it.forward()) {
1835 BLOBNBOX* bbox = cell_it.extract();
1836 ColPartition* part = new ColPartition(bbox->region_type(), ICOORD(0, 1));
1837 part->set_flow(bbox->flow());
1838 part->AddBox(bbox);
1839 CompletePartition(pageseg_mode, part, part_grid);
1840 }
1841 }
1842}
1843
1844// Helper function to finish setting up a ColPartition and insert into
1845// part_grid.
1846void StrokeWidth::CompletePartition(PageSegMode pageseg_mode,
1847 ColPartition* part,
1848 ColPartitionGrid* part_grid) {
1849 part->ComputeLimits();
1850 TBOX box = part->bounding_box();
1851 bool debug = AlignedBlob::WithinTestRegion(2, box.left(),
1852 box.bottom());
1853 int value = projection_->EvaluateColPartition(*part, denorm_, debug);
1854 // Override value if pageseg_mode disagrees.
1855 if (value > 0 && FindingVerticalOnly(pageseg_mode)) {
1856 value = part->boxes_count() == 1 ? 0 : -2;
1857 } else if (value < 0 && FindingHorizontalOnly(pageseg_mode)) {
1858 value = part->boxes_count() == 1 ? 0 : 2;
1859 }
1860 part->SetRegionAndFlowTypesFromProjectionValue(value);
1861 part->ClaimBoxes();
1862 part_grid->InsertBBox(true, true, part);
1863}
1864
1865// Merge partitions where the merge appears harmless.
1866// As this
1867void StrokeWidth::EasyMerges(ColPartitionGrid* part_grid) {
1868 part_grid->Merges(
1869 NewPermanentTessCallback(this, &StrokeWidth::OrientationSearchBox),
1870 NewPermanentTessCallback(this, &StrokeWidth::ConfirmEasyMerge));
1871}
1872
1873// Compute a search box based on the orientation of the partition.
1874// Returns true if a suitable box can be calculated.
1875// Callback for EasyMerges.
1876bool StrokeWidth::OrientationSearchBox(ColPartition* part, TBOX* box) {
1877 if (part->IsVerticalType()) {
1878 box->set_top(box->top() + box->width());
1879 box->set_bottom(box->bottom() - box->width());
1880 } else {
1881 box->set_left(box->left() - box->height());
1882 box->set_right(box->right() + box->height());
1883 }
1884 return true;
1885}
1886
1887// Merge confirmation callback for EasyMerges.
1888bool StrokeWidth::ConfirmEasyMerge(const ColPartition* p1,
1889 const ColPartition* p2) {
1890 ASSERT_HOST(p1 != nullptr && p2 != nullptr);
1891 ASSERT_HOST(!p1->IsEmpty() && !p2->IsEmpty());
1892 if ((p1->flow() == BTFT_NONTEXT && p2->flow() >= BTFT_CHAIN) ||
1893 (p1->flow() >= BTFT_CHAIN && p2->flow() == BTFT_NONTEXT))
1894 return false; // Don't merge confirmed image with text.
1895 if ((p1->IsVerticalType() || p2->IsVerticalType()) &&
1896 p1->HCoreOverlap(*p2) <= 0 &&
1897 ((!p1->IsSingleton() &&
1898 !p2->IsSingleton()) ||
1899 !p1->bounding_box().major_overlap(p2->bounding_box())))
1900 return false; // Overlap must be in the text line.
1901 if ((p1->IsHorizontalType() || p2->IsHorizontalType()) &&
1902 p1->VCoreOverlap(*p2) <= 0 &&
1903 ((!p1->IsSingleton() &&
1904 !p2->IsSingleton()) ||
1905 (!p1->bounding_box().major_overlap(p2->bounding_box()) &&
1906 !p1->OKDiacriticMerge(*p2, false) &&
1907 !p2->OKDiacriticMerge(*p1, false))))
1908 return false; // Overlap must be in the text line.
1909 if (!p1->ConfirmNoTabViolation(*p2))
1910 return false;
1911 if (p1->flow() <= BTFT_NONTEXT && p2->flow() <= BTFT_NONTEXT)
1912 return true;
1913 return NoNoiseInBetween(p1->bounding_box(), p2->bounding_box());
1914}
1915
1916// Returns true if there is no significant noise in between the boxes.
1917bool StrokeWidth::NoNoiseInBetween(const TBOX& box1, const TBOX& box2) const {
1918 return ImageFind::BlankImageInBetween(box1, box2, grid_box_, rerotation_,
1919 nontext_map_);
1920}
1921
1925ScrollView* StrokeWidth::DisplayGoodBlobs(const char* window_name,
1926 int x, int y) {
1927 ScrollView* window = nullptr;
1928#ifndef GRAPHICS_DISABLED
1929 window = MakeWindow(x, y, window_name);
1930 // For every blob in the grid, display it.
1931 window->Brush(ScrollView::NONE);
1932
1933 // For every bbox in the grid, display it.
1934 BlobGridSearch gsearch(this);
1935 gsearch.StartFullSearch();
1936 BLOBNBOX* bbox;
1937 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
1938 const TBOX& box = bbox->bounding_box();
1939 int left_x = box.left();
1940 int right_x = box.right();
1941 int top_y = box.top();
1942 int bottom_y = box.bottom();
1943 int goodness = bbox->GoodTextBlob();
1944 BlobRegionType blob_type = bbox->region_type();
1945 if (bbox->UniquelyVertical())
1946 blob_type = BRT_VERT_TEXT;
1947 if (bbox->UniquelyHorizontal())
1948 blob_type = BRT_TEXT;
1949 BlobTextFlowType flow = bbox->flow();
1950 if (flow == BTFT_NONE) {
1951 if (goodness == 0)
1952 flow = BTFT_NEIGHBOURS;
1953 else if (goodness == 1)
1954 flow = BTFT_CHAIN;
1955 else
1956 flow = BTFT_STRONG_CHAIN;
1957 }
1958 window->Pen(BLOBNBOX::TextlineColor(blob_type, flow));
1959 window->Rectangle(left_x, bottom_y, right_x, top_y);
1960 }
1961 window->Update();
1962#endif
1963 return window;
1964}
1965
1966static void DrawDiacriticJoiner(const BLOBNBOX* blob, ScrollView* window) {
1967#ifndef GRAPHICS_DISABLED
1968 const TBOX& blob_box(blob->bounding_box());
1969 int top = std::max(static_cast<int>(blob_box.top()), blob->base_char_top());
1970 int bottom = std::min(static_cast<int>(blob_box.bottom()), blob->base_char_bottom());
1971 int x = (blob_box.left() + blob_box.right()) / 2;
1972 window->Line(x, top, x, bottom);
1973#endif // GRAPHICS_DISABLED
1974}
1975
1976// Displays blobs colored according to whether or not they are diacritics.
1977ScrollView* StrokeWidth::DisplayDiacritics(const char* window_name,
1978 int x, int y, TO_BLOCK* block) {
1979 ScrollView* window = nullptr;
1980#ifndef GRAPHICS_DISABLED
1981 window = MakeWindow(x, y, window_name);
1982 // For every blob in the grid, display it.
1983 window->Brush(ScrollView::NONE);
1984
1985 BLOBNBOX_IT it(&block->blobs);
1986 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1987 BLOBNBOX* blob = it.data();
1988 if (blob->IsDiacritic()) {
1989 window->Pen(ScrollView::GREEN);
1990 DrawDiacriticJoiner(blob, window);
1991 } else {
1992 window->Pen(blob->BoxColor());
1993 }
1994 const TBOX& box = blob->bounding_box();
1995 window->Rectangle(box.left(), box. bottom(), box.right(), box.top());
1996 }
1997 it.set_to_list(&block->noise_blobs);
1998 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1999 BLOBNBOX* blob = it.data();
2000 if (blob->IsDiacritic()) {
2001 window->Pen(ScrollView::GREEN);
2002 DrawDiacriticJoiner(blob, window);
2003 } else {
2004 window->Pen(ScrollView::WHITE);
2005 }
2006 const TBOX& box = blob->bounding_box();
2007 window->Rectangle(box.left(), box. bottom(), box.right(), box.top());
2008 }
2009 window->Update();
2010#endif
2011 return window;
2012}
2013
2014} // namespace tesseract.
BlobNeighbourDir DirOtherWay(BlobNeighbourDir dir)
Definition: blobbox.h:106
BlobNeighbourDir
Definition: blobbox.h:87
@ BND_COUNT
Definition: blobbox.h:92
@ BND_ABOVE
Definition: blobbox.h:91
@ BND_LEFT
Definition: blobbox.h:88
@ BND_BELOW
Definition: blobbox.h:89
@ BND_RIGHT
Definition: blobbox.h:90
BlobTextFlowType
Definition: blobbox.h:114
@ BTFT_LEADER
Definition: blobbox.h:121
@ BTFT_NONE
Definition: blobbox.h:115
@ BTFT_CHAIN
Definition: blobbox.h:118
@ BTFT_STRONG_CHAIN
Definition: blobbox.h:119
@ BTFT_NEIGHBOURS
Definition: blobbox.h:117
@ BTFT_NONTEXT
Definition: blobbox.h:116
BlobRegionType
Definition: blobbox.h:72
@ BRT_TEXT
Definition: blobbox.h:80
@ BRT_HLINE
Definition: blobbox.h:74
@ BRT_VLINE
Definition: blobbox.h:75
@ BRT_UNKNOWN
Definition: blobbox.h:78
@ BRT_NOISE
Definition: blobbox.h:73
@ BRT_VERT_TEXT
Definition: blobbox.h:79
#define ASSERT_HOST(x)
Definition: errcode.h:88
int IntCastRounded(double x)
Definition: helpers.h:175
#define BOOL_VAR(name, val, comment)
Definition: params.h:306
#define INT_VAR(name, val, comment)
Definition: params.h:303
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
Definition: tesscallback.h:258
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int textord_debug_tabfind
Definition: alignedblob.cpp:27
@ SVET_DESTROY
Definition: scrollview.h:46
const double kMaxDiacriticDistanceRatio
Definition: strokewidth.cpp:79
const int kLineResiduePadRatio
Definition: strokewidth.cpp:96
PartitionFindResult
Definition: strokewidth.h:46
const double kNoiseOverlapAreaFactor
const int kCJKMaxComponents
Definition: strokewidth.cpp:59
@ PSM_SINGLE_BLOCK_VERT_TEXT
aligned text.
Definition: publictypes.h:170
@ PSM_SINGLE_COLUMN
Assume a single column of text of variable sizes.
Definition: publictypes.h:169
const double kMinDiacriticSizeRatio
Definition: strokewidth.cpp:76
const double kCJKBrokenDistanceFraction
Definition: strokewidth.cpp:57
const int kLineTrapLongest
Definition: strokewidth.cpp:87
const double kCJKAspectRatio
Definition: strokewidth.cpp:61
const double kStrokeWidthTolerance
Definition: strokewidth.cpp:49
const double kNoiseOverlapGrowthFactor
const double kCJKAspectRatioIncrease
Definition: strokewidth.cpp:63
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:936
const int kCJKRadius
Definition: strokewidth.cpp:55
const double kNeighbourSearchFactor
const double kLineResidueAspectRatio
Definition: strokewidth.cpp:94
const double kBrokenCJKIterationFraction
Definition: strokewidth.cpp:67
const int kMaxCJKSizeRatio
Definition: strokewidth.cpp:65
const double kDiacriticXPadRatio
Definition: strokewidth.cpp:70
const double kLineResidueSizeRatio
Definition: strokewidth.cpp:98
const double kMaxDiacriticGapToBaseCharHeight
Definition: strokewidth.cpp:82
const int kLineTrapShortest
Definition: strokewidth.cpp:89
const double kStrokeWidthFractionTolerance
Definition: strokewidth.cpp:44
const double kStrokeWidthFractionCJK
Definition: strokewidth.cpp:51
const double kStrokeWidthCJK
Definition: strokewidth.cpp:52
const double kDiacriticYPadRatio
Definition: strokewidth.cpp:73
const float kSizeRatioToReject
const int kMostlyOneDirRatio
Definition: strokewidth.cpp:92
GridSearch< BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT > BlobGridSearch
Definition: blobgrid.h:31
void MinMaxGapsClipped(int *h_min, int *h_max, int *v_min, int *v_max) const
Definition: blobbox.cpp:200
void set_leader_on_right(bool flag)
Definition: blobbox.h:367
bool good_stroke_neighbour(BlobNeighbourDir n) const
Definition: blobbox.h:373
BLOBNBOX * base_char_blob() const
Definition: blobbox.h:402
bool leader_on_right() const
Definition: blobbox.h:364
bool DefiniteIndividualFlow()
Definition: blobbox.cpp:252
bool ConfirmNoTabViolation(const BLOBNBOX &other) const
Definition: blobbox.cpp:292
bool MatchingStrokeWidth(const BLOBNBOX &other, double fractional_tolerance, double constant_tolerance) const
Definition: blobbox.cpp:305
float vert_stroke_width() const
Definition: blobbox.h:343
bool leader_on_left() const
Definition: blobbox.h:358
float horz_stroke_width() const
Definition: blobbox.h:337
void compute_bounding_box()
Definition: blobbox.h:240
void really_merge(BLOBNBOX *other)
Definition: blobbox.cpp:103
void set_owns_cblob(bool value)
Definition: blobbox.h:408
BlobRegionType region_type() const
Definition: blobbox.h:283
void rotate_box(FCOORD rotation)
Definition: blobbox.cpp:71
void set_horz_possible(bool value)
Definition: blobbox.h:310
void set_vert_possible(bool value)
Definition: blobbox.h:304
bool vert_possible() const
Definition: blobbox.h:301
TBOX BoundsWithinLimits(int left, int right)
Definition: blobbox.cpp:333
BLOBNBOX * neighbour(BlobNeighbourDir n) const
Definition: blobbox.h:370
float area_stroke_width() const
Definition: blobbox.h:349
int right_rule() const
Definition: blobbox.h:319
ScrollView::Color BoxColor() const
Definition: blobbox.cpp:481
void set_leader_on_left(bool flag)
Definition: blobbox.h:361
void set_base_char_blob(BLOBNBOX *blob)
Definition: blobbox.h:405
static ScrollView::Color TextlineColor(BlobRegionType region_type, BlobTextFlowType flow_type)
Definition: blobbox.cpp:444
void set_flow(BlobTextFlowType value)
Definition: blobbox.h:298
bool IsDiacritic() const
Definition: blobbox.h:380
void set_neighbour(BlobNeighbourDir n, BLOBNBOX *neighbour, bool good)
Definition: blobbox.h:376
void set_region_type(BlobRegionType new_type)
Definition: blobbox.h:286
int base_char_bottom() const
Definition: blobbox.h:386
int base_char_top() const
Definition: blobbox.h:383
const TBOX & bounding_box() const
Definition: blobbox.h:230
C_BLOB * cblob() const
Definition: blobbox.h:268
tesseract::ColPartition * owner() const
Definition: blobbox.h:352
int GoodTextBlob() const
Definition: blobbox.cpp:226
bool joined_to_prev() const
Definition: blobbox.h:256
BlobTextFlowType flow() const
Definition: blobbox.h:295
void NeighbourGaps(int gaps[BND_COUNT]) const
Definition: blobbox.cpp:181
void ClearNeighbours()
Definition: blobbox.h:499
bool UniquelyHorizontal() const
Definition: blobbox.h:413
void set_owner(tesseract::ColPartition *new_owner)
Definition: blobbox.h:355
bool horz_possible() const
Definition: blobbox.h:307
static bool UnMergeableType(BlobRegionType type)
Definition: blobbox.h:430
void set_diacritic_box(const TBOX &diacritic_box)
Definition: blobbox.h:398
bool UniquelyVertical() const
Definition: blobbox.h:410
BLOBNBOX_LIST blobs
Definition: blobbox.h:772
void DeleteUnownedNoise()
Definition: blobbox.cpp:1037
BLOBNBOX_LIST noise_blobs
Definition: blobbox.h:774
BLOBNBOX_LIST large_blobs
Definition: blobbox.h:776
BLOBNBOX_LIST small_blobs
Definition: blobbox.h:775
integer coordinate
Definition: points.h:32
Definition: points.h:189
float y() const
Definition: points.h:210
void set_y(float yin)
rewrite function
Definition: points.h:218
void set_x(float xin)
rewrite function
Definition: points.h:214
float x() const
Definition: points.h:207
Definition: rect.h:34
void set_right(int x)
Definition: rect.h:82
int16_t top() const
Definition: rect.h:58
void print() const
Definition: rect.h:278
void set_bottom(int y)
Definition: rect.h:68
int16_t width() const
Definition: rect.h:115
int32_t area() const
Definition: rect.h:122
int16_t height() const
Definition: rect.h:108
bool overlap(const TBOX &box) const
Definition: rect.h:355
void set_top(int y)
Definition: rect.h:61
bool y_overlap(const TBOX &box) const
Definition: rect.h:428
int16_t left() const
Definition: rect.h:72
int y_gap(const TBOX &box) const
Definition: rect.h:233
int16_t bottom() const
Definition: rect.h:65
bool contains(const FCOORD pt) const
Definition: rect.h:333
void pad(int xpad, int ypad)
Definition: rect.h:131
void set_left(int x)
Definition: rect.h:75
bool major_y_overlap(const TBOX &box) const
Definition: rect.h:439
int x_gap(const TBOX &box) const
Definition: rect.h:225
int16_t right() const
Definition: rect.h:79
Definition: statistc.h:31
int32_t area()
Definition: stepblob.cpp:273
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:70
int32_t perimeter()
Definition: stepblob.cpp:292
static bool WithinTestRegion(int detail_level, int x, int y)
void StartRadSearch(int x, int y, int max_radius)
Definition: bbgrid.h:698
BBC * NextRectSearch()
Definition: bbgrid.h:842
void StartFullSearch()
Definition: bbgrid.h:665
void StartRectSearch(const TBOX &rect)
Definition: bbgrid.h:830
BBC * NextFullSearch()
Definition: bbgrid.h:675
BBC * NextRadSearch()
Definition: bbgrid.h:713
int gridsize() const
Definition: bbgrid.h:63
const ICOORD & bleft() const
Definition: bbgrid.h:72
void GridCoords(int x, int y, int *grid_x, int *grid_y) const
Definition: bbgrid.cpp:52
const ICOORD & tright() const
Definition: bbgrid.h:75
void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: bbgrid.h:445
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:486
virtual void HandleClick(int x, int y)
Definition: bbgrid.h:655
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: bbgrid.h:589
BlobGrid(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: blobgrid.cpp:24
void InsertBlobList(BLOBNBOX_LIST *blobs)
Definition: blobgrid.cpp:36
static ColPartition * MakeBigPartition(BLOBNBOX *box, ColPartition_LIST *big_part_list)
void AddBox(BLOBNBOX *box)
bool IsVerticalType() const
Definition: colpartition.h:442
static bool BlankImageInBetween(const TBOX &box1, const TBOX &box2, const TBOX &im_box, const FCOORD &rotation, Pix *pix)
Definition: imagefind.cpp:576
void FindTextlineDirectionAndFixBrokenCJK(PageSegMode pageseg_mode, bool cjk_merge, TO_BLOCK *input_block)
void CorrectForRotation(const FCOORD &rerotation, ColPartitionGrid *part_grid)
void HandleClick(int x, int y) override
StrokeWidth(int gridsize, const ICOORD &bleft, const ICOORD &tright)
void RemoveLineResidue(ColPartition_LIST *big_part_list)
void SetNeighboursOnMediumBlobs(TO_BLOCK *block)
void FindLeaderPartitions(TO_BLOCK *block, ColPartitionGrid *part_grid)
bool TestVerticalTextDirection(double find_vertical_text_ratio, TO_BLOCK *block, BLOBNBOX_CLIST *osd_blobs)
void GradeBlobsIntoPartitions(PageSegMode pageseg_mode, const FCOORD &rerotation, TO_BLOCK *block, Pix *nontext_pix, const DENORM *denorm, bool cjk_script, TextlineProjection *projection, BLOBNBOX_LIST *diacritic_blobs, ColPartitionGrid *part_grid, ColPartition_LIST *big_parts)
static bool DifferentSizes(int size1, int size2)
Definition: tabfind.cpp:407
static bool VeryDifferentSizes(int size1, int size2)
Definition: tabfind.cpp:413
void MoveNonTextlineBlobs(BLOBNBOX_LIST *blobs, BLOBNBOX_LIST *small_blobs) const
int EvaluateColPartition(const ColPartition &part, const DENORM *denorm, bool debug) const
void ConstructProjection(TO_BLOCK *input_block, const FCOORD &rotation, Pix *nontext_map)
int DistanceOfBoxFromBox(const TBOX &from_box, const TBOX &to_box, bool horizontal_textline, const DENORM *denorm, bool debug) const
void PlotGradedBlobs(BLOBNBOX_LIST *blobs, ScrollView *win)
static void Update()
Definition: scrollview.cpp:709
SVEvent * AwaitEvent(SVEventType type)
Definition: scrollview.cpp:443
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:532
void Pen(Color color)
Definition: scrollview.cpp:719
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:600
void Brush(Color color)
Definition: scrollview.cpp:725