tesseract 4.1.1
Loading...
Searching...
No Matches
devanagari_processing.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: devanagari_processing.cpp
3 * Description: Methods to process images containing devanagari symbols,
4 * prior to classification.
5 * Author: Shobhit Saxena
6 * Created: Mon Nov 17 20:26:01 IST 2008
7 *
8 * (C) Copyright 2008, Google Inc.
9 ** Licensed under the Apache License, Version 2.0 (the "License");
10 ** you may not use this file except in compliance with the License.
11 ** You may obtain a copy of the License at
12 ** http://www.apache.org/licenses/LICENSE-2.0
13 ** Unless required by applicable law or agreed to in writing, software
14 ** distributed under the License is distributed on an "AS IS" BASIS,
15 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 ** See the License for the specific language governing permissions and
17 ** limitations under the License.
18 *
19 **********************************************************************/
20
21#ifdef HAVE_CONFIG_H
22#include "config_auto.h"
23#endif
24
25#include "allheaders.h"
26#include "debugpixa.h"
28#include "statistc.h"
29#include "tordmain.h"
30
31// Flags controlling the debugging information for shiro-rekha splitting
32// strategies.
34 "Debug level for split shiro-rekha process.");
35
37 "Whether to create a debug image for split shiro-rekha process.");
38
39namespace tesseract {
40
42 orig_pix_ = nullptr;
43 segmentation_block_list_ = nullptr;
44 splitted_image_ = nullptr;
45 global_xheight_ = kUnspecifiedXheight;
46 perform_close_ = false;
47 debug_image_ = nullptr;
48 pageseg_split_strategy_ = NO_SPLIT;
49 ocr_split_strategy_ = NO_SPLIT;
50}
51
53 Clear();
54}
55
57 pixDestroy(&orig_pix_);
58 pixDestroy(&splitted_image_);
59 pageseg_split_strategy_ = NO_SPLIT;
60 ocr_split_strategy_ = NO_SPLIT;
61 pixDestroy(&debug_image_);
62 segmentation_block_list_ = nullptr;
63 global_xheight_ = kUnspecifiedXheight;
64 perform_close_ = false;
65}
66
67// On setting the input image, a clone of it is owned by this class.
69 if (orig_pix_) {
70 pixDestroy(&orig_pix_);
71 }
72 orig_pix_ = pixClone(pix);
73}
74
75// Top-level method to perform splitting based on current settings.
76// Returns true if a split was actually performed.
77// split_for_pageseg should be true if the splitting is being done prior to
78// page segmentation. This mode uses the flag
79// pageseg_devanagari_split_strategy to determine the splitting strategy.
80bool ShiroRekhaSplitter::Split(bool split_for_pageseg, DebugPixa* pixa_debug) {
81 SplitStrategy split_strategy = split_for_pageseg ? pageseg_split_strategy_ :
82 ocr_split_strategy_;
83 if (split_strategy == NO_SPLIT) {
84 return false; // Nothing to do.
85 }
86 ASSERT_HOST(split_strategy == MINIMAL_SPLIT ||
87 split_strategy == MAXIMAL_SPLIT);
88 ASSERT_HOST(orig_pix_);
90 tprintf("Splitting shiro-rekha ...\n");
91 tprintf("Split strategy = %s\n",
92 split_strategy == MINIMAL_SPLIT ? "Minimal" : "Maximal");
93 tprintf("Initial pageseg available = %s\n",
94 segmentation_block_list_ ? "yes" : "no");
95 }
96 // Create a copy of original image to store the splitting output.
97 pixDestroy(&splitted_image_);
98 splitted_image_ = pixCopy(nullptr, orig_pix_);
99
100 // Initialize debug image if required.
102 pixDestroy(&debug_image_);
103 debug_image_ = pixConvertTo32(orig_pix_);
104 }
105
106 // Determine all connected components in the input image. A close operation
107 // may be required prior to this, depending on the current settings.
108 Pix* pix_for_ccs = pixClone(orig_pix_);
109 if (perform_close_ && global_xheight_ != kUnspecifiedXheight &&
110 !segmentation_block_list_) {
112 tprintf("Performing a global close operation..\n");
113 }
114 // A global measure is available for xheight, but no local information
115 // exists.
116 pixDestroy(&pix_for_ccs);
117 pix_for_ccs = pixCopy(nullptr, orig_pix_);
118 PerformClose(pix_for_ccs, global_xheight_);
119 }
120 Pixa* ccs;
121 Boxa* tmp_boxa = pixConnComp(pix_for_ccs, &ccs, 8);
122 boxaDestroy(&tmp_boxa);
123 pixDestroy(&pix_for_ccs);
124
125 // Iterate over all connected components. Get their bounding boxes and clip
126 // out the image regions corresponding to these boxes from the original image.
127 // Conditionally run splitting on each of them.
128 Boxa* regions_to_clear = boxaCreate(0);
129 int num_ccs = 0;
130 if (ccs != nullptr) num_ccs = pixaGetCount(ccs);
131 for (int i = 0; i < num_ccs; ++i) {
132 Box* box = ccs->boxa->box[i];
133 Pix* word_pix = pixClipRectangle(orig_pix_, box, nullptr);
134 ASSERT_HOST(word_pix);
135 int xheight = GetXheightForCC(box);
136 if (xheight == kUnspecifiedXheight && segmentation_block_list_ &&
138 pixRenderBoxArb(debug_image_, box, 1, 255, 0, 0);
139 }
140 // If some xheight measure is available, attempt to pre-eliminate small
141 // blobs from the shiro-rekha process. This is primarily to save the CCs
142 // corresponding to punctuation marks/small dots etc which are part of
143 // larger graphemes.
144 if (xheight == kUnspecifiedXheight ||
145 (box->w > xheight / 3 && box->h > xheight / 2)) {
146 SplitWordShiroRekha(split_strategy, word_pix, xheight,
147 box->x, box->y, regions_to_clear);
148 } else if (devanagari_split_debuglevel > 0) {
149 tprintf("CC dropped from splitting: %d,%d (%d, %d)\n",
150 box->x, box->y, box->w, box->h);
151 }
152 pixDestroy(&word_pix);
153 }
154 // Actually clear the boxes now.
155 for (int i = 0; i < boxaGetCount(regions_to_clear); ++i) {
156 Box* box = boxaGetBox(regions_to_clear, i, L_CLONE);
157 pixClearInRect(splitted_image_, box);
158 boxDestroy(&box);
159 }
160 boxaDestroy(&regions_to_clear);
161 pixaDestroy(&ccs);
162 if (devanagari_split_debugimage && pixa_debug != nullptr) {
163 pixa_debug->AddPix(debug_image_,
164 split_for_pageseg ? "pageseg_split" : "ocr_split");
165 }
166 return true;
167}
168
169// Method to perform a close operation on the input image. The xheight
170// estimate decides the size of sel used.
171void ShiroRekhaSplitter::PerformClose(Pix* pix, int xheight_estimate) {
172 pixCloseBrick(pix, pix, xheight_estimate / 8, xheight_estimate / 3);
173}
174
175// This method resolves the cc bbox to a particular row and returns the row's
176// xheight.
177int ShiroRekhaSplitter::GetXheightForCC(Box* cc_bbox) {
178 if (!segmentation_block_list_) {
179 return global_xheight_;
180 }
181 // Compute the box coordinates in Tesseract's coordinate system.
182 TBOX bbox(cc_bbox->x,
183 pixGetHeight(orig_pix_) - cc_bbox->y - cc_bbox->h - 1,
184 cc_bbox->x + cc_bbox->w,
185 pixGetHeight(orig_pix_) - cc_bbox->y - 1);
186 // Iterate over all blocks.
187 BLOCK_IT block_it(segmentation_block_list_);
188 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
189 BLOCK* block = block_it.data();
190 // Iterate over all rows in the block.
191 ROW_IT row_it(block->row_list());
192 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
193 ROW* row = row_it.data();
194 if (!row->bounding_box().major_overlap(bbox)) {
195 continue;
196 }
197 // Row could be skewed, warped, etc. Use the position of the box to
198 // determine the baseline position of the row for that x-coordinate.
199 // Create a square TBOX whose baseline's mid-point lies at this point
200 // and side is row's xheight. Take the overlap of this box with the input
201 // box and check if it is a 'major overlap'. If so, this box lies in this
202 // row. In that case, return the xheight for this row.
203 float box_middle = 0.5 * (bbox.left() + bbox.right());
204 int baseline = static_cast<int>(row->base_line(box_middle) + 0.5);
205 TBOX test_box(box_middle - row->x_height() / 2,
206 baseline,
207 box_middle + row->x_height() / 2,
208 static_cast<int>(baseline + row->x_height()));
209 // Compute overlap. If it is is a major overlap, this is the right row.
210 if (bbox.major_overlap(test_box)) {
211 return row->x_height();
212 }
213 }
214 }
215 // No row found for this bbox.
216 return kUnspecifiedXheight;
217}
218
219// Returns a list of regions (boxes) which should be cleared in the original
220// image so as to perform shiro-rekha splitting. Pix is assumed to carry one
221// (or less) word only. Xheight measure could be the global estimate, the row
222// estimate, or unspecified. If unspecified, over splitting may occur, since a
223// conservative estimate of stroke width along with an associated multiplier
224// is used in its place. It is advisable to have a specified xheight when
225// splitting for classification/training.
226// A vertical projection histogram of all the on-pixels in the input pix is
227// computed. The maxima of this histogram is regarded as an approximate location
228// of the shiro-rekha. By descending on the maxima's peak on both sides,
229// stroke width of shiro-rekha is estimated.
230// A horizontal projection histogram is computed for a sub-image of the input
231// image, which extends from just below the shiro-rekha down to a certain
232// leeway. The leeway depends on the input xheight, if provided, else a
233// conservative multiplier on approximate stroke width is used (which may lead
234// to over-splitting).
235void ShiroRekhaSplitter::SplitWordShiroRekha(SplitStrategy split_strategy,
236 Pix* pix,
237 int xheight,
238 int word_left,
239 int word_top,
240 Boxa* regions_to_clear) {
241 if (split_strategy == NO_SPLIT) {
242 return;
243 }
244 int width = pixGetWidth(pix);
245 int height = pixGetHeight(pix);
246 // Statistically determine the yextents of the shiro-rekha.
247 int shirorekha_top, shirorekha_bottom, shirorekha_ylevel;
248 GetShiroRekhaYExtents(pix, &shirorekha_top, &shirorekha_bottom,
249 &shirorekha_ylevel);
250 // Since the shiro rekha is also a stroke, its width is equal to the stroke
251 // width.
252 int stroke_width = shirorekha_bottom - shirorekha_top + 1;
253
254 // Some safeguards to protect CCs we do not want to be split.
255 // These are particularly useful when the word wasn't eliminated earlier
256 // because xheight information was unavailable.
257 if (shirorekha_ylevel > height / 2) {
258 // Shirorekha shouldn't be in the bottom half of the word.
260 tprintf("Skipping splitting CC at (%d, %d): shirorekha in lower half..\n",
261 word_left, word_top);
262 }
263 return;
264 }
265 if (stroke_width > height / 3) {
266 // Even the boldest of fonts shouldn't do this.
268 tprintf("Skipping splitting CC at (%d, %d): stroke width too huge..\n",
269 word_left, word_top);
270 }
271 return;
272 }
273
274 // Clear the ascender and descender regions of the word.
275 // Obtain a vertical projection histogram for the resulting image.
276 Box* box_to_clear = boxCreate(0, shirorekha_top - stroke_width / 3,
277 width, 5 * stroke_width / 3);
278 Pix* word_in_xheight = pixCopy(nullptr, pix);
279 pixClearInRect(word_in_xheight, box_to_clear);
280 // Also clear any pixels which are below shirorekha_bottom + some leeway.
281 // The leeway is set to xheight if the information is available, else it is a
282 // multiplier applied to the stroke width.
283 int leeway_to_keep = stroke_width * 3;
284 if (xheight != kUnspecifiedXheight) {
285 // This is because the xheight-region typically includes the shiro-rekha
286 // inside it, i.e., the top of the xheight range corresponds to the top of
287 // shiro-rekha.
288 leeway_to_keep = xheight - stroke_width;
289 }
290 box_to_clear->y = shirorekha_bottom + leeway_to_keep;
291 box_to_clear->h = height - box_to_clear->y;
292 pixClearInRect(word_in_xheight, box_to_clear);
293 boxDestroy(&box_to_clear);
294
295 PixelHistogram vert_hist;
296 vert_hist.ConstructVerticalCountHist(word_in_xheight);
297 pixDestroy(&word_in_xheight);
298
299 // If the number of black pixel in any column of the image is less than a
300 // fraction of the stroke width, treat it as noise / a stray mark. Perform
301 // these changes inside the vert_hist data itself, as that is used later on as
302 // a bit vector for the final split decision at every column.
303 for (int i = 0; i < width; ++i) {
304 if (vert_hist.hist()[i] <= stroke_width / 4)
305 vert_hist.hist()[i] = 0;
306 else
307 vert_hist.hist()[i] = 1;
308 }
309 // In order to split the line at any point, we make sure that the width of the
310 // gap is at least half the stroke width.
311 int i = 0;
312 int cur_component_width = 0;
313 while (i < width) {
314 if (!vert_hist.hist()[i]) {
315 int j = 0;
316 while (i + j < width && !vert_hist.hist()[i+j])
317 ++j;
318 if (j >= stroke_width / 2 && cur_component_width >= stroke_width / 2) {
319 // Perform a shiro-rekha split. The intervening region lies from i to
320 // i+j-1.
321 // A minimal single-pixel split makes the estimation of intra- and
322 // inter-word spacing easier during page layout analysis,
323 // whereas a maximal split may be needed for OCR, depending on
324 // how the engine was trained.
325 bool minimal_split = (split_strategy == MINIMAL_SPLIT);
326 int split_width = minimal_split ? 1 : j;
327 int split_left = minimal_split ? i + (j / 2) - (split_width / 2) : i;
328 if (!minimal_split || (i != 0 && i + j != width)) {
329 Box* box_to_clear =
330 boxCreate(word_left + split_left,
331 word_top + shirorekha_top - stroke_width / 3,
332 split_width,
333 5 * stroke_width / 3);
334 if (box_to_clear) {
335 boxaAddBox(regions_to_clear, box_to_clear, L_CLONE);
336 // Mark this in the debug image if needed.
338 pixRenderBoxArb(debug_image_, box_to_clear, 1, 128, 255, 128);
339 }
340 boxDestroy(&box_to_clear);
341 cur_component_width = 0;
342 }
343 }
344 }
345 i += j;
346 } else {
347 ++i;
348 ++cur_component_width;
349 }
350 }
351}
352
353// Refreshes the words in the segmentation block list by using blobs in the
354// input block list.
355// The segmentation block list must be set.
357 C_BLOB_LIST* new_blobs) {
358 // The segmentation block list must have been specified.
359 ASSERT_HOST(segmentation_block_list_);
361 tprintf("Before refreshing blobs:\n");
362 PrintSegmentationStats(segmentation_block_list_);
363 tprintf("New Blobs found: %d\n", new_blobs->length());
364 }
365
366 C_BLOB_LIST not_found_blobs;
367 RefreshWordBlobsFromNewBlobs(segmentation_block_list_,
368 new_blobs,
369 ((devanagari_split_debugimage && debug_image_) ?
370 &not_found_blobs : nullptr));
371
373 tprintf("After refreshing blobs:\n");
374 PrintSegmentationStats(segmentation_block_list_);
375 }
376 if (devanagari_split_debugimage && debug_image_) {
377 // Plot out the original blobs for which no match was found in the new
378 // all_blobs list.
379 C_BLOB_IT not_found_it(&not_found_blobs);
380 for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list();
381 not_found_it.forward()) {
382 C_BLOB* not_found = not_found_it.data();
383 TBOX not_found_box = not_found->bounding_box();
384 Box* box_to_plot = GetBoxForTBOX(not_found_box);
385 pixRenderBoxArb(debug_image_, box_to_plot, 1, 255, 0, 255);
386 boxDestroy(&box_to_plot);
387 }
388
389 // Plot out the blobs unused from all blobs.
390 C_BLOB_IT all_blobs_it(new_blobs);
391 for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list();
392 all_blobs_it.forward()) {
393 C_BLOB* a_blob = all_blobs_it.data();
394 Box* box_to_plot = GetBoxForTBOX(a_blob->bounding_box());
395 pixRenderBoxArb(debug_image_, box_to_plot, 3, 0, 127, 0);
396 boxDestroy(&box_to_plot);
397 }
398 }
399}
400
401// Returns a new box object for the corresponding TBOX, based on the original
402// image's coordinate system.
403Box* ShiroRekhaSplitter::GetBoxForTBOX(const TBOX& tbox) const {
404 return boxCreate(tbox.left(), pixGetHeight(orig_pix_) - tbox.top() - 1,
405 tbox.width(), tbox.height());
406}
407
408// This method returns the computed mode-height of blobs in the pix.
409// It also prunes very small blobs from calculation.
411 Boxa* boxa = pixConnComp(pix, nullptr, 8);
412 STATS heights(0, pixGetHeight(pix));
413 heights.clear();
414 for (int i = 0; i < boxaGetCount(boxa); ++i) {
415 Box* box = boxaGetBox(boxa, i, L_CLONE);
416 if (box->h >= 3 || box->w >= 3) {
417 heights.add(box->h, 1);
418 }
419 boxDestroy(&box);
420 }
421 boxaDestroy(&boxa);
422 return heights.mode();
423}
424
425// This method returns y-extents of the shiro-rekha computed from the input
426// word image.
427void ShiroRekhaSplitter::GetShiroRekhaYExtents(Pix* word_pix,
428 int* shirorekha_top,
429 int* shirorekha_bottom,
430 int* shirorekha_ylevel) {
431 // Compute a histogram from projecting the word on a vertical line.
432 PixelHistogram hist_horiz;
433 hist_horiz.ConstructHorizontalCountHist(word_pix);
434 // Get the ylevel where the top-line exists. This is basically the global
435 // maxima in the horizontal histogram.
436 int topline_onpixel_count = 0;
437 int topline_ylevel = hist_horiz.GetHistogramMaximum(&topline_onpixel_count);
438
439 // Get the upper and lower extents of the shiro rekha.
440 int thresh = (topline_onpixel_count * 70) / 100;
441 int ulimit = topline_ylevel;
442 int llimit = topline_ylevel;
443 while (ulimit > 0 && hist_horiz.hist()[ulimit] >= thresh)
444 --ulimit;
445 while (llimit < pixGetHeight(word_pix) && hist_horiz.hist()[llimit] >= thresh)
446 ++llimit;
447
448 if (shirorekha_top) *shirorekha_top = ulimit;
449 if (shirorekha_bottom) *shirorekha_bottom = llimit;
450 if (shirorekha_ylevel) *shirorekha_ylevel = topline_ylevel;
451}
452
453// This method returns the global-maxima for the histogram. The frequency of
454// the global maxima is returned in count, if specified.
456 int best_value = 0;
457 for (int i = 0; i < length_; ++i) {
458 if (hist_[i] > hist_[best_value]) {
459 best_value = i;
460 }
461 }
462 if (count) {
463 *count = hist_[best_value];
464 }
465 return best_value;
466}
467
468// Methods to construct histograms from images.
470 Clear();
471 int width = pixGetWidth(pix);
472 int height = pixGetHeight(pix);
473 hist_ = new int[width];
474 length_ = width;
475 int wpl = pixGetWpl(pix);
476 l_uint32 *data = pixGetData(pix);
477 for (int i = 0; i < width; ++i)
478 hist_[i] = 0;
479 for (int i = 0; i < height; ++i) {
480 l_uint32 *line = data + i * wpl;
481 for (int j = 0; j < width; ++j)
482 if (GET_DATA_BIT(line, j))
483 ++(hist_[j]);
484 }
485}
486
488 Clear();
489 Numa* counts = pixCountPixelsByRow(pix, nullptr);
490 length_ = numaGetCount(counts);
491 hist_ = new int[length_];
492 for (int i = 0; i < length_; ++i) {
493 l_int32 val = 0;
494 numaGetIValue(counts, i, &val);
495 hist_[i] = val;
496 }
497 numaDestroy(&counts);
498}
499
500} // namespace tesseract.
void PrintSegmentationStats(BLOCK_LIST *block_list)
Definition: ocrblock.cpp:405
void RefreshWordBlobsFromNewBlobs(BLOCK_LIST *block_list, C_BLOB_LIST *new_blobs, C_BLOB_LIST *not_found_blobs)
Definition: ocrblock.cpp:473
#define ASSERT_HOST(x)
Definition: errcode.h:88
#define BOOL_VAR(name, val, comment)
Definition: params.h:306
#define INT_VAR(name, val, comment)
Definition: params.h:303
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
@ baseline
Definition: mfoutline.h:63
int count(LIST var_list)
Definition: oldlist.cpp:95
bool devanagari_split_debugimage
int devanagari_split_debuglevel
void AddPix(const Pix *pix, const char *caption)
Definition: debugpixa.h:26
Definition: ocrblock.h:31
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:116
Definition: ocrrow.h:37
float base_line(float xpos) const
Definition: ocrrow.h:59
TBOX bounding_box() const
Definition: ocrrow.h:88
float x_height() const
Definition: ocrrow.h:64
Definition: rect.h:34
int16_t top() const
Definition: rect.h:58
int16_t width() const
Definition: rect.h:115
int16_t height() const
Definition: rect.h:108
bool major_overlap(const TBOX &box) const
Definition: rect.h:368
int16_t left() const
Definition: rect.h:72
Definition: statistc.h:31
void clear()
Definition: statistc.cpp:75
void add(int32_t value, int32_t count)
Definition: statistc.cpp:93
int32_t mode() const
Definition: statistc.cpp:107
TBOX bounding_box() const
Definition: stepblob.cpp:253
int GetHistogramMaximum(int *count) const
void RefreshSegmentationWithNewBlobs(C_BLOB_LIST *new_blobs)
bool Split(bool split_for_pageseg, DebugPixa *pixa_debug)