tesseract 4.1.1
Loading...
Searching...
No Matches
docqual.cpp
Go to the documentation of this file.
1/******************************************************************
2 * File: docqual.cpp (Formerly docqual.c)
3 * Description: Document Quality Metrics
4 * Author: Phil Cheatle
5 * Created: Mon May 9 11:27:28 BST 1994
6 *
7 * (C) Copyright 1994, Hewlett-Packard Ltd.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 **********************************************************************/
19
20#include <cctype>
21#include "docqual.h"
22#include "reject.h"
23#include "tesscallback.h"
24#include "tessvars.h"
25#include "tesseractclass.h"
26
27namespace tesseract{
28
29// A little class to provide the callbacks as we have no pre-bound args.
31 explicit DocQualCallbacks(WERD_RES* word0)
32 : word(word0), match_count(0), accepted_match_count(0) {}
33
34 void CountMatchingBlobs(int index) {
36 }
37
38 void CountAcceptedBlobs(int index) {
39 if (word->reject_map[index].accepted())
42 }
43
44 void AcceptIfGoodQuality(int index) {
45 if (word->reject_map[index].accept_if_good_quality())
46 word->reject_map[index].setrej_quality_accept();
47 }
48
50 int16_t match_count;
52};
53
54/*************************************************************************
55 * word_blob_quality()
56 * How many blobs in the box_word are identical to those of the inword?
57 * ASSUME blobs in both initial word and box_word are in ascending order of
58 * left hand blob edge.
59 *************************************************************************/
61 if (word->bln_boxes == nullptr ||
62 word->rebuild_word == nullptr || word->rebuild_word->blobs.empty())
63 return 0;
64
65 DocQualCallbacks cb(word);
67 *word->rebuild_word,
69 return cb.match_count;
70}
71
73 int16_t i = 0;
74 int16_t err_count = 0;
75
76 if (word->rebuild_word != nullptr) {
77 for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
78 TBLOB* blob = word->rebuild_word->blobs[b];
79 err_count += count_outline_errs(word->best_choice->unichar_string()[i],
80 blob->NumOutlines());
81 i++;
82 }
83 }
84 return err_count;
85}
86
87/*************************************************************************
88 * word_char_quality()
89 * Combination of blob quality and outline quality - how many good chars are
90 * there? - I.e chars which pass the blob AND outline tests.
91 *************************************************************************/
93 ROW *row,
94 int16_t *match_count,
95 int16_t *accepted_match_count) {
96 if (word->bln_boxes == nullptr || word->rebuild_word == nullptr ||
97 word->rebuild_word->blobs.empty()) {
98 *match_count = 0;
99 *accepted_match_count = 0;
100 return;
101 }
102
103 DocQualCallbacks cb(word);
105 *word->rebuild_word,
107 *match_count = cb.match_count;
108 *accepted_match_count = cb.accepted_match_count;
109}
110
111/*************************************************************************
112 * unrej_good_chs()
113 * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
114 *************************************************************************/
116 if (word->bln_boxes == nullptr ||
117 word->rebuild_word == nullptr || word->rebuild_word->blobs.empty())
118 return;
119
120 DocQualCallbacks cb(word);
122 *word->rebuild_word,
124}
125
126int16_t Tesseract::count_outline_errs(char c, int16_t outline_count) {
127 int expected_outline_count;
128
129 if (STRING (outlines_odd).contains (c))
130 return 0; // Don't use this char
131 else if (STRING (outlines_2).contains (c))
132 expected_outline_count = 2;
133 else
134 expected_outline_count = 1;
135 return abs (outline_count - expected_outline_count);
136}
137
139 bool good_quality_doc) {
140 if ((tessedit_good_quality_unrej && good_quality_doc))
141 unrej_good_quality_words(page_res_it);
142 doc_and_block_rejection(page_res_it, good_quality_doc);
144 tilde_crunch(page_res_it);
145 tilde_delete(page_res_it);
146 }
147}
148
149/*************************************************************************
150 * unrej_good_quality_words()
151 * Accept potential rejects in words which pass the following checks:
152 * - Contains a potential reject
153 * - Word looks like a sensible alpha word.
154 * - Word segmentation is the same as the original image
155 * - All characters have the expected number of outlines
156 * NOTE - the rejection counts are recalculated after unrejection
157 * - CAN'T do it in a single pass without a bit of fiddling
158 * - keep it simple but inefficient
159 *************************************************************************/
160void Tesseract::unrej_good_quality_words( //unreject potential
161 PAGE_RES_IT &page_res_it) {
162 WERD_RES *word;
163 ROW_RES *current_row;
164 BLOCK_RES *current_block;
165 int i;
166
167 page_res_it.restart_page ();
168 while (page_res_it.word () != nullptr) {
169 check_debug_pt (page_res_it.word (), 100);
170 if (bland_unrej) {
171 word = page_res_it.word ();
172 for (i = 0; i < word->reject_map.length (); i++) {
173 if (word->reject_map[i].accept_if_good_quality ())
174 word->reject_map[i].setrej_quality_accept ();
175 }
176 page_res_it.forward ();
177 }
178 else if ((page_res_it.row ()->char_count > 0) &&
179 ((page_res_it.row ()->rej_count /
180 static_cast<float>(page_res_it.row ()->char_count)) <=
182 word = page_res_it.word ();
188 != AC_UNACCEPTABLE)) {
189 unrej_good_chs(word, page_res_it.row ()->row);
190 }
191 page_res_it.forward ();
192 }
193 else {
194 /* Skip to end of dodgy row */
195 current_row = page_res_it.row ();
196 while ((page_res_it.word () != nullptr) &&
197 (page_res_it.row () == current_row))
198 page_res_it.forward ();
199 }
200 check_debug_pt (page_res_it.word (), 110);
201 }
202 page_res_it.restart_page ();
203 page_res_it.page_res->char_count = 0;
204 page_res_it.page_res->rej_count = 0;
205 current_block = nullptr;
206 current_row = nullptr;
207 while (page_res_it.word () != nullptr) {
208 if (current_block != page_res_it.block ()) {
209 current_block = page_res_it.block ();
210 current_block->char_count = 0;
211 current_block->rej_count = 0;
212 }
213 if (current_row != page_res_it.row ()) {
214 current_row = page_res_it.row ();
215 current_row->char_count = 0;
216 current_row->rej_count = 0;
217 current_row->whole_word_rej_count = 0;
218 }
219 page_res_it.rej_stat_word ();
220 page_res_it.forward ();
221 }
222}
223
224
225/*************************************************************************
226 * doc_and_block_rejection()
227 *
228 * If the page has too many rejects - reject all of it.
229 * If any block has too many rejects - reject all words in the block
230 *************************************************************************/
231
232void Tesseract::doc_and_block_rejection( //reject big chunks
233 PAGE_RES_IT &page_res_it,
234 bool good_quality_doc) {
235 int16_t block_no = 0;
236 int16_t row_no = 0;
237 BLOCK_RES *current_block;
238 ROW_RES *current_row;
239
240 bool rej_word;
241 bool prev_word_rejected;
242 int16_t char_quality = 0;
243 int16_t accepted_char_quality;
244
245 if (page_res_it.page_res->rej_count * 100.0 /
247 reject_whole_page(page_res_it);
249 tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
250 page_res_it.page_res->char_count,
251 page_res_it.page_res->rej_count);
252 }
253 } else {
255 tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
256 page_res_it.page_res->char_count,
257 page_res_it.page_res->rej_count);
258 }
259
260 /* Walk blocks testing for block rejection */
261
262 page_res_it.restart_page();
263 WERD_RES* word;
264 while ((word = page_res_it.word()) != nullptr) {
265 current_block = page_res_it.block();
266 block_no = current_block->block->pdblk.index();
267 if (current_block->char_count > 0 &&
268 (current_block->rej_count * 100.0 / current_block->char_count) >
271 tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
272 block_no, current_block->char_count,
273 current_block->rej_count);
274 }
275 prev_word_rejected = false;
276 while ((word = page_res_it.word()) != nullptr &&
277 (page_res_it.block() == current_block)) {
279 rej_word = word->reject_map.reject_count() > 0 ||
281 if (rej_word && tessedit_dont_blkrej_good_wds &&
284 *word->uch_set,
286 word->best_choice->unichar_lengths().string()) !=
288 word_char_quality(word, page_res_it.row()->row,
289 &char_quality,
290 &accepted_char_quality);
291 rej_word = char_quality != word->reject_map.length();
292 }
293 } else {
294 rej_word = true;
295 }
296 if (rej_word) {
297 /*
298 Reject spacing if both current and prev words are rejected.
299 NOTE - this is NOT restricted to FUZZY spaces. - When tried this
300 generated more space errors.
301 */
303 prev_word_rejected &&
304 page_res_it.prev_row() == page_res_it.row() &&
305 word->word->space() == 1)
306 word->reject_spaces = true;
308 }
309 prev_word_rejected = rej_word;
310 page_res_it.forward();
311 }
312 } else {
314 tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
315 block_no, page_res_it.block()->char_count,
316 page_res_it.block()->rej_count);
317 }
318
319 /* Walk rows in block testing for row rejection */
320 row_no = 0;
321 while (page_res_it.word() != nullptr &&
322 page_res_it.block() == current_block) {
323 current_row = page_res_it.row();
324 row_no++;
325 /* Reject whole row if:
326 fraction of chars on row which are rejected exceed a limit AND
327 fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
328 limit
329 */
330 if (current_row->char_count > 0 &&
331 (current_row->rej_count * 100.0 / current_row->char_count) >
333 (current_row->whole_word_rej_count * 100.0 /
334 current_row->rej_count) <
337 tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n",
338 row_no, current_row->char_count,
339 current_row->rej_count);
340 }
341 prev_word_rejected = false;
342 while ((word = page_res_it.word()) != nullptr &&
343 page_res_it.row () == current_row) {
344 /* Preserve words on good docs unless they are mostly rejected*/
345 if (!tessedit_row_rej_good_docs && good_quality_doc) {
346 rej_word = word->reject_map.reject_count() /
347 static_cast<float>(word->reject_map.length()) >
350 /* Preserve perfect words anyway */
351 rej_word = word->reject_map.reject_count() > 0 ||
353 if (rej_word && tessedit_dont_rowrej_good_wds &&
357 word->best_choice->unichar_lengths().string()) !=
359 word_char_quality(word, page_res_it.row()->row,
360 &char_quality,
361 &accepted_char_quality);
362 rej_word = char_quality != word->reject_map.length();
363 }
364 } else {
365 rej_word = true;
366 }
367 if (rej_word) {
368 /*
369 Reject spacing if both current and prev words are rejected.
370 NOTE - this is NOT restricted to FUZZY spaces. - When tried
371 this generated more space errors.
372 */
374 prev_word_rejected &&
375 page_res_it.prev_row() == page_res_it.row() &&
376 word->word->space () == 1)
377 word->reject_spaces = true;
379 }
380 prev_word_rejected = rej_word;
381 page_res_it.forward();
382 }
383 } else {
385 tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
386 row_no, current_row->char_count, current_row->rej_count);
387 }
388 while (page_res_it.word() != nullptr &&
389 page_res_it.row() == current_row)
390 page_res_it.forward();
391 }
392 }
393 }
394 }
395 }
396}
397
398} // namespace tesseract
399
400/*************************************************************************
401 * reject_whole_page()
402 * Don't believe any of it - set the reject map to 00..00 in all words
403 *
404 *************************************************************************/
405
406void reject_whole_page(PAGE_RES_IT &page_res_it) {
407 page_res_it.restart_page ();
408 while (page_res_it.word () != nullptr) {
409 page_res_it.word ()->reject_map.rej_word_doc_rej ();
410 page_res_it.forward ();
411 }
412 //whole page is rejected
413 page_res_it.page_res->rejected = true;
414}
415
416namespace tesseract {
418 WERD_RES *word;
419 GARBAGE_LEVEL garbage_level;
420 PAGE_RES_IT copy_it;
421 bool prev_potential_marked = false;
422 bool found_terrible_word = false;
423 bool ok_dict_word;
424
425 page_res_it.restart_page();
426 while (page_res_it.word() != nullptr) {
427 POLY_BLOCK* pb = page_res_it.block()->block->pdblk.poly_block();
428 if (pb != nullptr && !pb->IsText()) {
429 page_res_it.forward();
430 continue;
431 }
432 word = page_res_it.word();
433
436
438 word->merge_tess_fails();
439
440 if (word->reject_map.accept_count () != 0) {
441 found_terrible_word = false;
442 //Forget earlier potential crunches
443 prev_potential_marked = false;
444 }
445 else {
446 ok_dict_word = safe_dict_word(word);
447 garbage_level = garbage_word(word, ok_dict_word);
448
449 if ((garbage_level != G_NEVER_CRUNCH) &&
450 (terrible_word_crunch (word, garbage_level))) {
451 if (crunch_debug > 0) {
452 tprintf ("T CRUNCHING: \"%s\"\n",
454 }
456 if (prev_potential_marked) {
457 while (copy_it.word () != word) {
458 if (crunch_debug > 0) {
459 tprintf ("P1 CRUNCHING: \"%s\"\n",
460 copy_it.word()->best_choice->unichar_string().string());
461 }
462 copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
463 copy_it.forward ();
464 }
465 prev_potential_marked = false;
466 }
467 found_terrible_word = true;
468 }
469 else if ((garbage_level != G_NEVER_CRUNCH) &&
471 garbage_level, ok_dict_word))) {
472 if (found_terrible_word) {
473 if (crunch_debug > 0) {
474 tprintf ("P2 CRUNCHING: \"%s\"\n",
476 }
478 }
479 else if (!prev_potential_marked) {
480 copy_it = page_res_it;
481 prev_potential_marked = true;
482 if (crunch_debug > 1) {
483 tprintf ("P3 CRUNCHING: \"%s\"\n",
485 }
486 }
487 }
488 else {
489 found_terrible_word = false;
490 //Forget earlier potential crunches
491 prev_potential_marked = false;
492 if (crunch_debug > 2) {
493 tprintf ("NO CRUNCH: \"%s\"\n",
495 }
496 }
497 }
498 page_res_it.forward ();
499 }
500}
501
502
504 GARBAGE_LEVEL garbage_level) {
505 float rating_per_ch;
506 int adjusted_len;
507 int crunch_mode = 0;
508
509 if ((word->best_choice->unichar_string().length() == 0) ||
510 (strspn(word->best_choice->unichar_string().string(), " ") ==
512 crunch_mode = 1;
513 else {
514 adjusted_len = word->reject_map.length ();
515 if (adjusted_len > crunch_rating_max)
516 adjusted_len = crunch_rating_max;
517 rating_per_ch = word->best_choice->rating () / adjusted_len;
518
519 if (rating_per_ch > crunch_terrible_rating)
520 crunch_mode = 2;
521 else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
522 crunch_mode = 3;
523 else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
524 (garbage_level != G_OK))
525 crunch_mode = 4;
526 else if ((rating_per_ch > crunch_poor_garbage_rate) &&
527 (garbage_level != G_OK))
528 crunch_mode = 5;
529 }
530 if (crunch_mode > 0) {
531 if (crunch_debug > 2) {
532 tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
533 crunch_mode, word->best_choice->unichar_string().string());
534 }
535 return true;
536 }
537 else
538 return false;
539}
540
542 GARBAGE_LEVEL garbage_level,
543 bool ok_dict_word) {
544 float rating_per_ch;
545 int adjusted_len;
546 const char *str = word->best_choice->unichar_string().string();
547 const char *lengths = word->best_choice->unichar_lengths().string();
548 bool word_crunchable;
549 int poor_indicator_count = 0;
550
551 word_crunchable = !crunch_leave_accept_strings ||
552 word->reject_map.length() < 3 ||
554 str, lengths) == AC_UNACCEPTABLE &&
555 !ok_dict_word);
556
557 adjusted_len = word->reject_map.length();
558 if (adjusted_len > 10)
559 adjusted_len = 10;
560 rating_per_ch = word->best_choice->rating() / adjusted_len;
561
562 if (rating_per_ch > crunch_pot_poor_rate) {
563 if (crunch_debug > 2) {
564 tprintf("Potential poor rating on \"%s\"\n",
566 }
567 poor_indicator_count++;
568 }
569
570 if (word_crunchable &&
572 if (crunch_debug > 2) {
573 tprintf("Potential poor cert on \"%s\"\n",
575 }
576 poor_indicator_count++;
577 }
578
579 if (garbage_level != G_OK) {
580 if (crunch_debug > 2) {
581 tprintf("Potential garbage on \"%s\"\n",
583 }
584 poor_indicator_count++;
585 }
586 return poor_indicator_count >= crunch_pot_indicators;
587}
588
590 WERD_RES *word;
591 PAGE_RES_IT copy_it;
592 bool deleting_from_bol = false;
593 bool marked_delete_point = false;
594 int16_t debug_delete_mode;
595 CRUNCH_MODE delete_mode;
596 int16_t x_debug_delete_mode;
597 CRUNCH_MODE x_delete_mode;
598
599 page_res_it.restart_page();
600 while (page_res_it.word() != nullptr) {
601 word = page_res_it.word();
602
603 delete_mode = word_deletable (word, debug_delete_mode);
604 if (delete_mode != CR_NONE) {
605 if (word->word->flag (W_BOL) || deleting_from_bol) {
606 if (crunch_debug > 0) {
607 tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
608 debug_delete_mode,
610 }
611 word->unlv_crunch_mode = delete_mode;
612 deleting_from_bol = true;
613 } else if (word->word->flag(W_EOL)) {
614 if (marked_delete_point) {
615 while (copy_it.word() != word) {
616 x_delete_mode = word_deletable (copy_it.word (),
617 x_debug_delete_mode);
618 if (crunch_debug > 0) {
619 tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
620 x_debug_delete_mode,
621 copy_it.word()->best_choice->unichar_string().string());
622 }
623 copy_it.word ()->unlv_crunch_mode = x_delete_mode;
624 copy_it.forward ();
625 }
626 }
627 if (crunch_debug > 0) {
628 tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
629 debug_delete_mode,
631 }
632 word->unlv_crunch_mode = delete_mode;
633 deleting_from_bol = false;
634 marked_delete_point = false;
635 }
636 else {
637 if (!marked_delete_point) {
638 copy_it = page_res_it;
639 marked_delete_point = true;
640 }
641 }
642 }
643 else {
644 deleting_from_bol = false;
645 //Forget earlier potential crunches
646 marked_delete_point = false;
647 }
648 /*
649 The following step has been left till now as the tess fails are used to
650 determine if the word is deletable.
651 */
653 word->merge_tess_fails();
654 page_res_it.forward ();
655 }
656}
657
658
660 int i;
661 UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
662 UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
663 UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
664 UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
665 for (i = 0; i < word_res->reject_map.length(); ++i) {
666 if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
667 word_res->best_choice->set_unichar_id(unichar_dash, i);
668 if (word_res->reject_map[i].accepted ())
669 word_res->reject_map[i].setrej_unlv_rej ();
670 }
671 if (word_res->best_choice->unichar_id(i) == unichar_pow) {
672 word_res->best_choice->set_unichar_id(unichar_space, i);
673 if (word_res->reject_map[i].accepted ())
674 word_res->reject_map[i].setrej_unlv_rej ();
675 }
676 }
677}
678
680 enum STATES
681 {
682 JUNK,
683 FIRST_UPPER,
684 FIRST_LOWER,
685 FIRST_NUM,
686 SUBSEQUENT_UPPER,
687 SUBSEQUENT_LOWER,
688 SUBSEQUENT_NUM
689 };
690 const char *str = word->best_choice->unichar_string().string();
691 const char *lengths = word->best_choice->unichar_lengths().string();
692 STATES state = JUNK;
693 int len = 0;
694 int isolated_digits = 0;
695 int isolated_alphas = 0;
696 int bad_char_count = 0;
697 int tess_rejs = 0;
698 int dodgy_chars = 0;
699 int ok_chars;
700 UNICHAR_ID last_char = -1;
701 int alpha_repetition_count = 0;
702 int longest_alpha_repetition_count = 0;
703 int longest_lower_run_len = 0;
704 int lower_string_count = 0;
705 int longest_upper_run_len = 0;
706 int upper_string_count = 0;
707 int total_alpha_count = 0;
708 int total_digit_count = 0;
709
710 for (; *str != '\0'; str += *(lengths++)) {
711 len++;
712 if (word->uch_set->get_isupper (str, *lengths)) {
713 total_alpha_count++;
714 switch (state) {
715 case SUBSEQUENT_UPPER:
716 case FIRST_UPPER:
717 state = SUBSEQUENT_UPPER;
718 upper_string_count++;
719 if (longest_upper_run_len < upper_string_count)
720 longest_upper_run_len = upper_string_count;
721 if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
722 alpha_repetition_count++;
723 if (longest_alpha_repetition_count < alpha_repetition_count) {
724 longest_alpha_repetition_count = alpha_repetition_count;
725 }
726 }
727 else {
728 last_char = word->uch_set->unichar_to_id(str, *lengths);
729 alpha_repetition_count = 1;
730 }
731 break;
732 case FIRST_NUM:
733 isolated_digits++;
734 // Fall through.
735 default:
736 state = FIRST_UPPER;
737 last_char = word->uch_set->unichar_to_id(str, *lengths);
738 alpha_repetition_count = 1;
739 upper_string_count = 1;
740 break;
741 }
742 }
743 else if (word->uch_set->get_islower (str, *lengths)) {
744 total_alpha_count++;
745 switch (state) {
746 case SUBSEQUENT_LOWER:
747 case FIRST_LOWER:
748 state = SUBSEQUENT_LOWER;
749 lower_string_count++;
750 if (longest_lower_run_len < lower_string_count)
751 longest_lower_run_len = lower_string_count;
752 if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
753 alpha_repetition_count++;
754 if (longest_alpha_repetition_count < alpha_repetition_count) {
755 longest_alpha_repetition_count = alpha_repetition_count;
756 }
757 }
758 else {
759 last_char = word->uch_set->unichar_to_id(str, *lengths);
760 alpha_repetition_count = 1;
761 }
762 break;
763 case FIRST_NUM:
764 isolated_digits++;
765 // Fall through.
766 default:
767 state = FIRST_LOWER;
768 last_char = word->uch_set->unichar_to_id(str, *lengths);
769 alpha_repetition_count = 1;
770 lower_string_count = 1;
771 break;
772 }
773 }
774 else if (word->uch_set->get_isdigit (str, *lengths)) {
775 total_digit_count++;
776 switch (state) {
777 case FIRST_NUM:
778 state = SUBSEQUENT_NUM;
779 case SUBSEQUENT_NUM:
780 break;
781 case FIRST_UPPER:
782 case FIRST_LOWER:
783 isolated_alphas++;
784 // Fall through.
785 default:
786 state = FIRST_NUM;
787 break;
788 }
789 }
790 else {
791 if (*lengths == 1 && *str == ' ')
792 tess_rejs++;
793 else
794 bad_char_count++;
795 switch (state) {
796 case FIRST_NUM:
797 isolated_digits++;
798 break;
799 case FIRST_UPPER:
800 case FIRST_LOWER:
801 isolated_alphas++;
802 default:
803 break;
804 }
805 state = JUNK;
806 }
807 }
808
809 switch (state) {
810 case FIRST_NUM:
811 isolated_digits++;
812 break;
813 case FIRST_UPPER:
814 case FIRST_LOWER:
815 isolated_alphas++;
816 default:
817 break;
818 }
819
821 total_alpha_count += total_digit_count - isolated_digits;
822 }
823
824 if (crunch_leave_ok_strings && len >= 4 &&
825 2 * (total_alpha_count - isolated_alphas) > len &&
826 longest_alpha_repetition_count < crunch_long_repetitions) {
827 if ((crunch_accept_ok &&
828 acceptable_word_string(*word->uch_set, str, lengths) !=
830 longest_lower_run_len > crunch_leave_lc_strings ||
831 longest_upper_run_len > crunch_leave_uc_strings)
832 return G_NEVER_CRUNCH;
833 }
834 if (word->reject_map.length() > 1 &&
835 strpbrk(str, " ") == nullptr &&
837 word->best_choice->permuter() == FREQ_DAWG_PERM ||
838 word->best_choice->permuter() == USER_DAWG_PERM ||
839 word->best_choice->permuter() == NUMBER_PERM ||
840 acceptable_word_string(*word->uch_set, str, lengths) !=
841 AC_UNACCEPTABLE || ok_dict_word))
842 return G_OK;
843
844 ok_chars = len - bad_char_count - isolated_digits -
845 isolated_alphas - tess_rejs;
846
847 if (crunch_debug > 3) {
848 tprintf("garbage_word: \"%s\"\n",
850 tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
851 len,
852 bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
853 }
854 if (bad_char_count == 0 &&
855 tess_rejs == 0 &&
856 (len > isolated_digits + isolated_alphas || len <= 2))
857 return G_OK;
858
859 if (tess_rejs > ok_chars ||
860 (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
861 return G_TERRIBLE;
862
863 if (len > 4) {
864 dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
865 isolated_alphas;
866 if (dodgy_chars > 5 || (dodgy_chars / static_cast<float>(len)) > 0.5)
867 return G_DODGY;
868 else
869 return G_OK;
870 } else {
871 dodgy_chars = 2 * tess_rejs + bad_char_count;
872 if ((len == 4 && dodgy_chars > 2) ||
873 (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
874 return G_DODGY;
875 else
876 return G_OK;
877 }
878}
879
880
881/*************************************************************************
882 * word_deletable()
883 * DELETE WERDS AT ENDS OF ROWS IF
884 * Word is crunched &&
885 * ( string length = 0 OR
886 * > 50% of chars are "|" (before merging) OR
887 * certainty < -10 OR
888 * rating /char > 60 OR
889 * TOP of word is more than 0.5 xht BELOW baseline OR
890 * BOTTOM of word is more than 0.5 xht ABOVE xht OR
891 * length of word < 3xht OR
892 * height of word < 0.7 xht OR
893 * height of word > 3.0 xht OR
894 * >75% of the outline BBs have longest dimension < 0.5xht
895 *************************************************************************/
896
897CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, int16_t &delete_mode) {
898 int word_len = word->reject_map.length ();
899 float rating_per_ch;
900 TBOX box; //BB of word
901
902 if (word->unlv_crunch_mode == CR_NONE) {
903 delete_mode = 0;
904 return CR_NONE;
905 }
906
907 if (word_len == 0) {
908 delete_mode = 1;
909 return CR_DELETE;
910 }
911
912 if (word->rebuild_word != nullptr) {
913 // Cube leaves rebuild_word nullptr.
914 box = word->rebuild_word->bounding_box();
915 if (box.height () < crunch_del_min_ht * kBlnXHeight) {
916 delete_mode = 4;
917 return CR_DELETE;
918 }
919
920 if (noise_outlines(word->rebuild_word)) {
921 delete_mode = 5;
922 return CR_DELETE;
923 }
924 }
925
926 if ((failure_count (word) * 1.5) > word_len) {
927 delete_mode = 2;
928 return CR_LOOSE_SPACE;
929 }
930
931 if (word->best_choice->certainty () < crunch_del_cert) {
932 delete_mode = 7;
933 return CR_LOOSE_SPACE;
934 }
935
936 rating_per_ch = word->best_choice->rating () / word_len;
937
938 if (rating_per_ch > crunch_del_rating) {
939 delete_mode = 8;
940 return CR_LOOSE_SPACE;
941 }
942
944 delete_mode = 9;
945 return CR_LOOSE_SPACE;
946 }
947
948 if (box.bottom () >
950 delete_mode = 10;
951 return CR_LOOSE_SPACE;
952 }
953
954 if (box.height () > crunch_del_max_ht * kBlnXHeight) {
955 delete_mode = 11;
956 return CR_LOOSE_SPACE;
957 }
958
959 if (box.width () < crunch_del_min_width * kBlnXHeight) {
960 delete_mode = 3;
961 return CR_LOOSE_SPACE;
962 }
963
964 delete_mode = 0;
965 return CR_NONE;
966}
967
969 const char *str = word->best_choice->unichar_string().string();
970 int tess_rejs = 0;
971
972 for (; *str != '\0'; str++) {
973 if (*str == ' ')
974 tess_rejs++;
975 }
976 return tess_rejs;
977}
978
979
981 TBOX box; // BB of outline
982 int16_t outline_count = 0;
983 int16_t small_outline_count = 0;
984 int16_t max_dimension;
985 float small_limit = kBlnXHeight * crunch_small_outlines_size;
986
987 for (int b = 0; b < word->NumBlobs(); ++b) {
988 TBLOB* blob = word->blobs[b];
989 for (TESSLINE* ol = blob->outlines; ol != nullptr; ol = ol->next) {
990 outline_count++;
991 box = ol->bounding_box();
992 if (box.height() > box.width())
993 max_dimension = box.height();
994 else
995 max_dimension = box.width();
996 if (max_dimension < small_limit)
997 small_outline_count++;
998 }
999 }
1000 return small_outline_count >= outline_count;
1001}
1002
1003} // namespace tesseract
@ AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:30
void reject_whole_page(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:406
GARBAGE_LEVEL
Definition: docqual.h:30
@ G_DODGY
Definition: docqual.h:33
@ G_TERRIBLE
Definition: docqual.h:34
@ G_OK
Definition: docqual.h:32
@ G_NEVER_CRUNCH
Definition: docqual.h:31
const int kBlnBaselineOffset
Definition: normalis.h:25
const int kBlnXHeight
Definition: normalis.h:24
CRUNCH_MODE
Definition: pageres.h:157
@ CR_DELETE
Definition: pageres.h:161
@ CR_NONE
Definition: pageres.h:158
@ CR_LOOSE_SPACE
Definition: pageres.h:160
@ CR_KEEP_SPACE
Definition: pageres.h:159
@ FREQ_DAWG_PERM
Definition: ratngs.h:244
@ USER_DAWG_PERM
Definition: ratngs.h:243
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:241
@ NUMBER_PERM
Definition: ratngs.h:239
@ W_EOL
end of line
Definition: werd.h:33
@ W_BOL
start of line
Definition: werd.h:32
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
Definition: tesscallback.h:258
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int UNICHAR_ID
Definition: unichar.h:34
bool empty() const
Definition: genericvector.h:91
void AcceptIfGoodQuality(int index)
Definition: docqual.cpp:44
void CountMatchingBlobs(int index)
Definition: docqual.cpp:34
void CountAcceptedBlobs(int index)
Definition: docqual.cpp:38
DocQualCallbacks(WERD_RES *word0)
Definition: docqual.cpp:31
void tilde_delete(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:589
GARBAGE_LEVEL garbage_word(WERD_RES *word, bool ok_dict_word)
Definition: docqual.cpp:679
double tessedit_reject_doc_percent
bool tessedit_preserve_row_rej_perfect_wds
void tilde_crunch(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:417
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:232
int16_t word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:72
bool crunch_early_convert_bad_unlv_chs
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1745
bool noise_outlines(TWERD *word)
Definition: docqual.cpp:980
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:138
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:608
double tessedit_whole_wd_rej_row_percent
int16_t failure_count(WERD_RES *word)
Definition: docqual.cpp:968
double tessedit_reject_row_percent
void convert_bad_unlv_chs(WERD_RES *word_res)
Definition: docqual.cpp:659
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1849
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:92
void unrej_good_chs(WERD_RES *word, ROW *row)
Definition: docqual.cpp:115
bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
Definition: docqual.cpp:541
bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
Definition: docqual.cpp:503
bool tessedit_preserve_blk_rej_perfect_wds
int16_t word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:60
CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode)
Definition: docqual.cpp:897
double tessedit_reject_block_percent
int16_t count_outline_errs(char c, int16_t outline_count)
Definition: docqual.cpp:126
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:160
double tessedit_good_doc_still_rowrej_wd
TESSLINE * next
Definition: blobs.h:281
Definition: blobs.h:284
TESSLINE * outlines
Definition: blobs.h:400
int NumOutlines() const
Definition: blobs.cpp:454
Definition: blobs.h:418
int NumBlobs() const
Definition: blobs.h:448
GenericVector< TBLOB * > blobs
Definition: blobs.h:459
TBOX bounding_box() const
Definition: blobs.cpp:861
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
Definition: boxword.cpp:190
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:190
Definition: ocrrow.h:37
bool rejected
Definition: pageres.h:81
int32_t rej_count
Definition: pageres.h:79
int32_t char_count
Definition: pageres.h:78
int32_t rej_count
Definition: pageres.h:118
int32_t char_count
Definition: pageres.h:117
BLOCK * block
Definition: pageres.h:116
int32_t whole_word_rej_count
Definition: pageres.h:143
int32_t rej_count
Definition: pageres.h:142
ROW * row
Definition: pageres.h:140
int32_t char_count
Definition: pageres.h:141
const UNICHARSET * uch_set
Definition: pageres.h:203
TWERD * rebuild_word
Definition: pageres.h:266
WERD_CHOICE * best_choice
Definition: pageres.h:241
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:315
bool reject_spaces
Definition: pageres.h:341
REJMAP reject_map
Definition: pageres.h:294
void merge_tess_fails()
Definition: pageres.cpp:1067
tesseract::BoxWord * bln_boxes
Definition: pageres.h:195
WERD * word
Definition: pageres.h:186
WERD_RES * word() const
Definition: pageres.h:754
ROW_RES * row() const
Definition: pageres.h:757
void rej_stat_word()
Definition: pageres.cpp:1667
ROW_RES * prev_row() const
Definition: pageres.h:748
BLOCK_RES * block() const
Definition: pageres.h:760
WERD_RES * restart_page()
Definition: pageres.h:701
PAGE_RES * page_res
Definition: pageres.h:677
WERD_RES * forward()
Definition: pageres.h:734
int index() const
Definition: pdblock.h:67
POLY_BLOCK * poly_block() const
Definition: pdblock.h:55
bool IsText() const
Definition: polyblk.h:49
const STRING & unichar_string() const
Definition: ratngs.h:531
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
uint8_t permuter() const
Definition: ratngs.h:336
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:349
float certainty() const
Definition: ratngs.h:320
const STRING & unichar_lengths() const
Definition: ratngs.h:538
float rating() const
Definition: ratngs.h:317
Definition: rect.h:34
int16_t top() const
Definition: rect.h:58
int16_t width() const
Definition: rect.h:115
int16_t height() const
Definition: rect.h:108
int16_t bottom() const
Definition: rect.h:65
void rej_word_row_rej()
Definition: rejctmap.cpp:442
int16_t reject_count()
Definition: rejctmap.h:229
int16_t accept_count()
Definition: rejctmap.cpp:279
void rej_word_doc_rej()
Definition: rejctmap.cpp:424
void rej_word_block_rej()
Definition: rejctmap.cpp:433
bool quality_recoverable_rejects()
Definition: rejctmap.cpp:300
int32_t length() const
Definition: rejctmap.h:223
uint8_t space()
Definition: werd.h:99
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
Definition: strngs.h:45
int32_t length() const
Definition: strngs.cpp:189
const char * string() const
Definition: strngs.cpp:194
uint32_t unsigned_size() const
Definition: strngs.h:72
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:505
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:498