tesseract 4.1.1
Loading...
Searching...
No Matches
reject.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: reject.cpp (Formerly reject.c)
3 * Description: Rejection functions used in tessedit
4 * Author: Phil Cheatle
5 *
6 * (C) Copyright 1992, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19// Include automatically generated configuration file if running autoconf.
20#ifdef HAVE_CONFIG_H
21#include "config_auto.h"
22#endif
23
24#ifdef DISABLED_LEGACY_ENGINE
25
26#include "tesseractclass.h"
27
28namespace tesseract {
29
30int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
31 const WERD_CHOICE &word = *werd_res->best_choice;
32 int dict_word_type = werd_res->tesseract->dict_word(word);
33 return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
34}
35} // namespace tesseract
36
37#else
38
39#include "tessvars.h"
40#include <cctype>
41#include <cerrno>
42#include <cstring>
43#include "genericvector.h"
44#include "reject.h"
45#include "control.h"
46#include "docqual.h"
47#include "helpers.h"
48
49#include "tesseractclass.h"
50
52
53/*************************************************************************
54 * set_done()
55 *
56 * Set the done flag based on the word acceptability criteria
57 *************************************************************************/
58
59namespace tesseract {
60void Tesseract::set_done(WERD_RES *word, int16_t pass) {
61 word->done = word->tess_accepted &&
62 (strchr(word->best_choice->unichar_string().string(), ' ') == nullptr);
63 bool word_is_ambig = word->best_choice->dangerous_ambig_found();
64 bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
67 if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
68 one_ell_conflict(word, false)) {
69 if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");
70 word->done = false;
71 }
72 if (word->done && ((!word_from_dict &&
73 word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
74 if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");
75 word->done = false;
76 }
78 tprintf("set_done(): done=%d\n", word->done);
79 word->best_choice->print("");
80 }
81}
82
83
84/*************************************************************************
85 * make_reject_map()
86 *
87 * Sets the done flag to indicate whether the resylt is acceptable.
88 *
89 * Sets a reject map for the word.
90 *************************************************************************/
91void Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) {
92 int i;
93 int offset;
94
95 flip_0O(word);
96 check_debug_pt(word, -1); // For trap only
97 set_done(word, pass); // Set acceptance
99 reject_blanks(word);
100 /*
101 0: Rays original heuristic - the baseline
102 */
103 if (tessedit_reject_mode == 0) {
104 if (!word->done)
106 } else if (tessedit_reject_mode == 5) {
107 /*
108 5: Reject I/1/l from words where there is no strong contextual confirmation;
109 the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
110 and the whole of any words which are very small
111 */
114 } else {
115 one_ell_conflict(word, true);
116 /*
117 Originally the code here just used the done flag. Now I have duplicated
118 and unpacked the conditions for setting the done flag so that each
119 mechanism can be turned on or off independently. This works WITHOUT
120 affecting the done flag setting.
121 */
124
126 (strchr (word->best_choice->unichar_string().string (), ' ') != nullptr))
128
129 WERD_CHOICE* best_choice = word->best_choice;
130 if (rej_use_good_perm) {
131 if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
132 best_choice->permuter() == FREQ_DAWG_PERM ||
133 best_choice->permuter() == USER_DAWG_PERM) &&
136 best_choice->unichar_string().string(),
137 best_choice->unichar_lengths().string()) !=
139 // PASSED TEST
140 } else if (best_choice->permuter() == NUMBER_PERM) {
142 for (i = 0, offset = 0;
143 best_choice->unichar_string()[offset] != '\0';
144 offset += best_choice->unichar_lengths()[i++]) {
145 if (word->reject_map[i].accepted() &&
146 word->uch_set->get_isalpha(
147 best_choice->unichar_string().string() + offset,
148 best_choice->unichar_lengths()[i]))
149 word->reject_map[i].setrej_bad_permuter();
150 // rej alpha
151 }
152 }
153 } else {
155 }
156 }
157 /* Ambig word rejection was here once !!*/
158 }
159 } else {
160 tprintf("BAD tessedit_reject_mode\n");
161 ASSERT_HOST("Fatal error encountered!" == nullptr);
162 }
163
164 if (tessedit_image_border > -1)
165 reject_edge_blobs(word);
166
167 check_debug_pt (word, 10);
169 tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
170 tprintf("Certainty: %f Rating: %f\n",
171 word->best_choice->certainty (), word->best_choice->rating ());
172 tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
173 }
174
175 flip_hyphens(word);
176 check_debug_pt(word, 20);
177}
178} // namespace tesseract
179
180
182 int16_t i;
183 int16_t offset;
184
185 for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
186 offset += word->best_choice->unichar_lengths()[i], i += 1) {
187 if (word->best_choice->unichar_string()[offset] == ' ')
188 //rej unrecognised blobs
189 word->reject_map[i].setrej_tess_failure ();
190 }
191}
192
193namespace tesseract {
195 int16_t i;
196 int16_t offset;
197
198 for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
199 offset += word->best_choice->unichar_lengths()[i], i += 1) {
201 contains (word->best_choice->unichar_string()[offset])) {
202 //rej 1Il conflict
203 word->reject_map[i].setrej_1Il_conflict ();
204 }
205 }
206}
207} // namespace tesseract
208
209
211 float threshold = compute_reject_threshold(word->best_choice);
212 for (int i = 0; i < word->best_choice->length(); ++i) {
213 if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)
214 word->reject_map[i].setrej_tess_failure();
215 else if (word->best_choice->certainty(i) < threshold)
216 word->reject_map[i].setrej_poor_match();
217 }
218}
219
220
221/**********************************************************************
222 * compute_reject_threshold
223 *
224 * Set a rejection threshold for this word.
225 * Initially this is a trivial function which looks for the largest
226 * gap in the certainty value.
227 **********************************************************************/
228
230 float threshold; // rejection threshold
231 float bestgap = 0.0f; // biggest gap
232 float gapstart; // bottom of gap
233
234 int blob_count = word->length();
235 GenericVector<float> ratings;
236 ratings.resize_no_init(blob_count);
237 for (int i = 0; i < blob_count; ++i) {
238 ratings[i] = word->certainty(i);
239 }
240 ratings.sort();
241 gapstart = ratings[0] - 1; // all reject if none better
242 if (blob_count >= 3) {
243 for (int index = 0; index < blob_count - 1; index++) {
244 if (ratings[index + 1] - ratings[index] > bestgap) {
245 bestgap = ratings[index + 1] - ratings[index];
246 // find biggest
247 gapstart = ratings[index];
248 }
249 }
250 }
251 threshold = gapstart + bestgap / 2;
252
253 return threshold;
254}
255
256
257/*************************************************************************
258 * reject_edge_blobs()
259 *
260 * If the word is perilously close to the edge of the image, reject those blobs
261 * in the word which are too close to the edge as they could be clipped.
262 *************************************************************************/
263namespace tesseract {
265 TBOX word_box = word->word->bounding_box();
266 // Use the box_word as it is already denormed back to image coordinates.
267 int blobcount = word->box_word->length();
268
269 if (word_box.left() < tessedit_image_border ||
270 word_box.bottom() < tessedit_image_border ||
271 word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
272 word_box.top() + tessedit_image_border > ImageHeight() - 1) {
273 ASSERT_HOST(word->reject_map.length() == blobcount);
274 for (int blobindex = 0; blobindex < blobcount; blobindex++) {
275 TBOX blob_box = word->box_word->BlobBox(blobindex);
276 if (blob_box.left() < tessedit_image_border ||
277 blob_box.bottom() < tessedit_image_border ||
278 blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
279 blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
280 word->reject_map[blobindex].setrej_edge_char();
281 // Close to edge
282 }
283 }
284 }
285}
286
287/**********************************************************************
288 * one_ell_conflict()
289 *
290 * Identify words where there is a potential I/l/1 error.
291 * - A bundle of contextual heuristics!
292 **********************************************************************/
293bool Tesseract::one_ell_conflict(WERD_RES* word_res, bool update_map) {
294 const char *word;
295 const char *lengths;
296 int16_t word_len; //its length
297 int16_t first_alphanum_index_;
298 int16_t first_alphanum_offset_;
299 int16_t i;
300 int16_t offset;
301 bool non_conflict_set_char; //non conf set a/n?
302 bool conflict = false;
303 bool allow_1s;
304 ACCEPTABLE_WERD_TYPE word_type;
305 bool dict_perm_type;
306 bool dict_word_ok;
307 int dict_word_type;
308
309 word = word_res->best_choice->unichar_string().string ();
310 lengths = word_res->best_choice->unichar_lengths().string();
311 word_len = strlen(lengths);
312 /*
313 If there are no occurrences of the conflict set characters then the word
314 is OK.
315 */
316 if (strpbrk(word, conflict_set_I_l_1.string ()) == nullptr)
317 return false;
318
319 /*
320 There is a conflict if there are NO other (confirmed) alphanumerics apart
321 from those in the conflict set.
322 */
323
324 for (i = 0, offset = 0, non_conflict_set_char = false;
325 (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
326 non_conflict_set_char =
327 (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
328 word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
329 !STRING (conflict_set_I_l_1).contains (word[offset]);
330 if (!non_conflict_set_char) {
331 if (update_map)
332 reject_I_1_L(word_res);
333 return true;
334 }
335
336 /*
337 If the word is accepted by a dawg permuter, and the first alpha character
338 is "I" or "l", check to see if the alternative is also a dawg word. If it
339 is, then there is a potential error otherwise the word is ok.
340 */
341
342 dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
343 (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
345 (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
346 (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
347 dict_word_type = dict_word(*(word_res->best_choice));
348 dict_word_ok = (dict_word_type > 0) &&
349 (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
350
351 if ((rej_1Il_use_dict_word && dict_word_ok) ||
352 (rej_1Il_trust_permuter_type && dict_perm_type) ||
353 (dict_perm_type && dict_word_ok)) {
354 first_alphanum_index_ = first_alphanum_index (word, lengths);
355 first_alphanum_offset_ = first_alphanum_offset (word, lengths);
356 if (lengths[first_alphanum_index_] == 1 &&
357 word[first_alphanum_offset_] == 'I') {
358 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
359 if (safe_dict_word(word_res) > 0) {
360 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
361 if (update_map)
362 word_res->reject_map[first_alphanum_index_].
363 setrej_1Il_conflict();
364 return true;
365 }
366 else {
367 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
368 return false;
369 }
370 }
371
372 if (lengths[first_alphanum_index_] == 1 &&
373 word[first_alphanum_offset_] == 'l') {
374 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
375 if (safe_dict_word(word_res) > 0) {
376 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
377 if (update_map)
378 word_res->reject_map[first_alphanum_index_].
379 setrej_1Il_conflict();
380 return true;
381 }
382 else {
383 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
384 return false;
385 }
386 }
387 return false;
388 }
389
390 /*
391 NEW 1Il code. The old code relied on permuter types too much. In fact,
392 tess will use TOP_CHOICE permute for good things like "palette".
393 In this code the string is examined independently to see if it looks like
394 a well formed word.
395 */
396
397 /*
398 REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
399 dictionary word.
400 */
401 first_alphanum_index_ = first_alphanum_index (word, lengths);
402 first_alphanum_offset_ = first_alphanum_offset (word, lengths);
403 if (lengths[first_alphanum_index_] == 1 &&
404 word[first_alphanum_offset_] == 'l') {
405 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
406 if (safe_dict_word(word_res) > 0)
407 return false;
408 else
409 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
410 }
411 else if (lengths[first_alphanum_index_] == 1 &&
412 word[first_alphanum_offset_] == 'I') {
413 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
414 if (safe_dict_word(word_res) > 0)
415 return false;
416 else
417 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
418 }
419 /*
420 For strings containing digits:
421 If there are no alphas OR the numeric permuter liked the word,
422 reject any non 1 conflict chs
423 Else reject all conflict chs
424 */
425 if (word_contains_non_1_digit (word, lengths)) {
426 allow_1s = (alpha_count (word, lengths) == 0) ||
427 (word_res->best_choice->permuter () == NUMBER_PERM);
428
429 int16_t offset;
430 conflict = false;
431 for (i = 0, offset = 0; word[offset] != '\0';
432 offset += word_res->best_choice->unichar_lengths()[i++]) {
433 if ((!allow_1s || (word[offset] != '1')) &&
434 STRING (conflict_set_I_l_1).contains (word[offset])) {
435 if (update_map)
436 word_res->reject_map[i].setrej_1Il_conflict ();
437 conflict = true;
438 }
439 }
440 return conflict;
441 }
442 /*
443 For anything else. See if it conforms to an acceptable word type. If so,
444 treat accordingly.
445 */
446 word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
447 if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
448 first_alphanum_index_ = first_alphanum_index (word, lengths);
449 first_alphanum_offset_ = first_alphanum_offset (word, lengths);
450 if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
451 if (update_map)
452 word_res->reject_map[first_alphanum_index_].
453 setrej_1Il_conflict ();
454 return true;
455 }
456 else
457 return false;
458 }
459 else if (word_type == AC_UPPER_CASE) {
460 return false;
461 }
462 else {
463 if (update_map)
464 reject_I_1_L(word_res);
465 return true;
466 }
467}
468
469
470int16_t Tesseract::first_alphanum_index(const char *word,
471 const char *word_lengths) {
472 int16_t i;
473 int16_t offset;
474
475 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
476 if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
477 unicharset.get_isdigit(word + offset, word_lengths[i]))
478 return i;
479 }
480 return -1;
481}
482
483int16_t Tesseract::first_alphanum_offset(const char *word,
484 const char *word_lengths) {
485 int16_t i;
486 int16_t offset;
487
488 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
489 if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
490 unicharset.get_isdigit(word + offset, word_lengths[i]))
491 return offset;
492 }
493 return -1;
494}
495
496int16_t Tesseract::alpha_count(const char *word,
497 const char *word_lengths) {
498 int16_t i;
499 int16_t offset;
500 int16_t count = 0;
501
502 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
503 if (unicharset.get_isalpha (word + offset, word_lengths[i]))
504 count++;
505 }
506 return count;
507}
508
509
511 const char* word_lengths) {
512 int16_t i;
513 int16_t offset;
514
515 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
516 if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
517 (word_lengths[i] != 1 || word[offset] != '1'))
518 return true;
519 }
520 return false;
521}
522
523/*************************************************************************
524 * dont_allow_1Il()
525 * Don't unreject LONE accepted 1Il conflict set chars
526 *************************************************************************/
528 int i = 0;
529 int offset;
530 int word_len = word->reject_map.length();
531 const char *s = word->best_choice->unichar_string().string();
532 const char *lengths = word->best_choice->unichar_lengths().string();
533 bool accepted_1Il = false;
534
535 for (i = 0, offset = 0; i < word_len;
536 offset += word->best_choice->unichar_lengths()[i++]) {
537 if (word->reject_map[i].accepted()) {
538 if (STRING(conflict_set_I_l_1).contains(s[offset])) {
539 accepted_1Il = true;
540 } else {
541 if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
542 word->uch_set->get_isdigit(s + offset, lengths[i]))
543 return; // >=1 non 1Il ch accepted
544 }
545 }
546 }
547 if (!accepted_1Il)
548 return; //Nothing to worry about
549
550 for (i = 0, offset = 0; i < word_len;
551 offset += word->best_choice->unichar_lengths()[i++]) {
552 if (STRING(conflict_set_I_l_1).contains(s[offset]) &&
553 word->reject_map[i].accepted())
554 word->reject_map[i].setrej_postNN_1Il();
555 }
556}
557
558
560 int count = 0;
561 const WERD_CHOICE *best_choice = word_res->best_choice;
562 for (int i = 0; i < word_res->reject_map.length(); ++i) {
563 if ((word_res->reject_map[i].accepted()) &&
564 (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
565 word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
566 count++;
567 }
568 }
569 return count;
570}
571
572
573// reject all if most rejected.
575 /* Reject the whole of the word if the fraction of rejects exceeds a limit */
576
577 if (static_cast<float>(word->reject_map.reject_count()) / word->reject_map.length() >=
580}
581
582
584 int16_t char_quality;
585 int16_t accepted_char_quality;
586
587 if (word->best_choice->unichar_lengths().length() <= 1)
588 return false;
589
591 contains(word->best_choice->unichar_string()[0]))
592 return false;
593
594 UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
595 for (int i = 1; i < word->best_choice->length(); ++i) {
596 if (word->best_choice->unichar_id(i) != uch_id) return false;
597 }
598
599 word_char_quality(word, row, &char_quality, &accepted_char_quality);
600
601 if ((word->best_choice->unichar_lengths().length () == char_quality) &&
602 (char_quality == accepted_char_quality))
603 return true;
604 else
605 return false;
606}
607
608int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
609 const WERD_CHOICE &word = *werd_res->best_choice;
610 int dict_word_type = werd_res->tesseract->dict_word(word);
611 return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
612}
613
614// Note: After running this function word_res->ratings
615// might not contain the right BLOB_CHOICE corresponding to each character
616// in word_res->best_choice.
618 WERD_CHOICE *best_choice = word_res->best_choice;
619 int i;
620 int prev_right = -9999;
621 int next_left;
622 TBOX out_box;
623 float aspect_ratio;
624
626 return;
627
628 int num_blobs = word_res->rebuild_word->NumBlobs();
629 UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
630 for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
631 TBLOB* blob = word_res->rebuild_word->blobs[i];
632 out_box = blob->bounding_box();
633 if (i + 1 == num_blobs)
634 next_left = 9999;
635 else
636 next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
637 // Don't touch small or touching blobs - it is too dangerous.
638 if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
639 (out_box.left() > prev_right) && (out_box.right() < next_left)) {
640 aspect_ratio = out_box.width() / static_cast<float>(out_box.height());
641 if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
642 if (aspect_ratio >= tessedit_upper_flip_hyphen &&
643 word_res->uch_set->contains_unichar_id(unichar_dash) &&
644 word_res->uch_set->get_enabled(unichar_dash)) {
645 /* Certain HYPHEN */
646 best_choice->set_unichar_id(unichar_dash, i);
647 if (word_res->reject_map[i].rejected())
648 word_res->reject_map[i].setrej_hyphen_accept();
649 }
650 if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
651 word_res->reject_map[i].accepted())
652 //Suspected HYPHEN
653 word_res->reject_map[i].setrej_hyphen ();
654 }
655 else if (best_choice->unichar_id(i) == unichar_dash) {
656 if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
657 (word_res->reject_map[i].rejected()))
658 word_res->reject_map[i].setrej_hyphen_accept();
659 //Certain HYPHEN
660
661 if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
662 (word_res->reject_map[i].accepted()))
663 //Suspected HYPHEN
664 word_res->reject_map[i].setrej_hyphen();
665 }
666 }
667 prev_right = out_box.right();
668 }
669}
670
671// Note: After running this function word_res->ratings
672// might not contain the right BLOB_CHOICE corresponding to each character
673// in word_res->best_choice.
675 WERD_CHOICE *best_choice = word_res->best_choice;
676 int i;
677 TBOX out_box;
678
679 if (!tessedit_flip_0O)
680 return;
681
682 int num_blobs = word_res->rebuild_word->NumBlobs();
683 for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
684 TBLOB* blob = word_res->rebuild_word->blobs[i];
685 if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
686 word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
687 out_box = blob->bounding_box();
688 if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
689 (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4))
690 return; //Beware words with sub/superscripts
691 }
692 }
693 UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
694 UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
695 if (unichar_0 == INVALID_UNICHAR_ID ||
696 !word_res->uch_set->get_enabled(unichar_0) ||
697 unichar_O == INVALID_UNICHAR_ID ||
698 !word_res->uch_set->get_enabled(unichar_O)) {
699 return; // 0 or O are not present/enabled in unicharset
700 }
701 for (i = 1; i < best_choice->length(); ++i) {
702 if (best_choice->unichar_id(i) == unichar_0 ||
703 best_choice->unichar_id(i) == unichar_O) {
704 /* A0A */
705 if ((i+1) < best_choice->length() &&
706 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
707 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) {
708 best_choice->set_unichar_id(unichar_O, i);
709 }
710 /* A00A */
711 if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
712 (i+1) < best_choice->length() &&
713 (best_choice->unichar_id(i+1) == unichar_0 ||
714 best_choice->unichar_id(i+1) == unichar_O) &&
715 (i+2) < best_choice->length() &&
716 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) {
717 best_choice->set_unichar_id(unichar_O, i);
718 i++;
719 }
720 /* AA0<non digit or end of word> */
721 if ((i > 1) &&
722 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) &&
723 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
724 (((i+1) < best_choice->length() &&
725 !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) &&
726 !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") &&
727 !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) ||
728 (i == best_choice->length() - 1))) {
729 best_choice->set_unichar_id(unichar_O, i);
730 }
731 /* 9O9 */
732 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
733 (i+1) < best_choice->length() &&
734 non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) {
735 best_choice->set_unichar_id(unichar_0, i);
736 }
737 /* 9OOO */
738 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
739 (i+2) < best_choice->length() &&
740 (best_choice->unichar_id(i+1) == unichar_0 ||
741 best_choice->unichar_id(i+1) == unichar_O) &&
742 (best_choice->unichar_id(i+2) == unichar_0 ||
743 best_choice->unichar_id(i+2) == unichar_O)) {
744 best_choice->set_unichar_id(unichar_0, i);
745 best_choice->set_unichar_id(unichar_0, i+1);
746 best_choice->set_unichar_id(unichar_0, i+2);
747 i += 2;
748 }
749 /* 9OO<non upper> */
750 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
751 (i+2) < best_choice->length() &&
752 (best_choice->unichar_id(i+1) == unichar_0 ||
753 best_choice->unichar_id(i+1) == unichar_O) &&
754 !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) {
755 best_choice->set_unichar_id(unichar_0, i);
756 best_choice->set_unichar_id(unichar_0, i+1);
757 i++;
758 }
759 /* 9O<non upper> */
760 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
761 (i+1) < best_choice->length() &&
762 !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) {
763 best_choice->set_unichar_id(unichar_0, i);
764 }
765 /* 9[.,]OOO.. */
766 if ((i > 1) &&
767 (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") ||
768 word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) &&
769 (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) ||
770 best_choice->unichar_id(i-2) == unichar_O)) {
771 if (best_choice->unichar_id(i-2) == unichar_O) {
772 best_choice->set_unichar_id(unichar_0, i-2);
773 }
774 while (i < best_choice->length() &&
775 (best_choice->unichar_id(i) == unichar_O ||
776 best_choice->unichar_id(i) == unichar_0)) {
777 best_choice->set_unichar_id(unichar_0, i);
778 i++;
779 }
780 i--;
781 }
782 }
783 }
784}
785
786bool Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
787 return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
788}
789
790bool Tesseract::non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
791 return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
792}
793} // namespace tesseract
794
795#endif // def DISABLED_LEGACY_ENGINE
ACCEPTABLE_WERD_TYPE
Definition: control.h:29
@ AC_INITIAL_CAP
ALL but initial lc.
Definition: control.h:33
@ AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:30
@ AC_UPPER_CASE
ALL upper case.
Definition: control.h:32
@ AC_LOWER_CASE
ALL lower case.
Definition: control.h:31
void reject_blanks(WERD_RES *word)
Definition: reject.cpp:181
float compute_reject_threshold(WERD_CHOICE *word)
Definition: reject.cpp:229
void reject_poor_matches(WERD_RES *word)
Definition: reject.cpp:210
const int kBlnBaselineOffset
Definition: normalis.h:25
const int kBlnXHeight
Definition: normalis.h:24
@ DOC_DAWG_PERM
Definition: ratngs.h:242
@ FREQ_DAWG_PERM
Definition: ratngs.h:244
@ USER_DAWG_PERM
Definition: ratngs.h:243
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:241
@ NUMBER_PERM
Definition: ratngs.h:239
#define CLISTIZEH(CLASSNAME)
Definition: clst.h:879
#define CLISTIZE(CLASSNAME)
Definition: clst.h:891
#define ASSERT_HOST(x)
Definition: errcode.h:88
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int UNICHAR_ID
Definition: unichar.h:34
@ UNICHAR_SPACE
Definition: unicharset.h:34
int count(LIST var_list)
Definition: oldlist.cpp:95
void resize_no_init(int size)
Definition: genericvector.h:66
int16_t first_alphanum_index(const char *word, const char *word_lengths)
Definition: reject.cpp:470
void reject_edge_blobs(WERD_RES *word)
Definition: reject.cpp:264
int16_t first_alphanum_offset(const char *word, const char *word_lengths)
Definition: reject.cpp:483
bool non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:786
int16_t alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:496
void dont_allow_1Il(WERD_RES *word)
Definition: reject.cpp:527
int16_t count_alphanums(const WERD_CHOICE &word)
Definition: output.cpp:376
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1745
bool one_ell_conflict(WERD_RES *word_res, bool update_map)
Definition: reject.cpp:293
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:608
void set_done(WERD_RES *word, int16_t pass)
bool non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:790
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1849
double rej_whole_of_mostly_reject_word_fract
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:92
void flip_hyphens(WERD_RES *word)
Definition: reject.cpp:617
void reject_I_1_L(WERD_RES *word)
Definition: reject.cpp:194
void reject_mostly_rejects(WERD_RES *word)
Definition: reject.cpp:574
char * ok_repeated_ch_non_alphanum_wds
bool word_contains_non_1_digit(const char *word, const char *word_lengths)
Definition: reject.cpp:510
bool repeated_nonalphanum_wd(WERD_RES *word, ROW *row)
Definition: reject.cpp:583
void flip_0O(WERD_RES *word)
Definition: reject.cpp:674
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
Definition: blobs.h:284
TBOX bounding_box() const
Definition: blobs.cpp:468
int NumBlobs() const
Definition: blobs.h:448
GenericVector< TBLOB * > blobs
Definition: blobs.h:459
int length() const
Definition: boxword.h:83
const TBOX & BlobBox(int index) const
Definition: boxword.h:84
float x_scale() const
Definition: normalis.h:267
float y_scale() const
Definition: normalis.h:270
Definition: ocrrow.h:37
const UNICHARSET * uch_set
Definition: pageres.h:203
DENORM denorm
Definition: pageres.h:201
TWERD * rebuild_word
Definition: pageres.h:266
bool done
Definition: pageres.h:305
tesseract::Tesseract * tesseract
Definition: pageres.h:280
tesseract::BoxWord * box_word
Definition: pageres.h:272
WERD_CHOICE * best_choice
Definition: pageres.h:241
bool tess_accepted
Definition: pageres.h:303
REJMAP reject_map
Definition: pageres.h:294
WERD * word
Definition: pageres.h:186
const STRING & unichar_string() const
Definition: ratngs.h:531
bool dangerous_ambig_found() const
Definition: ratngs.h:353
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
uint8_t permuter() const
Definition: ratngs.h:336
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:349
float certainty() const
Definition: ratngs.h:320
int length() const
Definition: ratngs.h:293
void print() const
Definition: ratngs.h:570
const STRING & unichar_lengths() const
Definition: ratngs.h:538
float rating() const
Definition: ratngs.h:317
Definition: rect.h:34
int16_t top() const
Definition: rect.h:58
int16_t width() const
Definition: rect.h:115
int16_t height() const
Definition: rect.h:108
int16_t left() const
Definition: rect.h:72
int16_t bottom() const
Definition: rect.h:65
int16_t right() const
Definition: rect.h:79
void rej_word_mostly_rej()
Definition: rejctmap.cpp:406
int16_t reject_count()
Definition: rejctmap.h:229
void rej_word_bad_permuter()
Definition: rejctmap.cpp:379
void rej_word_not_tess_accepted()
Definition: rejctmap.cpp:361
void initialise(int16_t length)
Definition: rejctmap.cpp:273
void rej_word_small_xht()
Definition: rejctmap.cpp:343
void rej_word_contains_blanks()
Definition: rejctmap.cpp:370
int32_t length() const
Definition: rejctmap.h:223
TBOX bounding_box() const
Definition: werd.cpp:148
UNICHARSET unicharset
Definition: ccutil.h:73
Definition: strngs.h:45
int32_t length() const
Definition: strngs.cpp:189
bool contains(char c) const
Definition: strngs.cpp:185
const char * string() const
Definition: strngs.cpp:194
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:505
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:878
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:284
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:687
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:89