tesseract 4.1.1
Loading...
Searching...
No Matches
context.cpp
Go to the documentation of this file.
1/* -*-C-*-
2 ********************************************************************************
3 *
4 * File: context.cpp (Formerly context.c)
5 * Description: Context checking functions
6 * Author: Mark Seaman, OCR Technology
7 *
8 * (c) Copyright 1990, Hewlett-Packard Company.
9 ** Licensed under the Apache License, Version 2.0 (the "License");
10 ** you may not use this file except in compliance with the License.
11 ** You may obtain a copy of the License at
12 ** http://www.apache.org/licenses/LICENSE-2.0
13 ** Unless required by applicable law or agreed to in writing, software
14 ** distributed under the License is distributed on an "AS IS" BASIS,
15 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 ** See the License for the specific language governing permissions and
17 ** limitations under the License.
18 *
19 *********************************************************************************/
20
21#include "dict.h"
22#include "unicharset.h"
23
24namespace tesseract {
25
26static const int kMinAbsoluteGarbageWordLength = 10;
27static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f;
28
29const int case_state_table[6][4] = {
30 {/* 0. Beginning of word */
31 /* P U L D */
32 /* -1. Error on case */
33 0, 1, 5, 4},
34 {/* 1. After initial capital */
35 0, 3, 2, 4},
36 {/* 2. After lower case */
37 0, -1, 2, -1},
38 {/* 3. After upper case */
39 0, 3, -1, 4},
40 {/* 4. After a digit */
41 0, -1, -1, 4},
42 {/* 5. After initial lower case */
43 5, -1, 2, -1},
44};
45
46int Dict::case_ok(const WERD_CHOICE &word) const {
47 int state = 0;
48 int x;
49 const UNICHARSET* unicharset = word.unicharset();
50 for (x = 0; x < word.length(); ++x) {
51 UNICHAR_ID ch_id = word.unichar_id(x);
52 if (unicharset->get_isupper(ch_id))
53 state = case_state_table[state][1];
54 else if (unicharset->get_islower(ch_id))
55 state = case_state_table[state][2];
56 else if (unicharset->get_isdigit(ch_id))
57 state = case_state_table[state][3];
58 else
59 state = case_state_table[state][0];
60 if (state == -1) return false;
61 }
62 return state != 5; // single lower is bad
63}
64
66 const UNICHARSET &unicharset) {
67 if (word.length() < kMinAbsoluteGarbageWordLength) return false;
68 int num_alphanum = 0;
69 for (int x = 0; x < word.length(); ++x) {
70 num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) ||
71 unicharset.get_isdigit(word.unichar_id(x)));
72 }
73 return (static_cast<float>(num_alphanum) /
74 static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac);
75}
76
77} // namespace tesseract
int UNICHAR_ID
Definition: unichar.h:34
const int case_state_table[6][4]
Definition: context.cpp:29
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
const UNICHARSET * unicharset() const
Definition: ratngs.h:290
int length() const
Definition: ratngs.h:293
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:505
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:498
int case_ok(const WERD_CHOICE &word) const
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:46
bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Definition: context.cpp:65