tesseract 4.1.1
Loading...
Searching...
No Matches
ambigs.h
Go to the documentation of this file.
1
2// File: ambigs.h
3// Description: Constants, flags, functions for dealing with
4// ambiguities (training and recognition).
5// Author: Daria Antonova
6//
7// (C) Copyright 2008, Google Inc.
8// Licensed under the Apache License, Version 2.0 (the "License");
9// you may not use this file except in compliance with the License.
10// You may obtain a copy of the License at
11// http://www.apache.org/licenses/LICENSE-2.0
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17//
19
20#ifndef TESSERACT_CCUTIL_AMBIGS_H_
21#define TESSERACT_CCUTIL_AMBIGS_H_
22
23#if !defined(DISABLED_LEGACY_ENGINE)
24
25#include "elst.h"
26#include "tprintf.h"
27#include "unichar.h"
28#include "unicharset.h"
29#include "genericvector.h"
30
31#define MAX_AMBIG_SIZE 10
32
33namespace tesseract {
34
36
38 NOT_AMBIG, // the ngram pair is not ambiguous
39 REPLACE_AMBIG, // ocred ngram should always be substituted with correct
40 DEFINITE_AMBIG, // add correct ngram to the classifier results (1-1)
41 SIMILAR_AMBIG, // use pairwise classifier for ocred/correct pair (1-1)
42 CASE_AMBIG, // this is a case ambiguity (1-1)
43
44 AMBIG_TYPE_COUNT // number of enum entries
45};
46
47// A collection of utility functions for arrays of UNICHAR_IDs that are
48// terminated by INVALID_UNICHAR_ID.
50 public:
51 // Compares two arrays of unichar ids. Returns -1 if the length of array1 is
52 // less than length of array2, if any array1[i] is less than array2[i].
53 // Returns 0 if the arrays are equal, 1 otherwise.
54 // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID.
55 static inline int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2) {
56 for (;;) {
57 const UNICHAR_ID val1 = *ptr1++;
58 const UNICHAR_ID val2 = *ptr2++;
59 if (val1 != val2) {
60 if (val1 == INVALID_UNICHAR_ID) return -1;
61 if (val2 == INVALID_UNICHAR_ID) return 1;
62 if (val1 < val2) return -1;
63 return 1;
64 }
65 if (val1 == INVALID_UNICHAR_ID) return 0;
66 }
67 }
68
69 // Look uid in the vector of uids. If found, the index of the matched
70 // element is returned. Otherwise, it returns -1.
71 static inline int find_in(const UnicharIdVector& uid_vec,
72 const UNICHAR_ID uid) {
73 for (int i = 0; i < uid_vec.size(); ++i)
74 if (uid_vec[i] == uid) return i;
75 return -1;
76 }
77
78 // Copies UNICHAR_IDs from dst to src. Returns the number of ids copied.
79 // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID
80 // and that dst has enough space for all the elements from src.
81 static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) {
82 int i = 0;
83 do {
84 dst[i] = src[i];
85 } while (dst[i++] != INVALID_UNICHAR_ID);
86 return i - 1;
87 }
88
89 // Prints unichars corresponding to the unichar_ids in the given array.
90 // The function assumes that array is terminated by INVALID_UNICHAR_ID.
91 static inline void print(const UNICHAR_ID array[],
92 const UNICHARSET &unicharset) {
93 const UNICHAR_ID *ptr = array;
94 if (*ptr == INVALID_UNICHAR_ID) tprintf("[Empty]");
95 while (*ptr != INVALID_UNICHAR_ID) {
96 tprintf("%s ", unicharset.id_to_unichar(*ptr++));
97 }
98 tprintf("( ");
99 ptr = array;
100 while (*ptr != INVALID_UNICHAR_ID) tprintf("%d ", *ptr++);
101 tprintf(")\n");
102 }
103};
104
105// AMBIG_SPEC_LIST stores a list of dangerous ambigs that
106// start with the same unichar (e.g. r->t rn->m rr1->m).
107class AmbigSpec : public ELIST_LINK {
108 public:
109 AmbigSpec();
110 ~AmbigSpec() = default;
111
112 // Comparator function for sorting AmbigSpec_LISTs. The lists will
113 // be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors
114 // in a a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1].
115 static int compare_ambig_specs(const void *spec1, const void *spec2) {
116 const AmbigSpec *s1 = *static_cast<const AmbigSpec *const *>(spec1);
117 const AmbigSpec *s2 = *static_cast<const AmbigSpec *const *>(spec2);
119 if (result != 0) return result;
122 }
123
129};
131
132// AMBIG_TABLE[i] stores a set of ambiguities whose
133// wrong ngram starts with unichar id i.
134using UnicharAmbigsVector = GenericVector<AmbigSpec_LIST *>;
135
137 public:
138 UnicharAmbigs() = default;
140 replace_ambigs_.delete_data_pointers();
141 dang_ambigs_.delete_data_pointers();
142 one_to_one_definite_ambigs_.delete_data_pointers();
143 }
144
145 const UnicharAmbigsVector &dang_ambigs() const { return dang_ambigs_; }
146 const UnicharAmbigsVector &replace_ambigs() const { return replace_ambigs_; }
147
148 // Initializes the ambigs by adding a nullptr pointer to each table.
149 void InitUnicharAmbigs(const UNICHARSET& unicharset,
150 bool use_ambigs_for_adaption);
151
152 // Loads the universal ambigs that are useful for any language.
153 void LoadUniversal(const UNICHARSET& encoder_set, UNICHARSET* unicharset);
154
155 // Fills in two ambiguity tables (replaceable and dangerous) with information
156 // read from the ambigs file. An ambiguity table is an array of lists.
157 // The array is indexed by a class id. Each entry in the table provides
158 // a list of potential ambiguities which can start with the corresponding
159 // character. For example the ambiguity "rn -> m", would be located in the
160 // table at index of unicharset.unichar_to_id('r').
161 // In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in
162 // one_to_one_definite_ambigs_. This vector is also indexed by the class id
163 // of the wrong part of the ambiguity and each entry contains a vector of
164 // unichar ids that are ambiguous to it.
165 // encoder_set is used to encode the ambiguity strings, undisturbed by new
166 // unichar_ids that may be created by adding the ambigs.
167 void LoadUnicharAmbigs(const UNICHARSET& encoder_set,
168 TFile *ambigs_file, int debug_level,
169 bool use_ambigs_for_adaption, UNICHARSET *unicharset);
170
171 // Returns definite 1-1 ambigs for the given unichar id.
173 UNICHAR_ID unichar_id) const {
174 if (one_to_one_definite_ambigs_.empty()) return nullptr;
175 return one_to_one_definite_ambigs_[unichar_id];
176 }
177
178 // Returns a pointer to the vector with all unichar ids that appear in the
179 // 'correct' part of the ambiguity pair when the given unichar id appears
180 // in the 'wrong' part of the ambiguity. E.g. if DangAmbigs file consist of
181 // m->rn,rn->m,m->iii, UnicharAmbigsForAdaption() called with unichar id of
182 // m will return a pointer to a vector with unichar ids of r,n,i.
184 UNICHAR_ID unichar_id) const {
185 if (ambigs_for_adaption_.empty()) return nullptr;
186 return ambigs_for_adaption_[unichar_id];
187 }
188
189 // Similar to the above, but return the vector of unichar ids for which
190 // the given unichar_id is an ambiguity (appears in the 'wrong' part of
191 // some ambiguity pair).
193 UNICHAR_ID unichar_id) const {
194 if (reverse_ambigs_for_adaption_.empty()) return nullptr;
195 return reverse_ambigs_for_adaption_[unichar_id];
196 }
197
198 private:
199 bool ParseAmbiguityLine(int line_num, int version, int debug_level,
200 const UNICHARSET &unicharset, char *buffer,
201 int *test_ambig_part_size,
202 UNICHAR_ID *test_unichar_ids,
203 int *replacement_ambig_part_size,
204 char *replacement_string, int *type);
205 bool InsertIntoTable(UnicharAmbigsVector &table,
206 int test_ambig_part_size, UNICHAR_ID *test_unichar_ids,
207 int replacement_ambig_part_size,
208 const char *replacement_string, int type,
209 AmbigSpec *ambig_spec, UNICHARSET *unicharset);
210
211 UnicharAmbigsVector dang_ambigs_;
212 UnicharAmbigsVector replace_ambigs_;
213 GenericVector<UnicharIdVector *> one_to_one_definite_ambigs_;
214 GenericVector<UnicharIdVector *> ambigs_for_adaption_;
215 GenericVector<UnicharIdVector *> reverse_ambigs_for_adaption_;
216};
217
218} // namespace tesseract
219
220#endif // !defined(DISABLED_LEGACY_ENGINE)
221
222#endif // TESSERACT_CCUTIL_AMBIGS_H_
#define MAX_AMBIG_SIZE
Definition: ambigs.h:31
#define ELISTIZEH(CLASSNAME)
Definition: elst.h:918
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int UNICHAR_ID
Definition: unichar.h:34
AmbigType
Definition: ambigs.h:37
@ CASE_AMBIG
Definition: ambigs.h:42
@ DEFINITE_AMBIG
Definition: ambigs.h:40
@ REPLACE_AMBIG
Definition: ambigs.h:39
@ AMBIG_TYPE_COUNT
Definition: ambigs.h:44
@ SIMILAR_AMBIG
Definition: ambigs.h:41
@ NOT_AMBIG
Definition: ambigs.h:38
int size() const
Definition: genericvector.h:72
static void print(const UNICHAR_ID array[], const UNICHARSET &unicharset)
Definition: ambigs.h:91
static int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2)
Definition: ambigs.h:55
static int find_in(const UnicharIdVector &uid_vec, const UNICHAR_ID uid)
Definition: ambigs.h:71
static int copy(const UNICHAR_ID src[], UNICHAR_ID dst[])
Definition: ambigs.h:81
UNICHAR_ID correct_ngram_id
Definition: ambigs.h:126
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE+1]
Definition: ambigs.h:124
static int compare_ambig_specs(const void *spec1, const void *spec2)
Definition: ambigs.h:115
UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE+1]
Definition: ambigs.h:125
AmbigType type
Definition: ambigs.h:127
const UnicharAmbigsVector & replace_ambigs() const
Definition: ambigs.h:146
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:145
const UnicharIdVector * AmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:183
const UnicharIdVector * OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id) const
Definition: ambigs.h:172
const UnicharIdVector * ReverseAmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:192
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291