tesseract 4.1.1
Loading...
Searching...
No Matches
normmatch.cpp
Go to the documentation of this file.
1/******************************************************************************
2 ** Filename: normmatch.c
3 ** Purpose: Simple matcher based on character normalization features.
4 ** Author: Dan Johnson
5 **
6 ** (c) Copyright Hewlett-Packard Company, 1988.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 ******************************************************************************/
17/*----------------------------------------------------------------------------
18 Include Files and Type Defines
19----------------------------------------------------------------------------*/
20#include "normmatch.h"
21
22#include <cstdio>
23#include <cmath>
24#include <sstream> // for std::istringstream
25
26#include "classify.h"
27#include "clusttool.h"
28#include "emalloc.h"
29#include "helpers.h"
30#include "normfeat.h"
31#include "unicharset.h"
32#include "params.h"
33
35{
40};
41
42/*----------------------------------------------------------------------------
43 Private Code
44----------------------------------------------------------------------------*/
45
53static double NormEvidenceOf(double NormAdj) {
55
56 if (classify_norm_adj_curl == 3) {
57 NormAdj = NormAdj * NormAdj * NormAdj;
58 } else if (classify_norm_adj_curl == 2) {
59 NormAdj = NormAdj * NormAdj;
60 } else {
61 NormAdj = pow(NormAdj, classify_norm_adj_curl);
62 }
63 return (1.0 / (1.0 + NormAdj));
64}
65
66/*----------------------------------------------------------------------------
67 Variables
68----------------------------------------------------------------------------*/
69
71double_VAR(classify_norm_adj_midpoint, 32.0, "Norm adjust midpoint ...");
72double_VAR(classify_norm_adj_curl, 2.0, "Norm adjust curl ...");
74const double kWidthErrorWeighting = 0.125;
75
76/*----------------------------------------------------------------------------
77 Public Code
78----------------------------------------------------------------------------*/
79/*---------------------------------------------------------------------------*/
80namespace tesseract {
95 const FEATURE_STRUCT& feature,
96 bool DebugMatch) {
97 LIST Protos;
98 float BestMatch;
99 float Match;
100 float Delta;
101 PROTOTYPE *Proto;
102 int ProtoId;
103
104 if (ClassId >= NormProtos->NumProtos) {
105 ClassId = NO_CLASS;
106 }
107
108 /* handle requests for classification as noise */
109 if (ClassId == NO_CLASS) {
110 /* kludge - clean up constants and make into control knobs later */
111 Match = (feature.Params[CharNormLength] *
112 feature.Params[CharNormLength] * 500.0 +
113 feature.Params[CharNormRx] *
114 feature.Params[CharNormRx] * 8000.0 +
115 feature.Params[CharNormRy] *
116 feature.Params[CharNormRy] * 8000.0);
117 return (1.0 - NormEvidenceOf(Match));
118 }
119
120 BestMatch = FLT_MAX;
121 Protos = NormProtos->Protos[ClassId];
122
123 if (DebugMatch) {
124 tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId));
125 }
126
127 ProtoId = 0;
128 iterate(Protos) {
129 Proto = reinterpret_cast<PROTOTYPE *>first_node (Protos);
130 Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY];
131 Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY];
132 if (DebugMatch) {
133 tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
134 Proto->Mean[CharNormY], Delta,
135 Proto->Weight.Elliptical[CharNormY], Match);
136 }
137 Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx];
138 Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx];
139 if (DebugMatch) {
140 tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
141 Proto->Mean[CharNormRx], Delta,
142 Proto->Weight.Elliptical[CharNormRx], Match);
143 }
144 // Ry is width! See intfx.cpp.
145 Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy];
146 if (DebugMatch) {
147 tprintf("Width: Proto=%g, Delta=%g, Var=%g\n",
148 Proto->Mean[CharNormRy], Delta,
150 }
151 Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy];
152 Delta *= kWidthErrorWeighting;
153 Match += Delta;
154 if (DebugMatch) {
155 tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n",
156 Match, Match / classify_norm_adj_midpoint,
157 NormEvidenceOf(Match), 256 * (1 - NormEvidenceOf(Match)));
158 }
159
160 if (Match < BestMatch)
161 BestMatch = Match;
162
163 ProtoId++;
164 }
165 return 1.0 - NormEvidenceOf(BestMatch);
166} /* ComputeNormMatch */
167
169 if (NormProtos != nullptr) {
170 for (int i = 0; i < NormProtos->NumProtos; i++)
175 NormProtos = nullptr;
176 }
177}
178} // namespace tesseract
179
180/*---------------------------------------------------------------------------*/
181namespace tesseract {
192 int i;
193 char unichar[2 * UNICHAR_LEN + 1];
194 UNICHAR_ID unichar_id;
195 LIST Protos;
196 int NumProtos;
197
198 /* allocate and initialization data structure */
199 NormProtos = static_cast<NORM_PROTOS *>(Emalloc (sizeof (NORM_PROTOS)));
201 NormProtos->Protos = static_cast<LIST *>(Emalloc (NormProtos->NumProtos * sizeof(LIST)));
202 for (i = 0; i < NormProtos->NumProtos; i++)
204
205 /* read file header and save in data structure */
208
209 /* read protos for each class into a separate list */
210 const int kMaxLineSize = 100;
211 char line[kMaxLineSize];
212 while (fp->FGets(line, kMaxLineSize) != nullptr) {
213 std::istringstream stream(line);
214 stream >> unichar >> NumProtos;
215 if (stream.fail()) {
216 continue;
217 }
218 if (unicharset.contains_unichar(unichar)) {
219 unichar_id = unicharset.unichar_to_id(unichar);
220 Protos = NormProtos->Protos[unichar_id];
221 for (i = 0; i < NumProtos; i++)
222 Protos = push_last(Protos, ReadPrototype(fp, NormProtos->NumParams));
223 NormProtos->Protos[unichar_id] = Protos;
224 } else {
225 tprintf("Error: unichar %s in normproto file is not in unichar set.\n",
226 unichar);
227 for (i = 0; i < NumProtos; i++)
229 }
230 }
231 return (NormProtos);
232} /* ReadNormProtos */
233} // namespace tesseract
#define double_VAR(name, val, comment)
Definition: params.h:312
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
#define UNICHAR_LEN
Definition: unichar.h:30
int UNICHAR_ID
Definition: unichar.h:34
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:538
void FreePrototype(void *arg)
Definition: cluster.cpp:549
uint16_t ReadSampleSize(TFile *fp)
Definition: clusttool.cpp:120
PARAM_DESC * ReadParamDesc(TFile *fp, uint16_t N)
Definition: clusttool.cpp:140
PROTOTYPE * ReadPrototype(TFile *fp, uint16_t N)
Definition: clusttool.cpp:176
@ CharNormRx
Definition: normfeat.h:30
@ CharNormY
Definition: normfeat.h:30
@ CharNormRy
Definition: normfeat.h:30
@ CharNormLength
Definition: normfeat.h:30
const double kWidthErrorWeighting
Definition: normmatch.cpp:74
double classify_norm_adj_midpoint
Definition: normmatch.cpp:71
double classify_norm_adj_curl
Definition: normmatch.cpp:72
void * Emalloc(int Size)
Definition: emalloc.cpp:31
void Efree(void *ptr)
Definition: emalloc.cpp:45
LIST push_last(LIST list, void *item)
Definition: oldlist.cpp:227
#define iterate(l)
Definition: oldlist.h:101
#define first_node(l)
Definition: oldlist.h:92
#define NIL_LIST
Definition: oldlist.h:76
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:34
#define NO_CLASS
Definition: matchdefs.h:35
UNICHARSET unicharset
Definition: ccutil.h:73
char * FGets(char *buffer, int buffer_size)
Definition: serialis.cpp:249
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:671
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
int size() const
Definition: unicharset.h:341
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
float ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch)
Definition: normmatch.cpp:94
NORM_PROTOS * ReadNormProtos(TFile *fp)
Definition: normmatch.cpp:190
NORM_PROTOS * NormProtos
Definition: classify.h:527
float * Elliptical
Definition: cluster.h:60
float * Mean
Definition: cluster.h:74
FLOATUNION Weight
Definition: cluster.h:79
PARAM_DESC * ParamDesc
Definition: normmatch.cpp:37
LIST * Protos
Definition: normmatch.cpp:38
float Params[1]
Definition: ocrfeatures.h:61