tesseract 4.1.1
Loading...
Searching...
No Matches
cntraining.cpp
Go to the documentation of this file.
1/******************************************************************************
2 ** Filename: cntraining.cpp
3 ** Purpose: Generates a normproto and pffmtable.
4 ** Author: Dan Johnson
5 ** Revisment: Christy Russon
6 **
7 ** (c) Copyright Hewlett-Packard Company, 1988.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17******************************************************************************/
18
19/*----------------------------------------------------------------------------
20 Include Files and Type Defines
21----------------------------------------------------------------------------*/
22#include "oldlist.h"
23#include "featdefs.h"
24#include "tessopt.h"
25#include "ocrfeatures.h"
26#include "clusttool.h"
27#include "cluster.h"
28#include <cstring>
29#include <cstdio>
30#include <cmath>
31#include "unichar.h"
32#include "commontraining.h"
33
34#define PROGRAM_FEATURE_TYPE "cn"
35
36/*----------------------------------------------------------------------------
37 Private Function Prototypes
38----------------------------------------------------------------------------*/
39
40static void WriteNormProtos(const char *Directory, LIST LabeledProtoList,
41 const FEATURE_DESC_STRUCT *feature_desc);
42
43static void WriteProtos(FILE* File, uint16_t N, LIST ProtoList,
44 bool WriteSigProtos, bool WriteInsigProtos);
45
46/*----------------------------------------------------------------------------
47 Global Data Definitions and Declarations
48----------------------------------------------------------------------------*/
49/* global variable to hold configuration parameters to control clustering */
50//-M 0.025 -B 0.05 -I 0.8 -C 1e-3
51static const CLUSTERCONFIG CNConfig = {
52 elliptical, 0.025, 0.05, 0.8, 1e-3, 0
53};
54
55/*----------------------------------------------------------------------------
56 Public Code
57----------------------------------------------------------------------------*/
58
104int main(int argc, char *argv[]) {
105 tesseract::CheckSharedLibraryVersion();
106
107 // Set the global Config parameters before parsing the command line.
108 Config = CNConfig;
109
110 const char *PageName;
111 LIST CharList = NIL_LIST;
112 CLUSTERER *Clusterer = nullptr;
113 LIST ProtoList = NIL_LIST;
114 LIST NormProtoList = NIL_LIST;
115 LIST pCharList;
116 LABELEDLIST CharSample;
117 FEATURE_DEFS_STRUCT FeatureDefs;
118 InitFeatureDefs(&FeatureDefs);
119
120 ParseArguments(&argc, &argv);
121 int num_fonts = 0;
122 while ((PageName = GetNextFilename(argc, argv)) != nullptr) {
123 printf("Reading %s ...\n", PageName);
124 FILE *TrainingPage = fopen(PageName, "rb");
125 ASSERT_HOST(TrainingPage);
126 if (TrainingPage) {
127 ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE, 100, nullptr,
128 TrainingPage, &CharList);
129 fclose(TrainingPage);
130 ++num_fonts;
131 }
132 }
133 printf("Clustering ...\n");
134 // To allow an individual font to form a separate cluster,
135 // reduce the min samples:
136 // Config.MinSamples = 0.5 / num_fonts;
137 pCharList = CharList;
138 // The norm protos will count the source protos, so we keep them here in
139 // freeable_protos, so they can be freed later.
140 GenericVector<LIST> freeable_protos;
141 iterate(pCharList) {
142 //Cluster
143 CharSample = reinterpret_cast<LABELEDLIST>first_node(pCharList);
144 Clusterer =
145 SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
146 if (Clusterer == nullptr) { // To avoid a SIGSEGV
147 fprintf(stderr, "Error: nullptr clusterer!\n");
148 return 1;
149 }
150 float SavedMinSamples = Config.MinSamples;
151 // To disable the tendency to produce a single cluster for all fonts,
152 // make MagicSamples an impossible to achieve number:
153 // Config.MagicSamples = CharSample->SampleCount * 10;
154 Config.MagicSamples = CharSample->SampleCount;
155 while (Config.MinSamples > 0.001) {
156 ProtoList = ClusterSamples(Clusterer, &Config);
157 if (NumberOfProtos(ProtoList, true, false) > 0) {
158 break;
159 } else {
160 Config.MinSamples *= 0.95;
161 printf("0 significant protos for %s."
162 " Retrying clustering with MinSamples = %f%%\n",
163 CharSample->Label, Config.MinSamples);
164 }
165 }
166 Config.MinSamples = SavedMinSamples;
167 AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
168 freeable_protos.push_back(ProtoList);
169 FreeClusterer(Clusterer);
170 }
171 FreeTrainingSamples(CharList);
172 int desc_index = ShortNameToFeatureType(FeatureDefs, PROGRAM_FEATURE_TYPE);
173 WriteNormProtos(FLAGS_D.c_str(), NormProtoList,
174 FeatureDefs.FeatureDesc[desc_index]);
175 FreeNormProtoList(NormProtoList);
176 for (int i = 0; i < freeable_protos.size(); ++i) {
177 FreeProtoList(&freeable_protos[i]);
178 }
179 printf ("\n");
180 return 0;
181} // main
182
183/*----------------------------------------------------------------------------
184 Private Code
185----------------------------------------------------------------------------*/
186
187/*----------------------------------------------------------------------------*/
196static void WriteNormProtos(const char *Directory, LIST LabeledProtoList,
197 const FEATURE_DESC_STRUCT *feature_desc) {
198 FILE *File;
199 STRING Filename;
200 LABELEDLIST LabeledProto;
201 int N;
202
203 Filename = "";
204 if (Directory != nullptr && Directory[0] != '\0') {
205 Filename += Directory;
206 Filename += "/";
207 }
208 Filename += "normproto";
209 printf ("\nWriting %s ...", Filename.string());
210 File = fopen(Filename.string(), "wb");
211 ASSERT_HOST(File);
212 fprintf(File, "%0d\n", feature_desc->NumParams);
213 WriteParamDesc(File, feature_desc->NumParams, feature_desc->ParamDesc);
214 iterate(LabeledProtoList)
215 {
216 LabeledProto = reinterpret_cast<LABELEDLIST>first_node (LabeledProtoList);
217 N = NumberOfProtos(LabeledProto->List, true, false);
218 if (N < 1) {
219 printf ("\nError! Not enough protos for %s: %d protos"
220 " (%d significant protos"
221 ", %d insignificant protos)\n",
222 LabeledProto->Label, N,
223 NumberOfProtos(LabeledProto->List, true, false),
224 NumberOfProtos(LabeledProto->List, false, true));
225 exit(1);
226 }
227 fprintf(File, "\n%s %d\n", LabeledProto->Label, N);
228 WriteProtos(File, feature_desc->NumParams, LabeledProto->List, true, false);
229 }
230 fclose (File);
231
232} // WriteNormProtos
233
234/*-------------------------------------------------------------------------*/
235
236static void WriteProtos(FILE* File, uint16_t N, LIST ProtoList,
237 bool WriteSigProtos, bool WriteInsigProtos)
238{
239 PROTOTYPE *Proto;
240
241 // write prototypes
242 iterate(ProtoList)
243 {
244 Proto = reinterpret_cast<PROTOTYPE*>first_node(ProtoList);
245 if ((Proto->Significant && WriteSigProtos) ||
246 (! Proto->Significant && WriteInsigProtos))
247 WritePrototype(File, N, Proto);
248 }
249} // WriteProtos
#define ASSERT_HOST(x)
Definition: errcode.h:88
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:538
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:514
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:483
@ elliptical
Definition: cluster.h:44
void WritePrototype(FILE *File, uint16_t N, PROTOTYPE *Proto)
Definition: clusttool.cpp:280
void WriteParamDesc(FILE *File, uint16_t N, const PARAM_DESC ParamDesc[])
Definition: clusttool.cpp:255
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:112
uint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:270
#define iterate(l)
Definition: oldlist.h:101
#define first_node(l)
Definition: oldlist.h:92
#define NIL_LIST
Definition: oldlist.h:76
int main(int argc, char *argv[])
Definition: cntraining.cpp:104
#define PROGRAM_FEATURE_TYPE
Definition: cntraining.cpp:34
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_definitions, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
CLUSTERCONFIG Config
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, char *CharName)
int NumberOfProtos(LIST ProtoList, bool CountSigProtos, bool CountInsigProtos)
void FreeTrainingSamples(LIST CharList)
const char * GetNextFilename(int argc, const char *const *argv)
void ParseArguments(int *argc, char ***argv)
void FreeNormProtoList(LIST CharList)
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
int push_back(T object)
int size() const
Definition: genericvector.h:72
Definition: strngs.h:45
const char * string() const
Definition: strngs.cpp:194
int MagicSamples
Definition: cluster.h:53
float MinSamples
Definition: cluster.h:48
bool Significant
Definition: cluster.h:64
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:47
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:55