tesseract 4.1.1
Loading...
Searching...
No Matches
mftraining.cpp
Go to the documentation of this file.
1/******************************************************************************
2 ** Filename: mftraining.c
3 ** Purpose: Separates training pages into files for each character.
4 ** Strips from files only the features and there parameters of
5 ** the feature type mf.
6 ** Author: Dan Johnson
7 ** Revisment: Christy Russon
8 **
9 ** (c) Copyright Hewlett-Packard Company, 1988.
10 ** Licensed under the Apache License, Version 2.0 (the "License");
11 ** you may not use this file except in compliance with the License.
12 ** You may obtain a copy of the License at
13 ** http://www.apache.org/licenses/LICENSE-2.0
14 ** Unless required by applicable law or agreed to in writing, software
15 ** distributed under the License is distributed on an "AS IS" BASIS,
16 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 ** See the License for the specific language governing permissions and
18 ** limitations under the License.
19******************************************************************************/
20/*----------------------------------------------------------------------------
21 Include Files and Type Defines
22----------------------------------------------------------------------------*/
23
24#define _USE_MATH_DEFINES // for M_PI
25#ifdef HAVE_CONFIG_H
26#include "config_auto.h"
27#endif
28
29#include <cmath> // for M_PI
30#include <cstring>
31#include <cstdio>
32
33#include "classify.h"
34#include "cluster.h"
35#include "clusttool.h"
36#include "commontraining.h"
37#include "featdefs.h"
38#include "fontinfo.h"
39#include "genericvector.h"
40#include "indexmapbidi.h"
41#include "intproto.h"
42#include "mastertrainer.h"
43#include "mergenf.h"
44#include "mf.h"
45#include "ocrfeatures.h"
46#include "oldlist.h"
47#include "protos.h"
48#include "shapetable.h"
49#include "tessopt.h"
50#include "tprintf.h"
51#include "unicity_table.h"
52
57
58// Max length of a fake shape label.
59const int kMaxShapeLabelLength = 10;
60
61/*----------------------------------------------------------------------------
62 Public Code
63-----------------------------------------------------------------------------*/
64#ifndef GRAPHICS_DISABLED
65static void DisplayProtoList(const char* ch, LIST protolist) {
66 void* window = c_create_window("Char samples", 50, 200,
67 520, 520, -130.0, 130.0, -130.0, 130.0);
68 LIST proto = protolist;
69 iterate(proto) {
70 PROTOTYPE* prototype = reinterpret_cast<PROTOTYPE *>(first_node(proto));
71 if (prototype->Significant)
73 else if (prototype->NumSamples == 0)
74 c_line_color_index(window, Blue);
75 else if (prototype->Merged)
77 else
78 c_line_color_index(window, Red);
79 float x = CenterX(prototype->Mean);
80 float y = CenterY(prototype->Mean);
81 double angle = OrientationOf(prototype->Mean) * 2 * M_PI;
82 float dx = static_cast<float>(LengthOf(prototype->Mean) * cos(angle) / 2);
83 float dy = static_cast<float>(LengthOf(prototype->Mean) * sin(angle) / 2);
84 c_move(window, (x - dx) * 256, (y - dy) * 256);
85 c_draw(window, (x + dx) * 256, (y + dy) * 256);
86 if (prototype->Significant)
87 tprintf("Green proto at (%g,%g)+(%g,%g) %d samples\n",
88 x, y, dx, dy, prototype->NumSamples);
89 else if (prototype->NumSamples > 0 && !prototype->Merged)
90 tprintf("Red proto at (%g,%g)+(%g,%g) %d samples\n",
91 x, y, dx, dy, prototype->NumSamples);
92 }
93 c_make_current(window);
94}
95#endif // GRAPHICS_DISABLED
96
97// Helper to run clustering on a single config.
98// Mostly copied from the old mftraining, but with renamed variables.
99static LIST ClusterOneConfig(int shape_id, const char* class_label,
100 LIST mf_classes,
101 const ShapeTable& shape_table,
102 MasterTrainer* trainer) {
103 int num_samples;
104 CLUSTERER *clusterer = trainer->SetupForClustering(shape_table,
106 shape_id,
107 &num_samples);
108 Config.MagicSamples = num_samples;
109 LIST proto_list = ClusterSamples(clusterer, &Config);
110 CleanUpUnusedData(proto_list);
111
112 // Merge protos where reasonable to make more of them significant by
113 // representing almost all samples of the class/font.
114 MergeInsignificantProtos(proto_list, class_label, clusterer, &Config);
115 #ifndef GRAPHICS_DISABLED
116 if (strcmp(FLAGS_test_ch.c_str(), class_label) == 0)
117 DisplayProtoList(FLAGS_test_ch.c_str(), proto_list);
118 #endif // GRAPHICS_DISABLED
119 // Delete the protos that will not be used in the inttemp output file.
120 proto_list = RemoveInsignificantProtos(proto_list, true,
121 false,
122 clusterer->SampleSize);
123 FreeClusterer(clusterer);
124 MERGE_CLASS merge_class = FindClass(mf_classes, class_label);
125 if (merge_class == nullptr) {
126 merge_class = NewLabeledClass(class_label);
127 mf_classes = push(mf_classes, merge_class);
128 }
129 int config_id = AddConfigToClass(merge_class->Class);
130 merge_class->Class->font_set.push_back(shape_id);
131 LIST proto_it = proto_list;
132 iterate(proto_it) {
133 PROTOTYPE* prototype = reinterpret_cast<PROTOTYPE*>(first_node(proto_it));
134 // See if proto can be approximated by existing proto.
135 int p_id = FindClosestExistingProto(merge_class->Class,
136 merge_class->NumMerged, prototype);
137 if (p_id == NO_PROTO) {
138 // Need to make a new proto, as it doesn't match anything.
139 p_id = AddProtoToClass(merge_class->Class);
140 MakeNewFromOld(ProtoIn(merge_class->Class, p_id), prototype);
141 merge_class->NumMerged[p_id] = 1;
142 } else {
143 PROTO_STRUCT dummy_proto;
144 MakeNewFromOld(&dummy_proto, prototype);
145 // Merge with the similar proto.
146 ComputeMergedProto(ProtoIn(merge_class->Class, p_id), &dummy_proto,
147 static_cast<float>(merge_class->NumMerged[p_id]),
148 1.0,
149 ProtoIn(merge_class->Class, p_id));
150 merge_class->NumMerged[p_id]++;
151 }
152 AddProtoToConfig(p_id, merge_class->Class->Configurations[config_id]);
153 }
154 FreeProtoList(&proto_list);
155 return mf_classes;
156}
157
158// Helper to setup the config map.
159// Setup an index mapping from the shapes in the shape table to the classes
160// that will be trained. In keeping with the original design, each shape
161// with the same list of unichars becomes a different class and the configs
162// represent the different combinations of fonts.
163static void SetupConfigMap(ShapeTable* shape_table, IndexMapBiDi* config_map) {
164 int num_configs = shape_table->NumShapes();
165 config_map->Init(num_configs, true);
166 config_map->Setup();
167 for (int c1 = 0; c1 < num_configs; ++c1) {
168 // Only process ids that are not already merged.
169 if (config_map->SparseToCompact(c1) == c1) {
170 Shape* shape1 = shape_table->MutableShape(c1);
171 // Find all the subsequent shapes that are equal.
172 for (int c2 = c1 + 1; c2 < num_configs; ++c2) {
173 if (shape_table->MutableShape(c2)->IsEqualUnichars(shape1)) {
174 config_map->Merge(c1, c2);
175 }
176 }
177 }
178 }
179 config_map->CompleteMerges();
180}
181
209int main (int argc, char **argv) {
210 tesseract::CheckSharedLibraryVersion();
211
212 ParseArguments(&argc, &argv);
213
214 ShapeTable* shape_table = nullptr;
215 STRING file_prefix;
216 // Load the training data.
217 MasterTrainer* trainer = tesseract::LoadTrainingData(argc, argv,
218 false,
219 &shape_table,
220 &file_prefix);
221 if (trainer == nullptr) return 1; // Failed.
222
223 // Setup an index mapping from the shapes in the shape table to the classes
224 // that will be trained. In keeping with the original design, each shape
225 // with the same list of unichars becomes a different class and the configs
226 // represent the different combinations of fonts.
227 IndexMapBiDi config_map;
228 SetupConfigMap(shape_table, &config_map);
229
230 WriteShapeTable(file_prefix, *shape_table);
231 // If the shape_table is flat, then either we didn't run shape clustering, or
232 // it did nothing, so we just output the trainer's unicharset.
233 // Otherwise shape_set will hold a fake unicharset with an entry for each
234 // shape in the shape table, and we will output that instead.
235 UNICHARSET shape_set;
236 const UNICHARSET* unicharset = &trainer->unicharset();
237 // If we ran shapeclustering (and it worked) then at least one shape will
238 // have multiple unichars, so we have to build a fake unicharset.
239 if (shape_table->AnyMultipleUnichars()) {
240 unicharset = &shape_set;
241 // Now build a fake unicharset for the compact shape space to keep the
242 // output modules happy that we are doing things correctly.
243 int num_shapes = config_map.CompactSize();
244 for (int s = 0; s < num_shapes; ++s) {
245 char shape_label[kMaxShapeLabelLength + 1];
246 snprintf(shape_label, kMaxShapeLabelLength, "sh%04d", s);
247 shape_set.unichar_insert(shape_label);
248 }
249 }
250
251 // Now train each config separately.
252 int num_configs = shape_table->NumShapes();
253 LIST mf_classes = NIL_LIST;
254 for (int s = 0; s < num_configs; ++s) {
255 int unichar_id, font_id;
256 if (unicharset == &shape_set) {
257 // Using fake unichar_ids from the config_map/shape_set.
258 unichar_id = config_map.SparseToCompact(s);
259 } else {
260 // Get the real unichar_id from the shape table/unicharset.
261 shape_table->GetFirstUnicharAndFont(s, &unichar_id, &font_id);
262 }
263 const char* class_label = unicharset->id_to_unichar(unichar_id);
264 mf_classes = ClusterOneConfig(s, class_label, mf_classes, *shape_table,
265 trainer);
266 }
267 STRING inttemp_file = file_prefix;
268 inttemp_file += "inttemp";
269 STRING pffmtable_file = file_prefix;
270 pffmtable_file += "pffmtable";
271 CLASS_STRUCT* float_classes = SetUpForFloat2Int(*unicharset, mf_classes);
272 // Now write the inttemp and pffmtable.
273 trainer->WriteInttempAndPFFMTable(trainer->unicharset(), *unicharset,
274 *shape_table, float_classes,
275 inttemp_file.string(),
276 pffmtable_file.string());
277 for (int c = 0; c < unicharset->size(); ++c) {
278 FreeClassFields(&float_classes[c]);
279 }
280 delete [] float_classes;
281 FreeLabeledClassList(mf_classes);
282 delete trainer;
283 delete shape_table;
284 printf("Done!\n");
285 if (!FLAGS_test_ch.empty()) {
286 // If we are displaying debug window(s), wait for the user to look at them.
287 printf("Hit return to exit...\n");
288 while (getchar() != '\n');
289 }
290 return 0;
291} /* main */
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:538
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:514
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:483
int AddProtoToClass(CLASS_TYPE Class)
Definition: protos.cpp:84
void FreeClassFields(CLASS_TYPE Class)
Definition: protos.cpp:138
int AddConfigToClass(CLASS_TYPE Class)
Definition: protos.cpp:47
#define AddProtoToConfig(Pid, Config)
Definition: protos.h:75
#define ProtoIn(Class, Pid)
Definition: protos.h:84
void c_draw(void *win, double x, double y)
Definition: callcpp.cpp:80
void c_move(void *win, double x, double y)
Definition: callcpp.cpp:71
void c_line_color_index(void *win, C_COL index)
Definition: callcpp.cpp:62
void c_make_current(void *win)
Definition: callcpp.cpp:89
ScrollView * c_create_window(const char *name, int16_t xpos, int16_t ypos, int16_t xsize, int16_t ysize, double xmin, double xmax, double ymin, double ymax)
Definition: callcpp.cpp:47
@ Magenta
Definition: callcpp.h:35
@ Green
Definition: callcpp.h:32
@ Red
Definition: callcpp.h:30
@ Blue
Definition: callcpp.h:34
LIST push(LIST list, void *element)
Definition: oldlist.cpp:213
#define iterate(l)
Definition: oldlist.h:101
#define first_node(l)
Definition: oldlist.h:92
#define NIL_LIST
Definition: oldlist.h:76
#define NO_PROTO
Definition: matchdefs.h:41
void FreeLabeledClassList(LIST ClassList)
CLUSTERCONFIG Config
CLASS_STRUCT * SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList)
void ParseArguments(int *argc, char ***argv)
void CleanUpUnusedData(LIST ProtoList)
FEATURE_DEFS_STRUCT feature_defs
LIST RemoveInsignificantProtos(LIST ProtoList, bool KeepSigProtos, bool KeepInsigProtos, int N)
void MergeInsignificantProtos(LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *clusterconfig)
MERGE_CLASS FindClass(LIST List, const char *Label)
MERGE_CLASS NewLabeledClass(const char *Label)
void ComputeMergedProto(PROTO p1, PROTO p2, float w1, float w2, PROTO MergedProto)
Definition: mergenf.cpp:123
void MakeNewFromOld(PROTO New, PROTOTYPE *Old)
Definition: mergenf.cpp:193
int FindClosestExistingProto(CLASS_TYPE Class, int NumMerged[], PROTOTYPE *Prototype)
Definition: mergenf.cpp:155
#define CenterX(M)
Definition: mergenf.h:50
#define CenterY(M)
Definition: mergenf.h:51
#define LengthOf(M)
Definition: mergenf.h:52
#define OrientationOf(M)
Definition: mergenf.h:53
int main(int argc, char **argv)
Definition: mftraining.cpp:209
const int kMaxShapeLabelLength
Definition: mftraining.cpp:59
MasterTrainer * LoadTrainingData(int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
int push_back(T object)
Add an element in the table.
int CompactSize() const
Definition: indexmapbidi.h:61
void Init(int size, bool all_mapped)
bool Merge(int compact_index1, int compact_index2)
int SparseToCompact(int sparse_index) const override
Definition: indexmapbidi.h:138
Definition: strngs.h:45
const char * string() const
Definition: strngs.cpp:194
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:626
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
int size() const
Definition: unicharset.h:341
int MagicSamples
Definition: cluster.h:53
unsigned NumSamples
Definition: cluster.h:71
float * Mean
Definition: cluster.h:74
bool Significant
Definition: cluster.h:64
bool Merged
Definition: cluster.h:65
int16_t SampleSize
Definition: cluster.h:83
UnicityTableEqEq< int > font_set
Definition: protos.h:61
CONFIGS Configurations
Definition: protos.h:60
bool IsEqualUnichars(Shape *other)
Definition: shapetable.cpp:217
bool AnyMultipleUnichars() const
Definition: shapetable.cpp:444
int NumShapes() const
Definition: shapetable.h:274
void GetFirstUnicharAndFont(int shape_id, int *unichar_id, int *font_id) const
Definition: shapetable.cpp:404
Shape * MutableShape(int shape_id)
Definition: shapetable.h:322
CLASS_TYPE Class
int NumMerged[MAX_NUM_PROTOS]
void WriteInttempAndPFFMTable(const UNICHARSET &unicharset, const UNICHARSET &shape_set, const ShapeTable &shape_table, CLASS_STRUCT *float_classes, const char *inttemp_file, const char *pffmtable_file)
const UNICHARSET & unicharset() const
CLUSTERER * SetupForClustering(const ShapeTable &shape_table, const FEATURE_DEFS_STRUCT &feature_defs, int shape_id, int *num_samples)