tesseract 4.1.1
Loading...
Searching...
No Matches
commontraining.cpp
Go to the documentation of this file.
1// Copyright 2008 Google Inc. All Rights Reserved.
2// Author: scharron@google.com (Samuel Charron)
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7// http://www.apache.org/licenses/LICENSE-2.0
8// Unless required by applicable law or agreed to in writing, software
9// distributed under the License is distributed on an "AS IS" BASIS,
10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11// See the License for the specific language governing permissions and
12// limitations under the License.
13
14#define _USE_MATH_DEFINES // for M_PI
15#include "commontraining.h"
16#include <algorithm>
17#include <cmath> // for M_PI
18
19#ifdef DISABLED_LEGACY_ENGINE
20
21#include "params.h"
22#include "tessopt.h"
23#include "tprintf.h"
24
25INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging");
26INT_PARAM_FLAG(load_images, 0, "Load images with tr files");
27STRING_PARAM_FLAG(configfile, "", "File to load more configs from");
28STRING_PARAM_FLAG(D, "", "Directory to write output files to");
29STRING_PARAM_FLAG(F, "font_properties", "File listing font properties");
30STRING_PARAM_FLAG(X, "", "File listing font xheights");
31STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from");
32STRING_PARAM_FLAG(O, "", "File to write unicharset to");
33STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to");
34STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string");
35
47void ParseArguments(int* argc, char ***argv) {
48 STRING usage;
49 if (*argc) {
50 usage += (*argv)[0];
51 usage += " -v | --version | ";
52 usage += (*argv)[0];
53 }
54 usage += " [.tr files ...]";
55 tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);
56}
57
58#else
59
60#include "allheaders.h"
61#include "ccutil.h"
62#include "classify.h"
63#include "cluster.h"
64#include "clusttool.h"
65#include "emalloc.h"
66#include "featdefs.h"
67#include "fontinfo.h"
68#include "intfeaturespace.h"
69#include "mastertrainer.h"
70#include "mf.h"
71#include "oldlist.h"
72#include "params.h"
73#include "shapetable.h"
74#include "tessdatamanager.h"
75#include "tessopt.h"
76#include "tprintf.h"
77#include "unicity_table.h"
78
83
84// Global Variables.
85
86// global variable to hold configuration parameters to control clustering
87// -M 0.625 -B 0.05 -I 1.0 -C 1e-6.
88CLUSTERCONFIG Config = { elliptical, 0.625, 0.05, 1.0, 1e-6, 0 };
90static CCUtil ccutil;
91
92INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging");
93static INT_PARAM_FLAG(load_images, 0, "Load images with tr files");
94static STRING_PARAM_FLAG(configfile, "", "File to load more configs from");
95STRING_PARAM_FLAG(D, "", "Directory to write output files to");
96STRING_PARAM_FLAG(F, "font_properties", "File listing font properties");
97STRING_PARAM_FLAG(X, "", "File listing font xheights");
98STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from");
99STRING_PARAM_FLAG(O, "", "File to write unicharset to");
100STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to");
101STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string");
102static DOUBLE_PARAM_FLAG(clusterconfig_min_samples_fraction, Config.MinSamples,
103 "Min number of samples per proto as % of total");
104static DOUBLE_PARAM_FLAG(clusterconfig_max_illegal, Config.MaxIllegal,
105 "Max percentage of samples in a cluster which have more"
106 " than 1 feature in that cluster");
107static DOUBLE_PARAM_FLAG(clusterconfig_independence, Config.Independence,
108 "Desired independence between dimensions");
109static DOUBLE_PARAM_FLAG(clusterconfig_confidence, Config.Confidence,
110 "Desired confidence in prototypes created");
111
122void ParseArguments(int* argc, char ***argv) {
123 STRING usage;
124 if (*argc) {
125 usage += (*argv)[0];
126 usage += " -v | --version | ";
127 usage += (*argv)[0];
128 }
129 usage += " [.tr files ...]";
130 tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);
131 // Record the index of the first non-flag argument to 1, since we set
132 // remove_flags to true when parsing the flags.
133 tessoptind = 1;
134 // Set some global values based on the flags.
136 std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_min_samples_fraction)));
138 std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_max_illegal)));
140 std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_independence)));
142 std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_confidence)));
143 // Set additional parameters from config file if specified.
144 if (!FLAGS_configfile.empty()) {
146 FLAGS_configfile.c_str(),
148 ccutil.params());
149 }
150}
151
152namespace tesseract {
153// Helper loads shape table from the given file.
154ShapeTable* LoadShapeTable(const STRING& file_prefix) {
155 ShapeTable* shape_table = nullptr;
156 STRING shape_table_file = file_prefix;
157 shape_table_file += kShapeTableFileSuffix;
158 TFile shape_fp;
159 if (shape_fp.Open(shape_table_file.string(), nullptr)) {
160 shape_table = new ShapeTable;
161 if (!shape_table->DeSerialize(&shape_fp)) {
162 delete shape_table;
163 shape_table = nullptr;
164 tprintf("Error: Failed to read shape table %s\n",
165 shape_table_file.string());
166 } else {
167 int num_shapes = shape_table->NumShapes();
168 tprintf("Read shape table %s of %d shapes\n",
169 shape_table_file.string(), num_shapes);
170 }
171 } else {
172 tprintf("Warning: No shape table file present: %s\n",
173 shape_table_file.string());
174 }
175 return shape_table;
176}
177
178// Helper to write the shape_table.
179void WriteShapeTable(const STRING& file_prefix, const ShapeTable& shape_table) {
180 STRING shape_table_file = file_prefix;
181 shape_table_file += kShapeTableFileSuffix;
182 FILE* fp = fopen(shape_table_file.string(), "wb");
183 if (fp != nullptr) {
184 if (!shape_table.Serialize(fp)) {
185 fprintf(stderr, "Error writing shape table: %s\n",
186 shape_table_file.string());
187 }
188 fclose(fp);
189 } else {
190 fprintf(stderr, "Error creating shape table: %s\n",
191 shape_table_file.string());
192 }
193}
194
211MasterTrainer* LoadTrainingData(int argc, const char* const * argv,
212 bool replication,
213 ShapeTable** shape_table,
214 STRING* file_prefix) {
217 *file_prefix = "";
218 if (!FLAGS_D.empty()) {
219 *file_prefix += FLAGS_D.c_str();
220 *file_prefix += "/";
221 }
222 // If we are shape clustering (nullptr shape_table) or we successfully load
223 // a shape_table written by a previous shape clustering, then
224 // shape_analysis will be true, meaning that the MasterTrainer will replace
225 // some members of the unicharset with their fragments.
226 bool shape_analysis = false;
227 if (shape_table != nullptr) {
228 *shape_table = LoadShapeTable(*file_prefix);
229 if (*shape_table != nullptr) shape_analysis = true;
230 } else {
231 shape_analysis = true;
232 }
234 shape_analysis,
235 replication,
236 FLAGS_debug_level);
239 trainer->LoadUnicharset(FLAGS_U.c_str());
240 // Get basic font information from font_properties.
241 if (!FLAGS_F.empty()) {
242 if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {
243 delete trainer;
244 return nullptr;
245 }
246 }
247 if (!FLAGS_X.empty()) {
248 if (!trainer->LoadXHeights(FLAGS_X.c_str())) {
249 delete trainer;
250 return nullptr;
251 }
252 }
253 trainer->SetFeatureSpace(fs);
254 const char* page_name;
255 // Load training data from .tr files on the command line.
256 while ((page_name = GetNextFilename(argc, argv)) != nullptr) {
257 tprintf("Reading %s ...\n", page_name);
258 trainer->ReadTrainingSamples(page_name, feature_defs, false);
259
260 // If there is a file with [lang].[fontname].exp[num].fontinfo present,
261 // read font spacing information in to fontinfo_table.
262 int pagename_len = strlen(page_name);
263 char* fontinfo_file_name = new char[pagename_len + 7];
264 strncpy(fontinfo_file_name, page_name, pagename_len - 2); // remove "tr"
265 strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo"); // +"fontinfo"
266 trainer->AddSpacingInfo(fontinfo_file_name);
267 delete[] fontinfo_file_name;
268
269 // Load the images into memory if required by the classifier.
270 if (FLAGS_load_images) {
271 STRING image_name = page_name;
272 // Chop off the tr and replace with tif. Extension must be tif!
273 image_name.truncate_at(image_name.length() - 2);
274 image_name += "tif";
275 trainer->LoadPageImages(image_name.string());
276 }
277 }
278 trainer->PostLoadCleanup();
279 // Write the master trainer if required.
280 if (!FLAGS_output_trainer.empty()) {
281 FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb");
282 if (fp == nullptr) {
283 tprintf("Can't create saved trainer data!\n");
284 } else {
285 trainer->Serialize(fp);
286 fclose(fp);
287 }
288 }
289 trainer->PreTrainingSetup();
290 if (!FLAGS_O.empty() &&
291 !trainer->unicharset().save_to_file(FLAGS_O.c_str())) {
292 fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str());
293 delete trainer;
294 return nullptr;
295 }
296 if (shape_table != nullptr) {
297 // If we previously failed to load a shapetable, then shape clustering
298 // wasn't run so make a flat one now.
299 if (*shape_table == nullptr) {
300 *shape_table = new ShapeTable;
301 trainer->SetupFlatShapeTable(*shape_table);
302 tprintf("Flat shape table summary: %s\n",
303 (*shape_table)->SummaryStr().string());
304 }
305 (*shape_table)->set_unicharset(trainer->unicharset());
306 }
307 return trainer;
308}
309
310} // namespace tesseract.
311
312/*---------------------------------------------------------------------------*/
323const char *GetNextFilename(int argc, const char* const * argv) {
324 if (tessoptind < argc)
325 return argv[tessoptind++];
326 else
327 return nullptr;
328} /* GetNextFilename */
329
330/*---------------------------------------------------------------------------*/
340LABELEDLIST FindList(LIST List, char* Label) {
341 LABELEDLIST LabeledList;
342
343 iterate (List)
344 {
345 LabeledList = reinterpret_cast<LABELEDLIST>first_node (List);
346 if (strcmp (LabeledList->Label, Label) == 0)
347 return (LabeledList);
348 }
349 return (nullptr);
350
351} /* FindList */
352
353/*---------------------------------------------------------------------------*/
361LABELEDLIST NewLabeledList(const char* Label) {
362 LABELEDLIST LabeledList;
363
364 LabeledList = static_cast<LABELEDLIST>(Emalloc (sizeof (LABELEDLISTNODE)));
365 LabeledList->Label = static_cast<char*>(Emalloc (strlen (Label)+1));
366 strcpy (LabeledList->Label, Label);
367 LabeledList->List = NIL_LIST;
368 LabeledList->SampleCount = 0;
369 LabeledList->font_sample_count = 0;
370 return (LabeledList);
371
372} /* NewLabeledList */
373
374/*---------------------------------------------------------------------------*/
375// TODO(rays) This is now used only by cntraining. Convert cntraining to use
376// the new method or get rid of it entirely.
389void ReadTrainingSamples(const FEATURE_DEFS_STRUCT& feature_definitions,
390 const char *feature_name, int max_samples,
391 UNICHARSET* unicharset,
392 FILE* file, LIST* training_samples) {
393 char buffer[2048];
394 char unichar[UNICHAR_LEN + 1];
395 LABELEDLIST char_sample;
396 FEATURE_SET feature_samples;
397 CHAR_DESC char_desc;
398 uint32_t feature_type =
399 ShortNameToFeatureType(feature_definitions, feature_name);
400
401 // Zero out the font_sample_count for all the classes.
402 LIST it = *training_samples;
403 iterate(it) {
404 char_sample = reinterpret_cast<LABELEDLIST>(first_node(it));
405 char_sample->font_sample_count = 0;
406 }
407
408 while (fgets(buffer, 2048, file) != nullptr) {
409 if (buffer[0] == '\n')
410 continue;
411
412 sscanf(buffer, "%*s %s", unichar);
413 if (unicharset != nullptr && !unicharset->contains_unichar(unichar)) {
414 unicharset->unichar_insert(unichar);
415 if (unicharset->size() > MAX_NUM_CLASSES) {
416 tprintf("Error: Size of unicharset in training is "
417 "greater than MAX_NUM_CLASSES\n");
418 exit(1);
419 }
420 }
421 char_sample = FindList(*training_samples, unichar);
422 if (char_sample == nullptr) {
423 char_sample = NewLabeledList(unichar);
424 *training_samples = push(*training_samples, char_sample);
425 }
426 char_desc = ReadCharDescription(feature_definitions, file);
427 feature_samples = char_desc->FeatureSets[feature_type];
428 if (char_sample->font_sample_count < max_samples || max_samples <= 0) {
429 char_sample->List = push(char_sample->List, feature_samples);
430 char_sample->SampleCount++;
431 char_sample->font_sample_count++;
432 } else {
433 FreeFeatureSet(feature_samples);
434 }
435 for (size_t i = 0; i < char_desc->NumFeatureSets; i++) {
436 if (feature_type != i)
437 FreeFeatureSet(char_desc->FeatureSets[i]);
438 }
439 free(char_desc);
440 }
441} // ReadTrainingSamples
442
443
444/*---------------------------------------------------------------------------*/
450void FreeTrainingSamples(LIST CharList) {
451 LABELEDLIST char_sample;
452 FEATURE_SET FeatureSet;
453 LIST FeatureList;
454
455 LIST nodes = CharList;
456 iterate(CharList) { /* iterate through all of the fonts */
457 char_sample = reinterpret_cast<LABELEDLIST>first_node(CharList);
458 FeatureList = char_sample->List;
459 iterate(FeatureList) { /* iterate through all of the classes */
460 FeatureSet = reinterpret_cast<FEATURE_SET>first_node(FeatureList);
461 FreeFeatureSet(FeatureSet);
462 }
463 FreeLabeledList(char_sample);
464 }
465 destroy(nodes);
466} /* FreeTrainingSamples */
467
468/*---------------------------------------------------------------------------*/
476void FreeLabeledList(LABELEDLIST LabeledList) {
477 destroy(LabeledList->List);
478 free(LabeledList->Label);
479 free(LabeledList);
480} /* FreeLabeledList */
481
482/*---------------------------------------------------------------------------*/
495 LABELEDLIST char_sample,
496 const char* program_feature_type) {
497 uint16_t N;
498 int i, j;
499 float* Sample = nullptr;
500 CLUSTERER *Clusterer;
501 int32_t CharID;
502 LIST FeatureList = nullptr;
503 FEATURE_SET FeatureSet = nullptr;
504
505 int32_t desc_index =
506 ShortNameToFeatureType(FeatureDefs, program_feature_type);
507 N = FeatureDefs.FeatureDesc[desc_index]->NumParams;
508 Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc);
509
510 FeatureList = char_sample->List;
511 CharID = 0;
512 iterate(FeatureList) {
513 FeatureSet = reinterpret_cast<FEATURE_SET>first_node(FeatureList);
514 for (i = 0; i < FeatureSet->MaxNumFeatures; i++) {
515 if (Sample == nullptr) Sample = static_cast<float*>(Emalloc(N * sizeof(float)));
516 for (j = 0; j < N; j++)
517 Sample[j] = FeatureSet->Features[i]->Params[j];
518 MakeSample (Clusterer, Sample, CharID);
519 }
520 CharID++;
521 }
522 free(Sample);
523 return Clusterer;
524
525} /* SetUpForClustering */
526
527/*------------------------------------------------------------------------*/
528void MergeInsignificantProtos(LIST ProtoList, const char* label,
529 CLUSTERER* Clusterer,
530 CLUSTERCONFIG* clusterconfig) {
531 PROTOTYPE* Prototype;
532 bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0;
533
534 LIST pProtoList = ProtoList;
535 iterate(pProtoList) {
536 Prototype = reinterpret_cast<PROTOTYPE *>first_node (pProtoList);
537 if (Prototype->Significant || Prototype->Merged)
538 continue;
539 float best_dist = 0.125;
540 PROTOTYPE* best_match = nullptr;
541 // Find the nearest alive prototype.
542 LIST list_it = ProtoList;
543 iterate(list_it) {
544 PROTOTYPE* test_p = reinterpret_cast<PROTOTYPE *>first_node (list_it);
545 if (test_p != Prototype && !test_p->Merged) {
546 float dist = ComputeDistance(Clusterer->SampleSize,
547 Clusterer->ParamDesc,
548 Prototype->Mean, test_p->Mean);
549 if (dist < best_dist) {
550 best_match = test_p;
551 best_dist = dist;
552 }
553 }
554 }
555 if (best_match != nullptr && !best_match->Significant) {
556 if (debug)
557 tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n",
558 best_match->NumSamples, Prototype->NumSamples,
559 best_match->Mean[0], best_match->Mean[1],
560 Prototype->Mean[0], Prototype->Mean[1]);
561 best_match->NumSamples = MergeClusters(Clusterer->SampleSize,
562 Clusterer->ParamDesc,
563 best_match->NumSamples,
564 Prototype->NumSamples,
565 best_match->Mean,
566 best_match->Mean, Prototype->Mean);
567 Prototype->NumSamples = 0;
568 Prototype->Merged = true;
569 } else if (best_match != nullptr) {
570 if (debug)
571 tprintf("Red proto at %g,%g matched a green one at %g,%g\n",
572 Prototype->Mean[0], Prototype->Mean[1],
573 best_match->Mean[0], best_match->Mean[1]);
574 Prototype->Merged = true;
575 }
576 }
577 // Mark significant those that now have enough samples.
578 int min_samples =
579 static_cast<int32_t>(clusterconfig->MinSamples * Clusterer->NumChar);
580 pProtoList = ProtoList;
581 iterate(pProtoList) {
582 Prototype = reinterpret_cast<PROTOTYPE *>first_node (pProtoList);
583 // Process insignificant protos that do not match a green one
584 if (!Prototype->Significant && Prototype->NumSamples >= min_samples &&
585 !Prototype->Merged) {
586 if (debug)
587 tprintf("Red proto at %g,%g becoming green\n",
588 Prototype->Mean[0], Prototype->Mean[1]);
589 Prototype->Significant = true;
590 }
591 }
592} /* MergeInsignificantProtos */
593
594/*-----------------------------------------------------------------------------*/
596 LIST ProtoList)
597{
598 PROTOTYPE* Prototype;
599
600 iterate(ProtoList)
601 {
602 Prototype = reinterpret_cast<PROTOTYPE *>first_node (ProtoList);
603 free(Prototype->Variance.Elliptical);
604 Prototype->Variance.Elliptical = nullptr;
605 free(Prototype->Magnitude.Elliptical);
606 Prototype->Magnitude.Elliptical = nullptr;
607 free(Prototype->Weight.Elliptical);
608 Prototype->Weight.Elliptical = nullptr;
609 }
610}
611
612/*------------------------------------------------------------------------*/
614 LIST ProtoList,
615 bool KeepSigProtos,
616 bool KeepInsigProtos,
617 int N)
618
619{
620 LIST NewProtoList = NIL_LIST;
621 LIST pProtoList;
622 PROTOTYPE* Proto;
623 PROTOTYPE* NewProto;
624 int i;
625
626 pProtoList = ProtoList;
627 iterate(pProtoList)
628 {
629 Proto = reinterpret_cast<PROTOTYPE *>first_node (pProtoList);
630 if ((Proto->Significant && KeepSigProtos) ||
631 (!Proto->Significant && KeepInsigProtos))
632 {
633 NewProto = static_cast<PROTOTYPE *>(Emalloc(sizeof(PROTOTYPE)));
634
635 NewProto->Mean = static_cast<float *>(Emalloc(N * sizeof(float)));
636 NewProto->Significant = Proto->Significant;
637 NewProto->Style = Proto->Style;
638 NewProto->NumSamples = Proto->NumSamples;
639 NewProto->Cluster = nullptr;
640 NewProto->Distrib = nullptr;
641
642 for (i=0; i < N; i++)
643 NewProto->Mean[i] = Proto->Mean[i];
644 if (Proto->Variance.Elliptical != nullptr) {
645 NewProto->Variance.Elliptical = static_cast<float *>(Emalloc(N * sizeof(float)));
646 for (i=0; i < N; i++)
647 NewProto->Variance.Elliptical[i] = Proto->Variance.Elliptical[i];
648 }
649 else
650 NewProto->Variance.Elliptical = nullptr;
651 //---------------------------------------------
652 if (Proto->Magnitude.Elliptical != nullptr) {
653 NewProto->Magnitude.Elliptical = static_cast<float *>(Emalloc(N * sizeof(float)));
654 for (i=0; i < N; i++)
655 NewProto->Magnitude.Elliptical[i] = Proto->Magnitude.Elliptical[i];
656 }
657 else
658 NewProto->Magnitude.Elliptical = nullptr;
659 //------------------------------------------------
660 if (Proto->Weight.Elliptical != nullptr) {
661 NewProto->Weight.Elliptical = static_cast<float *>(Emalloc(N * sizeof(float)));
662 for (i=0; i < N; i++)
663 NewProto->Weight.Elliptical[i] = Proto->Weight.Elliptical[i];
664 }
665 else
666 NewProto->Weight.Elliptical = nullptr;
667
668 NewProto->TotalMagnitude = Proto->TotalMagnitude;
669 NewProto->LogMagnitude = Proto->LogMagnitude;
670 NewProtoList = push_last(NewProtoList, NewProto);
671 }
672 }
673 FreeProtoList(&ProtoList);
674 return (NewProtoList);
675} /* RemoveInsignificantProtos */
676
677/*----------------------------------------------------------------------------*/
678MERGE_CLASS FindClass(LIST List, const char* Label) {
679 MERGE_CLASS MergeClass;
680
681 iterate (List)
682 {
683 MergeClass = reinterpret_cast<MERGE_CLASS>first_node (List);
684 if (strcmp (MergeClass->Label, Label) == 0)
685 return (MergeClass);
686 }
687 return (nullptr);
688
689} /* FindClass */
690
691/*---------------------------------------------------------------------------*/
692MERGE_CLASS NewLabeledClass(const char* Label) {
693 MERGE_CLASS MergeClass;
694
695 MergeClass = new MERGE_CLASS_NODE;
696 MergeClass->Label = static_cast<char*>(Emalloc (strlen (Label)+1));
697 strcpy (MergeClass->Label, Label);
699 return (MergeClass);
700
701} /* NewLabeledClass */
702
703/*-----------------------------------------------------------------------------*/
709void FreeLabeledClassList(LIST ClassList) {
710 MERGE_CLASS MergeClass;
711
712 LIST nodes = ClassList;
713 iterate(ClassList) /* iterate through all of the fonts */
714 {
715 MergeClass = reinterpret_cast<MERGE_CLASS>first_node (ClassList);
716 free (MergeClass->Label);
717 FreeClass(MergeClass->Class);
718 delete MergeClass;
719 }
720 destroy(nodes);
721
722} /* FreeLabeledClassList */
723
724/* SetUpForFloat2Int */
726 LIST LabeledClassList) {
727 MERGE_CLASS MergeClass;
728 CLASS_TYPE Class;
729 int NumProtos;
730 int NumConfigs;
731 int NumWords;
732 int i, j;
733 float Values[3];
734 PROTO NewProto;
735 PROTO OldProto;
736 BIT_VECTOR NewConfig;
737 BIT_VECTOR OldConfig;
738
739 // printf("Float2Int ...\n");
740
741 CLASS_STRUCT* float_classes = new CLASS_STRUCT[unicharset.size()];
742 iterate(LabeledClassList)
743 {
744 UnicityTableEqEq<int> font_set;
745 MergeClass = reinterpret_cast<MERGE_CLASS>first_node (LabeledClassList);
746 Class = &float_classes[unicharset.unichar_to_id(MergeClass->Label)];
747 NumProtos = MergeClass->Class->NumProtos;
748 NumConfigs = MergeClass->Class->NumConfigs;
749 font_set.move(&MergeClass->Class->font_set);
750 Class->NumProtos = NumProtos;
751 Class->MaxNumProtos = NumProtos;
752 Class->Prototypes = static_cast<PROTO>(Emalloc (sizeof(PROTO_STRUCT) * NumProtos));
753 for(i=0; i < NumProtos; i++)
754 {
755 NewProto = ProtoIn(Class, i);
756 OldProto = ProtoIn(MergeClass->Class, i);
757 Values[0] = OldProto->X;
758 Values[1] = OldProto->Y;
759 Values[2] = OldProto->Angle;
760 Normalize(Values);
761 NewProto->X = OldProto->X;
762 NewProto->Y = OldProto->Y;
763 NewProto->Length = OldProto->Length;
764 NewProto->Angle = OldProto->Angle;
765 NewProto->A = Values[0];
766 NewProto->B = Values[1];
767 NewProto->C = Values[2];
768 }
769
770 Class->NumConfigs = NumConfigs;
771 Class->MaxNumConfigs = NumConfigs;
772 Class->font_set.move(&font_set);
773 Class->Configurations = static_cast<BIT_VECTOR*>(Emalloc (sizeof(BIT_VECTOR) * NumConfigs));
774 NumWords = WordsInVectorOfSize(NumProtos);
775 for(i=0; i < NumConfigs; i++)
776 {
777 NewConfig = NewBitVector(NumProtos);
778 OldConfig = MergeClass->Class->Configurations[i];
779 for(j=0; j < NumWords; j++)
780 NewConfig[j] = OldConfig[j];
781 Class->Configurations[i] = NewConfig;
782 }
783 }
784 return float_classes;
785} // SetUpForFloat2Int
786
787/*--------------------------------------------------------------------------*/
789 float *Values)
790{
791 float Slope;
792 float Intercept;
793 float Normalizer;
794
795 Slope = tan(Values [2] * 2 * M_PI);
796 Intercept = Values [1] - Slope * Values [0];
797 Normalizer = 1 / sqrt (Slope * Slope + 1.0);
798
799 Values [0] = Slope * Normalizer;
800 Values [1] = - Normalizer;
801 Values [2] = Intercept * Normalizer;
802} // Normalize
803
804/*-------------------------------------------------------------------------*/
806
807{
808 LABELEDLIST char_sample;
809
810 LIST nodes = CharList;
811 iterate(CharList) /* iterate through all of the fonts */
812 {
813 char_sample = reinterpret_cast<LABELEDLIST>first_node (CharList);
814 FreeLabeledList (char_sample);
815 }
816 destroy(nodes);
817
818} // FreeNormProtoList
819
820/*---------------------------------------------------------------------------*/
822 LIST* NormProtoList,
823 LIST ProtoList,
824 char* CharName)
825{
826 PROTOTYPE* Proto;
827 LABELEDLIST LabeledProtoList;
828
829 LabeledProtoList = NewLabeledList(CharName);
830 iterate(ProtoList)
831 {
832 Proto = reinterpret_cast<PROTOTYPE *>first_node (ProtoList);
833 LabeledProtoList->List = push(LabeledProtoList->List, Proto);
834 }
835 *NormProtoList = push(*NormProtoList, LabeledProtoList);
836}
837
838/*---------------------------------------------------------------------------*/
839int NumberOfProtos(LIST ProtoList, bool CountSigProtos,
840 bool CountInsigProtos) {
841 int N = 0;
842 iterate(ProtoList)
843 {
844 PROTOTYPE* Proto = reinterpret_cast<PROTOTYPE*>first_node(ProtoList);
845 if ((Proto->Significant && CountSigProtos) ||
846 (!Proto->Significant && CountInsigProtos))
847 N++;
848 }
849 return(N);
850}
851
852#endif // def DISABLED_LEGACY_ENGINE
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
#define UNICHAR_LEN
Definition: unichar.h:30
int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], float m1[], float m2[])
Definition: cluster.cpp:824
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:538
SAMPLE * MakeSample(CLUSTERER *Clusterer, const float *Feature, int32_t CharID)
Definition: cluster.cpp:429
CLUSTERER * MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:376
@ elliptical
Definition: cluster.h:44
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:112
uint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:270
CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File)
Definition: featdefs.cpp:236
const int kBoostXYBuckets
const int kBoostDirBuckets
void InitIntegerFX()
Definition: intfx.cpp:49
#define MAX_NUM_PROTOS
Definition: intproto.h:48
#define MAX_NUM_CONFIGS
Definition: intproto.h:47
float ComputeDistance(int k, PARAM_DESC *dim, float p1[], float p2[])
Definition: kdtree.cpp:448
void FreeFeatureSet(FEATURE_SET FeatureSet)
Definition: ocrfeatures.cpp:62
void FreeClass(CLASS_TYPE Class)
Definition: protos.cpp:125
CLASS_TYPE NewClass(int NumProtos, int NumConfigs)
Definition: protos.cpp:157
#define ProtoIn(Class, Pid)
Definition: protos.h:84
uint32_t * BIT_VECTOR
Definition: bitvec.h:28
void * Emalloc(int Size)
Definition: emalloc.cpp:31
LIST push_last(LIST list, void *item)
Definition: oldlist.cpp:227
LIST destroy(LIST list)
Definition: oldlist.cpp:141
LIST push(LIST list, void *element)
Definition: oldlist.cpp:213
#define iterate(l)
Definition: oldlist.h:101
#define first_node(l)
Definition: oldlist.h:92
#define NIL_LIST
Definition: oldlist.h:76
#define MAX_NUM_CLASSES
Definition: matchdefs.h:30
#define DOUBLE_PARAM_FLAG(name, val, comment)
#define INT_PARAM_FLAG(name, val, comment)
#define STRING_PARAM_FLAG(name, val, comment)
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_definitions, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
void FreeLabeledClassList(LIST ClassList)
CLUSTERCONFIG Config
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, char *CharName)
int NumberOfProtos(LIST ProtoList, bool CountSigProtos, bool CountInsigProtos)
void FreeTrainingSamples(LIST CharList)
CLASS_STRUCT * SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList)
const char * GetNextFilename(int argc, const char *const *argv)
void FreeLabeledList(LABELEDLIST LabeledList)
void ParseArguments(int *argc, char ***argv)
void CleanUpUnusedData(LIST ProtoList)
void FreeNormProtoList(LIST CharList)
LABELEDLIST FindList(LIST List, char *Label)
FEATURE_DEFS_STRUCT feature_defs
LABELEDLIST NewLabeledList(const char *Label)
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
void Normalize(float *Values)
LIST RemoveInsignificantProtos(LIST ProtoList, bool KeepSigProtos, bool KeepInsigProtos, int N)
void MergeInsignificantProtos(LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *clusterconfig)
MERGE_CLASS FindClass(LIST List, const char *Label)
MERGE_CLASS NewLabeledClass(const char *Label)
int tessoptind
Definition: tessopt.cpp:24
void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags)
@ SET_PARAM_CONSTRAINT_NON_INIT_ONLY
Definition: params.h:39
ShapeTable * LoadShapeTable(const STRING &file_prefix)
MasterTrainer * LoadTrainingData(int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
@ NM_CHAR_ANISOTROPIC
Definition: normalis.h:45
void WriteShapeTable(const STRING &file_prefix, const ShapeTable &shape_table)
void move(UnicityTable< T > *from)
ParamsVectors * params()
Definition: ccutil.h:67
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:39
bool Open(const STRING &filename, FileReader reader)
Definition: serialis.cpp:197
Definition: strngs.h:45
void truncate_at(int32_t index)
Definition: strngs.cpp:265
const char * c_str() const
Definition: strngs.cpp:205
int32_t length() const
Definition: strngs.cpp:189
const char * string() const
Definition: strngs.cpp:194
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:626
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:671
bool save_to_file(const char *const filename) const
Definition: unicharset.h:350
int size() const
Definition: unicharset.h:341
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
float Independence
Definition: cluster.h:51
float MinSamples
Definition: cluster.h:48
float MaxIllegal
Definition: cluster.h:49
double Confidence
Definition: cluster.h:52
float * Elliptical
Definition: cluster.h:60
FLOATUNION Magnitude
Definition: cluster.h:78
unsigned NumSamples
Definition: cluster.h:71
FLOATUNION Variance
Definition: cluster.h:77
unsigned Style
Definition: cluster.h:70
float * Mean
Definition: cluster.h:74
float LogMagnitude
Definition: cluster.h:76
bool Significant
Definition: cluster.h:64
bool Merged
Definition: cluster.h:65
float TotalMagnitude
Definition: cluster.h:75
DISTRIBUTION * Distrib
Definition: cluster.h:73
FLOATUNION Weight
Definition: cluster.h:79
CLUSTER * Cluster
Definition: cluster.h:72
int16_t SampleSize
Definition: cluster.h:83
int32_t NumChar
Definition: cluster.h:89
PARAM_DESC * ParamDesc
Definition: cluster.h:84
uint32_t NumFeatureSets
Definition: featdefs.h:40
FEATURE_SET FeatureSets[NUM_FEATURE_TYPES]
Definition: featdefs.h:41
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:47
void Init(uint8_t xbuckets, uint8_t ybuckets, uint8_t thetabuckets)
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:55
float Params[1]
Definition: ocrfeatures.h:61
FEATURE Features[1]
Definition: ocrfeatures.h:68
uint16_t MaxNumFeatures
Definition: ocrfeatures.h:67
float Angle
Definition: protos.h:42
float Length
Definition: protos.h:43
float Y
Definition: protos.h:41
float B
Definition: protos.h:38
float A
Definition: protos.h:37
float C
Definition: protos.h:39
float X
Definition: protos.h:40
int16_t NumConfigs
Definition: protos.h:58
int16_t NumProtos
Definition: protos.h:55
UnicityTableEqEq< int > font_set
Definition: protos.h:61
CONFIGS Configurations
Definition: protos.h:60
int16_t MaxNumProtos
Definition: protos.h:56
int16_t MaxNumConfigs
Definition: protos.h:59
PROTO Prototypes
Definition: protos.h:57
int NumShapes() const
Definition: shapetable.h:274
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:246
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:241
CLASS_TYPE Class
bool LoadFontInfo(const char *filename)
void LoadUnicharset(const char *filename)
void LoadPageImages(const char *filename)
bool Serialize(FILE *fp) const
void SetFeatureSpace(const IntFeatureSpace &fs)
Definition: mastertrainer.h:82
bool LoadXHeights(const char *filename)
const UNICHARSET & unicharset() const
void SetupFlatShapeTable(ShapeTable *shape_table)
void ReadTrainingSamples(const char *page_name, const FEATURE_DEFS_STRUCT &feature_defs, bool verification)
bool AddSpacingInfo(const char *filename)