tesseract 4.1.1
Loading...
Searching...
No Matches
commontraining.cpp File Reference
#include "commontraining.h"
#include <algorithm>
#include <cmath>
#include "allheaders.h"
#include "ccutil.h"
#include "classify.h"
#include "cluster.h"
#include "clusttool.h"
#include "emalloc.h"
#include "featdefs.h"
#include "fontinfo.h"
#include "intfeaturespace.h"
#include "mastertrainer.h"
#include "mf.h"
#include "oldlist.h"
#include "params.h"
#include "shapetable.h"
#include "tessdatamanager.h"
#include "tessopt.h"
#include "tprintf.h"
#include "unicity_table.h"

Go to the source code of this file.

Namespaces

namespace  tesseract
 

Macros

#define _USE_MATH_DEFINES
 

Functions

 INT_PARAM_FLAG (debug_level, 0, "Level of Trainer debugging")
 
 STRING_PARAM_FLAG (D, "", "Directory to write output files to")
 
 STRING_PARAM_FLAG (F, "font_properties", "File listing font properties")
 
 STRING_PARAM_FLAG (X, "", "File listing font xheights")
 
 STRING_PARAM_FLAG (U, "unicharset", "File to load unicharset from")
 
 STRING_PARAM_FLAG (O, "", "File to write unicharset to")
 
 STRING_PARAM_FLAG (output_trainer, "", "File to write trainer to")
 
 STRING_PARAM_FLAG (test_ch, "", "UTF8 test character string")
 
void ParseArguments (int *argc, char ***argv)
 
ShapeTabletesseract::LoadShapeTable (const STRING &file_prefix)
 
void tesseract::WriteShapeTable (const STRING &file_prefix, const ShapeTable &shape_table)
 
MasterTrainer * tesseract::LoadTrainingData (int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
 
const char * GetNextFilename (int argc, const char *const *argv)
 
LABELEDLIST FindList (LIST List, char *Label)
 
LABELEDLIST NewLabeledList (const char *Label)
 
void ReadTrainingSamples (const FEATURE_DEFS_STRUCT &feature_definitions, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
 
void FreeTrainingSamples (LIST CharList)
 
void FreeLabeledList (LABELEDLIST LabeledList)
 
CLUSTERERSetUpForClustering (const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
 
void MergeInsignificantProtos (LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *clusterconfig)
 
void CleanUpUnusedData (LIST ProtoList)
 
LIST RemoveInsignificantProtos (LIST ProtoList, bool KeepSigProtos, bool KeepInsigProtos, int N)
 
MERGE_CLASS FindClass (LIST List, const char *Label)
 
MERGE_CLASS NewLabeledClass (const char *Label)
 
void FreeLabeledClassList (LIST ClassList)
 
CLASS_STRUCTSetUpForFloat2Int (const UNICHARSET &unicharset, LIST LabeledClassList)
 
void Normalize (float *Values)
 
void FreeNormProtoList (LIST CharList)
 
void AddToNormProtosList (LIST *NormProtoList, LIST ProtoList, char *CharName)
 
int NumberOfProtos (LIST ProtoList, bool CountSigProtos, bool CountInsigProtos)
 

Variables

CLUSTERCONFIG Config = { elliptical, 0.625, 0.05, 1.0, 1e-6, 0 }
 
FEATURE_DEFS_STRUCT feature_defs
 

Macro Definition Documentation

◆ _USE_MATH_DEFINES

#define _USE_MATH_DEFINES

Definition at line 14 of file commontraining.cpp.

Function Documentation

◆ AddToNormProtosList()

void AddToNormProtosList ( LIST NormProtoList,
LIST  ProtoList,
char *  CharName 
)

Definition at line 821 of file commontraining.cpp.

825{
826 PROTOTYPE* Proto;
827 LABELEDLIST LabeledProtoList;
828
829 LabeledProtoList = NewLabeledList(CharName);
830 iterate(ProtoList)
831 {
832 Proto = reinterpret_cast<PROTOTYPE *>first_node (ProtoList);
833 LabeledProtoList->List = push(LabeledProtoList->List, Proto);
834 }
835 *NormProtoList = push(*NormProtoList, LabeledProtoList);
836}
LIST push(LIST list, void *element)
Definition: oldlist.cpp:213
#define iterate(l)
Definition: oldlist.h:101
#define first_node(l)
Definition: oldlist.h:92
LABELEDLIST NewLabeledList(const char *Label)

◆ CleanUpUnusedData()

void CleanUpUnusedData ( LIST  ProtoList)

Definition at line 595 of file commontraining.cpp.

597{
598 PROTOTYPE* Prototype;
599
600 iterate(ProtoList)
601 {
602 Prototype = reinterpret_cast<PROTOTYPE *>first_node (ProtoList);
603 free(Prototype->Variance.Elliptical);
604 Prototype->Variance.Elliptical = nullptr;
605 free(Prototype->Magnitude.Elliptical);
606 Prototype->Magnitude.Elliptical = nullptr;
607 free(Prototype->Weight.Elliptical);
608 Prototype->Weight.Elliptical = nullptr;
609 }
610}
float * Elliptical
Definition: cluster.h:60
FLOATUNION Magnitude
Definition: cluster.h:78
FLOATUNION Variance
Definition: cluster.h:77
FLOATUNION Weight
Definition: cluster.h:79

◆ FindClass()

MERGE_CLASS FindClass ( LIST  List,
const char *  Label 
)

Definition at line 678 of file commontraining.cpp.

678 {
679 MERGE_CLASS MergeClass;
680
681 iterate (List)
682 {
683 MergeClass = reinterpret_cast<MERGE_CLASS>first_node (List);
684 if (strcmp (MergeClass->Label, Label) == 0)
685 return (MergeClass);
686 }
687 return (nullptr);
688
689} /* FindClass */

◆ FindList()

LABELEDLIST FindList ( LIST  List,
char *  Label 
)

This routine searches through a list of labeled lists to find a list with the specified label. If a matching labeled list cannot be found, nullptr is returned.

Parameters
Listlist to search
Labellabel to search for
Returns
Labeled list with the specified label or nullptr.
Note
Globals: none

Definition at line 340 of file commontraining.cpp.

340 {
341 LABELEDLIST LabeledList;
342
343 iterate (List)
344 {
345 LabeledList = reinterpret_cast<LABELEDLIST>first_node (List);
346 if (strcmp (LabeledList->Label, Label) == 0)
347 return (LabeledList);
348 }
349 return (nullptr);
350
351} /* FindList */

◆ FreeLabeledClassList()

void FreeLabeledClassList ( LIST  ClassList)

This routine deallocates all of the space allocated to the specified list of training samples.

Parameters
ClassListlist of all fonts in document

Definition at line 709 of file commontraining.cpp.

709 {
710 MERGE_CLASS MergeClass;
711
712 LIST nodes = ClassList;
713 iterate(ClassList) /* iterate through all of the fonts */
714 {
715 MergeClass = reinterpret_cast<MERGE_CLASS>first_node (ClassList);
716 free (MergeClass->Label);
717 FreeClass(MergeClass->Class);
718 delete MergeClass;
719 }
720 destroy(nodes);
721
722} /* FreeLabeledClassList */
void FreeClass(CLASS_TYPE Class)
Definition: protos.cpp:125
LIST destroy(LIST list)
Definition: oldlist.cpp:141
CLASS_TYPE Class

◆ FreeLabeledList()

void FreeLabeledList ( LABELEDLIST  LabeledList)

This routine deallocates all of the memory consumed by a labeled list. It does not free any memory which may be consumed by the items in the list.

Parameters
LabeledListlabeled list to be freed
Note
Globals: none

Definition at line 476 of file commontraining.cpp.

476 {
477 destroy(LabeledList->List);
478 free(LabeledList->Label);
479 free(LabeledList);
480} /* FreeLabeledList */

◆ FreeNormProtoList()

void FreeNormProtoList ( LIST  CharList)

Definition at line 805 of file commontraining.cpp.

807{
808 LABELEDLIST char_sample;
809
810 LIST nodes = CharList;
811 iterate(CharList) /* iterate through all of the fonts */
812 {
813 char_sample = reinterpret_cast<LABELEDLIST>first_node (CharList);
814 FreeLabeledList (char_sample);
815 }
816 destroy(nodes);
817
818} // FreeNormProtoList
void FreeLabeledList(LABELEDLIST LabeledList)

◆ FreeTrainingSamples()

void FreeTrainingSamples ( LIST  CharList)

This routine deallocates all of the space allocated to the specified list of training samples.

Parameters
CharListlist of all fonts in document

Definition at line 450 of file commontraining.cpp.

450 {
451 LABELEDLIST char_sample;
452 FEATURE_SET FeatureSet;
453 LIST FeatureList;
454
455 LIST nodes = CharList;
456 iterate(CharList) { /* iterate through all of the fonts */
457 char_sample = reinterpret_cast<LABELEDLIST>first_node(CharList);
458 FeatureList = char_sample->List;
459 iterate(FeatureList) { /* iterate through all of the classes */
460 FeatureSet = reinterpret_cast<FEATURE_SET>first_node(FeatureList);
461 FreeFeatureSet(FeatureSet);
462 }
463 FreeLabeledList(char_sample);
464 }
465 destroy(nodes);
466} /* FreeTrainingSamples */
void FreeFeatureSet(FEATURE_SET FeatureSet)
Definition: ocrfeatures.cpp:62

◆ GetNextFilename()

const char * GetNextFilename ( int  argc,
const char *const *  argv 
)

This routine returns the next command line argument. If there are no remaining command line arguments, it returns nullptr. This routine should only be called after all option arguments have been parsed and removed with ParseArguments.

Globals:

  • tessoptind defined by tessopt sys call
    Returns
    Next command line argument or nullptr.

Definition at line 323 of file commontraining.cpp.

323 {
324 if (tessoptind < argc)
325 return argv[tessoptind++];
326 else
327 return nullptr;
328} /* GetNextFilename */
int tessoptind
Definition: tessopt.cpp:24

◆ INT_PARAM_FLAG()

INT_PARAM_FLAG ( debug_level  ,
,
"Level of Trainer debugging"   
)

◆ MergeInsignificantProtos()

void MergeInsignificantProtos ( LIST  ProtoList,
const char *  label,
CLUSTERER Clusterer,
CLUSTERCONFIG clusterconfig 
)

Definition at line 528 of file commontraining.cpp.

530 {
531 PROTOTYPE* Prototype;
532 bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0;
533
534 LIST pProtoList = ProtoList;
535 iterate(pProtoList) {
536 Prototype = reinterpret_cast<PROTOTYPE *>first_node (pProtoList);
537 if (Prototype->Significant || Prototype->Merged)
538 continue;
539 float best_dist = 0.125;
540 PROTOTYPE* best_match = nullptr;
541 // Find the nearest alive prototype.
542 LIST list_it = ProtoList;
543 iterate(list_it) {
544 PROTOTYPE* test_p = reinterpret_cast<PROTOTYPE *>first_node (list_it);
545 if (test_p != Prototype && !test_p->Merged) {
546 float dist = ComputeDistance(Clusterer->SampleSize,
547 Clusterer->ParamDesc,
548 Prototype->Mean, test_p->Mean);
549 if (dist < best_dist) {
550 best_match = test_p;
551 best_dist = dist;
552 }
553 }
554 }
555 if (best_match != nullptr && !best_match->Significant) {
556 if (debug)
557 tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n",
558 best_match->NumSamples, Prototype->NumSamples,
559 best_match->Mean[0], best_match->Mean[1],
560 Prototype->Mean[0], Prototype->Mean[1]);
561 best_match->NumSamples = MergeClusters(Clusterer->SampleSize,
562 Clusterer->ParamDesc,
563 best_match->NumSamples,
564 Prototype->NumSamples,
565 best_match->Mean,
566 best_match->Mean, Prototype->Mean);
567 Prototype->NumSamples = 0;
568 Prototype->Merged = true;
569 } else if (best_match != nullptr) {
570 if (debug)
571 tprintf("Red proto at %g,%g matched a green one at %g,%g\n",
572 Prototype->Mean[0], Prototype->Mean[1],
573 best_match->Mean[0], best_match->Mean[1]);
574 Prototype->Merged = true;
575 }
576 }
577 // Mark significant those that now have enough samples.
578 int min_samples =
579 static_cast<int32_t>(clusterconfig->MinSamples * Clusterer->NumChar);
580 pProtoList = ProtoList;
581 iterate(pProtoList) {
582 Prototype = reinterpret_cast<PROTOTYPE *>first_node (pProtoList);
583 // Process insignificant protos that do not match a green one
584 if (!Prototype->Significant && Prototype->NumSamples >= min_samples &&
585 !Prototype->Merged) {
586 if (debug)
587 tprintf("Red proto at %g,%g becoming green\n",
588 Prototype->Mean[0], Prototype->Mean[1]);
589 Prototype->Significant = true;
590 }
591 }
592} /* MergeInsignificantProtos */
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], float m1[], float m2[])
Definition: cluster.cpp:824
float ComputeDistance(int k, PARAM_DESC *dim, float p1[], float p2[])
Definition: kdtree.cpp:448
float MinSamples
Definition: cluster.h:48
unsigned NumSamples
Definition: cluster.h:71
float * Mean
Definition: cluster.h:74
bool Significant
Definition: cluster.h:64
bool Merged
Definition: cluster.h:65
int16_t SampleSize
Definition: cluster.h:83
int32_t NumChar
Definition: cluster.h:89
PARAM_DESC * ParamDesc
Definition: cluster.h:84

◆ NewLabeledClass()

MERGE_CLASS NewLabeledClass ( const char *  Label)

Definition at line 692 of file commontraining.cpp.

692 {
693 MERGE_CLASS MergeClass;
694
695 MergeClass = new MERGE_CLASS_NODE;
696 MergeClass->Label = static_cast<char*>(Emalloc (strlen (Label)+1));
697 strcpy (MergeClass->Label, Label);
699 return (MergeClass);
700
701} /* NewLabeledClass */
#define MAX_NUM_PROTOS
Definition: intproto.h:48
#define MAX_NUM_CONFIGS
Definition: intproto.h:47
CLASS_TYPE NewClass(int NumProtos, int NumConfigs)
Definition: protos.cpp:157
void * Emalloc(int Size)
Definition: emalloc.cpp:31

◆ NewLabeledList()

LABELEDLIST NewLabeledList ( const char *  Label)

This routine allocates a new, empty labeled list and gives it the specified label.

Parameters
Labellabel for new list
Returns
New, empty labeled list.
Note
Globals: none

Definition at line 361 of file commontraining.cpp.

361 {
362 LABELEDLIST LabeledList;
363
364 LabeledList = static_cast<LABELEDLIST>(Emalloc (sizeof (LABELEDLISTNODE)));
365 LabeledList->Label = static_cast<char*>(Emalloc (strlen (Label)+1));
366 strcpy (LabeledList->Label, Label);
367 LabeledList->List = NIL_LIST;
368 LabeledList->SampleCount = 0;
369 LabeledList->font_sample_count = 0;
370 return (LabeledList);
371
372} /* NewLabeledList */
#define NIL_LIST
Definition: oldlist.h:76

◆ Normalize()

void Normalize ( float *  Values)

Definition at line 788 of file commontraining.cpp.

790{
791 float Slope;
792 float Intercept;
793 float Normalizer;
794
795 Slope = tan(Values [2] * 2 * M_PI);
796 Intercept = Values [1] - Slope * Values [0];
797 Normalizer = 1 / sqrt (Slope * Slope + 1.0);
798
799 Values [0] = Slope * Normalizer;
800 Values [1] = - Normalizer;
801 Values [2] = Intercept * Normalizer;
802} // Normalize

◆ NumberOfProtos()

int NumberOfProtos ( LIST  ProtoList,
bool  CountSigProtos,
bool  CountInsigProtos 
)

Definition at line 839 of file commontraining.cpp.

840 {
841 int N = 0;
842 iterate(ProtoList)
843 {
844 PROTOTYPE* Proto = reinterpret_cast<PROTOTYPE*>first_node(ProtoList);
845 if ((Proto->Significant && CountSigProtos) ||
846 (!Proto->Significant && CountInsigProtos))
847 N++;
848 }
849 return(N);
850}

◆ ParseArguments()

void ParseArguments ( int *  argc,
char ***  argv 
)

This routine parses the command line arguments that were passed to the program and uses them to set relevant training-related global parameters.

Globals:

  • Config current clustering parameters
    Parameters
    argcnumber of command line arguments to parse
    argvcommand line arguments

Definition at line 122 of file commontraining.cpp.

122 {
123 STRING usage;
124 if (*argc) {
125 usage += (*argv)[0];
126 usage += " -v | --version | ";
127 usage += (*argv)[0];
128 }
129 usage += " [.tr files ...]";
130 tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);
131 // Record the index of the first non-flag argument to 1, since we set
132 // remove_flags to true when parsing the flags.
133 tessoptind = 1;
134 // Set some global values based on the flags.
136 std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_min_samples_fraction)));
138 std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_max_illegal)));
140 std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_independence)));
142 std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_confidence)));
143 // Set additional parameters from config file if specified.
144 if (!FLAGS_configfile.empty()) {
146 FLAGS_configfile.c_str(),
148 ccutil.params());
149 }
150}
CLUSTERCONFIG Config
void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags)
@ SET_PARAM_CONSTRAINT_NON_INIT_ONLY
Definition: params.h:39
ParamsVectors * params()
Definition: ccutil.h:67
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:39
Definition: strngs.h:45
const char * c_str() const
Definition: strngs.cpp:205
float Independence
Definition: cluster.h:51
float MaxIllegal
Definition: cluster.h:49
double Confidence
Definition: cluster.h:52

◆ ReadTrainingSamples()

void ReadTrainingSamples ( const FEATURE_DEFS_STRUCT feature_definitions,
const char *  feature_name,
int  max_samples,
UNICHARSET unicharset,
FILE *  file,
LIST training_samples 
)

This routine reads training samples from a file and places them into a data structure which organizes the samples by FontName and CharName. It then returns this data structure.

Parameters
fileopen text file to read samples from
feature_definitions
feature_name
max_samples
unicharset
training_samples

Definition at line 389 of file commontraining.cpp.

392 {
393 char buffer[2048];
394 char unichar[UNICHAR_LEN + 1];
395 LABELEDLIST char_sample;
396 FEATURE_SET feature_samples;
397 CHAR_DESC char_desc;
398 uint32_t feature_type =
399 ShortNameToFeatureType(feature_definitions, feature_name);
400
401 // Zero out the font_sample_count for all the classes.
402 LIST it = *training_samples;
403 iterate(it) {
404 char_sample = reinterpret_cast<LABELEDLIST>(first_node(it));
405 char_sample->font_sample_count = 0;
406 }
407
408 while (fgets(buffer, 2048, file) != nullptr) {
409 if (buffer[0] == '\n')
410 continue;
411
412 sscanf(buffer, "%*s %s", unichar);
413 if (unicharset != nullptr && !unicharset->contains_unichar(unichar)) {
414 unicharset->unichar_insert(unichar);
415 if (unicharset->size() > MAX_NUM_CLASSES) {
416 tprintf("Error: Size of unicharset in training is "
417 "greater than MAX_NUM_CLASSES\n");
418 exit(1);
419 }
420 }
421 char_sample = FindList(*training_samples, unichar);
422 if (char_sample == nullptr) {
423 char_sample = NewLabeledList(unichar);
424 *training_samples = push(*training_samples, char_sample);
425 }
426 char_desc = ReadCharDescription(feature_definitions, file);
427 feature_samples = char_desc->FeatureSets[feature_type];
428 if (char_sample->font_sample_count < max_samples || max_samples <= 0) {
429 char_sample->List = push(char_sample->List, feature_samples);
430 char_sample->SampleCount++;
431 char_sample->font_sample_count++;
432 } else {
433 FreeFeatureSet(feature_samples);
434 }
435 for (size_t i = 0; i < char_desc->NumFeatureSets; i++) {
436 if (feature_type != i)
437 FreeFeatureSet(char_desc->FeatureSets[i]);
438 }
439 free(char_desc);
440 }
441} // ReadTrainingSamples
#define UNICHAR_LEN
Definition: unichar.h:30
uint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:270
CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File)
Definition: featdefs.cpp:236
#define MAX_NUM_CLASSES
Definition: matchdefs.h:30
LABELEDLIST FindList(LIST List, char *Label)
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:626
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:671
int size() const
Definition: unicharset.h:341
uint32_t NumFeatureSets
Definition: featdefs.h:40
FEATURE_SET FeatureSets[NUM_FEATURE_TYPES]
Definition: featdefs.h:41

◆ RemoveInsignificantProtos()

LIST RemoveInsignificantProtos ( LIST  ProtoList,
bool  KeepSigProtos,
bool  KeepInsigProtos,
int  N 
)

Definition at line 613 of file commontraining.cpp.

619{
620 LIST NewProtoList = NIL_LIST;
621 LIST pProtoList;
622 PROTOTYPE* Proto;
623 PROTOTYPE* NewProto;
624 int i;
625
626 pProtoList = ProtoList;
627 iterate(pProtoList)
628 {
629 Proto = reinterpret_cast<PROTOTYPE *>first_node (pProtoList);
630 if ((Proto->Significant && KeepSigProtos) ||
631 (!Proto->Significant && KeepInsigProtos))
632 {
633 NewProto = static_cast<PROTOTYPE *>(Emalloc(sizeof(PROTOTYPE)));
634
635 NewProto->Mean = static_cast<float *>(Emalloc(N * sizeof(float)));
636 NewProto->Significant = Proto->Significant;
637 NewProto->Style = Proto->Style;
638 NewProto->NumSamples = Proto->NumSamples;
639 NewProto->Cluster = nullptr;
640 NewProto->Distrib = nullptr;
641
642 for (i=0; i < N; i++)
643 NewProto->Mean[i] = Proto->Mean[i];
644 if (Proto->Variance.Elliptical != nullptr) {
645 NewProto->Variance.Elliptical = static_cast<float *>(Emalloc(N * sizeof(float)));
646 for (i=0; i < N; i++)
647 NewProto->Variance.Elliptical[i] = Proto->Variance.Elliptical[i];
648 }
649 else
650 NewProto->Variance.Elliptical = nullptr;
651 //---------------------------------------------
652 if (Proto->Magnitude.Elliptical != nullptr) {
653 NewProto->Magnitude.Elliptical = static_cast<float *>(Emalloc(N * sizeof(float)));
654 for (i=0; i < N; i++)
655 NewProto->Magnitude.Elliptical[i] = Proto->Magnitude.Elliptical[i];
656 }
657 else
658 NewProto->Magnitude.Elliptical = nullptr;
659 //------------------------------------------------
660 if (Proto->Weight.Elliptical != nullptr) {
661 NewProto->Weight.Elliptical = static_cast<float *>(Emalloc(N * sizeof(float)));
662 for (i=0; i < N; i++)
663 NewProto->Weight.Elliptical[i] = Proto->Weight.Elliptical[i];
664 }
665 else
666 NewProto->Weight.Elliptical = nullptr;
667
668 NewProto->TotalMagnitude = Proto->TotalMagnitude;
669 NewProto->LogMagnitude = Proto->LogMagnitude;
670 NewProtoList = push_last(NewProtoList, NewProto);
671 }
672 }
673 FreeProtoList(&ProtoList);
674 return (NewProtoList);
675} /* RemoveInsignificantProtos */
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:538
LIST push_last(LIST list, void *item)
Definition: oldlist.cpp:227
unsigned Style
Definition: cluster.h:70
float LogMagnitude
Definition: cluster.h:76
float TotalMagnitude
Definition: cluster.h:75
DISTRIBUTION * Distrib
Definition: cluster.h:73
CLUSTER * Cluster
Definition: cluster.h:72

◆ SetUpForClustering()

CLUSTERER * SetUpForClustering ( const FEATURE_DEFS_STRUCT FeatureDefs,
LABELEDLIST  char_sample,
const char *  program_feature_type 
)

This routine reads samples from a LABELEDLIST and enters those samples into a clusterer data structure. This data structure is then returned to the caller.

Parameters
char_sampleLABELEDLIST that holds all the feature information for a
FeatureDefs
program_feature_typegiven character.
Returns
Pointer to new clusterer data structure.
Note
Globals: None

Definition at line 494 of file commontraining.cpp.

496 {
497 uint16_t N;
498 int i, j;
499 float* Sample = nullptr;
500 CLUSTERER *Clusterer;
501 int32_t CharID;
502 LIST FeatureList = nullptr;
503 FEATURE_SET FeatureSet = nullptr;
504
505 int32_t desc_index =
506 ShortNameToFeatureType(FeatureDefs, program_feature_type);
507 N = FeatureDefs.FeatureDesc[desc_index]->NumParams;
508 Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc);
509
510 FeatureList = char_sample->List;
511 CharID = 0;
512 iterate(FeatureList) {
513 FeatureSet = reinterpret_cast<FEATURE_SET>first_node(FeatureList);
514 for (i = 0; i < FeatureSet->MaxNumFeatures; i++) {
515 if (Sample == nullptr) Sample = static_cast<float*>(Emalloc(N * sizeof(float)));
516 for (j = 0; j < N; j++)
517 Sample[j] = FeatureSet->Features[i]->Params[j];
518 MakeSample (Clusterer, Sample, CharID);
519 }
520 CharID++;
521 }
522 free(Sample);
523 return Clusterer;
524
525} /* SetUpForClustering */
SAMPLE * MakeSample(CLUSTERER *Clusterer, const float *Feature, int32_t CharID)
Definition: cluster.cpp:429
CLUSTERER * MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:376
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:47
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:55
float Params[1]
Definition: ocrfeatures.h:61
FEATURE Features[1]
Definition: ocrfeatures.h:68
uint16_t MaxNumFeatures
Definition: ocrfeatures.h:67

◆ SetUpForFloat2Int()

CLASS_STRUCT * SetUpForFloat2Int ( const UNICHARSET unicharset,
LIST  LabeledClassList 
)

Definition at line 725 of file commontraining.cpp.

726 {
727 MERGE_CLASS MergeClass;
728 CLASS_TYPE Class;
729 int NumProtos;
730 int NumConfigs;
731 int NumWords;
732 int i, j;
733 float Values[3];
734 PROTO NewProto;
735 PROTO OldProto;
736 BIT_VECTOR NewConfig;
737 BIT_VECTOR OldConfig;
738
739 // printf("Float2Int ...\n");
740
741 CLASS_STRUCT* float_classes = new CLASS_STRUCT[unicharset.size()];
742 iterate(LabeledClassList)
743 {
744 UnicityTableEqEq<int> font_set;
745 MergeClass = reinterpret_cast<MERGE_CLASS>first_node (LabeledClassList);
746 Class = &float_classes[unicharset.unichar_to_id(MergeClass->Label)];
747 NumProtos = MergeClass->Class->NumProtos;
748 NumConfigs = MergeClass->Class->NumConfigs;
749 font_set.move(&MergeClass->Class->font_set);
750 Class->NumProtos = NumProtos;
751 Class->MaxNumProtos = NumProtos;
752 Class->Prototypes = static_cast<PROTO>(Emalloc (sizeof(PROTO_STRUCT) * NumProtos));
753 for(i=0; i < NumProtos; i++)
754 {
755 NewProto = ProtoIn(Class, i);
756 OldProto = ProtoIn(MergeClass->Class, i);
757 Values[0] = OldProto->X;
758 Values[1] = OldProto->Y;
759 Values[2] = OldProto->Angle;
760 Normalize(Values);
761 NewProto->X = OldProto->X;
762 NewProto->Y = OldProto->Y;
763 NewProto->Length = OldProto->Length;
764 NewProto->Angle = OldProto->Angle;
765 NewProto->A = Values[0];
766 NewProto->B = Values[1];
767 NewProto->C = Values[2];
768 }
769
770 Class->NumConfigs = NumConfigs;
771 Class->MaxNumConfigs = NumConfigs;
772 Class->font_set.move(&font_set);
773 Class->Configurations = static_cast<BIT_VECTOR*>(Emalloc (sizeof(BIT_VECTOR) * NumConfigs));
774 NumWords = WordsInVectorOfSize(NumProtos);
775 for(i=0; i < NumConfigs; i++)
776 {
777 NewConfig = NewBitVector(NumProtos);
778 OldConfig = MergeClass->Class->Configurations[i];
779 for(j=0; j < NumWords; j++)
780 NewConfig[j] = OldConfig[j];
781 Class->Configurations[i] = NewConfig;
782 }
783 }
784 return float_classes;
785} // SetUpForFloat2Int
#define ProtoIn(Class, Pid)
Definition: protos.h:84
uint32_t * BIT_VECTOR
Definition: bitvec.h:28
void Normalize(float *Values)
void move(UnicityTable< T > *from)
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
float Angle
Definition: protos.h:42
float Length
Definition: protos.h:43
float Y
Definition: protos.h:41
float B
Definition: protos.h:38
float A
Definition: protos.h:37
float C
Definition: protos.h:39
float X
Definition: protos.h:40
int16_t NumConfigs
Definition: protos.h:58
int16_t NumProtos
Definition: protos.h:55
UnicityTableEqEq< int > font_set
Definition: protos.h:61
CONFIGS Configurations
Definition: protos.h:60
int16_t MaxNumProtos
Definition: protos.h:56
int16_t MaxNumConfigs
Definition: protos.h:59
PROTO Prototypes
Definition: protos.h:57

◆ STRING_PARAM_FLAG() [1/7]

STRING_PARAM_FLAG ( ,
""  ,
"Directory to write output files to"   
)

◆ STRING_PARAM_FLAG() [2/7]

STRING_PARAM_FLAG ( ,
"font_properties"  ,
"File listing font properties"   
)

◆ STRING_PARAM_FLAG() [3/7]

STRING_PARAM_FLAG ( ,
""  ,
"File to write unicharset to"   
)

◆ STRING_PARAM_FLAG() [4/7]

STRING_PARAM_FLAG ( output_trainer  ,
""  ,
"File to write trainer to"   
)

◆ STRING_PARAM_FLAG() [5/7]

STRING_PARAM_FLAG ( test_ch  ,
""  ,
"UTF8 test character string"   
)

◆ STRING_PARAM_FLAG() [6/7]

STRING_PARAM_FLAG ( ,
"unicharset"  ,
"File to load unicharset from"   
)

◆ STRING_PARAM_FLAG() [7/7]

STRING_PARAM_FLAG ( ,
""  ,
"File listing font xheights"   
)

Variable Documentation

◆ Config

CLUSTERCONFIG Config = { elliptical, 0.625, 0.05, 1.0, 1e-6, 0 }

Definition at line 88 of file commontraining.cpp.

◆ feature_defs

FEATURE_DEFS_STRUCT feature_defs

Definition at line 89 of file commontraining.cpp.