tesseract 4.1.1
Loading...
Searching...
No Matches
cluster.h File Reference
#include "kdtree.h"
#include "oldlist.h"

Go to the source code of this file.

Classes

struct  sample
 
struct  CLUSTERCONFIG
 
union  FLOATUNION
 
struct  PROTOTYPE
 
struct  CLUSTERER
 
struct  SAMPLELIST
 

Macros

#define MINBUCKETS   5
 
#define MAXBUCKETS   39
 
#define InitSampleSearch(S, C)    (((C) == nullptr) ? (S = NIL_LIST) : (S = push(NIL_LIST, (C))))
 

Typedefs

typedef struct sample CLUSTER
 
using SAMPLE = CLUSTER
 

Enumerations

enum  PROTOSTYLE { spherical , elliptical , mixed , automatic }
 
enum  DISTRIBUTION { normal , uniform , D_random , DISTRIBUTION_COUNT }
 

Functions

CLUSTERERMakeClusterer (int16_t SampleSize, const PARAM_DESC ParamDesc[])
 
SAMPLEMakeSample (CLUSTERER *Clusterer, const float *Feature, int32_t CharID)
 
LIST ClusterSamples (CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
 
void FreeClusterer (CLUSTERER *Clusterer)
 
void FreeProtoList (LIST *ProtoList)
 
void FreePrototype (void *arg)
 
CLUSTERNextSample (LIST *SearchState)
 
float Mean (PROTOTYPE *Proto, uint16_t Dimension)
 
float StandardDeviation (PROTOTYPE *Proto, uint16_t Dimension)
 
int32_t MergeClusters (int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], float m1[], float m2[])
 

Macro Definition Documentation

◆ InitSampleSearch

#define InitSampleSearch (   S,
 
)     (((C) == nullptr) ? (S = NIL_LIST) : (S = push(NIL_LIST, (C))))

Definition at line 101 of file cluster.h.

◆ MAXBUCKETS

#define MAXBUCKETS   39

Definition at line 27 of file cluster.h.

◆ MINBUCKETS

#define MINBUCKETS   5

Definition at line 26 of file cluster.h.

Typedef Documentation

◆ CLUSTER

typedef struct sample CLUSTER

◆ SAMPLE

using SAMPLE = CLUSTER

Definition at line 42 of file cluster.h.

Enumeration Type Documentation

◆ DISTRIBUTION

Enumerator
normal 
uniform 
D_random 
DISTRIBUTION_COUNT 

Definition at line 56 of file cluster.h.

DISTRIBUTION
Definition: cluster.h:56
@ DISTRIBUTION_COUNT
Definition: cluster.h:56
@ D_random
Definition: cluster.h:56
@ uniform
Definition: cluster.h:56
@ normal
Definition: cluster.h:56

◆ PROTOSTYLE

enum PROTOSTYLE
Enumerator
spherical 
elliptical 
mixed 
automatic 

Definition at line 44 of file cluster.h.

PROTOSTYLE
Definition: cluster.h:44
@ elliptical
Definition: cluster.h:44
@ spherical
Definition: cluster.h:44
@ automatic
Definition: cluster.h:44
@ mixed
Definition: cluster.h:44

Function Documentation

◆ ClusterSamples()

LIST ClusterSamples ( CLUSTERER Clusterer,
CLUSTERCONFIG Config 
)

This routine first checks to see if the samples in this clusterer have already been clustered before; if so, it does not bother to recreate the cluster tree. It simply recomputes the prototypes based on the new Config info.

If the samples have not been clustered before, the samples in the KD tree are formed into a cluster tree and then the prototypes are computed from the cluster tree.

In either case this routine returns a pointer to a list of prototypes that best represent the samples given the constraints specified in Config.

Parameters
Clustererdata struct containing samples to be clustered
Configparameters which control clustering process
Returns
Pointer to a list of prototypes

Definition at line 483 of file cluster.cpp.

483 {
484 //only create cluster tree if samples have never been clustered before
485 if (Clusterer->Root == nullptr)
486 CreateClusterTree(Clusterer);
487
488 //deallocate the old prototype list if one exists
489 FreeProtoList (&Clusterer->ProtoList);
490 Clusterer->ProtoList = NIL_LIST;
491
492 //compute prototypes starting at the root node in the tree
493 ComputePrototypes(Clusterer, Config);
494 // We don't need the cluster pointers in the protos any more, so null them
495 // out, which makes it safe to delete the clusterer.
496 LIST proto_list = Clusterer->ProtoList;
497 iterate(proto_list) {
498 auto *proto = reinterpret_cast<PROTOTYPE *>(first_node(proto_list));
499 proto->Cluster = nullptr;
500 }
501 return Clusterer->ProtoList;
502} // ClusterSamples
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:538
#define iterate(l)
Definition: oldlist.h:101
#define first_node(l)
Definition: oldlist.h:92
#define NIL_LIST
Definition: oldlist.h:76
CLUSTERCONFIG Config
CLUSTER * Cluster
Definition: cluster.h:72
CLUSTER * Root
Definition: cluster.h:87
LIST ProtoList
Definition: cluster.h:88

◆ FreeClusterer()

void FreeClusterer ( CLUSTERER Clusterer)

This routine frees all of the memory allocated to the specified data structure. It will not, however, free the memory used by the prototype list. The pointers to the clusters for each prototype in the list will be set to nullptr to indicate that the cluster data structures no longer exist. Any sample lists that have been obtained via calls to GetSamples are no longer valid.

Parameters
Clustererpointer to data structure to be freed

Definition at line 514 of file cluster.cpp.

514 {
515 if (Clusterer != nullptr) {
516 free(Clusterer->ParamDesc);
517 if (Clusterer->KDTree != nullptr)
518 FreeKDTree (Clusterer->KDTree);
519 if (Clusterer->Root != nullptr)
520 FreeCluster (Clusterer->Root);
521 // Free up all used buckets structures.
522 for (auto & d : Clusterer->bucket_cache) {
523 for (auto & c : d)
524 if (c != nullptr)
525 FreeBuckets(c);
526 }
527
528 free(Clusterer);
529 }
530} // FreeClusterer
void FreeKDTree(KDTREE *Tree)
Definition: kdtree.cpp:331
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1 - MINBUCKETS]
Definition: cluster.h:91
KDTREE * KDTree
Definition: cluster.h:86
PARAM_DESC * ParamDesc
Definition: cluster.h:84

◆ FreeProtoList()

void FreeProtoList ( LIST ProtoList)

This routine frees all of the memory allocated to the specified list of prototypes. The clusters which are pointed to by the prototypes are not freed.

Parameters
ProtoListpointer to list of prototypes to be freed

Definition at line 538 of file cluster.cpp.

538 {
539 destroy_nodes(*ProtoList, FreePrototype);
540} // FreeProtoList
void FreePrototype(void *arg)
Definition: cluster.cpp:549
void destroy_nodes(LIST list, void_dest destructor)
Definition: oldlist.cpp:157

◆ FreePrototype()

void FreePrototype ( void *  arg)

This routine deallocates the memory consumed by the specified prototype and modifies the corresponding cluster so that it is no longer marked as a prototype. The cluster is NOT deallocated by this routine.

Parameters
argprototype data structure to be deallocated

Definition at line 549 of file cluster.cpp.

549 { //PROTOTYPE *Prototype)
550 auto *Prototype = static_cast<PROTOTYPE *>(arg);
551
552 // unmark the corresponding cluster (if there is one
553 if (Prototype->Cluster != nullptr)
554 Prototype->Cluster->Prototype = false;
555
556 // deallocate the prototype statistics and then the prototype itself
557 free(Prototype->Distrib);
558 free(Prototype->Mean);
559 if (Prototype->Style != spherical) {
560 free(Prototype->Variance.Elliptical);
561 free(Prototype->Magnitude.Elliptical);
562 free(Prototype->Weight.Elliptical);
563 }
564 free(Prototype);
565} // FreePrototype
bool Prototype
Definition: cluster.h:34

◆ MakeClusterer()

CLUSTERER * MakeClusterer ( int16_t  SampleSize,
const PARAM_DESC  ParamDesc[] 
)

This routine creates a new clusterer data structure, initializes it, and returns a pointer to it.

Parameters
SampleSizenumber of dimensions in feature space
ParamDescdescription of each dimension
Returns
pointer to the new clusterer data structure

Definition at line 376 of file cluster.cpp.

376 {
377 CLUSTERER *Clusterer;
378 int i;
379
380 // allocate main clusterer data structure and init simple fields
381 Clusterer = static_cast<CLUSTERER *>(Emalloc (sizeof (CLUSTERER)));
382 Clusterer->SampleSize = SampleSize;
383 Clusterer->NumberOfSamples = 0;
384 Clusterer->NumChar = 0;
385
386 // init fields which will not be used initially
387 Clusterer->Root = nullptr;
388 Clusterer->ProtoList = NIL_LIST;
389
390 // maintain a copy of param descriptors in the clusterer data structure
391 Clusterer->ParamDesc =
392 static_cast<PARAM_DESC *>(Emalloc (SampleSize * sizeof (PARAM_DESC)));
393 for (i = 0; i < SampleSize; i++) {
394 Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
395 Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
396 Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
397 Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
398 Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
399 Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
400 Clusterer->ParamDesc[i].MidRange =
401 (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
402 }
403
404 // allocate a kd tree to hold the samples
405 Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
406
407 // Initialize cache of histogram buckets to minimize recomputing them.
408 for (auto & d : Clusterer->bucket_cache) {
409 for (auto & c : d)
410 c = nullptr;
411 }
412
413 return Clusterer;
414} // MakeClusterer
KDTREE * MakeKDTree(int16_t KeySize, const PARAM_DESC KeyDesc[])
Definition: kdtree.cpp:180
void * Emalloc(int Size)
Definition: emalloc.cpp:31
int32_t NumberOfSamples
Definition: cluster.h:85
int16_t SampleSize
Definition: cluster.h:83
int32_t NumChar
Definition: cluster.h:89
float HalfRange
Definition: ocrfeatures.h:48
float Range
Definition: ocrfeatures.h:47
bool Circular
Definition: ocrfeatures.h:43
float Max
Definition: ocrfeatures.h:46
float MidRange
Definition: ocrfeatures.h:49
bool NonEssential
Definition: ocrfeatures.h:44
float Min
Definition: ocrfeatures.h:45

◆ MakeSample()

SAMPLE * MakeSample ( CLUSTERER Clusterer,
const float *  Feature,
int32_t  CharID 
)

This routine creates a new sample data structure to hold the specified feature. This sample is added to the clusterer data structure (so that it knows which samples are to be clustered later), and a pointer to the sample is returned to the caller.

Parameters
Clustererclusterer data structure to add sample to
Featurefeature to be added to clusterer
CharIDunique ident. of char that sample came from
Returns
Pointer to the new sample data structure

Definition at line 429 of file cluster.cpp.

430 {
431 SAMPLE *Sample;
432 int i;
433
434 // see if the samples have already been clustered - if so trap an error
435 // Can't add samples after they have been clustered.
436 ASSERT_HOST(Clusterer->Root == nullptr);
437
438 // allocate the new sample and initialize it
439 Sample = static_cast<SAMPLE *>(Emalloc (sizeof (SAMPLE) +
440 (Clusterer->SampleSize -
441 1) * sizeof (float)));
442 Sample->Clustered = false;
443 Sample->Prototype = false;
444 Sample->SampleCount = 1;
445 Sample->Left = nullptr;
446 Sample->Right = nullptr;
447 Sample->CharID = CharID;
448
449 for (i = 0; i < Clusterer->SampleSize; i++)
450 Sample->Mean[i] = Feature[i];
451
452 // add the sample to the KD tree - keep track of the total # of samples
453 Clusterer->NumberOfSamples++;
454 KDStore(Clusterer->KDTree, Sample->Mean, Sample);
455 if (CharID >= Clusterer->NumChar)
456 Clusterer->NumChar = CharID + 1;
457
458 // execute hook for monitoring clustering operation
459 // (*SampleCreationHook)(Sample);
460
461 return (Sample);
462} // MakeSample
#define ASSERT_HOST(x)
Definition: errcode.h:88
void KDStore(KDTREE *Tree, float *Key, void *Data)
Definition: kdtree.cpp:212
Definition: cluster.h:32
struct sample * Right
Definition: cluster.h:37
int32_t CharID
Definition: cluster.h:38
unsigned SampleCount
Definition: cluster.h:35
bool Clustered
Definition: cluster.h:33
float Mean[1]
Definition: cluster.h:39
struct sample * Left
Definition: cluster.h:36

◆ Mean()

float Mean ( PROTOTYPE Proto,
uint16_t  Dimension 
)

This routine returns the mean of the specified prototype in the indicated dimension.

Parameters
Protoprototype to return mean of
Dimensiondimension whose mean is to be returned
Returns
Mean of Prototype in Dimension

Definition at line 602 of file cluster.cpp.

602 {
603 return (Proto->Mean[Dimension]);
604} // Mean
float * Mean
Definition: cluster.h:74

◆ MergeClusters()

int32_t MergeClusters ( int16_t  N,
PARAM_DESC  ParamDesc[],
int32_t  n1,
int32_t  n2,
float  m[],
float  m1[],
float  m2[] 
)

This routine merges two clusters into one larger cluster. To do this it computes the number of samples in the new cluster and the mean of the new cluster. The ParamDesc information is used to ensure that circular dimensions are handled correctly.

Parameters
N# of dimensions (size of arrays)
ParamDescarray of dimension descriptions
n1,n2number of samples in each old cluster
marray to hold mean of new cluster
m1,m2arrays containing means of old clusters
Returns
The number of samples in the new cluster.

Definition at line 824 of file cluster.cpp.

829 {
830 int32_t i, n;
831
832 n = n1 + n2;
833 for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
834 if (ParamDesc->Circular) {
835 // if distance between means is greater than allowed
836 // reduce upper point by one "rotation" to compute mean
837 // then normalize the mean back into the accepted range
838 if ((*m2 - *m1) > ParamDesc->HalfRange) {
839 *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
840 if (*m < ParamDesc->Min)
841 *m += ParamDesc->Range;
842 }
843 else if ((*m1 - *m2) > ParamDesc->HalfRange) {
844 *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
845 if (*m < ParamDesc->Min)
846 *m += ParamDesc->Range;
847 }
848 else
849 *m = (n1 * *m1 + n2 * *m2) / n;
850 }
851 else
852 *m = (n1 * *m1 + n2 * *m2) / n;
853 }
854 return n;
855} // MergeClusters

◆ NextSample()

CLUSTER * NextSample ( LIST SearchState)

This routine is used to find all of the samples which belong to a cluster. It starts by removing the top cluster on the cluster list (SearchState). If this cluster is a leaf it is returned. Otherwise, the right subcluster is pushed on the list and we continue the search in the left subcluster. This continues until a leaf is found. If all samples have been found, nullptr is returned. InitSampleSearch() must be called before NextSample() to initialize the search.

Parameters
SearchStateptr to list containing clusters to be searched
Returns
Pointer to the next leaf cluster (sample) or nullptr.

Definition at line 580 of file cluster.cpp.

580 {
581 CLUSTER *Cluster;
582
583 if (*SearchState == NIL_LIST)
584 return (nullptr);
585 Cluster = reinterpret_cast<CLUSTER *>first_node (*SearchState);
586 *SearchState = pop (*SearchState);
587 for (;;) {
588 if (Cluster->Left == nullptr)
589 return (Cluster);
590 *SearchState = push (*SearchState, Cluster->Right);
591 Cluster = Cluster->Left;
592 }
593} // NextSample
LIST pop(LIST list)
Definition: oldlist.cpp:201
LIST push(LIST list, void *element)
Definition: oldlist.cpp:213

◆ StandardDeviation()

float StandardDeviation ( PROTOTYPE Proto,
uint16_t  Dimension 
)

This routine returns the standard deviation of the prototype in the indicated dimension.

Parameters
Protoprototype to return standard deviation of
Dimensiondimension whose stddev is to be returned
Returns
Standard deviation of Prototype in Dimension

Definition at line 613 of file cluster.cpp.

613 {
614 switch (Proto->Style) {
615 case spherical:
616 return (static_cast<float>(sqrt (static_cast<double>(Proto->Variance.Spherical))));
617 case elliptical:
618 return (static_cast<float>(sqrt (static_cast<double>(Proto->Variance.Elliptical[Dimension]))));
619 case mixed:
620 switch (Proto->Distrib[Dimension]) {
621 case normal:
622 return (static_cast<float>(sqrt (static_cast<double>(Proto->Variance.Elliptical[Dimension]))));
623 case uniform:
624 case D_random:
625 return (Proto->Variance.Elliptical[Dimension]);
627 ASSERT_HOST(!"Distribution count not allowed!");
628 }
629 }
630 return 0.0f;
631} // StandardDeviation
float Spherical
Definition: cluster.h:59
float * Elliptical
Definition: cluster.h:60
FLOATUNION Variance
Definition: cluster.h:77
unsigned Style
Definition: cluster.h:70
DISTRIBUTION * Distrib
Definition: cluster.h:73