tesseract 4.1.1
Loading...
Searching...
No Matches
tesseract::SampleIterator Class Reference

#include <sampleiterator.h>

Public Member Functions

 SampleIterator ()
 
 ~SampleIterator ()
 
void Clear ()
 
void Init (const IndexMapBiDi *charset_map, const ShapeTable *shape_table, bool randomize, TrainingSampleSet *sample_set)
 
void Begin ()
 
bool AtEnd () const
 
const TrainingSampleGetSample () const
 
TrainingSampleMutableSample () const
 
int GlobalSampleIndex () const
 
int GetCompactClassID () const
 
int GetSparseClassID () const
 
void Next ()
 
int CompactCharsetSize () const
 
int SparseCharsetSize () const
 
const IndexMapBiDicharset_map () const
 
const ShapeTableshape_table () const
 
const TrainingSampleSetsample_set () const
 
void MapSampleFeatures (const IntFeatureMap &feature_map)
 
int UniformSamples ()
 
double NormalizeSamples ()
 

Detailed Description

Definition at line 92 of file sampleiterator.h.

Constructor & Destructor Documentation

◆ SampleIterator()

tesseract::SampleIterator::SampleIterator ( )

Definition at line 27 of file sampleiterator.cpp.

28 : charset_map_(nullptr),
29 shape_table_(nullptr),
30 sample_set_(nullptr),
31 randomize_(false),
32 owned_shape_table_(nullptr) {
33 num_shapes_ = 0;
34 Begin();
35}

◆ ~SampleIterator()

tesseract::SampleIterator::~SampleIterator ( )

Definition at line 37 of file sampleiterator.cpp.

37 {
38 Clear();
39}

Member Function Documentation

◆ AtEnd()

bool tesseract::SampleIterator::AtEnd ( ) const

Definition at line 99 of file sampleiterator.cpp.

99 {
100 return shape_index_ >= num_shapes_;
101}

◆ Begin()

void tesseract::SampleIterator::Begin ( )

Definition at line 87 of file sampleiterator.cpp.

87 {
88 shape_index_ = -1;
89 shape_char_index_ = 0;
90 num_shape_chars_ = 0;
91 shape_font_index_ = 0;
92 num_shape_fonts_ = 0;
93 sample_index_ = 0;
94 num_samples_ = 0;
95 // Find the first indexable sample.
96 Next();
97}

◆ charset_map()

const IndexMapBiDi & tesseract::SampleIterator::charset_map ( ) const
inline

Definition at line 137 of file sampleiterator.h.

137 {
138 return *charset_map_;
139 }

◆ Clear()

void tesseract::SampleIterator::Clear ( )

Definition at line 41 of file sampleiterator.cpp.

41 {
42 delete owned_shape_table_;
43 owned_shape_table_ = nullptr;
44}

◆ CompactCharsetSize()

int tesseract::SampleIterator::CompactCharsetSize ( ) const

Definition at line 196 of file sampleiterator.cpp.

196 {
197 return charset_map_ != nullptr ? charset_map_->CompactSize()
199}
int CompactSize() const
Definition: indexmapbidi.h:61

◆ GetCompactClassID()

int tesseract::SampleIterator::GetCompactClassID ( ) const

Definition at line 142 of file sampleiterator.cpp.

142 {
143 return charset_map_ != nullptr ? charset_map_->SparseToCompact(shape_index_)
145}
int SparseToCompact(int sparse_index) const override
Definition: indexmapbidi.h:138

◆ GetSample()

const TrainingSample & tesseract::SampleIterator::GetSample ( ) const

Definition at line 103 of file sampleiterator.cpp.

103 {
104 if (shape_table_ != nullptr) {
105 const UnicharAndFonts* shape_entry = GetShapeEntry();
106 int char_id = shape_entry->unichar_id;
107 int font_id = shape_entry->font_ids[shape_font_index_];
108 return *sample_set_->GetSample(font_id, char_id, sample_index_);
109 } else {
110 return *sample_set_->GetSample(shape_index_);
111 }
112}
const TrainingSample * GetSample(int index) const

◆ GetSparseClassID()

int tesseract::SampleIterator::GetSparseClassID ( ) const

Definition at line 150 of file sampleiterator.cpp.

150 {
151 return shape_table_ != nullptr ? shape_index_ : GetSample().class_id();
152}
const TrainingSample & GetSample() const
UNICHAR_ID class_id() const

◆ GlobalSampleIndex()

int tesseract::SampleIterator::GlobalSampleIndex ( ) const

Definition at line 127 of file sampleiterator.cpp.

127 {
128 if (shape_table_ != nullptr) {
129 const UnicharAndFonts* shape_entry = GetShapeEntry();
130 int char_id = shape_entry->unichar_id;
131 int font_id = shape_entry->font_ids[shape_font_index_];
132 return sample_set_->GlobalSampleIndex(font_id, char_id, sample_index_);
133 } else {
134 return shape_index_;
135 }
136}
int GlobalSampleIndex(int font_id, int class_id, int index) const

◆ Init()

void tesseract::SampleIterator::Init ( const IndexMapBiDi charset_map,
const ShapeTable shape_table,
bool  randomize,
TrainingSampleSet sample_set 
)

Definition at line 47 of file sampleiterator.cpp.

50 {
51 Clear();
52 charset_map_ = charset_map;
53 shape_table_ = shape_table;
54 sample_set_ = sample_set;
55 randomize_ = randomize;
56 if (shape_table_ == nullptr && charset_map_ != nullptr) {
57 // The caller wishes to iterate by class. The easiest way to do this
58 // is to create a dummy shape_table_ that we will own.
59 int num_fonts = sample_set_->NumFonts();
60 owned_shape_table_ = new ShapeTable(sample_set_->unicharset());
61 int charsetsize = sample_set_->unicharset().size();
62 for (int c = 0; c < charsetsize; ++c) {
63 // We always add a shape for each character to keep the index in sync
64 // with the unichar_id.
65 int shape_id = owned_shape_table_->AddShape(c, 0);
66 for (int f = 1; f < num_fonts; ++f) {
67 if (sample_set_->NumClassSamples(f, c, true) > 0) {
68 owned_shape_table_->AddToShape(shape_id, c, f);
69 }
70 }
71 }
72 shape_table_ = owned_shape_table_;
73 }
74 if (shape_table_ != nullptr) {
75 num_shapes_ = shape_table_->NumShapes();
76 } else {
77 num_shapes_ = randomize ? sample_set_->num_samples()
78 : sample_set_->num_raw_samples();
79 }
80 Begin();
81}
int size() const
Definition: unicharset.h:341
const TrainingSampleSet * sample_set() const
const IndexMapBiDi & charset_map() const
const ShapeTable * shape_table() const
int NumShapes() const
Definition: shapetable.h:274
int AddShape(int unichar_id, int font_id)
Definition: shapetable.cpp:336
void AddToShape(int shape_id, int unichar_id, int font_id)
Definition: shapetable.cpp:369
int NumClassSamples(int font_id, int class_id, bool randomize) const
const UNICHARSET & unicharset() const

◆ MapSampleFeatures()

void tesseract::SampleIterator::MapSampleFeatures ( const IntFeatureMap feature_map)

Definition at line 211 of file sampleiterator.cpp.

211 {
212 for (Begin(); !AtEnd(); Next()) {
213 TrainingSample* sample = MutableSample();
214 sample->MapFeatures(feature_map);
215 }
216}
Definition: cluster.h:32
TrainingSample * MutableSample() const

◆ MutableSample()

TrainingSample * tesseract::SampleIterator::MutableSample ( ) const

Definition at line 114 of file sampleiterator.cpp.

114 {
115 if (shape_table_ != nullptr) {
116 const UnicharAndFonts* shape_entry = GetShapeEntry();
117 int char_id = shape_entry->unichar_id;
118 int font_id = shape_entry->font_ids[shape_font_index_];
119 return sample_set_->MutableSample(font_id, char_id, sample_index_);
120 } else {
121 return sample_set_->mutable_sample(shape_index_);
122 }
123}
TrainingSample * mutable_sample(int index)
TrainingSample * MutableSample(int font_id, int class_id, int index)

◆ Next()

void tesseract::SampleIterator::Next ( )

Definition at line 156 of file sampleiterator.cpp.

156 {
157 if (shape_table_ != nullptr) {
158 // Next sample in this class/font combination.
159 ++sample_index_;
160 if (sample_index_ < num_samples_)
161 return;
162 // Next font in this class in this shape.
163 sample_index_ = 0;
164 do {
165 ++shape_font_index_;
166 if (shape_font_index_ >= num_shape_fonts_) {
167 // Next unichar in this shape.
168 shape_font_index_ = 0;
169 ++shape_char_index_;
170 if (shape_char_index_ >= num_shape_chars_) {
171 // Find the next shape that is mapped in the charset_map_.
172 shape_char_index_ = 0;
173 do {
174 ++shape_index_;
175 } while (shape_index_ < num_shapes_ &&
176 charset_map_ != nullptr &&
177 charset_map_->SparseToCompact(shape_index_) < 0);
178 if (shape_index_ >= num_shapes_)
179 return; // The end.
180 num_shape_chars_ = shape_table_->GetShape(shape_index_).size();
181 }
182 }
183 const UnicharAndFonts* shape_entry = GetShapeEntry();
184 num_shape_fonts_ = shape_entry->font_ids.size();
185 int char_id = shape_entry->unichar_id;
186 int font_id = shape_entry->font_ids[shape_font_index_];
187 num_samples_ = sample_set_->NumClassSamples(font_id, char_id, randomize_);
188 } while (num_samples_ == 0);
189 } else {
190 // We are just iterating over the samples.
191 ++shape_index_;
192 }
193}
int size() const
Definition: shapetable.h:199
const Shape & GetShape(int shape_id) const
Definition: shapetable.h:319

◆ NormalizeSamples()

double tesseract::SampleIterator::NormalizeSamples ( )

Definition at line 233 of file sampleiterator.cpp.

233 {
234 double total_weight = 0.0;
235 int sample_count = 0;
236 for (Begin(); !AtEnd(); Next()) {
237 const TrainingSample& sample = GetSample();
238 total_weight += sample.weight();
239 ++sample_count;
240 }
241 // Normalize samples.
242 double min_assigned_sample_weight = 1.0;
243 if (total_weight > 0.0) {
244 for (Begin(); !AtEnd(); Next()) {
245 TrainingSample* sample = MutableSample();
246 double weight = sample->weight() / total_weight;
247 if (weight < min_assigned_sample_weight)
248 min_assigned_sample_weight = weight;
249 sample->set_weight(weight);
250 }
251 }
252 return min_assigned_sample_weight;
253}

◆ sample_set()

const TrainingSampleSet * tesseract::SampleIterator::sample_set ( ) const
inline

Definition at line 144 of file sampleiterator.h.

144 {
145 return sample_set_;
146 }

◆ shape_table()

const ShapeTable * tesseract::SampleIterator::shape_table ( ) const
inline

Definition at line 140 of file sampleiterator.h.

140 {
141 return shape_table_;
142 }

◆ SparseCharsetSize()

int tesseract::SampleIterator::SparseCharsetSize ( ) const

Definition at line 202 of file sampleiterator.cpp.

202 {
203 return charset_map_ != nullptr
204 ? charset_map_->SparseSize()
205 : (shape_table_ != nullptr ? shape_table_->NumShapes()
206 : sample_set_->charsetsize());
207}
int SparseSize() const override
Definition: indexmapbidi.h:142

◆ UniformSamples()

int tesseract::SampleIterator::UniformSamples ( )

Definition at line 220 of file sampleiterator.cpp.

220 {
221 int num_good_samples = 0;
222 for (Begin(); !AtEnd(); Next()) {
223 TrainingSample* sample = MutableSample();
224 sample->set_weight(1.0);
225 ++num_good_samples;
226 }
228 return num_good_samples;
229}

The documentation for this class was generated from the following files: