tesseract 4.1.1
Loading...
Searching...
No Matches
sampleiterator.h
Go to the documentation of this file.
1// Copyright 2011 Google Inc. All Rights Reserved.
2// Author: rays@google.com (Ray Smith)
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7// http://www.apache.org/licenses/LICENSE-2.0
8// Unless required by applicable law or agreed to in writing, software
9// distributed under the License is distributed on an "AS IS" BASIS,
10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11// See the License for the specific language governing permissions and
12// limitations under the License.
13//
15
16
17#ifndef TESSERACT_CLASSIFY_SAMPLEITERATOR_H_
18#define TESSERACT_CLASSIFY_SAMPLEITERATOR_H_
19
20namespace tesseract {
21
22class IndexMapBiDi;
23class IntFeatureMap;
24class ShapeTable;
25class TrainingSample;
26class TrainingSampleSet;
27struct UnicharAndFonts;
28
29// Iterator class to encapsulate the complex iteration involved in getting
30// all samples of all shapes needed for a classification problem.
31//
32// =====INPUTS TO Init FUNCTION=====
33// The charset_map defines a subset of the sample_set classes (with a nullptr
34// shape_table, or the shape_table classes if not nullptr.)
35//
36// The shape_table (if not nullptr) defines the mapping from shapes to
37// font_id/class_id pairs. Each shape is a list of unichar_id and font lists.
38//
39// The sample_set holds the samples and provides indexed access to samples
40// of font_id/class_id pairs.
41//
42// If randomize is true, the samples are perturbed slightly, but the
43// perturbation is guaranteed to be the same for multiple identical
44// iterations.
45//
46// =====DIFFERENT COMBINATIONS OF INPUTS=====
47// nullptr shape_table:
48// Without a shape_table, everything works in UNICHAR_IDs.
49//
50// nullptr shape_table, nullptr charset_map:
51// Iterations simply run over the samples in the order the samples occur in the
52// input files.
53// GetCompactClassID and GetSparseClassID both return the sample UNICHAR_ID.
54//
55// nullptr shape_table, non-nullptr charset_map:
56// When shape_table is nullptr, the charset_map indexes unichar_ids directly,
57// and an iteration returns all samples of all chars in the charset_map, which
58// is a subset of the full unicharset.
59// The iteration will be in groups of the same unichar_id, in the order
60// defined by the charset_map.
61// GetCompactClassID returns the charset_map index of a sample, and
62// GetSparseClassID returns the sample UNICHAR_ID.
63//
64// Non-nullptr shape_table:
65// With a shape_table, samples are grouped according to the shape_table, so
66// multiple UNICHAR_IDs and fonts may be grouped together, and everything
67// works in shape_ids.
68//
69// Non-nullptr shape_table, nullptr charset_map.
70// Iterations simply run over the samples in the order of shape_id.
71// GetCompactClassID and GetSparseClassID both return the shape_id.
72// (If you want the unichar_id or font_id, the sample still has them.)
73//
74// Non-nullptr shape_table, non-nullptr charset_map.
75// When shape_table is not nullptr, the charset_map indexes and subsets shapes in
76// the shape_table, and iterations will be in shape_table order, not
77// charset_map order.
78// GetCompactClassID returns the charset_map index of a shape, and
79// GetSparseClassID returns the shape_id.
80//
81// =====What is SampleIterator good for?=====
82// Inside a classifier training module, the SampleIterator has abstracted away
83// all the different modes above.
84// Use the following iteration to train your classifier:
85// for (it.Begin(); !it.AtEnd(); it.Next()) {
86// const TrainingSample& sample = it.GetSample();
87// int class_id = it.GetCompactClassID();
88// Your classifier may or may not be dealing with a shape_table, and may be
89// dealing with some subset of the character/shape set. It doesn't need to
90// know and shouldn't care. It is just learning shapes with compact class ids
91// in the range [0, it.CompactCharsetSize()).
93 public:
96
97 void Clear();
98
99 // See class comment for arguments.
100 void Init(const IndexMapBiDi* charset_map,
101 const ShapeTable* shape_table,
102 bool randomize,
104
105 // Iterator functions designed for use with a simple for loop:
106 // for (it.Begin(); !it.AtEnd(); it.Next()) {
107 // const TrainingSample& sample = it.GetSample();
108 // int class_id = it.GetCompactClassID();
109 // ...
110 // }
111 void Begin();
112 bool AtEnd() const;
113 const TrainingSample& GetSample() const;
115 // Returns the total index (from the original set of samples) of the current
116 // sample.
117 int GlobalSampleIndex() const;
118 // Returns the index of the current sample in compact charset space, so
119 // in a 2-class problem between x and y, the returned indices will all be
120 // 0 or 1, and have nothing to do with the unichar_ids.
121 // If the charset_map_ is nullptr, then this is equal to GetSparseClassID().
122 int GetCompactClassID() const;
123 // Returns the index of the current sample in sparse charset space, so
124 // in a 2-class problem between x and y, the returned indices will all be
125 // x or y, where x and y may be unichar_ids (no shape_table_) or shape_ids
126 // with a shape_table_.
127 int GetSparseClassID() const;
128 // Moves on to the next indexable sample. If the end is reached, leaves
129 // the state such that AtEnd() is true.
130 void Next();
131
132 // Returns the size of the compact charset space.
133 int CompactCharsetSize() const;
134 // Returns the size of the sparse charset space.
135 int SparseCharsetSize() const;
136
137 const IndexMapBiDi& charset_map() const {
138 return *charset_map_;
139 }
140 const ShapeTable* shape_table() const {
141 return shape_table_;
142 }
143 // Sample set operations.
145 return sample_set_;
146 }
147
148 // A set of functions that do something to all the samples accessed by the
149 // iterator, as it is currently setup.
150
151 // Apply the supplied feature_space/feature_map transform to all samples
152 // accessed by this iterator.
153 void MapSampleFeatures(const IntFeatureMap& feature_map);
154
155 // Adjust the weights of all the samples to be uniform in the given charset.
156 // Returns the number of samples in the iterator.
157 int UniformSamples();
158
159 // Normalize the weights of all the samples defined by the iterator so they
160 // sum to 1. Returns the minimum assigned sample weight.
161 double NormalizeSamples();
162
163 private:
164 // Helper returns the current UnicharAndFont shape_entry.
165 const UnicharAndFonts* GetShapeEntry() const;
166
167 // Map to subset the actual charset space.
168 const IndexMapBiDi* charset_map_;
169 // Shape table to recombine character classes into shapes
170 const ShapeTable* shape_table_;
171 // The samples to iterate over.
172 TrainingSampleSet* sample_set_;
173 // Flag to control randomizing the sample features.
174 bool randomize_;
175 // Shape table owned by this used to iterate character classes.
176 ShapeTable* owned_shape_table_;
177
178 // Top-level iteration. Shape index in sparse charset_map space.
179 int shape_index_;
180 int num_shapes_;
181 // Index to the character class within a shape.
182 int shape_char_index_;
183 int num_shape_chars_;
184 // Index to the font within a shape/class pair.
185 int shape_font_index_;
186 int num_shape_fonts_;
187 // The lowest level iteration. sample_index_/num_samples_ counts samples
188 // in the current shape/class/font combination.
189 int sample_index_;
190 int num_samples_;
191};
192
193} // namespace tesseract.
194
195#endif // TESSERACT_CLASSIFY_SAMPLEITERATOR_H_
const TrainingSample & GetSample() const
void Init(const IndexMapBiDi *charset_map, const ShapeTable *shape_table, bool randomize, TrainingSampleSet *sample_set)
const TrainingSampleSet * sample_set() const
void MapSampleFeatures(const IntFeatureMap &feature_map)
const IndexMapBiDi & charset_map() const
TrainingSample * MutableSample() const
const ShapeTable * shape_table() const