tesseract 4.1.1
Loading...
Searching...
No Matches
trainingsampleset.h
Go to the documentation of this file.
1// Copyright 2010 Google Inc. All Rights Reserved.
2// Author: rays@google.com (Ray Smith)
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7// http://www.apache.org/licenses/LICENSE-2.0
8// Unless required by applicable law or agreed to in writing, software
9// distributed under the License is distributed on an "AS IS" BASIS,
10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11// See the License for the specific language governing permissions and
12// limitations under the License.
13//
15
16#ifndef TESSERACT_TRAINING_TRAININGSAMPLESET_H_
17#define TESSERACT_TRAINING_TRAININGSAMPLESET_H_
18
19#include "bitvector.h"
20#include "genericvector.h"
21#include "indexmapbidi.h"
22#include "matrix.h"
23#include "shapetable.h"
24#include "trainingsample.h"
25
26class UNICHARSET;
27
28namespace tesseract {
29
30struct FontInfo;
31class FontInfoTable;
32class IntFeatureMap;
33class IntFeatureSpace;
34class TrainingSample;
35struct UnicharAndFonts;
36
37// Collection of TrainingSample used for training or testing a classifier.
38// Provides several useful methods to operate on the collection as a whole,
39// including outlier detection and deletion, providing access by font and
40// class, finding the canonical sample, finding the "cloud" features (OR of
41// all features in all samples), replication of samples, caching of distance
42// metrics.
44 public:
47
48 // Writes to the given file. Returns false in case of error.
49 bool Serialize(FILE* fp) const;
50 // Reads from the given file. Returns false in case of error.
51 // If swap is true, assumes a big/little-endian swap is needed.
52 bool DeSerialize(bool swap, FILE* fp);
53
54 // Accessors
55 int num_samples() const {
56 return samples_.size();
57 }
58 int num_raw_samples() const {
59 return num_raw_samples_;
60 }
61 int NumFonts() const {
62 return font_id_map_.SparseSize();
63 }
64 const UNICHARSET& unicharset() const {
65 return unicharset_;
66 }
67 int charsetsize() const {
68 return unicharset_size_;
69 }
71 return fontinfo_table_;
72 }
73
74 // Loads an initial unicharset, or sets one up if the file cannot be read.
75 void LoadUnicharset(const char* filename);
76
77 // Adds a character sample to this sample set.
78 // If the unichar is not already in the local unicharset, it is added.
79 // Returns the unichar_id of the added sample, from the local unicharset.
80 int AddSample(const char* unichar, TrainingSample* sample);
81 // Adds a character sample to this sample set with the given unichar_id,
82 // which must correspond to the local unicharset (in this).
83 void AddSample(int unichar_id, TrainingSample* sample);
84
85 // Returns the number of samples for the given font,class pair.
86 // If randomize is true, returns the number of samples accessible
87 // with randomizing on. (Increases the number of samples if small.)
88 // OrganizeByFontAndClass must have been already called.
89 int NumClassSamples(int font_id, int class_id, bool randomize) const;
90
91 // Gets a sample by its index.
92 const TrainingSample* GetSample(int index) const;
93
94 // Gets a sample by its font, class, index.
95 // OrganizeByFontAndClass must have been already called.
96 const TrainingSample* GetSample(int font_id, int class_id, int index) const;
97
98 // Get a sample by its font, class, index. Does not randomize.
99 // OrganizeByFontAndClass must have been already called.
100 TrainingSample* MutableSample(int font_id, int class_id, int index);
101
102 // Returns a string debug representation of the given sample:
103 // font, unichar_str, bounding box, page.
105
106 // Gets the combined set of features used by all the samples of the given
107 // font/class combination.
108 const BitVector& GetCloudFeatures(int font_id, int class_id) const;
109 // Gets the indexed features of the canonical sample of the given
110 // font/class combination.
111 const GenericVector<int>& GetCanonicalFeatures(int font_id,
112 int class_id) const;
113
114 // Returns the distance between the given UniCharAndFonts pair.
115 // If matched_fonts, only matching fonts, are considered, unless that yields
116 // the empty set.
117 // OrganizeByFontAndClass must have been already called.
118 float UnicharDistance(const UnicharAndFonts& uf1, const UnicharAndFonts& uf2,
119 bool matched_fonts, const IntFeatureMap& feature_map);
120
121 // Returns the distance between the given pair of font/class pairs.
122 // Finds in cache or computes and caches.
123 // OrganizeByFontAndClass must have been already called.
124 float ClusterDistance(int font_id1, int class_id1,
125 int font_id2, int class_id2,
126 const IntFeatureMap& feature_map);
127
128 // Computes the distance between the given pair of font/class pairs.
129 float ComputeClusterDistance(int font_id1, int class_id1,
130 int font_id2, int class_id2,
131 const IntFeatureMap& feature_map) const;
132
133 // Returns the number of canonical features of font/class 2 for which
134 // neither the feature nor any of its near neighbors occurs in the cloud
135 // of font/class 1. Each such feature is a reliable separation between
136 // the classes, ASSUMING that the canonical sample is sufficiently
137 // representative that every sample has a feature near that particular
138 // feature. To check that this is so on the fly would be prohibitively
139 // expensive, but it might be possible to pre-qualify the canonical features
140 // to include only those for which this assumption is true.
141 // ComputeCanonicalFeatures and ComputeCloudFeatures must have been called
142 // first, or the results will be nonsense.
143 int ReliablySeparable(int font_id1, int class_id1,
144 int font_id2, int class_id2,
145 const IntFeatureMap& feature_map,
146 bool thorough) const;
147
148
149 // Returns the total index of the requested sample.
150 // OrganizeByFontAndClass must have been already called.
151 int GlobalSampleIndex(int font_id, int class_id, int index) const;
152
153 // Gets the canonical sample for the given font, class pair.
154 // ComputeCanonicalSamples must have been called first.
155 const TrainingSample* GetCanonicalSample(int font_id, int class_id) const;
156 // Gets the max distance for the given canonical sample.
157 // ComputeCanonicalSamples must have been called first.
158 float GetCanonicalDist(int font_id, int class_id) const;
159
160 // Returns a mutable pointer to the sample with the given index.
162 return samples_[index];
163 }
164 // Gets ownership of the sample with the given index, removing it from this.
166 TrainingSample* sample = samples_[index];
167 samples_[index] = nullptr;
168 return sample;
169 }
170
171 // Generates indexed features for all samples with the supplied feature_space.
172 void IndexFeatures(const IntFeatureSpace& feature_space);
173
174 // Marks the given sample for deletion.
175 // Deletion is actually completed by DeleteDeadSamples.
177
178 // Deletes all samples with a negative sample index marked by KillSample.
179 // Must be called before OrganizeByFontAndClass, and OrganizeByFontAndClass
180 // must be called after as the samples have been renumbered.
181 void DeleteDeadSamples();
182
183 // Callback function returns true if the given sample is to be deleted, due
184 // to having a negative classid.
186
187 // Construct an array to access the samples by font,class pair.
189
190 // Constructs the font_id_map_ which maps real font_ids (sparse) to a compact
191 // index for the font_class_array_.
192 void SetupFontIdMap();
193
194 // Finds the sample for each font, class pair that has least maximum
195 // distance to all the other samples of the same font, class.
196 // OrganizeByFontAndClass must have been already called.
197 void ComputeCanonicalSamples(const IntFeatureMap& map, bool debug);
198
199 // Replicates the samples to a minimum frequency defined by
200 // 2 * kSampleRandomSize, or for larger counts duplicates all samples.
201 // After replication, the replicated samples are perturbed slightly, but
202 // in a predictable and repeatable way.
203 // Use after OrganizeByFontAndClass().
205
206 // Caches the indexed features of the canonical samples.
207 // ComputeCanonicalSamples must have been already called.
209 // Computes the combined set of features used by all the samples of each
210 // font/class combination. Use after ReplicateAndRandomizeSamples.
211 void ComputeCloudFeatures(int feature_space_size);
212
213 // Adds all fonts of the given class to the shape.
214 void AddAllFontsForClass(int class_id, Shape* shape) const;
215
216 // Display the samples with the given indexed feature that also match
217 // the given shape.
218 void DisplaySamplesWithFeature(int f_index, const Shape& shape,
219 const IntFeatureSpace& feature_space,
220 ScrollView::Color color,
221 ScrollView* window) const;
222
223 private:
224 // Struct to store a triplet of unichar, font, distance in the distance cache.
225 struct FontClassDistance {
226 int unichar_id;
227 int font_id; // Real font id.
228 float distance;
229 };
230 // Simple struct to store information related to each font/class combination.
231 struct FontClassInfo {
232 FontClassInfo();
233
234 // Writes to the given file. Returns false in case of error.
235 bool Serialize(FILE* fp) const;
236 // Reads from the given file. Returns false in case of error.
237 // If swap is true, assumes a big/little-endian swap is needed.
238 bool DeSerialize(bool swap, FILE* fp);
239
240 // Number of raw samples.
241 int32_t num_raw_samples;
242 // Index of the canonical sample.
243 int32_t canonical_sample;
244 // Max distance of the canonical sample from any other.
245 float canonical_dist;
246 // Sample indices for the samples, including replicated.
248
249 // Non-serialized cache data.
250 // Indexed features of the canonical sample.
251 GenericVector<int> canonical_features;
252 // The mapped features of all the samples.
253 BitVector cloud_features;
254
255 // Caches for ClusterDistance.
256 // Caches for other fonts but matching this unichar. -1 indicates not set.
257 // Indexed by compact font index from font_id_map_.
258 GenericVector<float> font_distance_cache;
259 // Caches for other unichars but matching this font. -1 indicates not set.
260 GenericVector<float> unichar_distance_cache;
261 // Cache for the rest (non matching font and unichar.)
262 // A cache of distances computed by ReliablySeparable.
264 };
265
266 PointerVector<TrainingSample> samples_;
267 // Number of samples before replication/randomization.
268 int num_raw_samples_;
269 // Character set we are training for.
270 UNICHARSET unicharset_;
271 // Character set size to which the 2-d arrays below refer.
272 int unicharset_size_;
273 // Map to allow the font_class_array_ below to be compact.
274 // The sparse space is the real font_id, used in samples_ .
275 // The compact space is an index to font_class_array_
276 IndexMapBiDi font_id_map_;
277 // A 2-d array of FontClassInfo holding information related to each
278 // (font_id, class_id) pair.
279 GENERIC_2D_ARRAY<FontClassInfo>* font_class_array_;
280
281 // Reference to the fontinfo_table_ in MasterTrainer. Provides names
282 // for font_ids in the samples. Not serialized!
283 const FontInfoTable& fontinfo_table_;
284};
285
286} // namespace tesseract.
287
288
289#endif // TRAININGSAMPLESETSET_H_
int SparseSize() const override
Definition: indexmapbidi.h:142
Definition: strngs.h:45
Definition: cluster.h:32
STRING SampleToString(const TrainingSample &sample) const
const TrainingSample * GetCanonicalSample(int font_id, int class_id) const
int NumClassSamples(int font_id, int class_id, bool randomize) const
int AddSample(const char *unichar, TrainingSample *sample)
TrainingSample * extract_sample(int index)
const GenericVector< int > & GetCanonicalFeatures(int font_id, int class_id) const
bool Serialize(FILE *fp) const
const FontInfoTable & fontinfo_table() const
void AddAllFontsForClass(int class_id, Shape *shape) const
void IndexFeatures(const IntFeatureSpace &feature_space)
float ComputeClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map) const
void KillSample(TrainingSample *sample)
void ComputeCanonicalSamples(const IntFeatureMap &map, bool debug)
void ComputeCloudFeatures(int feature_space_size)
bool DeSerialize(bool swap, FILE *fp)
void LoadUnicharset(const char *filename)
const BitVector & GetCloudFeatures(int font_id, int class_id) const
const UNICHARSET & unicharset() const
int GlobalSampleIndex(int font_id, int class_id, int index) const
TrainingSample * mutable_sample(int index)
bool DeleteableSample(const TrainingSample *sample)
float ClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map)
float GetCanonicalDist(int font_id, int class_id) const
const TrainingSample * GetSample(int index) const
void DisplaySamplesWithFeature(int f_index, const Shape &shape, const IntFeatureSpace &feature_space, ScrollView::Color color, ScrollView *window) const
int ReliablySeparable(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map, bool thorough) const
TrainingSample * MutableSample(int font_id, int class_id, int index)
float UnicharDistance(const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, bool matched_fonts, const IntFeatureMap &feature_map)