tesseract 4.1.1
Loading...
Searching...
No Matches
trainingsampleset.cpp
Go to the documentation of this file.
1// Copyright 2010 Google Inc. All Rights Reserved.
2// Author: rays@google.com (Ray Smith)
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7// http://www.apache.org/licenses/LICENSE-2.0
8// Unless required by applicable law or agreed to in writing, software
9// distributed under the License is distributed on an "AS IS" BASIS,
10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11// See the License for the specific language governing permissions and
12// limitations under the License.
13//
15
16#include "trainingsampleset.h"
17#include "allheaders.h"
18#include "boxread.h"
19#include "fontinfo.h"
20#include "indexmapbidi.h"
21#include "intfeaturedist.h"
22#include "intfeaturemap.h"
23#include "intfeaturespace.h"
24#include "shapetable.h"
25#include "trainingsample.h"
26#include "unicity_table.h"
27
28#include <algorithm>
29
30namespace tesseract {
31
32const int kTestChar = -1; // 37;
33// Max number of distances to compute the squared way
34const int kSquareLimit = 25;
35// Prime numbers for subsampling distances.
36const int kPrime1 = 17;
37const int kPrime2 = 13;
38
39TrainingSampleSet::FontClassInfo::FontClassInfo()
40 : num_raw_samples(0), canonical_sample(-1), canonical_dist(0.0f) {
41}
42
43// Writes to the given file. Returns false in case of error.
44bool TrainingSampleSet::FontClassInfo::Serialize(FILE* fp) const {
45 if (fwrite(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1)
46 return false;
47 if (fwrite(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1)
48 return false;
49 if (fwrite(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) return false;
50 if (!samples.Serialize(fp)) return false;
51 return true;
52}
53// Reads from the given file. Returns false in case of error.
54// If swap is true, assumes a big/little-endian swap is needed.
55bool TrainingSampleSet::FontClassInfo::DeSerialize(bool swap, FILE* fp) {
56 if (fread(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1)
57 return false;
58 if (fread(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1)
59 return false;
60 if (fread(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) return false;
61 if (!samples.DeSerialize(swap, fp)) return false;
62 if (swap) {
63 ReverseN(&num_raw_samples, sizeof(num_raw_samples));
64 ReverseN(&canonical_sample, sizeof(canonical_sample));
65 ReverseN(&canonical_dist, sizeof(canonical_dist));
66 }
67 return true;
68}
69
70TrainingSampleSet::TrainingSampleSet(const FontInfoTable& font_table)
71 : num_raw_samples_(0), unicharset_size_(0),
72 font_class_array_(nullptr), fontinfo_table_(font_table) {
73}
74
76 delete font_class_array_;
77}
78
79// Writes to the given file. Returns false in case of error.
80bool TrainingSampleSet::Serialize(FILE* fp) const {
81 if (!samples_.Serialize(fp)) return false;
82 if (!unicharset_.save_to_file(fp)) return false;
83 if (!font_id_map_.Serialize(fp)) return false;
84 int8_t not_null = font_class_array_ != nullptr;
85 if (fwrite(&not_null, sizeof(not_null), 1, fp) != 1) return false;
86 if (not_null) {
87 if (!font_class_array_->SerializeClasses(fp)) return false;
88 }
89 return true;
90}
91
92// Reads from the given file. Returns false in case of error.
93// If swap is true, assumes a big/little-endian swap is needed.
94bool TrainingSampleSet::DeSerialize(bool swap, FILE* fp) {
95 if (!samples_.DeSerialize(swap, fp)) return false;
96 num_raw_samples_ = samples_.size();
97 if (!unicharset_.load_from_file(fp)) return false;
98 if (!font_id_map_.DeSerialize(swap, fp)) return false;
99 delete font_class_array_;
100 font_class_array_ = nullptr;
101 int8_t not_null;
102 if (fread(&not_null, sizeof(not_null), 1, fp) != 1) return false;
103 if (not_null) {
104 FontClassInfo empty;
105 font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo >(1, 1 , empty);
106 if (!font_class_array_->DeSerializeClasses(swap, fp)) return false;
107 }
108 unicharset_size_ = unicharset_.size();
109 return true;
110}
111
112// Load an initial unicharset, or set one up if the file cannot be read.
113void TrainingSampleSet::LoadUnicharset(const char* filename) {
114 if (!unicharset_.load_from_file(filename)) {
115 tprintf("Failed to load unicharset from file %s\n"
116 "Building unicharset from scratch...\n",
117 filename);
118 unicharset_.clear();
119 // Add special characters as they were removed by the clear.
120 UNICHARSET empty;
121 unicharset_.AppendOtherUnicharset(empty);
122 }
123 unicharset_size_ = unicharset_.size();
124}
125
126// Adds a character sample to this sample set.
127// If the unichar is not already in the local unicharset, it is added.
128// Returns the unichar_id of the added sample, from the local unicharset.
130 if (!unicharset_.contains_unichar(unichar)) {
131 unicharset_.unichar_insert(unichar);
132 if (unicharset_.size() > MAX_NUM_CLASSES) {
133 tprintf("Error: Size of unicharset in TrainingSampleSet::AddSample is "
134 "greater than MAX_NUM_CLASSES\n");
135 return -1;
136 }
137 }
138 UNICHAR_ID char_id = unicharset_.unichar_to_id(unichar);
139 AddSample(char_id, sample);
140 return char_id;
141}
142
143// Adds a character sample to this sample set with the given unichar_id,
144// which must correspond to the local unicharset (in this).
146 sample->set_class_id(unichar_id);
147 samples_.push_back(sample);
148 num_raw_samples_ = samples_.size();
149 unicharset_size_ = unicharset_.size();
150}
151
152// Returns the number of samples for the given font,class pair.
153// If randomize is true, returns the number of samples accessible
154// with randomizing on. (Increases the number of samples if small.)
155// OrganizeByFontAndClass must have been already called.
156int TrainingSampleSet::NumClassSamples(int font_id, int class_id,
157 bool randomize) const {
158 ASSERT_HOST(font_class_array_ != nullptr);
159 if (font_id < 0 || class_id < 0 ||
160 font_id >= font_id_map_.SparseSize() || class_id >= unicharset_size_) {
161 // There are no samples because the font or class doesn't exist.
162 return 0;
163 }
164 int font_index = font_id_map_.SparseToCompact(font_id);
165 if (font_index < 0)
166 return 0; // The font has no samples.
167 if (randomize)
168 return (*font_class_array_)(font_index, class_id).samples.size();
169 else
170 return (*font_class_array_)(font_index, class_id).num_raw_samples;
171}
172
173// Gets a sample by its index.
175 return samples_[index];
176}
177
178// Gets a sample by its font, class, index.
179// OrganizeByFontAndClass must have been already called.
180const TrainingSample* TrainingSampleSet::GetSample(int font_id, int class_id,
181 int index) const {
182 ASSERT_HOST(font_class_array_ != nullptr);
183 int font_index = font_id_map_.SparseToCompact(font_id);
184 if (font_index < 0) return nullptr;
185 int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
186 return samples_[sample_index];
187}
188
189// Get a sample by its font, class, index. Does not randomize.
190// OrganizeByFontAndClass must have been already called.
192 int index) {
193 ASSERT_HOST(font_class_array_ != nullptr);
194 int font_index = font_id_map_.SparseToCompact(font_id);
195 if (font_index < 0) return nullptr;
196 int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
197 return samples_[sample_index];
198}
199
200// Returns a string debug representation of the given sample:
201// font, unichar_str, bounding box, page.
203 STRING boxfile_str;
204 MakeBoxFileStr(unicharset_.id_to_unichar(sample.class_id()),
205 sample.bounding_box(), sample.page_num(), &boxfile_str);
206 return STRING(fontinfo_table_.get(sample.font_id()).name) + " " + boxfile_str;
207}
208
209// Gets the combined set of features used by all the samples of the given
210// font/class combination.
212 int font_id, int class_id) const {
213 int font_index = font_id_map_.SparseToCompact(font_id);
214 ASSERT_HOST(font_index >= 0);
215 return (*font_class_array_)(font_index, class_id).cloud_features;
216}
217// Gets the indexed features of the canonical sample of the given
218// font/class combination.
220 int font_id, int class_id) const {
221 int font_index = font_id_map_.SparseToCompact(font_id);
222 ASSERT_HOST(font_index >= 0);
223 return (*font_class_array_)(font_index, class_id).canonical_features;
224}
225
226// Returns the distance between the given UniCharAndFonts pair.
227// If matched_fonts, only matching fonts, are considered, unless that yields
228// the empty set.
229// OrganizeByFontAndClass must have been already called.
231 const UnicharAndFonts& uf2,
232 bool matched_fonts,
233 const IntFeatureMap& feature_map) {
234 int num_fonts1 = uf1.font_ids.size();
235 int c1 = uf1.unichar_id;
236 int num_fonts2 = uf2.font_ids.size();
237 int c2 = uf2.unichar_id;
238 double dist_sum = 0.0;
239 int dist_count = 0;
240 const bool debug = false;
241 if (matched_fonts) {
242 // Compute distances only where fonts match.
243 for (int i = 0; i < num_fonts1; ++i) {
244 int f1 = uf1.font_ids[i];
245 for (int j = 0; j < num_fonts2; ++j) {
246 int f2 = uf2.font_ids[j];
247 if (f1 == f2) {
248 dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
249 ++dist_count;
250 }
251 }
252 }
253 } else if (num_fonts1 * num_fonts2 <= kSquareLimit) {
254 // Small enough sets to compute all the distances.
255 for (int i = 0; i < num_fonts1; ++i) {
256 int f1 = uf1.font_ids[i];
257 for (int j = 0; j < num_fonts2; ++j) {
258 int f2 = uf2.font_ids[j];
259 dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
260 if (debug) {
261 tprintf("Cluster dist %d %d %d %d = %g\n",
262 f1, c1, f2, c2,
263 ClusterDistance(f1, c1, f2, c2, feature_map));
264 }
265 ++dist_count;
266 }
267 }
268 } else {
269 // Subsample distances, using the largest set once, and stepping through
270 // the smaller set so as to ensure that all the pairs are different.
271 int increment = kPrime1 != num_fonts2 ? kPrime1 : kPrime2;
272 int index = 0;
273 int num_samples = std::max(num_fonts1, num_fonts2);
274 for (int i = 0; i < num_samples; ++i, index += increment) {
275 int f1 = uf1.font_ids[i % num_fonts1];
276 int f2 = uf2.font_ids[index % num_fonts2];
277 if (debug) {
278 tprintf("Cluster dist %d %d %d %d = %g\n",
279 f1, c1, f2, c2, ClusterDistance(f1, c1, f2, c2, feature_map));
280 }
281 dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
282 ++dist_count;
283 }
284 }
285 if (dist_count == 0) {
286 if (matched_fonts)
287 return UnicharDistance(uf1, uf2, false, feature_map);
288 return 0.0f;
289 }
290 return dist_sum / dist_count;
291}
292
293// Returns the distance between the given pair of font/class pairs.
294// Finds in cache or computes and caches.
295// OrganizeByFontAndClass must have been already called.
296float TrainingSampleSet::ClusterDistance(int font_id1, int class_id1,
297 int font_id2, int class_id2,
298 const IntFeatureMap& feature_map) {
299 ASSERT_HOST(font_class_array_ != nullptr);
300 int font_index1 = font_id_map_.SparseToCompact(font_id1);
301 int font_index2 = font_id_map_.SparseToCompact(font_id2);
302 if (font_index1 < 0 || font_index2 < 0)
303 return 0.0f;
304 FontClassInfo& fc_info = (*font_class_array_)(font_index1, class_id1);
305 if (font_id1 == font_id2) {
306 // Special case cache for speed.
307 if (fc_info.unichar_distance_cache.size() == 0)
308 fc_info.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
309 if (fc_info.unichar_distance_cache[class_id2] < 0) {
310 // Distance has to be calculated.
311 float result = ComputeClusterDistance(font_id1, class_id1,
312 font_id2, class_id2,
313 feature_map);
314 fc_info.unichar_distance_cache[class_id2] = result;
315 // Copy to the symmetric cache entry.
316 FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
317 if (fc_info2.unichar_distance_cache.size() == 0)
318 fc_info2.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
319 fc_info2.unichar_distance_cache[class_id1] = result;
320 }
321 return fc_info.unichar_distance_cache[class_id2];
322 } else if (class_id1 == class_id2) {
323 // Another special-case cache for equal class-id.
324 if (fc_info.font_distance_cache.size() == 0)
325 fc_info.font_distance_cache.init_to_size(font_id_map_.CompactSize(),
326 -1.0f);
327 if (fc_info.font_distance_cache[font_index2] < 0) {
328 // Distance has to be calculated.
329 float result = ComputeClusterDistance(font_id1, class_id1,
330 font_id2, class_id2,
331 feature_map);
332 fc_info.font_distance_cache[font_index2] = result;
333 // Copy to the symmetric cache entry.
334 FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
335 if (fc_info2.font_distance_cache.size() == 0)
336 fc_info2.font_distance_cache.init_to_size(font_id_map_.CompactSize(),
337 -1.0f);
338 fc_info2.font_distance_cache[font_index1] = result;
339 }
340 return fc_info.font_distance_cache[font_index2];
341 }
342 // Both font and class are different. Linear search for class_id2/font_id2
343 // in what is a hopefully short list of distances.
344 int cache_index = 0;
345 while (cache_index < fc_info.distance_cache.size() &&
346 (fc_info.distance_cache[cache_index].unichar_id != class_id2 ||
347 fc_info.distance_cache[cache_index].font_id != font_id2))
348 ++cache_index;
349 if (cache_index == fc_info.distance_cache.size()) {
350 // Distance has to be calculated.
351 float result = ComputeClusterDistance(font_id1, class_id1,
352 font_id2, class_id2,
353 feature_map);
354 FontClassDistance fc_dist = { class_id2, font_id2, result };
355 fc_info.distance_cache.push_back(fc_dist);
356 // Copy to the symmetric cache entry. We know it isn't there already, as
357 // we always copy to the symmetric entry.
358 FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
359 fc_dist.unichar_id = class_id1;
360 fc_dist.font_id = font_id1;
361 fc_info2.distance_cache.push_back(fc_dist);
362 }
363 return fc_info.distance_cache[cache_index].distance;
364}
365
366// Computes the distance between the given pair of font/class pairs.
368 int font_id1, int class_id1, int font_id2, int class_id2,
369 const IntFeatureMap& feature_map) const {
370 int dist = ReliablySeparable(font_id1, class_id1, font_id2, class_id2,
371 feature_map, false);
372 dist += ReliablySeparable(font_id2, class_id2, font_id1, class_id1,
373 feature_map, false);
374 int denominator = GetCanonicalFeatures(font_id1, class_id1).size();
375 denominator += GetCanonicalFeatures(font_id2, class_id2).size();
376 return static_cast<float>(dist) / denominator;
377}
378
379// Helper to add a feature and its near neighbors to the good_features.
380// levels indicates how many times to compute the offset features of what is
381// already there. This is done by iteration rather than recursion.
382static void AddNearFeatures(const IntFeatureMap& feature_map, int f, int levels,
383 GenericVector<int>* good_features) {
384 int prev_num_features = 0;
385 good_features->push_back(f);
386 int num_features = 1;
387 for (int level = 0; level < levels; ++level) {
388 for (int i = prev_num_features; i < num_features; ++i) {
389 int feature = (*good_features)[i];
390 for (int dir = -kNumOffsetMaps; dir <= kNumOffsetMaps; ++dir) {
391 if (dir == 0) continue;
392 int f1 = feature_map.OffsetFeature(feature, dir);
393 if (f1 >= 0) {
394 good_features->push_back(f1);
395 }
396 }
397 }
398 prev_num_features = num_features;
399 num_features = good_features->size();
400 }
401}
402
403// Returns the number of canonical features of font/class 2 for which
404// neither the feature nor any of its near neighbors occurs in the cloud
405// of font/class 1. Each such feature is a reliable separation between
406// the classes, ASSUMING that the canonical sample is sufficiently
407// representative that every sample has a feature near that particular
408// feature. To check that this is so on the fly would be prohibitively
409// expensive, but it might be possible to pre-qualify the canonical features
410// to include only those for which this assumption is true.
411// ComputeCanonicalFeatures and ComputeCloudFeatures must have been called
412// first, or the results will be nonsense.
413int TrainingSampleSet::ReliablySeparable(int font_id1, int class_id1,
414 int font_id2, int class_id2,
415 const IntFeatureMap& feature_map,
416 bool thorough) const {
417 int result = 0;
418 const TrainingSample* sample2 = GetCanonicalSample(font_id2, class_id2);
419 if (sample2 == nullptr)
420 return 0; // There are no canonical features.
421 const GenericVector<int>& canonical2 = GetCanonicalFeatures(font_id2,
422 class_id2);
423 const BitVector& cloud1 = GetCloudFeatures(font_id1, class_id1);
424 if (cloud1.size() == 0)
425 return canonical2.size(); // There are no cloud features.
426
427 // Find a canonical2 feature that is not in cloud1.
428 for (int f = 0; f < canonical2.size(); ++f) {
429 const int feature = canonical2[f];
430 if (cloud1[feature])
431 continue;
432 // Gather the near neighbours of f.
433 GenericVector<int> good_features;
434 AddNearFeatures(feature_map, feature, 1, &good_features);
435 // Check that none of the good_features are in the cloud.
436 int i;
437 for (i = 0; i < good_features.size(); ++i) {
438 int good_f = good_features[i];
439 if (cloud1[good_f]) {
440 break;
441 }
442 }
443 if (i < good_features.size())
444 continue; // Found one in the cloud.
445 ++result;
446 }
447 return result;
448}
449
450// Returns the total index of the requested sample.
451// OrganizeByFontAndClass must have been already called.
452int TrainingSampleSet::GlobalSampleIndex(int font_id, int class_id,
453 int index) const {
454 ASSERT_HOST(font_class_array_ != nullptr);
455 int font_index = font_id_map_.SparseToCompact(font_id);
456 if (font_index < 0) return -1;
457 return (*font_class_array_)(font_index, class_id).samples[index];
458}
459
460// Gets the canonical sample for the given font, class pair.
461// ComputeCanonicalSamples must have been called first.
463 int font_id, int class_id) const {
464 ASSERT_HOST(font_class_array_ != nullptr);
465 int font_index = font_id_map_.SparseToCompact(font_id);
466 if (font_index < 0) return nullptr;
467 const int sample_index = (*font_class_array_)(font_index,
468 class_id).canonical_sample;
469 return sample_index >= 0 ? samples_[sample_index] : nullptr;
470}
471
472// Gets the max distance for the given canonical sample.
473// ComputeCanonicalSamples must have been called first.
474float TrainingSampleSet::GetCanonicalDist(int font_id, int class_id) const {
475 ASSERT_HOST(font_class_array_ != nullptr);
476 int font_index = font_id_map_.SparseToCompact(font_id);
477 if (font_index < 0) return 0.0f;
478 if ((*font_class_array_)(font_index, class_id).canonical_sample >= 0)
479 return (*font_class_array_)(font_index, class_id).canonical_dist;
480 else
481 return 0.0f;
482}
483
484// Generates indexed features for all samples with the supplied feature_space.
486 for (int s = 0; s < samples_.size(); ++s)
487 samples_[s]->IndexFeatures(feature_space);
488}
489
490// Marks the given sample index for deletion.
491// Deletion is actually completed by DeleteDeadSamples.
493 sample->set_sample_index(-1);
494}
495
496// Deletes all samples with zero features marked by KillSample.
498 samples_.compact(
500 num_raw_samples_ = samples_.size();
501 // Samples must be re-organized now we have deleted a few.
502}
503
504// Callback function returns true if the given sample is to be deleted, due
505// to having a negative classid.
507 return sample == nullptr || sample->class_id() < 0;
508}
509
510// Construct an array to access the samples by font,class pair.
512 // Font indexes are sparse, so we used a map to compact them, so we can
513 // have an efficient 2-d array of fonts and character classes.
515 int compact_font_size = font_id_map_.CompactSize();
516 // Get a 2-d array of generic vectors.
517 delete font_class_array_;
518 FontClassInfo empty;
519 font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo>(
520 compact_font_size, unicharset_size_, empty);
521 for (int s = 0; s < samples_.size(); ++s) {
522 int font_id = samples_[s]->font_id();
523 int class_id = samples_[s]->class_id();
524 if (font_id < 0 || font_id >= font_id_map_.SparseSize()) {
525 tprintf("Font id = %d/%d, class id = %d/%d on sample %d\n",
526 font_id, font_id_map_.SparseSize(), class_id, unicharset_size_,
527 s);
528 }
529 ASSERT_HOST(font_id >= 0 && font_id < font_id_map_.SparseSize());
530 ASSERT_HOST(class_id >= 0 && class_id < unicharset_size_);
531 int font_index = font_id_map_.SparseToCompact(font_id);
532 (*font_class_array_)(font_index, class_id).samples.push_back(s);
533 }
534 // Set the num_raw_samples member of the FontClassInfo, to set the boundary
535 // between the raw samples and the replicated ones.
536 for (int f = 0; f < compact_font_size; ++f) {
537 for (int c = 0; c < unicharset_size_; ++c)
538 (*font_class_array_)(f, c).num_raw_samples =
539 (*font_class_array_)(f, c).samples.size();
540 }
541 // This is the global number of samples and also marks the boundary between
542 // real and replicated samples.
543 num_raw_samples_ = samples_.size();
544}
545
546// Constructs the font_id_map_ which maps real font_ids (sparse) to a compact
547// index for the font_class_array_.
549 // Number of samples for each font_id.
550 GenericVector<int> font_counts;
551 for (int s = 0; s < samples_.size(); ++s) {
552 const int font_id = samples_[s]->font_id();
553 while (font_id >= font_counts.size())
554 font_counts.push_back(0);
555 ++font_counts[font_id];
556 }
557 font_id_map_.Init(font_counts.size(), false);
558 for (int f = 0; f < font_counts.size(); ++f) {
559 font_id_map_.SetMap(f, font_counts[f] > 0);
560 }
561 font_id_map_.Setup();
562}
563
564
565// Finds the sample for each font, class pair that has least maximum
566// distance to all the other samples of the same font, class.
567// OrganizeByFontAndClass must have been already called.
569 bool debug) {
570 ASSERT_HOST(font_class_array_ != nullptr);
571 IntFeatureDist f_table;
572 if (debug) tprintf("feature table size %d\n", map.sparse_size());
573 f_table.Init(&map);
574 int worst_s1 = 0;
575 int worst_s2 = 0;
576 double global_worst_dist = 0.0;
577 // Compute distances independently for each font and char index.
578 int font_size = font_id_map_.CompactSize();
579 for (int font_index = 0; font_index < font_size; ++font_index) {
580 int font_id = font_id_map_.CompactToSparse(font_index);
581 for (int c = 0; c < unicharset_size_; ++c) {
582 int samples_found = 0;
583 FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
584 if (fcinfo.samples.size() == 0 ||
585 (kTestChar >= 0 && c != kTestChar)) {
586 fcinfo.canonical_sample = -1;
587 fcinfo.canonical_dist = 0.0f;
588 if (debug) tprintf("Skipping class %d\n", c);
589 continue;
590 }
591 // The canonical sample will be the one with the min_max_dist, which
592 // is the sample with the lowest maximum distance to all other samples.
593 double min_max_dist = 2.0;
594 // We keep track of the farthest apart pair (max_s1, max_s2) which
595 // are max_max_dist apart, so we can see how bad the variability is.
596 double max_max_dist = 0.0;
597 int max_s1 = 0;
598 int max_s2 = 0;
599 fcinfo.canonical_sample = fcinfo.samples[0];
600 fcinfo.canonical_dist = 0.0f;
601 for (int i = 0; i < fcinfo.samples.size(); ++i) {
602 int s1 = fcinfo.samples[i];
603 const GenericVector<int>& features1 = samples_[s1]->indexed_features();
604 f_table.Set(features1, features1.size(), true);
605 double max_dist = 0.0;
606 // Run the full squared-order search for similar samples. It is still
607 // reasonably fast because f_table.FeatureDistance is fast, but we
608 // may have to reconsider if we start playing with too many samples
609 // of a single char/font.
610 for (int j = 0; j < fcinfo.samples.size(); ++j) {
611 int s2 = fcinfo.samples[j];
612 if (samples_[s2]->class_id() != c ||
613 samples_[s2]->font_id() != font_id ||
614 s2 == s1)
615 continue;
616 GenericVector<int> features2 = samples_[s2]->indexed_features();
617 double dist = f_table.FeatureDistance(features2);
618 if (dist > max_dist) {
619 max_dist = dist;
620 if (dist > max_max_dist) {
621 max_max_dist = dist;
622 max_s1 = s1;
623 max_s2 = s2;
624 }
625 }
626 }
627 // Using Set(..., false) is far faster than re initializing, due to
628 // the sparseness of the feature space.
629 f_table.Set(features1, features1.size(), false);
630 samples_[s1]->set_max_dist(max_dist);
631 ++samples_found;
632 if (max_dist < min_max_dist) {
633 fcinfo.canonical_sample = s1;
634 fcinfo.canonical_dist = max_dist;
635 }
636 UpdateRange(max_dist, &min_max_dist, &max_max_dist);
637 }
638 if (max_max_dist > global_worst_dist) {
639 // Keep a record of the worst pair over all characters/fonts too.
640 global_worst_dist = max_max_dist;
641 worst_s1 = max_s1;
642 worst_s2 = max_s2;
643 }
644 if (debug) {
645 tprintf("Found %d samples of class %d=%s, font %d, "
646 "dist range [%g, %g], worst pair= %s, %s\n",
647 samples_found, c, unicharset_.debug_str(c).string(),
648 font_index, min_max_dist, max_max_dist,
649 SampleToString(*samples_[max_s1]).string(),
650 SampleToString(*samples_[max_s2]).string());
651 }
652 }
653 }
654 if (debug) {
655 tprintf("Global worst dist = %g, between sample %d and %d\n",
656 global_worst_dist, worst_s1, worst_s2);
657 }
658}
659
660// Replicates the samples to a minimum frequency defined by
661// 2 * kSampleRandomSize, or for larger counts duplicates all samples.
662// After replication, the replicated samples are perturbed slightly, but
663// in a predictable and repeatable way.
664// Use after OrganizeByFontAndClass().
666 ASSERT_HOST(font_class_array_ != nullptr);
667 int font_size = font_id_map_.CompactSize();
668 for (int font_index = 0; font_index < font_size; ++font_index) {
669 for (int c = 0; c < unicharset_size_; ++c) {
670 FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
671 int sample_count = fcinfo.samples.size();
672 int min_samples = 2 * std::max(kSampleRandomSize, sample_count);
673 if (sample_count > 0 && sample_count < min_samples) {
674 int base_count = sample_count;
675 for (int base_index = 0; sample_count < min_samples; ++sample_count) {
676 int src_index = fcinfo.samples[base_index++];
677 if (base_index >= base_count) base_index = 0;
678 TrainingSample* sample = samples_[src_index]->RandomizedCopy(
679 sample_count % kSampleRandomSize);
680 int sample_index = samples_.size();
681 sample->set_sample_index(sample_index);
682 samples_.push_back(sample);
683 fcinfo.samples.push_back(sample_index);
684 }
685 }
686 }
687 }
688}
689
690// Caches the indexed features of the canonical samples.
691// ComputeCanonicalSamples must have been already called.
692// TODO(rays) see note on ReliablySeparable and try restricting the
693// canonical features to those that truly represent all samples.
695 ASSERT_HOST(font_class_array_ != nullptr);
696 const int font_size = font_id_map_.CompactSize();
697 for (int font_index = 0; font_index < font_size; ++font_index) {
698 const int font_id = font_id_map_.CompactToSparse(font_index);
699 for (int c = 0; c < unicharset_size_; ++c) {
700 int num_samples = NumClassSamples(font_id, c, false);
701 if (num_samples == 0)
702 continue;
703 const TrainingSample* sample = GetCanonicalSample(font_id, c);
704 FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
705 fcinfo.canonical_features = sample->indexed_features();
706 }
707 }
708}
709
710// Computes the combined set of features used by all the samples of each
711// font/class combination. Use after ReplicateAndRandomizeSamples.
712void TrainingSampleSet::ComputeCloudFeatures(int feature_space_size) {
713 ASSERT_HOST(font_class_array_ != nullptr);
714 int font_size = font_id_map_.CompactSize();
715 for (int font_index = 0; font_index < font_size; ++font_index) {
716 int font_id = font_id_map_.CompactToSparse(font_index);
717 for (int c = 0; c < unicharset_size_; ++c) {
718 int num_samples = NumClassSamples(font_id, c, false);
719 if (num_samples == 0)
720 continue;
721 FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
722 fcinfo.cloud_features.Init(feature_space_size);
723 for (int s = 0; s < num_samples; ++s) {
724 const TrainingSample* sample = GetSample(font_id, c, s);
725 const GenericVector<int>& sample_features = sample->indexed_features();
726 for (int i = 0; i < sample_features.size(); ++i)
727 fcinfo.cloud_features.SetBit(sample_features[i]);
728 }
729 }
730 }
731}
732
733// Adds all fonts of the given class to the shape.
734void TrainingSampleSet::AddAllFontsForClass(int class_id, Shape* shape) const {
735 for (int f = 0; f < font_id_map_.CompactSize(); ++f) {
736 const int font_id = font_id_map_.CompactToSparse(f);
737 shape->AddToShape(class_id, font_id);
738 }
739}
740
741// Display the samples with the given indexed feature that also match
742// the given shape.
744 const Shape& shape,
745 const IntFeatureSpace& space,
746 ScrollView::Color color,
747 ScrollView* window) const {
748 for (int s = 0; s < num_raw_samples(); ++s) {
749 const TrainingSample* sample = GetSample(s);
750 if (shape.ContainsUnichar(sample->class_id())) {
751 GenericVector<int> indexed_features;
752 space.IndexAndSortFeatures(sample->features(), sample->num_features(),
753 &indexed_features);
754 for (int f = 0; f < indexed_features.size(); ++f) {
755 if (indexed_features[f] == f_index) {
756 sample->DisplayFeatures(color, window);
757 }
758 }
759 }
760 }
761}
762
763
764} // namespace tesseract.
void MakeBoxFileStr(const char *unichar_str, const TBOX &box, int page_num, STRING *box_str)
Definition: boxread.cpp:242
#define ASSERT_HOST(x)
Definition: errcode.h:88
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
Definition: helpers.h:120
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:185
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
Definition: tesscallback.h:258
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int UNICHAR_ID
Definition: unichar.h:34
#define MAX_NUM_CLASSES
Definition: matchdefs.h:30
const int kPrime2
const int kPrime1
const int kSquareLimit
const int kTestChar
int push_back(T object)
int size() const
Definition: genericvector.h:72
T & get(int index) const
bool DeSerializeClasses(bool swap, FILE *fp)
Definition: matrix.h:198
bool SerializeClasses(FILE *fp) const
Definition: matrix.h:185
int size() const
Definition: bitvector.h:53
int CompactSize() const
Definition: indexmapbidi.h:61
int CompactToSparse(int compact_index) const
Definition: indexmapbidi.h:53
void Init(int size, bool all_mapped)
void SetMap(int sparse_index, bool mapped)
int SparseSize() const override
Definition: indexmapbidi.h:142
bool Serialize(FILE *fp) const
bool DeSerialize(bool swap, FILE *fp)
int SparseToCompact(int sparse_index) const override
Definition: indexmapbidi.h:138
Definition: strngs.h:45
const char * string() const
Definition: strngs.cpp:194
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:626
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:671
bool save_to_file(const char *const filename) const
Definition: unicharset.h:350
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
void AppendOtherUnicharset(const UNICHARSET &src)
Definition: unicharset.cpp:464
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:343
void clear()
Definition: unicharset.h:306
int size() const
Definition: unicharset.h:341
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:388
Definition: cluster.h:32
double FeatureDistance(const GenericVector< int > &features) const
void Init(const IntFeatureMap *feature_map)
void Set(const GenericVector< int > &indexed_features, int canonical_count, bool value)
int OffsetFeature(int index_feature, int dir) const
void IndexAndSortFeatures(const INT_FEATURE_STRUCT *features, int num_features, GenericVector< int > *sorted_features) const
GenericVector< int32_t > font_ids
Definition: shapetable.h:174
void AddToShape(int unichar_id, int font_id)
Definition: shapetable.cpp:101
bool ContainsUnichar(int unichar_id) const
Definition: shapetable.cpp:147
STRING SampleToString(const TrainingSample &sample) const
const TrainingSample * GetCanonicalSample(int font_id, int class_id) const
int NumClassSamples(int font_id, int class_id, bool randomize) const
int AddSample(const char *unichar, TrainingSample *sample)
const GenericVector< int > & GetCanonicalFeatures(int font_id, int class_id) const
bool Serialize(FILE *fp) const
void AddAllFontsForClass(int class_id, Shape *shape) const
void IndexFeatures(const IntFeatureSpace &feature_space)
float ComputeClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map) const
void KillSample(TrainingSample *sample)
void ComputeCanonicalSamples(const IntFeatureMap &map, bool debug)
void ComputeCloudFeatures(int feature_space_size)
bool DeSerialize(bool swap, FILE *fp)
void LoadUnicharset(const char *filename)
const BitVector & GetCloudFeatures(int font_id, int class_id) const
int GlobalSampleIndex(int font_id, int class_id, int index) const
bool DeleteableSample(const TrainingSample *sample)
float ClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map)
float GetCanonicalDist(int font_id, int class_id) const
const TrainingSample * GetSample(int index) const
void DisplaySamplesWithFeature(int f_index, const Shape &shape, const IntFeatureSpace &feature_space, ScrollView::Color color, ScrollView *window) const
int ReliablySeparable(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map, bool thorough) const
TrainingSample * MutableSample(int font_id, int class_id, int index)
float UnicharDistance(const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, bool matched_fonts, const IntFeatureMap &feature_map)