tesseract 4.1.1
Loading...
Searching...
No Matches
tesseract::MasterTrainer Class Reference

#include <mastertrainer.h>

Public Member Functions

 MasterTrainer (NormalizationMode norm_mode, bool shape_analysis, bool replicate_samples, int debug_level)
 
 ~MasterTrainer ()
 
bool Serialize (FILE *fp) const
 
void LoadUnicharset (const char *filename)
 
void SetFeatureSpace (const IntFeatureSpace &fs)
 
void ReadTrainingSamples (const char *page_name, const FEATURE_DEFS_STRUCT &feature_defs, bool verification)
 
void AddSample (bool verification, const char *unichar_str, TrainingSample *sample)
 
void LoadPageImages (const char *filename)
 
void PostLoadCleanup ()
 
void PreTrainingSetup ()
 
void SetupMasterShapes ()
 
void IncludeJunk ()
 
void ReplicateAndRandomizeSamplesIfRequired ()
 
bool LoadFontInfo (const char *filename)
 
bool LoadXHeights (const char *filename)
 
bool AddSpacingInfo (const char *filename)
 
int GetFontInfoId (const char *font_name)
 
int GetBestMatchingFontInfoId (const char *filename)
 
const STRINGGetTRFileName (int index) const
 
void SetupFlatShapeTable (ShapeTable *shape_table)
 
CLUSTERERSetupForClustering (const ShapeTable &shape_table, const FEATURE_DEFS_STRUCT &feature_defs, int shape_id, int *num_samples)
 
void WriteInttempAndPFFMTable (const UNICHARSET &unicharset, const UNICHARSET &shape_set, const ShapeTable &shape_table, CLASS_STRUCT *float_classes, const char *inttemp_file, const char *pffmtable_file)
 
const UNICHARSETunicharset () const
 
TrainingSampleSetGetSamples ()
 
const ShapeTablemaster_shapes () const
 
void DebugCanonical (const char *unichar_str1, const char *unichar_str2)
 
void DisplaySamples (const char *unichar_str1, int cloud_font, const char *unichar_str2, int canonical_font)
 
void TestClassifierVOld (bool replicate_samples, ShapeClassifier *test_classifier, ShapeClassifier *old_classifier)
 
void TestClassifierOnSamples (CountTypes error_mode, int report_level, bool replicate_samples, ShapeClassifier *test_classifier, STRING *report_string)
 
double TestClassifier (CountTypes error_mode, int report_level, bool replicate_samples, TrainingSampleSet *samples, ShapeClassifier *test_classifier, STRING *report_string)
 
float ShapeDistance (const ShapeTable &shapes, int s1, int s2)
 

Detailed Description

Definition at line 69 of file mastertrainer.h.

Constructor & Destructor Documentation

◆ MasterTrainer()

tesseract::MasterTrainer::MasterTrainer ( NormalizationMode  norm_mode,
bool  shape_analysis,
bool  replicate_samples,
int  debug_level 
)

Definition at line 51 of file mastertrainer.cpp.

55 : norm_mode_(norm_mode), samples_(fontinfo_table_),
56 junk_samples_(fontinfo_table_), verify_samples_(fontinfo_table_),
57 charsetsize_(0),
58 enable_shape_analysis_(shape_analysis),
59 enable_replication_(replicate_samples),
60 fragments_(nullptr), prev_unichar_id_(-1), debug_level_(debug_level) {
61}

◆ ~MasterTrainer()

tesseract::MasterTrainer::~MasterTrainer ( )

Definition at line 63 of file mastertrainer.cpp.

63 {
64 delete [] fragments_;
65 for (int p = 0; p < page_images_.size(); ++p)
66 pixDestroy(&page_images_[p]);
67}
int size() const
Definition: genericvector.h:72

Member Function Documentation

◆ AddSample()

void tesseract::MasterTrainer::AddSample ( bool  verification,
const char *  unichar_str,
TrainingSample sample 
)

Definition at line 163 of file mastertrainer.cpp.

164 {
165 if (verification) {
166 verify_samples_.AddSample(unichar, sample);
167 prev_unichar_id_ = -1;
168 } else if (unicharset_.contains_unichar(unichar)) {
169 if (prev_unichar_id_ >= 0)
170 fragments_[prev_unichar_id_] = -1;
171 prev_unichar_id_ = samples_.AddSample(unichar, sample);
172 if (flat_shapes_.FindShape(prev_unichar_id_, sample->font_id()) < 0)
173 flat_shapes_.AddShape(prev_unichar_id_, sample->font_id());
174 } else {
175 const int junk_id = junk_samples_.AddSample(unichar, sample);
176 if (prev_unichar_id_ >= 0) {
178 if (frag != nullptr && frag->is_natural()) {
179 if (fragments_[prev_unichar_id_] == 0)
180 fragments_[prev_unichar_id_] = junk_id;
181 else if (fragments_[prev_unichar_id_] != junk_id)
182 fragments_[prev_unichar_id_] = -1;
183 }
184 delete frag;
185 }
186 prev_unichar_id_ = -1;
187 }
188}
static CHAR_FRAGMENT * parse_from_string(const char *str)
bool is_natural() const
Definition: unicharset.h:113
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:671
Definition: cluster.h:32
int AddShape(int unichar_id, int font_id)
Definition: shapetable.cpp:336
int FindShape(int unichar_id, int font_id) const
Definition: shapetable.cpp:386
int AddSample(const char *unichar, TrainingSample *sample)

◆ AddSpacingInfo()

bool tesseract::MasterTrainer::AddSpacingInfo ( const char *  filename)

Definition at line 412 of file mastertrainer.cpp.

412 {
413 FILE* fontinfo_file = fopen(filename, "rb");
414 if (fontinfo_file == nullptr)
415 return true; // We silently ignore missing files!
416 // Find the fontinfo_id.
417 int fontinfo_id = GetBestMatchingFontInfoId(filename);
418 if (fontinfo_id < 0) {
419 tprintf("No font found matching fontinfo filename %s\n", filename);
420 fclose(fontinfo_file);
421 return false;
422 }
423 tprintf("Reading spacing from %s for font %d...\n", filename, fontinfo_id);
424 // TODO(rays) scale should probably be a double, but keep as an int for now
425 // to duplicate current behavior.
426 int scale = kBlnXHeight / xheights_[fontinfo_id];
427 int num_unichars;
428 char uch[UNICHAR_LEN];
429 char kerned_uch[UNICHAR_LEN];
430 int x_gap, x_gap_before, x_gap_after, num_kerned;
431 ASSERT_HOST(tfscanf(fontinfo_file, "%d\n", &num_unichars) == 1);
432 FontInfo *fi = &fontinfo_table_.get(fontinfo_id);
433 fi->init_spacing(unicharset_.size());
434 FontSpacingInfo *spacing = nullptr;
435 for (int l = 0; l < num_unichars; ++l) {
436 if (tfscanf(fontinfo_file, "%s %d %d %d",
437 uch, &x_gap_before, &x_gap_after, &num_kerned) != 4) {
438 tprintf("Bad format of font spacing file %s\n", filename);
439 fclose(fontinfo_file);
440 return false;
441 }
442 bool valid = unicharset_.contains_unichar(uch);
443 if (valid) {
444 spacing = new FontSpacingInfo();
445 spacing->x_gap_before = static_cast<int16_t>(x_gap_before * scale);
446 spacing->x_gap_after = static_cast<int16_t>(x_gap_after * scale);
447 }
448 for (int k = 0; k < num_kerned; ++k) {
449 if (tfscanf(fontinfo_file, "%s %d", kerned_uch, &x_gap) != 2) {
450 tprintf("Bad format of font spacing file %s\n", filename);
451 fclose(fontinfo_file);
452 delete spacing;
453 return false;
454 }
455 if (!valid || !unicharset_.contains_unichar(kerned_uch)) continue;
456 spacing->kerned_unichar_ids.push_back(
457 unicharset_.unichar_to_id(kerned_uch));
458 spacing->kerned_x_gaps.push_back(static_cast<int16_t>(x_gap * scale));
459 }
460 if (valid) fi->add_spacing(unicharset_.unichar_to_id(uch), spacing);
461 }
462 fclose(fontinfo_file);
463 return true;
464}
const int kBlnXHeight
Definition: normalis.h:24
#define ASSERT_HOST(x)
Definition: errcode.h:88
int tfscanf(FILE *stream, const char *format,...)
Definition: scanutils.cpp:181
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
#define UNICHAR_LEN
Definition: unichar.h:30
T & get(int index) const
int size() const
Definition: unicharset.h:341
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
int GetBestMatchingFontInfoId(const char *filename)

◆ DebugCanonical()

void tesseract::MasterTrainer::DebugCanonical ( const char *  unichar_str1,
const char *  unichar_str2 
)

Definition at line 636 of file mastertrainer.cpp.

637 {
638 int class_id1 = unicharset_.unichar_to_id(unichar_str1);
639 int class_id2 = unicharset_.unichar_to_id(unichar_str2);
640 if (class_id2 == INVALID_UNICHAR_ID)
641 class_id2 = class_id1;
642 if (class_id1 == INVALID_UNICHAR_ID) {
643 tprintf("No unicharset entry found for %s\n", unichar_str1);
644 return;
645 } else {
646 tprintf("Font ambiguities for unichar %d = %s and %d = %s\n",
647 class_id1, unichar_str1, class_id2, unichar_str2);
648 }
649 int num_fonts = samples_.NumFonts();
650 const IntFeatureMap& feature_map = feature_map_;
651 // Iterate the fonts to get the similarity with other fonst of the same
652 // class.
653 tprintf(" ");
654 for (int f = 0; f < num_fonts; ++f) {
655 if (samples_.NumClassSamples(f, class_id2, false) == 0)
656 continue;
657 tprintf("%6d", f);
658 }
659 tprintf("\n");
660 for (int f1 = 0; f1 < num_fonts; ++f1) {
661 // Map the features of the canonical_sample.
662 if (samples_.NumClassSamples(f1, class_id1, false) == 0)
663 continue;
664 tprintf("%4d ", f1);
665 for (int f2 = 0; f2 < num_fonts; ++f2) {
666 if (samples_.NumClassSamples(f2, class_id2, false) == 0)
667 continue;
668 float dist = samples_.ClusterDistance(f1, class_id1, f2, class_id2,
669 feature_map);
670 tprintf(" %5.3f", dist);
671 }
672 tprintf("\n");
673 }
674 // Build a fake ShapeTable containing all the sample types.
675 ShapeTable shapes(unicharset_);
676 for (int f = 0; f < num_fonts; ++f) {
677 if (samples_.NumClassSamples(f, class_id1, true) > 0)
678 shapes.AddShape(class_id1, f);
679 if (class_id1 != class_id2 &&
680 samples_.NumClassSamples(f, class_id2, true) > 0)
681 shapes.AddShape(class_id2, f);
682 }
683}
int NumClassSamples(int font_id, int class_id, bool randomize) const
float ClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map)

◆ DisplaySamples()

void tesseract::MasterTrainer::DisplaySamples ( const char *  unichar_str1,
int  cloud_font,
const char *  unichar_str2,
int  canonical_font 
)

Definition at line 696 of file mastertrainer.cpp.

698 {
699 const IntFeatureMap& feature_map = feature_map_;
700 const IntFeatureSpace& feature_space = feature_map.feature_space();
701 ScrollView* f_window = CreateFeatureSpaceWindow("Features", 100, 500);
703 f_window);
704 int class_id2 = samples_.unicharset().unichar_to_id(unichar_str2);
705 if (class_id2 != INVALID_UNICHAR_ID && canonical_font >= 0) {
706 const TrainingSample* sample = samples_.GetCanonicalSample(canonical_font,
707 class_id2);
708 for (uint32_t f = 0; f < sample->num_features(); ++f) {
709 RenderIntFeature(f_window, &sample->features()[f], ScrollView::RED);
710 }
711 }
712 int class_id1 = samples_.unicharset().unichar_to_id(unichar_str1);
713 if (class_id1 != INVALID_UNICHAR_ID && cloud_font >= 0) {
714 const BitVector& cloud = samples_.GetCloudFeatures(cloud_font, class_id1);
715 for (int f = 0; f < cloud.size(); ++f) {
716 if (cloud[f]) {
717 INT_FEATURE_STRUCT feature =
718 feature_map.InverseIndexFeature(f);
719 RenderIntFeature(f_window, &feature, ScrollView::GREEN);
720 }
721 }
722 }
723 f_window->Update();
724 ScrollView* s_window = CreateFeatureSpaceWindow("Samples", 100, 500);
725 SVEventType ev_type;
726 do {
727 SVEvent* ev;
728 // Wait until a click or popup event.
729 ev = f_window->AwaitEvent(SVET_ANY);
730 ev_type = ev->type;
731 if (ev_type == SVET_CLICK) {
732 int feature_index = feature_space.XYToFeatureIndex(ev->x, ev->y);
733 if (feature_index >= 0) {
734 // Iterate samples and display those with the feature.
735 Shape shape;
736 shape.AddToShape(class_id1, cloud_font);
737 s_window->Clear();
738 samples_.DisplaySamplesWithFeature(feature_index, shape,
739 feature_space, ScrollView::GREEN,
740 s_window);
741 s_window->Update();
742 }
743 }
744 delete ev;
745 } while (ev_type != SVET_DESTROY);
746}
ScrollView * CreateFeatureSpaceWindow(const char *name, int xpos, int ypos)
Definition: intproto.cpp:1763
void RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT *Feature, ScrollView::Color color)
Definition: intproto.cpp:1602
@ baseline
Definition: mfoutline.h:63
@ character
Definition: mfoutline.h:63
SVEventType
Definition: scrollview.h:45
@ SVET_CLICK
Definition: scrollview.h:48
@ SVET_ANY
Definition: scrollview.h:56
@ SVET_DESTROY
Definition: scrollview.h:46
void ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView *window)
Definition: intproto.cpp:987
@ NM_BASELINE
Definition: normalis.h:43
const IntFeatureSpace & feature_space() const
Definition: intfeaturemap.h:60
const TrainingSample * GetCanonicalSample(int font_id, int class_id) const
const BitVector & GetCloudFeatures(int font_id, int class_id) const
const UNICHARSET & unicharset() const
void DisplaySamplesWithFeature(int f_index, const Shape &shape, const IntFeatureSpace &feature_space, ScrollView::Color color, ScrollView *window) const
int x
Definition: scrollview.h:67
SVEventType type
Definition: scrollview.h:64
int y
Definition: scrollview.h:68
static void Update()
Definition: scrollview.cpp:709
SVEvent * AwaitEvent(SVEventType type)
Definition: scrollview.cpp:443
void Clear()
Definition: scrollview.cpp:589

◆ GetBestMatchingFontInfoId()

int tesseract::MasterTrainer::GetBestMatchingFontInfoId ( const char *  filename)

Definition at line 479 of file mastertrainer.cpp.

479 {
480 int fontinfo_id = -1;
481 int best_len = 0;
482 for (int f = 0; f < fontinfo_table_.size(); ++f) {
483 if (strstr(filename, fontinfo_table_.get(f).name) != nullptr) {
484 int len = strlen(fontinfo_table_.get(f).name);
485 // Use the longest matching length in case a substring of a font matched.
486 if (len > best_len) {
487 best_len = len;
488 fontinfo_id = f;
489 }
490 }
491 }
492 return fontinfo_id;
493}

◆ GetFontInfoId()

int tesseract::MasterTrainer::GetFontInfoId ( const char *  font_name)

Definition at line 468 of file mastertrainer.cpp.

468 {
469 FontInfo fontinfo;
470 // We are only borrowing the string, so it is OK to const cast it.
471 fontinfo.name = const_cast<char*>(font_name);
472 fontinfo.properties = 0; // Not used to lookup in the table
473 fontinfo.universal_id = 0;
474 return fontinfo_table_.get_index(fontinfo);
475}
int get_index(const T &object) const

◆ GetSamples()

TrainingSampleSet * tesseract::MasterTrainer::GetSamples ( )
inline

Definition at line 189 of file mastertrainer.h.

189 {
190 return &samples_;
191 }

◆ GetTRFileName()

const STRING & tesseract::MasterTrainer::GetTRFileName ( int  index) const
inline

Definition at line 162 of file mastertrainer.h.

162 {
163 return tr_filenames_[index];
164 }

◆ IncludeJunk()

void tesseract::MasterTrainer::IncludeJunk ( )

Definition at line 295 of file mastertrainer.cpp.

295 {
296 // Get ids of fragments in junk_samples_ that replace the dead chars.
297 const UNICHARSET& junk_set = junk_samples_.unicharset();
298 const UNICHARSET& sample_set = samples_.unicharset();
299 int num_junks = junk_samples_.num_samples();
300 tprintf("Moving %d junk samples to master sample set.\n", num_junks);
301 for (int s = 0; s < num_junks; ++s) {
302 TrainingSample* sample = junk_samples_.mutable_sample(s);
303 int junk_id = sample->class_id();
304 const char* junk_utf8 = junk_set.id_to_unichar(junk_id);
305 int sample_id = sample_set.unichar_to_id(junk_utf8);
306 if (sample_id == INVALID_UNICHAR_ID)
307 sample_id = 0;
308 sample->set_class_id(sample_id);
309 junk_samples_.extract_sample(s);
310 samples_.AddSample(sample_id, sample);
311 }
312 junk_samples_.DeleteDeadSamples();
313 samples_.OrganizeByFontAndClass();
314}
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
TrainingSample * extract_sample(int index)
TrainingSample * mutable_sample(int index)

◆ LoadFontInfo()

bool tesseract::MasterTrainer::LoadFontInfo ( const char *  filename)

Definition at line 333 of file mastertrainer.cpp.

333 {
334 FILE* fp = fopen(filename, "rb");
335 if (fp == nullptr) {
336 fprintf(stderr, "Failed to load font_properties from %s\n", filename);
337 return false;
338 }
339 int italic, bold, fixed, serif, fraktur;
340 while (!feof(fp)) {
341 FontInfo fontinfo;
342 char* font_name = new char[1024];
343 fontinfo.name = font_name;
344 fontinfo.properties = 0;
345 fontinfo.universal_id = 0;
346 if (tfscanf(fp, "%1024s %i %i %i %i %i\n", font_name, &italic, &bold,
347 &fixed, &serif, &fraktur) != 6) {
348 delete[] font_name;
349 continue;
350 }
351 fontinfo.properties =
352 (italic << 0) +
353 (bold << 1) +
354 (fixed << 2) +
355 (serif << 3) +
356 (fraktur << 4);
357 if (!fontinfo_table_.contains(fontinfo)) {
358 fontinfo_table_.push_back(fontinfo);
359 } else {
360 delete[] font_name;
361 }
362 }
363 fclose(fp);
364 return true;
365}
int push_back(T object)
bool contains(const T &object) const

◆ LoadPageImages()

void tesseract::MasterTrainer::LoadPageImages ( const char *  filename)

Definition at line 193 of file mastertrainer.cpp.

193 {
194 size_t offset = 0;
195 int page;
196 Pix* pix;
197 for (page = 0;; page++) {
198 pix = pixReadFromMultipageTiff(filename, &offset);
199 if (!pix) break;
200 page_images_.push_back(pix);
201 if (!offset) break;
202 }
203 tprintf("Loaded %d page images from %s\n", page, filename);
204}

◆ LoadUnicharset()

void tesseract::MasterTrainer::LoadUnicharset ( const char *  filename)

Definition at line 88 of file mastertrainer.cpp.

88 {
89 if (!unicharset_.load_from_file(filename)) {
90 tprintf("Failed to load unicharset from file %s\n"
91 "Building unicharset for training from scratch...\n",
92 filename);
93 unicharset_.clear();
94 UNICHARSET initialized;
95 // Add special characters, as they were removed by the clear, but the
96 // default constructor puts them in.
97 unicharset_.AppendOtherUnicharset(initialized);
98 }
99 charsetsize_ = unicharset_.size();
100 delete [] fragments_;
101 fragments_ = new int[charsetsize_];
102 memset(fragments_, 0, sizeof(*fragments_) * charsetsize_);
103 samples_.LoadUnicharset(filename);
104 junk_samples_.LoadUnicharset(filename);
105 verify_samples_.LoadUnicharset(filename);
106}
void AppendOtherUnicharset(const UNICHARSET &src)
Definition: unicharset.cpp:464
void clear()
Definition: unicharset.h:306
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:388
void LoadUnicharset(const char *filename)

◆ LoadXHeights()

bool tesseract::MasterTrainer::LoadXHeights ( const char *  filename)

Definition at line 369 of file mastertrainer.cpp.

369 {
370 tprintf("fontinfo table is of size %d\n", fontinfo_table_.size());
371 xheights_.init_to_size(fontinfo_table_.size(), -1);
372 if (filename == nullptr) return true;
373 FILE *f = fopen(filename, "rb");
374 if (f == nullptr) {
375 fprintf(stderr, "Failed to load font xheights from %s\n", filename);
376 return false;
377 }
378 tprintf("Reading x-heights from %s ...\n", filename);
379 FontInfo fontinfo;
380 fontinfo.properties = 0; // Not used to lookup in the table.
381 fontinfo.universal_id = 0;
382 char buffer[1024];
383 int xht;
384 int total_xheight = 0;
385 int xheight_count = 0;
386 while (!feof(f)) {
387 if (tfscanf(f, "%1023s %d\n", buffer, &xht) != 2)
388 continue;
389 buffer[1023] = '\0';
390 fontinfo.name = buffer;
391 if (!fontinfo_table_.contains(fontinfo)) continue;
392 int fontinfo_id = fontinfo_table_.get_index(fontinfo);
393 xheights_[fontinfo_id] = xht;
394 total_xheight += xht;
395 ++xheight_count;
396 }
397 if (xheight_count == 0) {
398 fprintf(stderr, "No valid xheights in %s!\n", filename);
399 fclose(f);
400 return false;
401 }
402 int mean_xheight = DivRounded(total_xheight, xheight_count);
403 for (int i = 0; i < fontinfo_table_.size(); ++i) {
404 if (xheights_[i] < 0)
405 xheights_[i] = mean_xheight;
406 }
407 fclose(f);
408 return true;
409} // LoadXHeights
int DivRounded(int a, int b)
Definition: helpers.h:167
void init_to_size(int size, const T &t)

◆ master_shapes()

const ShapeTable & tesseract::MasterTrainer::master_shapes ( ) const
inline

Definition at line 192 of file mastertrainer.h.

192 {
193 return master_shapes_;
194 }

◆ PostLoadCleanup()

void tesseract::MasterTrainer::PostLoadCleanup ( )

Definition at line 211 of file mastertrainer.cpp.

211 {
212 if (debug_level_ > 0)
213 tprintf("PostLoadCleanup...\n");
214 if (enable_shape_analysis_)
215 ReplaceFragmentedSamples();
216 SampleIterator sample_it;
217 sample_it.Init(nullptr, nullptr, true, &verify_samples_);
218 sample_it.NormalizeSamples();
219 verify_samples_.OrganizeByFontAndClass();
220
221 samples_.IndexFeatures(feature_space_);
222 // TODO(rays) DeleteOutliers is currently turned off to prove NOP-ness
223 // against current training.
224 // samples_.DeleteOutliers(feature_space_, debug_level_ > 0);
225 samples_.OrganizeByFontAndClass();
226 if (debug_level_ > 0)
227 tprintf("ComputeCanonicalSamples...\n");
228 samples_.ComputeCanonicalSamples(feature_map_, debug_level_ > 0);
229}
void IndexFeatures(const IntFeatureSpace &feature_space)
void ComputeCanonicalSamples(const IntFeatureMap &map, bool debug)

◆ PreTrainingSetup()

void tesseract::MasterTrainer::PreTrainingSetup ( )

Definition at line 234 of file mastertrainer.cpp.

234 {
235 if (debug_level_ > 0)
236 tprintf("PreTrainingSetup...\n");
237 samples_.IndexFeatures(feature_space_);
238 samples_.ComputeCanonicalFeatures();
239 if (debug_level_ > 0)
240 tprintf("ComputeCloudFeatures...\n");
241 samples_.ComputeCloudFeatures(feature_space_.Size());
242}
void ComputeCloudFeatures(int feature_space_size)

◆ ReadTrainingSamples()

void tesseract::MasterTrainer::ReadTrainingSamples ( const char *  page_name,
const FEATURE_DEFS_STRUCT feature_defs,
bool  verification 
)

Definition at line 112 of file mastertrainer.cpp.

114 {
115 char buffer[2048];
116 const int int_feature_type = ShortNameToFeatureType(feature_defs, kIntFeatureType);
117 const int micro_feature_type = ShortNameToFeatureType(feature_defs,
119 const int cn_feature_type = ShortNameToFeatureType(feature_defs, kCNFeatureType);
120 const int geo_feature_type = ShortNameToFeatureType(feature_defs, kGeoFeatureType);
121
122 FILE* fp = fopen(page_name, "rb");
123 if (fp == nullptr) {
124 tprintf("Failed to open tr file: %s\n", page_name);
125 return;
126 }
127 tr_filenames_.push_back(STRING(page_name));
128 while (fgets(buffer, sizeof(buffer), fp) != nullptr) {
129 if (buffer[0] == '\n')
130 continue;
131
132 char* space = strchr(buffer, ' ');
133 if (space == nullptr) {
134 tprintf("Bad format in tr file, reading fontname, unichar\n");
135 continue;
136 }
137 *space++ = '\0';
138 int font_id = GetFontInfoId(buffer);
139 if (font_id < 0) font_id = 0;
140 int page_number;
141 STRING unichar;
142 TBOX bounding_box;
143 if (!ParseBoxFileStr(space, &page_number, &unichar, &bounding_box)) {
144 tprintf("Bad format in tr file, reading box coords\n");
145 continue;
146 }
148 auto* sample = new TrainingSample;
149 sample->set_font_id(font_id);
150 sample->set_page_num(page_number + page_images_.size());
151 sample->set_bounding_box(bounding_box);
152 sample->ExtractCharDesc(int_feature_type, micro_feature_type,
153 cn_feature_type, geo_feature_type, char_desc);
154 AddSample(verification, unichar.string(), sample);
155 FreeCharDescription(char_desc);
156 }
157 charsetsize_ = unicharset_.size();
158 fclose(fp);
159}
bool ParseBoxFileStr(const char *boxfile_str, int *page_number, STRING *utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:174
const char *const kIntFeatureType
Definition: featdefs.cpp:34
const char *const kGeoFeatureType
Definition: featdefs.cpp:35
const char *const kCNFeatureType
Definition: featdefs.cpp:33
uint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:270
CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File)
Definition: featdefs.cpp:236
void FreeCharDescription(CHAR_DESC CharDesc)
Definition: featdefs.cpp:129
const char *const kMicroFeatureType
Definition: featdefs.cpp:32
FEATURE_DEFS_STRUCT feature_defs
Definition: rect.h:34
Definition: strngs.h:45
const char * string() const
Definition: strngs.cpp:194
int GetFontInfoId(const char *font_name)
void AddSample(bool verification, const char *unichar_str, TrainingSample *sample)

◆ ReplicateAndRandomizeSamplesIfRequired()

void tesseract::MasterTrainer::ReplicateAndRandomizeSamplesIfRequired ( )

Definition at line 321 of file mastertrainer.cpp.

321 {
322 if (enable_replication_) {
323 if (debug_level_ > 0)
324 tprintf("ReplicateAndRandomize...\n");
325 verify_samples_.ReplicateAndRandomizeSamples();
327 samples_.IndexFeatures(feature_space_);
328 }
329}

◆ Serialize()

bool tesseract::MasterTrainer::Serialize ( FILE *  fp) const

Definition at line 72 of file mastertrainer.cpp.

72 {
73 uint32_t value = norm_mode_;
74 if (!tesseract::Serialize(fp, &value)) return false;
75 if (!unicharset_.save_to_file(fp)) return false;
76 if (!feature_space_.Serialize(fp)) return false;
77 if (!samples_.Serialize(fp)) return false;
78 if (!junk_samples_.Serialize(fp)) return false;
79 if (!verify_samples_.Serialize(fp)) return false;
80 if (!master_shapes_.Serialize(fp)) return false;
81 if (!flat_shapes_.Serialize(fp)) return false;
82 if (!fontinfo_table_.Serialize(fp)) return false;
83 if (!xheights_.Serialize(fp)) return false;
84 return true;
85}
bool Serialize(FILE *fp, const char *data, size_t n)
Definition: serialis.cpp:60
bool Serialize(FILE *fp) const
bool Serialize(FILE *fp) const
Definition: fontinfo.cpp:48
bool save_to_file(const char *const filename) const
Definition: unicharset.h:350
bool Serialize(FILE *fp) const
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:241
bool Serialize(FILE *fp) const

◆ SetFeatureSpace()

void tesseract::MasterTrainer::SetFeatureSpace ( const IntFeatureSpace fs)
inline

Definition at line 82 of file mastertrainer.h.

82 {
83 feature_space_ = fs;
84 feature_map_.Init(fs);
85 }
void Init(const IntFeatureSpace &feature_space)

◆ SetupFlatShapeTable()

void tesseract::MasterTrainer::SetupFlatShapeTable ( ShapeTable shape_table)

Definition at line 496 of file mastertrainer.cpp.

496 {
497 // To exactly mimic the results of the previous implementation, the shapes
498 // must be clustered in order the fonts arrived, and reverse order of the
499 // characters within each font.
500 // Get a list of the fonts in the order they appeared.
501 GenericVector<int> active_fonts;
502 int num_shapes = flat_shapes_.NumShapes();
503 for (int s = 0; s < num_shapes; ++s) {
504 int font = flat_shapes_.GetShape(s)[0].font_ids[0];
505 int f = 0;
506 for (f = 0; f < active_fonts.size(); ++f) {
507 if (active_fonts[f] == font)
508 break;
509 }
510 if (f == active_fonts.size())
511 active_fonts.push_back(font);
512 }
513 // For each font in order, add all the shapes with that font in reverse order.
514 int num_fonts = active_fonts.size();
515 for (int f = 0; f < num_fonts; ++f) {
516 for (int s = num_shapes - 1; s >= 0; --s) {
517 int font = flat_shapes_.GetShape(s)[0].font_ids[0];
518 if (font == active_fonts[f]) {
519 shape_table->AddShape(flat_shapes_.GetShape(s));
520 }
521 }
522 }
523}
void AddShape(const Shape &other)
Definition: shapetable.cpp:120
int NumShapes() const
Definition: shapetable.h:274
const Shape & GetShape(int shape_id) const
Definition: shapetable.h:319

◆ SetupForClustering()

CLUSTERER * tesseract::MasterTrainer::SetupForClustering ( const ShapeTable shape_table,
const FEATURE_DEFS_STRUCT feature_defs,
int  shape_id,
int *  num_samples 
)

Definition at line 527 of file mastertrainer.cpp.

531 {
532
534 int num_params = feature_defs.FeatureDesc[desc_index]->NumParams;
535 ASSERT_HOST(num_params == MFCount);
536 CLUSTERER* clusterer = MakeClusterer(
537 num_params, feature_defs.FeatureDesc[desc_index]->ParamDesc);
538
539 // We want to iterate over the samples of just the one shape.
540 IndexMapBiDi shape_map;
541 shape_map.Init(shape_table.NumShapes(), false);
542 shape_map.SetMap(shape_id, true);
543 shape_map.Setup();
544 // Reverse the order of the samples to match the previous behavior.
546 SampleIterator it;
547 it.Init(&shape_map, &shape_table, false, &samples_);
548 for (it.Begin(); !it.AtEnd(); it.Next()) {
549 sample_ptrs.push_back(&it.GetSample());
550 }
551 int sample_id = 0;
552 for (int i = sample_ptrs.size() - 1; i >= 0; --i) {
553 const TrainingSample* sample = sample_ptrs[i];
554 uint32_t num_features = sample->num_micro_features();
555 for (uint32_t f = 0; f < num_features; ++f)
556 MakeSample(clusterer, sample->micro_features()[f], sample_id);
557 ++sample_id;
558 }
559 *num_samples = sample_id;
560 return clusterer;
561}
SAMPLE * MakeSample(CLUSTERER *Clusterer, const float *Feature, int32_t CharID)
Definition: cluster.cpp:429
CLUSTERER * MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:376
@ MFCount
Definition: mf.h:30
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:47
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:55

◆ SetupMasterShapes()

void tesseract::MasterTrainer::SetupMasterShapes ( )

Definition at line 246 of file mastertrainer.cpp.

246 {
247 tprintf("Building master shape table\n");
248 const int num_fonts = samples_.NumFonts();
249
250 ShapeTable char_shapes_begin_fragment(samples_.unicharset());
251 ShapeTable char_shapes_end_fragment(samples_.unicharset());
252 ShapeTable char_shapes(samples_.unicharset());
253 for (int c = 0; c < samples_.charsetsize(); ++c) {
254 ShapeTable shapes(samples_.unicharset());
255 for (int f = 0; f < num_fonts; ++f) {
256 if (samples_.NumClassSamples(f, c, true) > 0)
257 shapes.AddShape(c, f);
258 }
259 ClusterShapes(kMinClusteredShapes, 1, kFontMergeDistance, &shapes);
260
261 const CHAR_FRAGMENT *fragment = samples_.unicharset().get_fragment(c);
262
263 if (fragment == nullptr)
264 char_shapes.AppendMasterShapes(shapes, nullptr);
265 else if (fragment->is_beginning())
266 char_shapes_begin_fragment.AppendMasterShapes(shapes, nullptr);
267 else if (fragment->is_ending())
268 char_shapes_end_fragment.AppendMasterShapes(shapes, nullptr);
269 else
270 char_shapes.AppendMasterShapes(shapes, nullptr);
271 }
273 kFontMergeDistance, &char_shapes_begin_fragment);
274 char_shapes.AppendMasterShapes(char_shapes_begin_fragment, nullptr);
276 kFontMergeDistance, &char_shapes_end_fragment);
277 char_shapes.AppendMasterShapes(char_shapes_end_fragment, nullptr);
279 kFontMergeDistance, &char_shapes);
280 master_shapes_.AppendMasterShapes(char_shapes, nullptr);
281 tprintf("Master shape_table:%s\n", master_shapes_.SummaryStr().string());
282}
const float kFontMergeDistance
const int kMinClusteredShapes
const int kMaxUnicharsPerCluster
bool is_beginning() const
Definition: unicharset.h:105
bool is_ending() const
Definition: unicharset.h:108
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:734
STRING SummaryStr() const
Definition: shapetable.cpp:313
void AppendMasterShapes(const ShapeTable &other, GenericVector< int > *shape_map)
Definition: shapetable.cpp:656

◆ ShapeDistance()

float tesseract::MasterTrainer::ShapeDistance ( const ShapeTable shapes,
int  s1,
int  s2 
)

Definition at line 810 of file mastertrainer.cpp.

810 {
811 const IntFeatureMap& feature_map = feature_map_;
812 const Shape& shape1 = shapes.GetShape(s1);
813 const Shape& shape2 = shapes.GetShape(s2);
814 int num_chars1 = shape1.size();
815 int num_chars2 = shape2.size();
816 float dist_sum = 0.0f;
817 int dist_count = 0;
818 if (num_chars1 > 1 || num_chars2 > 1) {
819 // In the multi-char case try to optimize the calculation by computing
820 // distances between characters of matching font where possible.
821 for (int c1 = 0; c1 < num_chars1; ++c1) {
822 for (int c2 = 0; c2 < num_chars2; ++c2) {
823 dist_sum += samples_.UnicharDistance(shape1[c1], shape2[c2],
824 true, feature_map);
825 ++dist_count;
826 }
827 }
828 } else {
829 // In the single unichar case, there is little alternative, but to compute
830 // the squared-order distance between pairs of fonts.
831 dist_sum = samples_.UnicharDistance(shape1[0], shape2[0],
832 false, feature_map);
833 ++dist_count;
834 }
835 return dist_sum / dist_count;
836}
float UnicharDistance(const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, bool matched_fonts, const IntFeatureMap &feature_map)

◆ TestClassifier()

double tesseract::MasterTrainer::TestClassifier ( CountTypes  error_mode,
int  report_level,
bool  replicate_samples,
TrainingSampleSet samples,
ShapeClassifier test_classifier,
STRING report_string 
)

Definition at line 783 of file mastertrainer.cpp.

788 {
789 SampleIterator sample_it;
790 sample_it.Init(nullptr, nullptr, replicate_samples, samples);
791 if (report_level > 0) {
792 int num_samples = 0;
793 for (sample_it.Begin(); !sample_it.AtEnd(); sample_it.Next())
794 ++num_samples;
795 tprintf("Iterator has charset size of %d/%d, %d shapes, %d samples\n",
796 sample_it.SparseCharsetSize(), sample_it.CompactCharsetSize(),
797 test_classifier->GetShapeTable()->NumShapes(), num_samples);
798 tprintf("Testing %sREPLICATED:\n", replicate_samples ? "" : "NON-");
799 }
800 double unichar_error = 0.0;
801 ErrorCounter::ComputeErrorRate(test_classifier, report_level,
802 error_mode, fontinfo_table_,
803 page_images_, &sample_it, &unichar_error,
804 nullptr, report_string);
805 return unichar_error;
806}
static double ComputeErrorRate(ShapeClassifier *classifier, int report_level, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const GenericVector< Pix * > &page_images, SampleIterator *it, double *unichar_error, double *scaled_error, STRING *fonts_report)

◆ TestClassifierOnSamples()

void tesseract::MasterTrainer::TestClassifierOnSamples ( CountTypes  error_mode,
int  report_level,
bool  replicate_samples,
ShapeClassifier test_classifier,
STRING report_string 
)

Definition at line 761 of file mastertrainer.cpp.

765 {
766 TestClassifier(error_mode, report_level, replicate_samples, &samples_,
767 test_classifier, report_string);
768}
double TestClassifier(CountTypes error_mode, int report_level, bool replicate_samples, TrainingSampleSet *samples, ShapeClassifier *test_classifier, STRING *report_string)

◆ TestClassifierVOld()

void tesseract::MasterTrainer::TestClassifierVOld ( bool  replicate_samples,
ShapeClassifier test_classifier,
ShapeClassifier old_classifier 
)

Definition at line 749 of file mastertrainer.cpp.

751 {
752 SampleIterator sample_it;
753 sample_it.Init(nullptr, nullptr, replicate_samples, &samples_);
754 ErrorCounter::DebugNewErrors(test_classifier, old_classifier,
755 CT_UNICHAR_TOPN_ERR, fontinfo_table_,
756 page_images_, &sample_it);
757}
@ CT_UNICHAR_TOPN_ERR
Definition: errorcounter.h:76
static void DebugNewErrors(ShapeClassifier *new_classifier, ShapeClassifier *old_classifier, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const GenericVector< Pix * > &page_images, SampleIterator *it)

◆ unicharset()

const UNICHARSET & tesseract::MasterTrainer::unicharset ( ) const
inline

Definition at line 186 of file mastertrainer.h.

186 {
187 return samples_.unicharset();
188 }

◆ WriteInttempAndPFFMTable()

void tesseract::MasterTrainer::WriteInttempAndPFFMTable ( const UNICHARSET unicharset,
const UNICHARSET shape_set,
const ShapeTable shape_table,
CLASS_STRUCT float_classes,
const char *  inttemp_file,
const char *  pffmtable_file 
)

Definition at line 567 of file mastertrainer.cpp.

572 {
573 auto *classify = new tesseract::Classify();
574 // Move the fontinfo table to classify.
575 fontinfo_table_.MoveTo(&classify->get_fontinfo_table());
576 INT_TEMPLATES int_templates = classify->CreateIntTemplates(float_classes,
577 shape_set);
578 FILE* fp = fopen(inttemp_file, "wb");
579 if (fp == nullptr) {
580 tprintf("Error, failed to open file \"%s\"\n", inttemp_file);
581 } else {
582 classify->WriteIntTemplates(fp, int_templates, shape_set);
583 fclose(fp);
584 }
585 // Now write pffmtable. This is complicated by the fact that the adaptive
586 // classifier still wants one indexed by unichar-id, but the static
587 // classifier needs one indexed by its shape class id.
588 // We put the shapetable_cutoffs in a GenericVector, and compute the
589 // unicharset cutoffs along the way.
590 GenericVector<uint16_t> shapetable_cutoffs;
591 GenericVector<uint16_t> unichar_cutoffs;
592 for (int c = 0; c < unicharset.size(); ++c)
593 unichar_cutoffs.push_back(0);
594 /* then write out each class */
595 for (int i = 0; i < int_templates->NumClasses; ++i) {
596 INT_CLASS Class = ClassForClassId(int_templates, i);
597 // Todo: Test with min instead of max
598 // int MaxLength = LengthForConfigId(Class, 0);
599 uint16_t max_length = 0;
600 for (int config_id = 0; config_id < Class->NumConfigs; config_id++) {
601 // Todo: Test with min instead of max
602 // if (LengthForConfigId (Class, config_id) < MaxLength)
603 uint16_t length = Class->ConfigLengths[config_id];
604 if (length > max_length)
605 max_length = Class->ConfigLengths[config_id];
606 int shape_id = float_classes[i].font_set.get(config_id);
607 const Shape& shape = shape_table.GetShape(shape_id);
608 for (int c = 0; c < shape.size(); ++c) {
609 int unichar_id = shape[c].unichar_id;
610 if (length > unichar_cutoffs[unichar_id])
611 unichar_cutoffs[unichar_id] = length;
612 }
613 }
614 shapetable_cutoffs.push_back(max_length);
615 }
616 fp = fopen(pffmtable_file, "wb");
617 if (fp == nullptr) {
618 tprintf("Error, failed to open file \"%s\"\n", pffmtable_file);
619 } else {
620 shapetable_cutoffs.Serialize(fp);
621 for (int c = 0; c < unicharset.size(); ++c) {
622 const char *unichar = unicharset.id_to_unichar(c);
623 if (strcmp(unichar, " ") == 0) {
624 unichar = "NULL";
625 }
626 fprintf(fp, "%s %d\n", unichar, unichar_cutoffs[c]);
627 }
628 fclose(fp);
629 }
630 free_int_templates(int_templates);
631 delete classify;
632}
void free_int_templates(INT_TEMPLATES templates)
Definition: intproto.cpp:698
#define ClassForClassId(T, c)
Definition: intproto.h:178
const T & get(int id) const
Return the object from an id.
void MoveTo(UnicityTable< FontInfo > *target)
Definition: fontinfo.cpp:105
uint16_t ConfigLengths[MAX_NUM_CONFIGS]
Definition: intproto.h:111
uint8_t NumConfigs
Definition: intproto.h:108
UnicityTableEqEq< int > font_set
Definition: protos.h:61
const UNICHARSET & unicharset() const

The documentation for this class was generated from the following files: