tesseract 4.1.1
Loading...
Searching...
No Matches
tesseract::Classify Class Reference

#include <classify.h>

Inheritance diagram for tesseract::Classify:
tesseract::CCStruct tesseract::CUtil tesseract::CCUtil tesseract::Wordrec tesseract::Tesseract

Public Member Functions

 Classify ()
 
 ~Classify () override
 
virtual DictgetDict ()
 
const ShapeTableshape_table () const
 
void SetStaticClassifier (ShapeClassifier *static_classifier)
 
void AddLargeSpeckleTo (int blob_length, BLOB_CHOICE_LIST *choices)
 
bool LargeSpeckle (const TBLOB &blob)
 
ADAPT_TEMPLATES NewAdaptedTemplates (bool InitFromUnicharset)
 
int GetFontinfoId (ADAPT_CLASS Class, uint8_t ConfigId)
 
int PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
 
void ReadNewCutoffs (TFile *fp, uint16_t *Cutoffs)
 
void PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
void WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
ADAPT_TEMPLATES ReadAdaptedTemplates (TFile *File)
 
float ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch)
 
void FreeNormProtos ()
 
NORM_PROTOSReadNormProtos (TFile *fp)
 
void ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class)
 
INT_TEMPLATES CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset)
 
void LearnWord (const char *fontname, WERD_RES *word)
 
void LearnPieces (const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
 
void InitAdaptiveClassifier (TessdataManager *mgr)
 
void InitAdaptedClass (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
 
void AmbigClassifier (const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
 
void MasterMatcher (INT_TEMPLATES templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
 
void ExpandShapesAndApplyCorrections (ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
 
double ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors)
 
void ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
 
void AddNewResult (const UnicharRating &new_result, ADAPT_RESULTS *results)
 
int GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
 
void DebugAdaptiveClassifier (TBLOB *Blob, ADAPT_RESULTS *Results)
 
PROTO_ID MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
 
int MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
 
void MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
 
void PrintAdaptiveMatchResults (const ADAPT_RESULTS &results)
 
void RemoveExtraPuncs (ADAPT_RESULTS *Results)
 
void RemoveBadMatches (ADAPT_RESULTS *Results)
 
void SetAdaptiveThreshold (float Threshold)
 
void ShowBestMatchFor (int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
 
STRING ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
 
int ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const
 
int ShapeIDToClassID (int shape_id) const
 
UNICHAR_IDBaselineClassifier (TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
 
int CharNormClassifier (TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
 
int CharNormTrainingSample (bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results)
 
UNICHAR_IDGetAmbiguities (TBLOB *Blob, CLASS_ID CorrectClass)
 
void DoAdaptiveMatch (TBLOB *Blob, ADAPT_RESULTS *Results)
 
void AdaptToChar (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES adaptive_templates)
 
void DisplayAdaptedChar (TBLOB *blob, INT_CLASS_STRUCT *int_class)
 
bool AdaptableWord (WERD_RES *word)
 
void EndAdaptiveClassifier ()
 
void SettupPass1 ()
 
void SettupPass2 ()
 
void AdaptiveClassifier (TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
 
void ClassifyAsNoise (ADAPT_RESULTS *Results)
 
void ResetAdaptiveClassifierInternal ()
 
void SwitchAdaptiveClassifier ()
 
void StartBackupAdaptiveClassifier ()
 
int GetCharNormFeature (const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array)
 
void ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array)
 
bool TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config)
 
void UpdateAmbigsGroup (CLASS_ID class_id, TBLOB *Blob)
 
bool AdaptiveClassifierIsFull () const
 
bool AdaptiveClassifierIsEmpty () const
 
bool LooksLikeGarbage (TBLOB *blob)
 
void RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
 
void ClearCharNormArray (uint8_t *char_norm_array)
 
void ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array)
 
void ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
 
INT_TEMPLATES ReadIntTemplates (TFile *fp)
 
void WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
 
CLASS_ID GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
 
void ShowMatchDisplay ()
 
UnicityTable< FontInfo > & get_fontinfo_table ()
 
const UnicityTable< FontInfo > & get_fontinfo_table () const
 
UnicityTable< FontSet > & get_fontset_table ()
 
void NormalizeOutlines (LIST Outlines, float *XScale, float *YScale)
 
FEATURE_SET ExtractOutlineFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractPicoFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractIntCNFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
FEATURE_SET ExtractIntGeoFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
void LearnBlob (const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
 
bool WriteTRFile (const STRING &filename)
 
- Public Member Functions inherited from tesseract::CCStruct
 CCStruct ()=default
 
 ~CCStruct () override
 
- Public Member Functions inherited from tesseract::CUtil
 CUtil ()=default
 
 ~CUtil () override
 
void read_variables (const char *filename, bool global_only)
 
- Public Member Functions inherited from tesseract::CCUtil
 CCUtil ()
 
virtual ~CCUtil ()
 
void main_setup (const char *argv0, const char *basename)
 CCUtil::main_setup - set location of tessdata and name of image. More...
 
ParamsVectorsparams ()
 

Static Public Member Functions

static void SetupBLCNDenorms (const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
 
static void ExtractFeatures (const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
 

Public Attributes

bool allow_blob_division = true
 
bool prioritize_division = false
 
bool classify_enable_learning = true
 
int classify_debug_level = 0
 
int classify_norm_method = character
 
double classify_char_norm_range = 0.2
 
double classify_max_rating_ratio = 1.5
 
double classify_max_certainty_margin = 5.5
 
bool tess_cn_matching = 0
 
bool tess_bn_matching = 0
 
bool classify_enable_adaptive_matcher = 1
 
bool classify_use_pre_adapted_templates = 0
 
bool classify_save_adapted_templates = 0
 
bool classify_enable_adaptive_debugger = 0
 
bool classify_nonlinear_norm = 0
 
int matcher_debug_level = 0
 
int matcher_debug_flags = 0
 
int classify_learning_debug_level = 0
 
double matcher_good_threshold = 0.125
 
double matcher_reliable_adaptive_result = 0.0
 
double matcher_perfect_threshold = 0.02
 
double matcher_bad_match_pad = 0.15
 
double matcher_rating_margin = 0.1
 
double matcher_avg_noise_size = 12.0
 
int matcher_permanent_classes_min = 1
 
int matcher_min_examples_for_prototyping = 3
 
int matcher_sufficient_examples_for_prototyping = 5
 
double matcher_clustering_max_angle_delta = 0.015
 
double classify_misfit_junk_penalty = 0.0
 
double rating_scale = 1.5
 
double certainty_scale = 20.0
 
double tessedit_class_miss_scale = 0.00390625
 
double classify_adapted_pruning_factor = 2.5
 
double classify_adapted_pruning_threshold = -1.0
 
int classify_adapt_proto_threshold = 230
 
int classify_adapt_feature_threshold = 230
 
bool disable_character_fragments = true
 
double classify_character_fragments_garbage_certainty_threshold = -3.0
 
bool classify_debug_character_fragments = false
 
bool matcher_debug_separate_windows = false
 
char * classify_learn_debug_str = ""
 
int classify_class_pruner_threshold = 229
 
int classify_class_pruner_multiplier = 15
 
int classify_cp_cutoff_strength = 7
 
int classify_integer_matcher_multiplier = 10
 
bool classify_bln_numeric_mode = 0
 
double speckle_large_max_size = 0.30
 
double speckle_rating_penalty = 10.0
 
INT_TEMPLATES PreTrainedTemplates
 
ADAPT_TEMPLATES AdaptedTemplates
 
ADAPT_TEMPLATES BackupAdaptedTemplates
 
BIT_VECTOR AllProtosOn
 
BIT_VECTOR AllConfigsOn
 
BIT_VECTOR AllConfigsOff
 
BIT_VECTOR TempProtoMask
 
NORM_PROTOSNormProtos
 
UnicityTable< FontInfofontinfo_table_
 
UnicityTable< FontSetfontset_table_
 
bool EnableLearning
 
- Public Attributes inherited from tesseract::CCUtil
STRING datadir
 
STRING imagebasename
 
STRING lang
 
STRING language_data_path_prefix
 
UNICHARSET unicharset
 
UnicharAmbigs unichar_ambigs
 
STRING imagefile
 
STRING directory
 
int ambigs_debug_level = 0
 
bool use_ambigs_for_adaption = false
 

Protected Attributes

IntegerMatcher im_
 
FEATURE_DEFS_STRUCT feature_defs_
 
ShapeTableshape_table_
 

Additional Inherited Members

- Static Public Attributes inherited from tesseract::CCStruct
static const double kDescenderFraction = 0.25
 
static const double kXHeightFraction = 0.5
 
static const double kAscenderFraction = 0.25
 
static const double kXHeightCapRatio
 

Detailed Description

Definition at line 103 of file classify.h.

Constructor & Destructor Documentation

◆ Classify()

tesseract::Classify::Classify ( )

Definition at line 60 of file classify.cpp.

61 : BOOL_MEMBER(allow_blob_division, true, "Use divisible blobs chopping",
62 this->params()),
64 "Prioritize blob division over chopping", this->params()),
65 BOOL_MEMBER(classify_enable_learning, true, "Enable adaptive classifier",
66 this->params()),
67 INT_MEMBER(classify_debug_level, 0, "Classify debug level",
68 this->params()),
69 INT_MEMBER(classify_norm_method, character, "Normalization Method ...",
70 this->params()),
72 "Character Normalization Range ...", this->params()),
74 "Veto ratio between classifier ratings", this->params()),
76 "Veto difference between classifier certainties",
77 this->params()),
78 BOOL_MEMBER(tess_cn_matching, 0, "Character Normalized Matching",
79 this->params()),
80 BOOL_MEMBER(tess_bn_matching, 0, "Baseline Normalized Matching",
81 this->params()),
83 "Enable adaptive classifier", this->params()),
85 "Use pre-adapted classifier templates", this->params()),
87 "Save adapted templates to a file", this->params()),
88 BOOL_MEMBER(classify_enable_adaptive_debugger, 0, "Enable match debugger",
89 this->params()),
91 "Non-linear stroke-density normalization", this->params()),
92 INT_MEMBER(matcher_debug_level, 0, "Matcher Debug Level", this->params()),
93 INT_MEMBER(matcher_debug_flags, 0, "Matcher Debug Flags", this->params()),
94 INT_MEMBER(classify_learning_debug_level, 0, "Learning Debug Level: ",
95 this->params()),
96 double_MEMBER(matcher_good_threshold, 0.125, "Good Match (0-1)",
97 this->params()),
98 double_MEMBER(matcher_reliable_adaptive_result, 0.0, "Great Match (0-1)",
99 this->params()),
100 double_MEMBER(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)",
101 this->params()),
102 double_MEMBER(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)",
103 this->params()),
104 double_MEMBER(matcher_rating_margin, 0.1, "New template margin (0-1)",
105 this->params()),
106 double_MEMBER(matcher_avg_noise_size, 12.0, "Avg. noise blob length",
107 this->params()),
108 INT_MEMBER(matcher_permanent_classes_min, 1, "Min # of permanent classes",
109 this->params()),
111 "Reliable Config Threshold", this->params()),
113 "Enable adaption even if the ambiguities have not been seen",
114 this->params()),
116 "Maximum angle delta for prototype clustering",
117 this->params()),
119 "Penalty to apply when a non-alnum is vertically out of "
120 "its expected textline position",
121 this->params()),
122 double_MEMBER(rating_scale, 1.5, "Rating scaling factor", this->params()),
123 double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
124 this->params()),
126 "Scale factor for features not used", this->params()),
129 "Prune poor adapted results this much worse than best result",
130 this->params()),
132 "Threshold at which classify_adapted_pruning_factor starts",
133 this->params()),
135 "Threshold for good protos during adaptive 0-255",
136 this->params()),
138 "Threshold for good features during adaptive 0-255",
139 this->params()),
141 "Do not include character fragments in the"
142 " results of the classifier",
143 this->params()),
145 -3.0,
146 "Exclude fragments that do not look like whole"
147 " characters from training and adaption",
148 this->params()),
150 "Bring up graphical debugging windows for fragments training",
151 this->params()),
153 "Use two different windows for debugging the matching: "
154 "One for the protos and one for the features.",
155 this->params()),
156 STRING_MEMBER(classify_learn_debug_str, "", "Class str to debug learning",
157 this->params()),
159 "Class Pruner Threshold 0-255", this->params()),
161 "Class Pruner Multiplier 0-255: ", this->params()),
163 "Class Pruner CutoffStrength: ", this->params()),
165 "Integer Matcher Multiplier 0-255: ", this->params()),
167 "Assume the input is numbers [0-9].", this->params()),
168 double_MEMBER(speckle_large_max_size, 0.30, "Max large speckle size",
169 this->params()),
171 "Penalty to add to worst rating for noise", this->params()),
173 shape_table_(nullptr),
174 dict_(this),
175 static_classifier_(nullptr) {
176 fontinfo_table_.set_compare_callback(
178 fontinfo_table_.set_clear_callback(
180 fontset_table_.set_compare_callback(
182 fontset_table_.set_clear_callback(
184 AdaptedTemplates = nullptr;
185 BackupAdaptedTemplates = nullptr;
186 PreTrainedTemplates = nullptr;
187 AllProtosOn = nullptr;
188 AllConfigsOn = nullptr;
189 AllConfigsOff = nullptr;
190 TempProtoMask = nullptr;
191 NormProtos = nullptr;
192
193 NumAdaptationsFailed = 0;
194
195 learn_debug_win_ = nullptr;
196 learn_fragmented_word_debug_win_ = nullptr;
197 learn_fragments_debug_win_ = nullptr;
199}
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:315
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:324
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:321
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:318
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
Definition: tesscallback.h:258
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:112
@ character
Definition: mfoutline.h:63
bool CompareFontInfo(const FontInfo &fi1, const FontInfo &fi2)
Definition: fontinfo.cpp:119
void FontInfoDeleteCallback(FontInfo f)
Definition: fontinfo.cpp:138
bool CompareFontSet(const FontSet &fs1, const FontSet &fs2)
Definition: fontinfo.cpp:127
void FontSetDeleteCallback(FontSet fs)
Definition: fontinfo.cpp:147
ParamsVectors * params()
Definition: ccutil.h:67
double speckle_rating_penalty
Definition: classify.h:511
double classify_adapted_pruning_factor
Definition: classify.h:477
double classify_max_rating_ratio
Definition: classify.h:438
BIT_VECTOR AllProtosOn
Definition: classify.h:522
bool matcher_debug_separate_windows
Definition: classify.h:494
IntegerMatcher im_
Definition: classify.h:540
double tessedit_class_miss_scale
Definition: classify.h:475
bool classify_debug_character_fragments
Definition: classify.h:491
bool allow_blob_division
Definition: classify.h:423
double matcher_bad_match_pad
Definition: classify.h:459
bool prioritize_division
Definition: classify.h:428
bool classify_enable_adaptive_debugger
Definition: classify.h:450
BIT_VECTOR TempProtoMask
Definition: classify.h:525
bool classify_save_adapted_templates
Definition: classify.h:449
double classify_adapted_pruning_threshold
Definition: classify.h:479
int classify_cp_cutoff_strength
Definition: classify.h:503
int matcher_min_examples_for_prototyping
Definition: classify.h:464
double classify_character_fragments_garbage_certainty_threshold
Definition: classify.h:489
int classify_adapt_proto_threshold
Definition: classify.h:481
double matcher_perfect_threshold
Definition: classify.h:458
bool classify_nonlinear_norm
Definition: classify.h:452
int classify_class_pruner_multiplier
Definition: classify.h:501
ShapeTable * shape_table_
Definition: classify.h:546
double speckle_large_max_size
Definition: classify.h:509
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:515
bool classify_use_pre_adapted_templates
Definition: classify.h:447
bool classify_bln_numeric_mode
Definition: classify.h:508
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:514
double classify_misfit_junk_penalty
Definition: classify.h:471
int classify_class_pruner_threshold
Definition: classify.h:499
FEATURE_DEFS_STRUCT feature_defs_
Definition: classify.h:541
int matcher_permanent_classes_min
Definition: classify.h:462
UnicityTable< FontSet > fontset_table_
Definition: classify.h:537
bool classify_enable_learning
Definition: classify.h:429
double matcher_clustering_max_angle_delta
Definition: classify.h:468
double matcher_rating_margin
Definition: classify.h:460
double classify_char_norm_range
Definition: classify.h:436
double classify_max_certainty_margin
Definition: classify.h:440
double matcher_avg_noise_size
Definition: classify.h:461
BIT_VECTOR AllConfigsOff
Definition: classify.h:524
double certainty_scale
Definition: classify.h:473
double matcher_reliable_adaptive_result
Definition: classify.h:457
bool disable_character_fragments
Definition: classify.h:486
char * classify_learn_debug_str
Definition: classify.h:495
int classify_integer_matcher_multiplier
Definition: classify.h:505
int matcher_sufficient_examples_for_prototyping
Definition: classify.h:466
NORM_PROTOS * NormProtos
Definition: classify.h:527
BIT_VECTOR AllConfigsOn
Definition: classify.h:523
bool classify_enable_adaptive_matcher
Definition: classify.h:445
double matcher_good_threshold
Definition: classify.h:456
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:529
ADAPT_TEMPLATES BackupAdaptedTemplates
Definition: classify.h:519
int classify_adapt_feature_threshold
Definition: classify.h:483
int classify_learning_debug_level
Definition: classify.h:455

◆ ~Classify()

tesseract::Classify::~Classify ( )
override

Definition at line 201 of file classify.cpp.

201 {
203 delete learn_debug_win_;
204 delete learn_fragmented_word_debug_win_;
205 delete learn_fragments_debug_win_;
206}
void EndAdaptiveClassifier()
Definition: adaptmatch.cpp:459

Member Function Documentation

◆ AdaptableWord()

bool tesseract::Classify::AdaptableWord ( WERD_RES word)

Return true if the specified word is acceptable for adaptation.

Globals: none

Parameters
wordcurrent word
Returns
true or false

Definition at line 821 of file adaptmatch.cpp.

821 {
822 if (word->best_choice == nullptr) return false;
823 int BestChoiceLength = word->best_choice->length();
824 float adaptable_score =
826 return // rules that apply in general - simplest to compute first
827 BestChoiceLength > 0 &&
828 BestChoiceLength == word->rebuild_word->NumBlobs() &&
829 BestChoiceLength <= MAX_ADAPTABLE_WERD_SIZE &&
830 // This basically ensures that the word is at least a dictionary match
831 // (freq word, user word, system dawg word, etc).
832 // Since all the other adjustments will make adjust factor higher
833 // than higher than adaptable_score=1.1+0.05=1.15
834 // Since these are other flags that ensure that the word is dict word,
835 // this check could be at times redundant.
836 word->best_choice->adjust_factor() <= adaptable_score &&
837 // Make sure that alternative choices are not dictionary words.
838 word->AlternativeChoiceAdjustmentsWorseThan(adaptable_score);
839}
#define MAX_ADAPTABLE_WERD_SIZE
Definition: adaptmatch.cpp:80
#define ADAPTABLE_WERD_ADJUSTMENT
Definition: adaptmatch.cpp:82
int NumBlobs() const
Definition: blobs.h:448
TWERD * rebuild_word
Definition: pageres.h:266
WERD_CHOICE * best_choice
Definition: pageres.h:241
bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const
Definition: pageres.cpp:439
float adjust_factor() const
Definition: ratngs.h:296
int length() const
Definition: ratngs.h:293
virtual Dict & getDict()
Definition: classify.h:107
double segment_penalty_dict_case_ok
Definition: dict.h:605

◆ AdaptiveClassifier()

void tesseract::Classify::AdaptiveClassifier ( TBLOB Blob,
BLOB_CHOICE_LIST *  Choices 
)

This routine calls the adaptive matcher which returns (in an array) the class id of each class matched.

It also returns the number of classes matched. For each class matched it places the best rating found for that class into the Ratings array.

Bad matches are then removed so that they don't need to be sorted. The remaining good matches are then sorted and converted to choices.

This routine also performs some simple speckle filtering.

Parameters
Blobblob to be classified
[out]ChoicesList of choices found by adaptive matcher. filled on return with the choices found by the class pruner and the ratings therefrom. Also contains the detailed results of the integer matcher.

Definition at line 191 of file adaptmatch.cpp.

191 {
192 assert(Choices != nullptr);
193 auto *Results = new ADAPT_RESULTS;
194 Results->Initialize();
195
196 ASSERT_HOST(AdaptedTemplates != nullptr);
197
198 DoAdaptiveMatch(Blob, Results);
199
200 RemoveBadMatches(Results);
201 Results->match.sort(&UnicharRating::SortDescendingRating);
202 RemoveExtraPuncs(Results);
203 Results->ComputeBest();
204 ConvertMatchesToChoices(Blob->denorm(), Blob->bounding_box(), Results,
205 Choices);
206
207 // TODO(rays) Move to before ConvertMatchesToChoices!
208 if (LargeSpeckle(*Blob) || Choices->length() == 0)
209 AddLargeSpeckleTo(Results->BlobLength, Choices);
210
211 if (matcher_debug_level >= 1) {
212 tprintf("AD Matches = ");
214 }
215
216#ifndef GRAPHICS_DISABLED
218 DebugAdaptiveClassifier(Blob, Results);
219#endif
220
221 delete Results;
222} /* AdaptiveClassifier */
#define ASSERT_HOST(x)
Definition: errcode.h:88
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
TBOX bounding_box() const
Definition: blobs.cpp:468
const DENORM & denorm() const
Definition: blobs.h:363
void Initialize()
Definition: adaptmatch.cpp:102
void RemoveBadMatches(ADAPT_RESULTS *Results)
bool LargeSpeckle(const TBLOB &blob)
Definition: classify.cpp:242
void DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results)
void RemoveExtraPuncs(ADAPT_RESULTS *Results)
void ConvertMatchesToChoices(const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
void DebugAdaptiveClassifier(TBLOB *Blob, ADAPT_RESULTS *Results)
void AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices)
Definition: classify.cpp:219
void PrintAdaptiveMatchResults(const ADAPT_RESULTS &results)
static int SortDescendingRating(const void *t1, const void *t2)
Definition: shapetable.h:55

◆ AdaptiveClassifierIsEmpty()

bool tesseract::Classify::AdaptiveClassifierIsEmpty ( ) const
inline

Definition at line 326 of file classify.h.

326 {
327 return AdaptedTemplates->NumPermClasses == 0;
328 }
uint8_t NumPermClasses
Definition: adaptive.h:69

◆ AdaptiveClassifierIsFull()

bool tesseract::Classify::AdaptiveClassifierIsFull ( ) const
inline

Definition at line 325 of file classify.h.

325{ return NumAdaptationsFailed > 0; }

◆ AdaptToChar()

void tesseract::Classify::AdaptToChar ( TBLOB Blob,
CLASS_ID  ClassId,
int  FontinfoId,
float  Threshold,
ADAPT_TEMPLATES  adaptive_templates 
)
Parameters
Blobblob to add to templates for ClassId
ClassIdclass to add blob to
FontinfoIdfont information from pre-trained templates
Thresholdminimum match rating to existing template
adaptive_templatescurrent set of adapted templates

Globals:

  • AllProtosOn dummy mask to match against all protos
  • AllConfigsOn dummy mask to match against all configs

Definition at line 853 of file adaptmatch.cpp.

855 {
856 int NumFeatures;
857 INT_FEATURE_ARRAY IntFeatures;
858 UnicharRating int_result;
859 INT_CLASS IClass;
860 ADAPT_CLASS Class;
861 TEMP_CONFIG TempConfig;
862 FEATURE_SET FloatFeatures;
863 int NewTempConfigId;
864
865 if (!LegalClassId (ClassId))
866 return;
867
868 int_result.unichar_id = ClassId;
869 Class = adaptive_templates->Class[ClassId];
870 assert(Class != nullptr);
871 if (IsEmptyAdaptedClass(Class)) {
872 InitAdaptedClass(Blob, ClassId, FontinfoId, Class, adaptive_templates);
873 } else {
874 IClass = ClassForClassId(adaptive_templates->Templates, ClassId);
875
876 NumFeatures = GetAdaptiveFeatures(Blob, IntFeatures, &FloatFeatures);
877 if (NumFeatures <= 0) {
878 return; // Features already freed by GetAdaptiveFeatures.
879 }
880
881 // Only match configs with the matching font.
882 BIT_VECTOR MatchingFontConfigs = NewBitVector(MAX_NUM_PROTOS);
883 for (int cfg = 0; cfg < IClass->NumConfigs; ++cfg) {
884 if (GetFontinfoId(Class, cfg) == FontinfoId) {
885 SET_BIT(MatchingFontConfigs, cfg);
886 } else {
887 reset_bit(MatchingFontConfigs, cfg);
888 }
889 }
890 im_.Match(IClass, AllProtosOn, MatchingFontConfigs,
891 NumFeatures, IntFeatures,
894 FreeBitVector(MatchingFontConfigs);
895
896 SetAdaptiveThreshold(Threshold);
897
898 if (1.0f - int_result.rating <= Threshold) {
899 if (ConfigIsPermanent(Class, int_result.config)) {
901 tprintf("Found good match to perm config %d = %4.1f%%.\n",
902 int_result.config, int_result.rating * 100.0);
903 FreeFeatureSet(FloatFeatures);
904 return;
905 }
906
907 TempConfig = TempConfigFor(Class, int_result.config);
908 IncreaseConfidence(TempConfig);
909 if (TempConfig->NumTimesSeen > Class->MaxNumTimesSeen) {
910 Class->MaxNumTimesSeen = TempConfig->NumTimesSeen;
911 }
913 tprintf("Increasing reliability of temp config %d to %d.\n",
914 int_result.config, TempConfig->NumTimesSeen);
915
916 if (TempConfigReliable(ClassId, TempConfig)) {
917 MakePermanent(adaptive_templates, ClassId, int_result.config, Blob);
918 UpdateAmbigsGroup(ClassId, Blob);
919 }
920 } else {
922 tprintf("Found poor match to temp config %d = %4.1f%%.\n",
923 int_result.config, int_result.rating * 100.0);
925 DisplayAdaptedChar(Blob, IClass);
926 }
927 NewTempConfigId =
928 MakeNewTemporaryConfig(adaptive_templates, ClassId, FontinfoId,
929 NumFeatures, IntFeatures, FloatFeatures);
930 if (NewTempConfigId >= 0 &&
931 TempConfigReliable(ClassId, TempConfigFor(Class, NewTempConfigId))) {
932 MakePermanent(adaptive_templates, ClassId, NewTempConfigId, Blob);
933 UpdateAmbigsGroup(ClassId, Blob);
934 }
935
936#ifndef GRAPHICS_DISABLED
938 DisplayAdaptedChar(Blob, IClass);
939 }
940#endif
941 }
942 FreeFeatureSet(FloatFeatures);
943 }
944} /* AdaptToChar */
#define IsEmptyAdaptedClass(Class)
Definition: adaptive.h:79
#define ConfigIsPermanent(Class, ConfigId)
Definition: adaptive.h:82
#define IncreaseConfidence(TempConfig)
Definition: adaptive.h:95
#define TempConfigFor(Class, ConfigId)
Definition: adaptive.h:91
#define NO_DEBUG
Definition: adaptmatch.cpp:79
#define MAX_NUM_PROTOS
Definition: intproto.h:48
INT_FEATURE_STRUCT INT_FEATURE_ARRAY[MAX_NUM_INT_FEATURES]
Definition: intproto.h:152
#define ClassForClassId(T, c)
Definition: intproto.h:178
#define LegalClassId(c)
Definition: intproto.h:176
void FreeFeatureSet(FEATURE_SET FeatureSet)
Definition: ocrfeatures.cpp:62
#define reset_bit(array, bit)
Definition: bitvec.h:57
uint32_t * BIT_VECTOR
Definition: bitvec.h:28
#define SET_BIT(array, bit)
Definition: bitvec.h:55
uint8_t NumTimesSeen
Definition: adaptive.h:36
uint8_t MaxNumTimesSeen
Definition: adaptive.h:57
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:70
INT_TEMPLATES Templates
Definition: adaptive.h:67
void DisplayAdaptedChar(TBLOB *blob, INT_CLASS_STRUCT *int_class)
Definition: adaptmatch.cpp:946
int GetAdaptiveFeatures(TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
Definition: adaptmatch.cpp:786
void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob)
int MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config)
int GetFontinfoId(ADAPT_CLASS Class, uint8_t ConfigId)
Definition: adaptive.cpp:173
void InitAdaptedClass(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
Definition: adaptmatch.cpp:693
void SetAdaptiveThreshold(float Threshold)
void MakePermanent(ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
void Match(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, const INT_FEATURE_STRUCT *Features, tesseract::UnicharRating *Result, int AdaptFeatureThreshold, int Debug, bool SeparateDebugWindows)
Definition: intmatcher.cpp:511
uint8_t NumConfigs
Definition: intproto.h:108

◆ AddLargeSpeckleTo()

void tesseract::Classify::AddLargeSpeckleTo ( int  blob_length,
BLOB_CHOICE_LIST *  choices 
)

Definition at line 219 of file classify.cpp.

219 {
220 BLOB_CHOICE_IT bc_it(choices);
221 // If there is no classifier result, we will use the worst possible certainty
222 // and corresponding rating.
223 float certainty = -getDict().certainty_scale;
224 float rating = rating_scale * blob_length;
225 if (!choices->empty() && blob_length > 0) {
226 bc_it.move_to_last();
227 BLOB_CHOICE* worst_choice = bc_it.data();
228 // Add speckle_rating_penalty to worst rating, matching old value.
229 rating = worst_choice->rating() + speckle_rating_penalty;
230 // Compute the rating to correspond to the certainty. (Used to be kept
231 // the same, but that messes up the language model search.)
232 certainty = -rating * getDict().certainty_scale /
233 (rating_scale * blob_length);
234 }
235 auto* blob_choice = new BLOB_CHOICE(UNICHAR_SPACE, rating, certainty,
236 -1, 0.0f, FLT_MAX, 0,
238 bc_it.add_to_end(blob_choice);
239}
@ BCC_SPECKLE_CLASSIFIER
Definition: ratngs.h:46
@ UNICHAR_SPACE
Definition: unicharset.h:34
float rating() const
Definition: ratngs.h:80
double certainty_scale
Definition: dict.h:627

◆ AddNewResult()

void tesseract::Classify::AddNewResult ( const UnicharRating new_result,
ADAPT_RESULTS results 
)

This routine adds the result of a classification into Results. If the new rating is much worse than the current best rating, it is not entered into results because it would end up being stripped later anyway. If the new rating is better than the old rating for the class, it replaces the old rating. If this is the first rating for the class, the class is added to the list of matched classes in Results. If the new rating is better than the best so far, it becomes the best so far.

Globals:

Parameters
new_resultnew result to add
[out]resultsresults to add new result to

Definition at line 994 of file adaptmatch.cpp.

995 {
996 int old_match = FindScoredUnichar(new_result.unichar_id, *results);
997
998 if (new_result.rating + matcher_bad_match_pad < results->best_rating ||
999 (old_match < results->match.size() &&
1000 new_result.rating <= results->match[old_match].rating))
1001 return; // New one not good enough.
1002
1003 if (!unicharset.get_fragment(new_result.unichar_id))
1004 results->HasNonfragment = true;
1005
1006 if (old_match < results->match.size()) {
1007 results->match[old_match].rating = new_result.rating;
1008 } else {
1009 results->match.push_back(new_result);
1010 }
1011
1012 if (new_result.rating > results->best_rating &&
1013 // Ensure that fragments do not affect best rating, class and config.
1014 // This is needed so that at least one non-fragmented character is
1015 // always present in the results.
1016 // TODO(daria): verify that this helps accuracy and does not
1017 // hurt performance.
1018 !unicharset.get_fragment(new_result.unichar_id)) {
1019 results->best_match_index = old_match;
1020 results->best_rating = new_result.rating;
1021 results->best_unichar_id = new_result.unichar_id;
1022 }
1023} /* AddNewResult */
int push_back(T object)
UNICHARSET unicharset
Definition: ccutil.h:73
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:734
UNICHAR_ID best_unichar_id
Definition: adaptmatch.cpp:94
GenericVector< UnicharRating > match
Definition: adaptmatch.cpp:97
int best_match_index
Definition: adaptmatch.cpp:95
bool HasNonfragment
Definition: adaptmatch.cpp:93
float best_rating
Definition: adaptmatch.cpp:96

◆ AmbigClassifier()

void tesseract::Classify::AmbigClassifier ( const GenericVector< INT_FEATURE_STRUCT > &  int_features,
const INT_FX_RESULT_STRUCT fx_info,
const TBLOB blob,
INT_TEMPLATES  templates,
ADAPT_CLASS classes,
UNICHAR_ID ambiguities,
ADAPT_RESULTS results 
)

This routine is identical to CharNormClassifier() except that it does no class pruning. It simply matches the unknown blob against the classes listed in Ambiguities.

Globals:

Parameters
blobblob to be classified
templatesbuilt-in templates to classify against
classesadapted class templates
ambiguitiesarray of unichar id's to match against
[out]resultsplace to put match results
int_features
fx_info

Definition at line 1045 of file adaptmatch.cpp.

1052 {
1053 if (int_features.empty()) return;
1054 auto* CharNormArray = new uint8_t[unicharset.size()];
1055 UnicharRating int_result;
1056
1057 results->BlobLength = GetCharNormFeature(fx_info, templates, nullptr,
1058 CharNormArray);
1059 bool debug = matcher_debug_level >= 2 || classify_debug_level > 1;
1060 if (debug)
1061 tprintf("AM Matches = ");
1062
1063 int top = blob->bounding_box().top();
1064 int bottom = blob->bounding_box().bottom();
1065 while (*ambiguities >= 0) {
1066 CLASS_ID class_id = *ambiguities;
1067
1068 int_result.unichar_id = class_id;
1069 im_.Match(ClassForClassId(templates, class_id),
1071 int_features.size(), &int_features[0],
1072 &int_result,
1075
1076 ExpandShapesAndApplyCorrections(nullptr, debug, class_id, bottom, top, 0,
1077 results->BlobLength,
1079 CharNormArray, &int_result, results);
1080 ambiguities++;
1081 }
1082 delete [] CharNormArray;
1083} /* AmbigClassifier */
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:34
bool empty() const
Definition: genericvector.h:91
int size() const
Definition: genericvector.h:72
int16_t top() const
Definition: rect.h:58
int16_t bottom() const
Definition: rect.h:65
int size() const
Definition: unicharset.h:341
int32_t BlobLength
Definition: adaptmatch.cpp:92
int GetCharNormFeature(const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array)
void ExpandShapesAndApplyCorrections(ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)

◆ BaselineClassifier()

UNICHAR_ID * tesseract::Classify::BaselineClassifier ( TBLOB Blob,
const GenericVector< INT_FEATURE_STRUCT > &  int_features,
const INT_FX_RESULT_STRUCT fx_info,
ADAPT_TEMPLATES  Templates,
ADAPT_RESULTS Results 
)

This routine extracts baseline normalized features from the unknown character and matches them against the specified set of templates. The classes which match are added to Results.

Globals:

  • BaselineCutoffs expected num features for each class
Parameters
Blobblob to be classified
Templatescurrent set of adapted templates
Resultsplace to put match results
int_features
fx_info
Returns
Array of possible ambiguous chars that should be checked.

Definition at line 1265 of file adaptmatch.cpp.

1268 {
1269 if (int_features.empty()) return nullptr;
1270 auto* CharNormArray = new uint8_t[unicharset.size()];
1271 ClearCharNormArray(CharNormArray);
1272
1274 PruneClasses(Templates->Templates, int_features.size(), -1, &int_features[0],
1275 CharNormArray, BaselineCutoffs, &Results->CPResults);
1276
1278 tprintf("BL Matches = ");
1279
1280 MasterMatcher(Templates->Templates, int_features.size(), &int_features[0],
1281 CharNormArray,
1282 Templates->Class, matcher_debug_flags, 0,
1283 Blob->bounding_box(), Results->CPResults, Results);
1284
1285 delete [] CharNormArray;
1286 CLASS_ID ClassId = Results->best_unichar_id;
1287 if (ClassId == INVALID_UNICHAR_ID || Results->best_match_index < 0)
1288 return nullptr;
1289
1290 return Templates->Class[ClassId]->
1291 Config[Results->match[Results->best_match_index].config].Perm->Ambigs;
1292} /* BaselineClassifier */
int IntCastRounded(double x)
Definition: helpers.h:175
const double kStandardFeatureLength
Definition: intfx.h:46
CLUSTERCONFIG Config
GenericVector< CP_RESULT_STRUCT > CPResults
Definition: adaptmatch.cpp:98
int PruneClasses(const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
Definition: intmatcher.cpp:452
void ClearCharNormArray(uint8_t *char_norm_array)
Definition: float2int.cpp:44
void MasterMatcher(INT_TEMPLATES templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
int32_t Length
Definition: intfx.h:36

◆ CharNormClassifier()

int tesseract::Classify::CharNormClassifier ( TBLOB blob,
const TrainingSample sample,
ADAPT_RESULTS adapt_results 
)

This routine extracts character normalized features from the unknown character and matches them against the specified set of templates. The classes which match are added to Results.

Parameters
blobblob to be classified
sampletemplates to classify unknown against
adapt_resultsplace to put match results

Globals:

  • CharNormCutoffs expected num features for each class
  • AllProtosOn mask that enables all protos
  • AllConfigsOn mask that enables all configs

Definition at line 1311 of file adaptmatch.cpp.

1313 {
1314 // This is the length that is used for scaling ratings vs certainty.
1315 adapt_results->BlobLength =
1316 IntCastRounded(sample.outline_length() / kStandardFeatureLength);
1317 GenericVector<UnicharRating> unichar_results;
1318 static_classifier_->UnicharClassifySample(sample, blob->denorm().pix(), 0,
1319 -1, &unichar_results);
1320 // Convert results to the format used internally by AdaptiveClassifier.
1321 for (int r = 0; r < unichar_results.size(); ++r) {
1322 AddNewResult(unichar_results[r], adapt_results);
1323 }
1324 return sample.num_features();
1325} /* CharNormClassifier */
Pix * pix() const
Definition: normalis.h:246
void AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results)
Definition: adaptmatch.cpp:994
Definition: cluster.h:32
virtual int UnicharClassifySample(const TrainingSample &sample, Pix *page_pix, int debug, UNICHAR_ID keep_this, GenericVector< UnicharRating > *results)

◆ CharNormTrainingSample()

int tesseract::Classify::CharNormTrainingSample ( bool  pruner_only,
int  keep_this,
const TrainingSample sample,
GenericVector< UnicharRating > *  results 
)

Definition at line 1329 of file adaptmatch.cpp.

1332 {
1333 results->clear();
1334 auto* adapt_results = new ADAPT_RESULTS();
1335 adapt_results->Initialize();
1336 // Compute the bounding box of the features.
1337 uint32_t num_features = sample.num_features();
1338 // Only the top and bottom of the blob_box are used by MasterMatcher, so
1339 // fabricate right and left using top and bottom.
1340 TBOX blob_box(sample.geo_feature(GeoBottom), sample.geo_feature(GeoBottom),
1341 sample.geo_feature(GeoTop), sample.geo_feature(GeoTop));
1342 // Compute the char_norm_array from the saved cn_feature.
1343 FEATURE norm_feature = sample.GetCNFeature();
1344 auto* char_norm_array = new uint8_t[unicharset.size()];
1345 int num_pruner_classes = std::max(unicharset.size(),
1347 auto* pruner_norm_array = new uint8_t[num_pruner_classes];
1348 adapt_results->BlobLength =
1349 static_cast<int>(ActualOutlineLength(norm_feature) * 20 + 0.5);
1350 ComputeCharNormArrays(norm_feature, PreTrainedTemplates, char_norm_array,
1351 pruner_norm_array);
1352
1353 PruneClasses(PreTrainedTemplates, num_features, keep_this, sample.features(),
1354 pruner_norm_array,
1355 shape_table_ != nullptr ? &shapetable_cutoffs_[0] : CharNormCutoffs,
1356 &adapt_results->CPResults);
1357 delete [] pruner_norm_array;
1358 if (keep_this >= 0) {
1359 adapt_results->CPResults[0].Class = keep_this;
1360 adapt_results->CPResults.truncate(1);
1361 }
1362 if (pruner_only) {
1363 // Convert pruner results to output format.
1364 for (int i = 0; i < adapt_results->CPResults.size(); ++i) {
1365 int class_id = adapt_results->CPResults[i].Class;
1366 results->push_back(
1367 UnicharRating(class_id, 1.0f - adapt_results->CPResults[i].Rating));
1368 }
1369 } else {
1370 MasterMatcher(PreTrainedTemplates, num_features, sample.features(),
1371 char_norm_array,
1372 nullptr, matcher_debug_flags,
1374 blob_box, adapt_results->CPResults, adapt_results);
1375 // Convert master matcher results to output format.
1376 for (int i = 0; i < adapt_results->match.size(); i++) {
1377 results->push_back(adapt_results->match[i]);
1378 }
1380 }
1381 delete [] char_norm_array;
1382 delete adapt_results;
1383 return num_features;
1384} /* CharNormTrainingSample */
float ActualOutlineLength(FEATURE Feature)
Definition: normfeat.cpp:32
@ GeoBottom
Definition: picofeat.h:37
@ GeoTop
Definition: picofeat.h:38
Definition: rect.h:34
void ComputeCharNormArrays(FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array)

◆ ClassAndConfigIDToFontOrShapeID()

int tesseract::Classify::ClassAndConfigIDToFontOrShapeID ( int  class_id,
int  int_result_config 
) const

Definition at line 2207 of file adaptmatch.cpp.

2208 {
2209 int font_set_id = PreTrainedTemplates->Class[class_id]->font_set_id;
2210 // Older inttemps have no font_ids.
2211 if (font_set_id < 0)
2212 return kBlankFontinfoId;
2213 const FontSet &fs = fontset_table_.get(font_set_id);
2214 ASSERT_HOST(int_result_config >= 0 && int_result_config < fs.size);
2215 return fs.configs[int_result_config];
2216}
INT_CLASS Class[MAX_NUM_CLASSES]
Definition: intproto.h:121

◆ ClassIDToDebugStr()

STRING tesseract::Classify::ClassIDToDebugStr ( const INT_TEMPLATES_STRUCT templates,
int  class_id,
int  config_id 
) const

Definition at line 2194 of file adaptmatch.cpp.

2195 {
2196 STRING class_string;
2197 if (templates == PreTrainedTemplates && shape_table_ != nullptr) {
2198 int shape_id = ClassAndConfigIDToFontOrShapeID(class_id, config_id);
2199 class_string = shape_table_->DebugStr(shape_id);
2200 } else {
2201 class_string = unicharset.debug_str(class_id);
2202 }
2203 return class_string;
2204}
Definition: strngs.h:45
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:343
int ClassAndConfigIDToFontOrShapeID(int class_id, int int_result_config) const
STRING DebugStr(int shape_id) const
Definition: shapetable.cpp:281

◆ ClassifyAsNoise()

void tesseract::Classify::ClassifyAsNoise ( ADAPT_RESULTS results)

This routine computes a rating which reflects the likelihood that the blob being classified is a noise blob. NOTE: assumes that the blob length has already been computed and placed into Results.

Parameters
resultsresults to add noise classification to

Globals:

  • matcher_avg_noise_size avg. length of a noise blob

Definition at line 1399 of file adaptmatch.cpp.

1399 {
1400 float rating = results->BlobLength / matcher_avg_noise_size;
1401 rating *= rating;
1402 rating /= 1.0 + rating;
1403
1404 AddNewResult(UnicharRating(UNICHAR_SPACE, 1.0f - rating), results);
1405} /* ClassifyAsNoise */

◆ ClearCharNormArray()

void tesseract::Classify::ClearCharNormArray ( uint8_t *  char_norm_array)

For each class in the unicharset, clears the corresponding entry in char_norm_array. char_norm_array is indexed by unichar_id.

Globals:

  • none
Parameters
char_norm_arrayarray to be cleared

Definition at line 44 of file float2int.cpp.

44 {
45 memset(char_norm_array, 0, sizeof(*char_norm_array) * unicharset.size());
46} /* ClearCharNormArray */

◆ ComputeCharNormArrays()

void tesseract::Classify::ComputeCharNormArrays ( FEATURE_STRUCT norm_feature,
INT_TEMPLATES_STRUCT templates,
uint8_t *  char_norm_array,
uint8_t *  pruner_array 
)

Definition at line 1698 of file adaptmatch.cpp.

1701 {
1702 ComputeIntCharNormArray(*norm_feature, char_norm_array);
1703 if (pruner_array != nullptr) {
1704 if (shape_table_ == nullptr) {
1705 ComputeIntCharNormArray(*norm_feature, pruner_array);
1706 } else {
1707 memset(pruner_array, UINT8_MAX,
1708 templates->NumClasses * sizeof(pruner_array[0]));
1709 // Each entry in the pruner norm array is the MIN of all the entries of
1710 // the corresponding unichars in the CharNormArray.
1711 for (int id = 0; id < templates->NumClasses; ++id) {
1712 int font_set_id = templates->Class[id]->font_set_id;
1713 const FontSet &fs = fontset_table_.get(font_set_id);
1714 for (int config = 0; config < fs.size; ++config) {
1715 const Shape& shape = shape_table_->GetShape(fs.configs[config]);
1716 for (int c = 0; c < shape.size(); ++c) {
1717 if (char_norm_array[shape[c].unichar_id] < pruner_array[id])
1718 pruner_array[id] = char_norm_array[shape[c].unichar_id];
1719 }
1720 }
1721 }
1722 }
1723 }
1724 FreeFeature(norm_feature);
1725}
void FreeFeature(FEATURE Feature)
Definition: ocrfeatures.cpp:54
void ComputeIntCharNormArray(const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array)
Definition: float2int.cpp:62
const Shape & GetShape(int shape_id) const
Definition: shapetable.h:319

◆ ComputeCorrectedRating()

double tesseract::Classify::ComputeCorrectedRating ( bool  debug,
int  unichar_id,
double  cp_rating,
double  im_rating,
int  feature_misses,
int  bottom,
int  top,
int  blob_length,
int  matcher_multiplier,
const uint8_t *  cn_factors 
)

Definition at line 1202 of file adaptmatch.cpp.

1207 {
1208 // Compute class feature corrections.
1209 double cn_corrected = im_.ApplyCNCorrection(1.0 - im_rating, blob_length,
1210 cn_factors[unichar_id],
1211 matcher_multiplier);
1212 double miss_penalty = tessedit_class_miss_scale * feature_misses;
1213 double vertical_penalty = 0.0;
1214 // Penalize non-alnums for being vertical misfits.
1215 if (!unicharset.get_isalpha(unichar_id) &&
1216 !unicharset.get_isdigit(unichar_id) &&
1217 cn_factors[unichar_id] != 0 && classify_misfit_junk_penalty > 0.0) {
1218 int min_bottom, max_bottom, min_top, max_top;
1219 unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom,
1220 &min_top, &max_top);
1221 if (debug) {
1222 tprintf("top=%d, vs [%d, %d], bottom=%d, vs [%d, %d]\n",
1223 top, min_top, max_top, bottom, min_bottom, max_bottom);
1224 }
1225 if (top < min_top || top > max_top ||
1226 bottom < min_bottom || bottom > max_bottom) {
1227 vertical_penalty = classify_misfit_junk_penalty;
1228 }
1229 }
1230 double result = 1.0 - (cn_corrected + miss_penalty + vertical_penalty);
1231 if (result < WORST_POSSIBLE_RATING)
1232 result = WORST_POSSIBLE_RATING;
1233 if (debug) {
1234 tprintf("%s: %2.1f%%(CP%2.1f, IM%2.1f + CN%.2f(%d) + MP%2.1f + VP%2.1f)\n",
1235 unicharset.id_to_unichar(unichar_id),
1236 result * 100.0,
1237 cp_rating * 100.0,
1238 (1.0 - im_rating) * 100.0,
1239 (cn_corrected - (1.0 - im_rating)) * 100.0,
1240 cn_factors[unichar_id],
1241 miss_penalty * 100.0,
1242 vertical_penalty * 100.0);
1243 }
1244 return result;
1245}
#define WORST_POSSIBLE_RATING
Definition: adaptmatch.cpp:86
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:568
float ApplyCNCorrection(float rating, int blob_length, int normalization_factor, int matcher_multiplier)

◆ ComputeIntCharNormArray()

void tesseract::Classify::ComputeIntCharNormArray ( const FEATURE_STRUCT norm_feature,
uint8_t *  char_norm_array 
)

For each class in unicharset, computes the match between norm_feature and the normalization protos for that class. Converts this number to the range from 0 - 255 and stores it into char_norm_array. CharNormArray is indexed by unichar_id.

Globals:

  • PreTrainedTemplates current set of built-in templates
Parameters
norm_featurecharacter normalization feature
[out]char_norm_arrayplace to put results of size unicharset.size()

Definition at line 62 of file float2int.cpp.

63 {
64 for (int i = 0; i < unicharset.size(); i++) {
65 if (i < PreTrainedTemplates->NumClasses) {
66 int norm_adjust = static_cast<int>(INT_CHAR_NORM_RANGE *
67 ComputeNormMatch(i, norm_feature, false));
68 char_norm_array[i] = ClipToRange(norm_adjust, 0, MAX_INT_CHAR_NORM);
69 } else {
70 // Classes with no templates (eg. ambigs & ligatures) default
71 // to worst match.
72 char_norm_array[i] = MAX_INT_CHAR_NORM;
73 }
74 }
75} /* ComputeIntCharNormArray */
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:108
#define MAX_INT_CHAR_NORM
Definition: float2int.cpp:27
#define INT_CHAR_NORM_RANGE
Definition: intproto.h:130
float ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch)
Definition: normmatch.cpp:94

◆ ComputeIntFeatures()

void tesseract::Classify::ComputeIntFeatures ( FEATURE_SET  Features,
INT_FEATURE_ARRAY  IntFeatures 
)

This routine converts each floating point pico-feature in Features into integer format and saves it into IntFeatures.

Globals:

  • none
Parameters
Featuresfloating point pico-features to be converted
[out]IntFeaturesarray to put converted features into

Definition at line 90 of file float2int.cpp.

91 {
92 float YShift;
93
95 YShift = BASELINE_Y_SHIFT;
96 else
97 YShift = Y_SHIFT;
98
99 for (int Fid = 0; Fid < Features->NumFeatures; Fid++) {
100 FEATURE Feature = Features->Features[Fid];
101
102 IntFeatures[Fid].X =
104 IntFeatures[Fid].Y =
105 Bucket8For(Feature->Params[PicoFeatY], YShift, INT_FEAT_RANGE);
106 IntFeatures[Fid].Theta = CircBucketFor(Feature->Params[PicoFeatDir],
108 IntFeatures[Fid].CP_misses = 0;
109 }
110} /* ComputeIntFeatures */
#define BASELINE_Y_SHIFT
Definition: float2int.h:28
#define INT_FEAT_RANGE
Definition: float2int.h:27
uint8_t Bucket8For(float param, float offset, int num_buckets)
Definition: intproto.cpp:418
uint8_t CircBucketFor(float param, float offset, int num_buckets)
Definition: intproto.cpp:432
#define ANGLE_SHIFT
Definition: intproto.h:40
#define X_SHIFT
Definition: intproto.h:41
#define Y_SHIFT
Definition: intproto.h:42
@ baseline
Definition: mfoutline.h:63
@ PicoFeatY
Definition: picofeat.h:44
@ PicoFeatDir
Definition: picofeat.h:44
@ PicoFeatX
Definition: picofeat.h:44
float Params[1]
Definition: ocrfeatures.h:61
FEATURE Features[1]
Definition: ocrfeatures.h:68
uint16_t NumFeatures
Definition: ocrfeatures.h:66

◆ ComputeNormMatch()

float tesseract::Classify::ComputeNormMatch ( CLASS_ID  ClassId,
const FEATURE_STRUCT feature,
bool  DebugMatch 
)

This routine compares Features against each character normalization proto for ClassId and returns the match rating of the best match.

Parameters
ClassIdid of class to match against
featurecharacter normalization feature
DebugMatchcontrols dump of debug info

Globals: NormProtos character normalization prototypes

Returns
Best match rating for Feature against protos of ClassId.

Definition at line 94 of file normmatch.cpp.

96 {
97 LIST Protos;
98 float BestMatch;
99 float Match;
100 float Delta;
101 PROTOTYPE *Proto;
102 int ProtoId;
103
104 if (ClassId >= NormProtos->NumProtos) {
105 ClassId = NO_CLASS;
106 }
107
108 /* handle requests for classification as noise */
109 if (ClassId == NO_CLASS) {
110 /* kludge - clean up constants and make into control knobs later */
111 Match = (feature.Params[CharNormLength] *
112 feature.Params[CharNormLength] * 500.0 +
113 feature.Params[CharNormRx] *
114 feature.Params[CharNormRx] * 8000.0 +
115 feature.Params[CharNormRy] *
116 feature.Params[CharNormRy] * 8000.0);
117 return (1.0 - NormEvidenceOf(Match));
118 }
119
120 BestMatch = FLT_MAX;
121 Protos = NormProtos->Protos[ClassId];
122
123 if (DebugMatch) {
124 tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId));
125 }
126
127 ProtoId = 0;
128 iterate(Protos) {
129 Proto = reinterpret_cast<PROTOTYPE *>first_node (Protos);
130 Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY];
131 Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY];
132 if (DebugMatch) {
133 tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
134 Proto->Mean[CharNormY], Delta,
135 Proto->Weight.Elliptical[CharNormY], Match);
136 }
137 Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx];
138 Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx];
139 if (DebugMatch) {
140 tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
141 Proto->Mean[CharNormRx], Delta,
142 Proto->Weight.Elliptical[CharNormRx], Match);
143 }
144 // Ry is width! See intfx.cpp.
145 Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy];
146 if (DebugMatch) {
147 tprintf("Width: Proto=%g, Delta=%g, Var=%g\n",
148 Proto->Mean[CharNormRy], Delta,
150 }
151 Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy];
152 Delta *= kWidthErrorWeighting;
153 Match += Delta;
154 if (DebugMatch) {
155 tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n",
156 Match, Match / classify_norm_adj_midpoint,
157 NormEvidenceOf(Match), 256 * (1 - NormEvidenceOf(Match)));
158 }
159
160 if (Match < BestMatch)
161 BestMatch = Match;
162
163 ProtoId++;
164 }
165 return 1.0 - NormEvidenceOf(BestMatch);
166} /* ComputeNormMatch */
@ CharNormRx
Definition: normfeat.h:30
@ CharNormY
Definition: normfeat.h:30
@ CharNormRy
Definition: normfeat.h:30
@ CharNormLength
Definition: normfeat.h:30
const double kWidthErrorWeighting
Definition: normmatch.cpp:74
double classify_norm_adj_midpoint
Definition: normmatch.cpp:71
#define iterate(l)
Definition: oldlist.h:101
#define first_node(l)
Definition: oldlist.h:92
#define NO_CLASS
Definition: matchdefs.h:35
float * Elliptical
Definition: cluster.h:60
float * Mean
Definition: cluster.h:74
FLOATUNION Weight
Definition: cluster.h:79
LIST * Protos
Definition: normmatch.cpp:38

◆ ConvertMatchesToChoices()

void tesseract::Classify::ConvertMatchesToChoices ( const DENORM denorm,
const TBOX box,
ADAPT_RESULTS Results,
BLOB_CHOICE_LIST *  Choices 
)

The function converts the given match ratings to the list of blob choices with ratings and certainties (used by the context checkers). If character fragments are present in the results, this function also makes sure that there is at least one non-fragmented classification included. For each classification result check the unicharset for "definite" ambiguities and modify the resulting Choices accordingly.

Definition at line 1413 of file adaptmatch.cpp.

1415 {
1416 assert(Choices != nullptr);
1417 float Rating;
1418 float Certainty;
1419 BLOB_CHOICE_IT temp_it;
1420 bool contains_nonfrag = false;
1421 temp_it.set_to_list(Choices);
1422 int choices_length = 0;
1423 // With no shape_table_ maintain the previous MAX_MATCHES as the maximum
1424 // number of returned results, but with a shape_table_ we want to have room
1425 // for at least the biggest shape (which might contain hundreds of Indic
1426 // grapheme fragments) and more, so use double the size of the biggest shape
1427 // if that is more than the default.
1428 int max_matches = MAX_MATCHES;
1429 if (shape_table_ != nullptr) {
1430 max_matches = shape_table_->MaxNumUnichars() * 2;
1431 if (max_matches < MAX_MATCHES)
1432 max_matches = MAX_MATCHES;
1433 }
1434
1435 float best_certainty = -FLT_MAX;
1436 for (int i = 0; i < Results->match.size(); i++) {
1437 const UnicharRating& result = Results->match[i];
1438 bool adapted = result.adapted;
1439 bool current_is_frag = (unicharset.get_fragment(result.unichar_id) != nullptr);
1440 if (temp_it.length()+1 == max_matches &&
1441 !contains_nonfrag && current_is_frag) {
1442 continue; // look for a non-fragmented character to fill the
1443 // last spot in Choices if only fragments are present
1444 }
1445 // BlobLength can never be legally 0, this means recognition failed.
1446 // But we must return a classification result because some invoking
1447 // functions (chopper/permuter) do not anticipate a null blob choice.
1448 // So we need to assign a poor, but not infinitely bad score.
1449 if (Results->BlobLength == 0) {
1450 Certainty = -20;
1451 Rating = 100; // should be -certainty * real_blob_length
1452 } else {
1453 Rating = Certainty = (1.0f - result.rating);
1454 Rating *= rating_scale * Results->BlobLength;
1455 Certainty *= -(getDict().certainty_scale);
1456 }
1457 // Adapted results, by their very nature, should have good certainty.
1458 // Those that don't are at best misleading, and often lead to errors,
1459 // so don't accept adapted results that are too far behind the best result,
1460 // whether adapted or static.
1461 // TODO(rays) find some way of automatically tuning these constants.
1462 if (Certainty > best_certainty) {
1463 best_certainty = std::min(Certainty, static_cast<float>(classify_adapted_pruning_threshold));
1464 } else if (adapted &&
1465 Certainty / classify_adapted_pruning_factor < best_certainty) {
1466 continue; // Don't accept bad adapted results.
1467 }
1468
1469 float min_xheight, max_xheight, yshift;
1470 denorm.XHeightRange(result.unichar_id, unicharset, box,
1471 &min_xheight, &max_xheight, &yshift);
1472 auto* choice =
1473 new BLOB_CHOICE(result.unichar_id, Rating, Certainty,
1475 min_xheight, max_xheight, yshift,
1476 adapted ? BCC_ADAPTED_CLASSIFIER
1478 choice->set_fonts(result.fonts);
1479 temp_it.add_to_end(choice);
1480 contains_nonfrag |= !current_is_frag; // update contains_nonfrag
1481 choices_length++;
1482 if (choices_length >= max_matches) break;
1483 }
1484 Results->match.truncate(choices_length);
1485} // ConvertMatchesToChoices
@ BCC_ADAPTED_CLASSIFIER
Definition: ratngs.h:45
@ BCC_STATIC_CLASSIFIER
Definition: ratngs.h:44
#define MAX_MATCHES
Definition: adaptmatch.cpp:77
void truncate(int size)
void XHeightRange(int unichar_id, const UNICHARSET &unicharset, const TBOX &bbox, float *min_xht, float *max_xht, float *yshift) const
Definition: normalis.cpp:428
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:663
GenericVector< ScoredFont > fonts
Definition: shapetable.h:87
int MaxNumUnichars() const
Definition: shapetable.cpp:455

◆ ConvertProto()

void tesseract::Classify::ConvertProto ( PROTO  Proto,
int  ProtoId,
INT_CLASS  Class 
)

This routine converts Proto to integer format and installs it as ProtoId in Class.

Parameters
Protofloating-pt proto to be converted to integer format
ProtoIdid of proto
Classinteger class to add converted proto to

Definition at line 488 of file intproto.cpp.

488 {
489 INT_PROTO P;
490 float Param;
491
492 assert(ProtoId < Class->NumProtos);
493
494 P = ProtoForProtoId(Class, ProtoId);
495
496 Param = Proto->A * 128;
497 P->A = TruncateParam(Param, -128, 127, nullptr);
498
499 Param = -Proto->B * 256;
500 P->B = TruncateParam(Param, 0, 255, nullptr);
501
502 Param = Proto->C * 128;
503 P->C = TruncateParam(Param, -128, 127, nullptr);
504
505 Param = Proto->Angle * 256;
506 if (Param < 0 || Param >= 256)
507 P->Angle = 0;
508 else
509 P->Angle = static_cast<uint8_t>(Param);
510
511 /* round proto length to nearest integer number of pico-features */
512 Param = (Proto->Length / GetPicoFeatureLength()) + 0.5;
513 Class->ProtoLengths[ProtoId] = TruncateParam(Param, 1, 255, nullptr);
515 cprintf("Converted ffeat to (A=%d,B=%d,C=%d,L=%d)",
516 P->A, P->B, P->C, Class->ProtoLengths[ProtoId]);
517} /* ConvertProto */
int TruncateParam(float Param, int Min, int Max, char *Id)
Definition: intproto.cpp:1701
#define ProtoForProtoId(C, P)
Definition: intproto.h:168
#define GetPicoFeatureLength()
Definition: picofeat.h:57
void cprintf(const char *format,...)
Definition: callcpp.cpp:32
uint8_t Angle
Definition: intproto.h:85
uint8_t * ProtoLengths
Definition: intproto.h:110
float Angle
Definition: protos.h:42
float Length
Definition: protos.h:43
float B
Definition: protos.h:38
float A
Definition: protos.h:37
float C
Definition: protos.h:39

◆ CreateIntTemplates()

INT_TEMPLATES tesseract::Classify::CreateIntTemplates ( CLASSES  FloatProtos,
const UNICHARSET target_unicharset 
)

This routine converts from the old floating point format to the new integer format.

Parameters
FloatProtosprototypes in old floating pt format
target_unicharsetthe UNICHARSET to use
Returns
New set of training templates in integer format.
Note
Globals: none

Definition at line 527 of file intproto.cpp.

529 {
530 INT_TEMPLATES IntTemplates;
531 CLASS_TYPE FClass;
532 INT_CLASS IClass;
533 int ClassId;
534 int ProtoId;
535 int ConfigId;
536
537 IntTemplates = NewIntTemplates();
538
539 for (ClassId = 0; ClassId < target_unicharset.size(); ClassId++) {
540 FClass = &(FloatProtos[ClassId]);
541 if (FClass->NumProtos == 0 && FClass->NumConfigs == 0 &&
542 strcmp(target_unicharset.id_to_unichar(ClassId), " ") != 0) {
543 cprintf("Warning: no protos/configs for %s in CreateIntTemplates()\n",
544 target_unicharset.id_to_unichar(ClassId));
545 }
546 assert(UnusedClassIdIn(IntTemplates, ClassId));
547 IClass = NewIntClass(FClass->NumProtos, FClass->NumConfigs);
548 FontSet fs;
549 fs.size = FClass->font_set.size();
550 fs.configs = new int[fs.size];
551 for (int i = 0; i < fs.size; ++i) {
552 fs.configs[i] = FClass->font_set.get(i);
553 }
554 if (this->fontset_table_.contains(fs)) {
555 IClass->font_set_id = this->fontset_table_.get_id(fs);
556 delete[] fs.configs;
557 } else {
558 IClass->font_set_id = this->fontset_table_.push_back(fs);
559 }
560 AddIntClass(IntTemplates, ClassId, IClass);
561
562 for (ProtoId = 0; ProtoId < FClass->NumProtos; ProtoId++) {
563 AddIntProto(IClass);
564 ConvertProto(ProtoIn(FClass, ProtoId), ProtoId, IClass);
565 AddProtoToProtoPruner(ProtoIn(FClass, ProtoId), ProtoId, IClass,
567 AddProtoToClassPruner(ProtoIn(FClass, ProtoId), ClassId, IntTemplates);
568 }
569
570 for (ConfigId = 0; ConfigId < FClass->NumConfigs; ConfigId++) {
571 AddIntConfig(IClass);
572 ConvertConfig(FClass->Configurations[ConfigId], ConfigId, IClass);
573 }
574 }
575 return (IntTemplates);
576} /* CreateIntTemplates */
INT_TEMPLATES NewIntTemplates()
Definition: intproto.cpp:682
void AddProtoToClassPruner(PROTO Proto, CLASS_ID ClassId, INT_TEMPLATES Templates)
Definition: intproto.cpp:328
int AddIntProto(INT_CLASS Class)
Definition: intproto.cpp:282
int AddIntConfig(INT_CLASS Class)
Definition: intproto.cpp:261
void ConvertConfig(BIT_VECTOR Config, int ConfigId, INT_CLASS Class)
Definition: intproto.cpp:463
void AddIntClass(INT_TEMPLATES Templates, CLASS_ID ClassId, INT_CLASS Class)
Definition: intproto.cpp:231
void AddProtoToProtoPruner(PROTO Proto, int ProtoId, INT_CLASS Class, bool debug)
Definition: intproto.cpp:367
INT_CLASS NewIntClass(int MaxNumProtos, int MaxNumConfigs)
Definition: intproto.cpp:626
#define UnusedClassIdIn(T, c)
Definition: intproto.h:177
#define ProtoIn(Class, Pid)
Definition: protos.h:84
int size() const
Return the size used.
const T & get(int id) const
Return the object from an id.
void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class)
Definition: intproto.cpp:488
int16_t NumConfigs
Definition: protos.h:58
int16_t NumProtos
Definition: protos.h:55
UnicityTableEqEq< int > font_set
Definition: protos.h:61
CONFIGS Configurations
Definition: protos.h:60

◆ DebugAdaptiveClassifier()

void tesseract::Classify::DebugAdaptiveClassifier ( TBLOB blob,
ADAPT_RESULTS Results 
)
Parameters
blobblob whose classification is being debugged
Resultsresults of match being debugged

Globals: none

Definition at line 1497 of file adaptmatch.cpp.

1498 {
1499 if (static_classifier_ == nullptr) return;
1500 INT_FX_RESULT_STRUCT fx_info;
1502 TrainingSample* sample =
1503 BlobToTrainingSample(*blob, false, &fx_info, &bl_features);
1504 if (sample == nullptr) return;
1505 static_classifier_->DebugDisplay(*sample, blob->denorm().pix(),
1506 Results->best_unichar_id);
1507} /* DebugAdaptiveClassifier */
TrainingSample * BlobToTrainingSample(const TBLOB &blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT *fx_info, GenericVector< INT_FEATURE_STRUCT > *bl_features)
Definition: intfx.cpp:77
virtual void DebugDisplay(const TrainingSample &sample, Pix *page_pix, UNICHAR_ID unichar_id)

◆ DisplayAdaptedChar()

void tesseract::Classify::DisplayAdaptedChar ( TBLOB blob,
INT_CLASS_STRUCT int_class 
)

Definition at line 946 of file adaptmatch.cpp.

946 {
947#ifndef GRAPHICS_DISABLED
948 INT_FX_RESULT_STRUCT fx_info;
950 TrainingSample* sample =
952 &bl_features);
953 if (sample == nullptr) return;
954
955 UnicharRating int_result;
956 im_.Match(int_class, AllProtosOn, AllConfigsOn,
957 bl_features.size(), &bl_features[0],
960 tprintf("Best match to temp config %d = %4.1f%%.\n",
961 int_result.config, int_result.rating * 100.0);
963 uint32_t ConfigMask;
964 ConfigMask = 1 << int_result.config;
966 im_.Match(int_class, AllProtosOn, static_cast<BIT_VECTOR>(&ConfigMask),
967 bl_features.size(), &bl_features[0],
971 }
972
973 delete sample;
974#endif
975}
void UpdateMatchDisplay()
Definition: intproto.cpp:447

◆ DoAdaptiveMatch()

void tesseract::Classify::DoAdaptiveMatch ( TBLOB Blob,
ADAPT_RESULTS Results 
)

This routine performs an adaptive classification. If we have not yet adapted to enough classes, a simple classification to the pre-trained templates is performed. Otherwise, we match the blob against the adapted templates. If the adapted templates do not match well, we try a match against the pre-trained templates. If an adapted template match is found, we do a match to any pre-trained templates which could be ambiguous. The results from all of these classifications are merged together into Results.

Parameters
Blobblob to be classified
Resultsplace to put match results

Globals:

  • PreTrainedTemplates built-in training templates
  • AdaptedTemplates templates adapted for this page
  • matcher_reliable_adaptive_result rating limit for a great match

Definition at line 1530 of file adaptmatch.cpp.

1530 {
1531 UNICHAR_ID *Ambiguities;
1532
1533 INT_FX_RESULT_STRUCT fx_info;
1535 TrainingSample* sample =
1537 &bl_features);
1538 if (sample == nullptr) return;
1539
1540 // TODO: With LSTM, static_classifier_ is nullptr.
1541 // Return to avoid crash in CharNormClassifier.
1542 if (static_classifier_ == nullptr) {
1543 delete sample;
1544 return;
1545 }
1546
1549 CharNormClassifier(Blob, *sample, Results);
1550 } else {
1551 Ambiguities = BaselineClassifier(Blob, bl_features, fx_info,
1552 AdaptedTemplates, Results);
1553 if ((!Results->match.empty() &&
1554 MarginalMatch(Results->best_rating,
1556 !tess_bn_matching) ||
1557 Results->match.empty()) {
1558 CharNormClassifier(Blob, *sample, Results);
1559 } else if (Ambiguities && *Ambiguities >= 0 && !tess_bn_matching) {
1560 AmbigClassifier(bl_features, fx_info, Blob,
1563 Ambiguities,
1564 Results);
1565 }
1566 }
1567
1568 // Force the blob to be classified as noise
1569 // if the results contain only fragments.
1570 // TODO(daria): verify that this is better than
1571 // just adding a nullptr classification.
1572 if (!Results->HasNonfragment || Results->match.empty())
1573 ClassifyAsNoise(Results);
1574 delete sample;
1575} /* DoAdaptiveMatch */
int UNICHAR_ID
Definition: unichar.h:34
bool MarginalMatch(float confidence, float matcher_great_threshold)
Definition: adaptmatch.cpp:131
UNICHAR_ID * BaselineClassifier(TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
int CharNormClassifier(TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
void ClassifyAsNoise(ADAPT_RESULTS *Results)
void AmbigClassifier(const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)

◆ EndAdaptiveClassifier()

void tesseract::Classify::EndAdaptiveClassifier ( )

This routine performs cleanup operations on the adaptive classifier. It should be called before the program is terminated. Its main function is to save the adapted templates to a file.

Globals:

Definition at line 459 of file adaptmatch.cpp.

459 {
460 STRING Filename;
461 FILE *File;
462
463 if (AdaptedTemplates != nullptr &&
465 Filename = imagefile + ADAPT_TEMPLATE_SUFFIX;
466 File = fopen (Filename.string(), "wb");
467 if (File == nullptr)
468 cprintf ("Unable to save adapted templates to %s!\n", Filename.string());
469 else {
470 cprintf ("\nSaving adapted templates to %s ...", Filename.string());
471 fflush(stdout);
473 cprintf ("\n");
474 fclose(File);
475 }
476 }
477
478 if (AdaptedTemplates != nullptr) {
480 AdaptedTemplates = nullptr;
481 }
482 if (BackupAdaptedTemplates != nullptr) {
484 BackupAdaptedTemplates = nullptr;
485 }
486
487 if (PreTrainedTemplates != nullptr) {
489 PreTrainedTemplates = nullptr;
490 }
493 if (AllProtosOn != nullptr) {
494 FreeBitVector(AllProtosOn);
495 FreeBitVector(AllConfigsOn);
496 FreeBitVector(AllConfigsOff);
497 FreeBitVector(TempProtoMask);
498 AllProtosOn = nullptr;
499 AllConfigsOn = nullptr;
500 AllConfigsOff = nullptr;
501 TempProtoMask = nullptr;
502 }
503 delete shape_table_;
504 shape_table_ = nullptr;
505 delete static_classifier_;
506 static_classifier_ = nullptr;
507} /* EndAdaptiveClassifier */
void free_adapted_templates(ADAPT_TEMPLATES templates)
Definition: adaptive.cpp:182
#define ADAPT_TEMPLATE_SUFFIX
Definition: adaptmatch.cpp:75
void free_int_templates(INT_TEMPLATES templates)
Definition: intproto.cpp:698
STRING imagefile
Definition: ccutil.h:77
const char * string() const
Definition: strngs.cpp:194
void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates)
Definition: adaptive.cpp:453
void EndDangerousAmbigs()
Definition: stopper.cpp:360

◆ ExpandShapesAndApplyCorrections()

void tesseract::Classify::ExpandShapesAndApplyCorrections ( ADAPT_CLASS classes,
bool  debug,
int  class_id,
int  bottom,
int  top,
float  cp_rating,
int  blob_length,
int  matcher_multiplier,
const uint8_t *  cn_factors,
UnicharRating int_result,
ADAPT_RESULTS final_results 
)

Definition at line 1128 of file adaptmatch.cpp.

1132 {
1133 if (classes != nullptr) {
1134 // Adapted result. Convert configs to fontinfo_ids.
1135 int_result->adapted = true;
1136 for (int f = 0; f < int_result->fonts.size(); ++f) {
1137 int_result->fonts[f].fontinfo_id =
1138 GetFontinfoId(classes[class_id], int_result->fonts[f].fontinfo_id);
1139 }
1140 } else {
1141 // Pre-trained result. Map fonts using font_sets_.
1142 int_result->adapted = false;
1143 for (int f = 0; f < int_result->fonts.size(); ++f) {
1144 int_result->fonts[f].fontinfo_id =
1146 int_result->fonts[f].fontinfo_id);
1147 }
1148 if (shape_table_ != nullptr) {
1149 // Two possible cases:
1150 // 1. Flat shapetable. All unichar-ids of the shapes referenced by
1151 // int_result->fonts are the same. In this case build a new vector of
1152 // mapped fonts and replace the fonts in int_result.
1153 // 2. Multi-unichar shapetable. Variable unichars in the shapes referenced
1154 // by int_result. In this case, build a vector of UnicharRating to
1155 // gather together different font-ids for each unichar. Also covers case1.
1156 GenericVector<UnicharRating> mapped_results;
1157 for (int f = 0; f < int_result->fonts.size(); ++f) {
1158 int shape_id = int_result->fonts[f].fontinfo_id;
1159 const Shape& shape = shape_table_->GetShape(shape_id);
1160 for (int c = 0; c < shape.size(); ++c) {
1161 int unichar_id = shape[c].unichar_id;
1162 if (!unicharset.get_enabled(unichar_id)) continue;
1163 // Find the mapped_result for unichar_id.
1164 int r = 0;
1165 for (r = 0; r < mapped_results.size() &&
1166 mapped_results[r].unichar_id != unichar_id; ++r) {}
1167 if (r == mapped_results.size()) {
1168 mapped_results.push_back(*int_result);
1169 mapped_results[r].unichar_id = unichar_id;
1170 mapped_results[r].fonts.truncate(0);
1171 }
1172 for (int i = 0; i < shape[c].font_ids.size(); ++i) {
1173 mapped_results[r].fonts.push_back(
1174 ScoredFont(shape[c].font_ids[i], int_result->fonts[f].score));
1175 }
1176 }
1177 }
1178 for (int m = 0; m < mapped_results.size(); ++m) {
1179 mapped_results[m].rating =
1180 ComputeCorrectedRating(debug, mapped_results[m].unichar_id,
1181 cp_rating, int_result->rating,
1182 int_result->feature_misses, bottom, top,
1183 blob_length, matcher_multiplier, cn_factors);
1184 AddNewResult(mapped_results[m], final_results);
1185 }
1186 return;
1187 }
1188 }
1189 if (unicharset.get_enabled(class_id)) {
1190 int_result->rating = ComputeCorrectedRating(debug, class_id, cp_rating,
1191 int_result->rating,
1192 int_result->feature_misses,
1193 bottom, top, blob_length,
1194 matcher_multiplier, cn_factors);
1195 AddNewResult(*int_result, final_results);
1196 }
1197}
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:878
double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors)

◆ ExtractFeatures()

void tesseract::Classify::ExtractFeatures ( const TBLOB blob,
bool  nonlinear_norm,
GenericVector< INT_FEATURE_STRUCT > *  bl_features,
GenericVector< INT_FEATURE_STRUCT > *  cn_features,
INT_FX_RESULT_STRUCT results,
GenericVector< int > *  outline_cn_counts 
)
static

Definition at line 442 of file intfx.cpp.

447 {
448 DENORM bl_denorm, cn_denorm;
449 tesseract::Classify::SetupBLCNDenorms(blob, nonlinear_norm,
450 &bl_denorm, &cn_denorm, results);
451 if (outline_cn_counts != nullptr)
452 outline_cn_counts->truncate(0);
453 // Iterate the outlines.
454 for (TESSLINE* ol = blob.outlines; ol != nullptr; ol = ol->next) {
455 // Iterate the polygon.
456 EDGEPT* loop_pt = ol->FindBestStartPt();
457 EDGEPT* pt = loop_pt;
458 if (pt == nullptr) continue;
459 do {
460 if (pt->IsHidden()) continue;
461 // Find a run of equal src_outline.
462 EDGEPT* last_pt = pt;
463 do {
464 last_pt = last_pt->next;
465 } while (last_pt != loop_pt && !last_pt->IsHidden() &&
466 last_pt->src_outline == pt->src_outline);
467 last_pt = last_pt->prev;
468 // Until the adaptive classifier can be weaned off polygon segments,
469 // we have to force extraction from the polygon for the bl_features.
470 ExtractFeaturesFromRun(pt, last_pt, bl_denorm, kStandardFeatureLength,
471 true, bl_features);
472 ExtractFeaturesFromRun(pt, last_pt, cn_denorm, kStandardFeatureLength,
473 false, cn_features);
474 pt = last_pt;
475 } while ((pt = pt->next) != loop_pt);
476 if (outline_cn_counts != nullptr)
477 outline_cn_counts->push_back(cn_features->size());
478 }
479 results->NumBL = bl_features->size();
480 results->NumCN = cn_features->size();
481 results->YBottom = blob.bounding_box().bottom();
482 results->YTop = blob.bounding_box().top();
483 results->Width = blob.bounding_box().width();
484}
Definition: blobs.h:99
EDGEPT * next
Definition: blobs.h:192
C_OUTLINE * src_outline
Definition: blobs.h:194
bool IsHidden() const
Definition: blobs.h:176
EDGEPT * prev
Definition: blobs.h:193
TESSLINE * next
Definition: blobs.h:281
TESSLINE * outlines
Definition: blobs.h:400
int16_t width() const
Definition: rect.h:115
static void SetupBLCNDenorms(const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
Definition: intfx.cpp:129
int16_t Width
Definition: intfx.h:40
uint8_t YBottom
Definition: intfx.h:41
uint8_t YTop
Definition: intfx.h:42
int16_t NumBL
Definition: intfx.h:39
int16_t NumCN
Definition: intfx.h:39

◆ ExtractIntCNFeatures()

FEATURE_SET tesseract::Classify::ExtractIntCNFeatures ( const TBLOB blob,
const INT_FX_RESULT_STRUCT fx_info 
)
Parameters
blobblob to extract features from
fx_info
Returns
Integer character-normalized features for blob.

Definition at line 217 of file picofeat.cpp.

218 {
219 INT_FX_RESULT_STRUCT local_fx_info(fx_info);
222 blob, false, &local_fx_info, &bl_features);
223 if (sample == nullptr) return nullptr;
224
225 uint32_t num_features = sample->num_features();
226 const INT_FEATURE_STRUCT* features = sample->features();
227 FEATURE_SET feature_set = NewFeatureSet(num_features);
228 for (uint32_t f = 0; f < num_features; ++f) {
229 FEATURE feature = NewFeature(&IntFeatDesc);
230
231 feature->Params[IntX] = features[f].X;
232 feature->Params[IntY] = features[f].Y;
233 feature->Params[IntDir] = features[f].Theta;
234 AddFeature(feature_set, feature);
235 }
236 delete sample;
237
238 return feature_set;
239} /* ExtractIntCNFeatures */
const FEATURE_DESC_STRUCT IntFeatDesc
FEATURE NewFeature(const FEATURE_DESC_STRUCT *FeatureDesc)
Definition: ocrfeatures.cpp:78
bool AddFeature(FEATURE_SET FeatureSet, FEATURE Feature)
Definition: ocrfeatures.cpp:40
FEATURE_SET NewFeatureSet(int NumFeatures)
Definition: ocrfeatures.cpp:94
@ IntDir
Definition: picofeat.h:32
@ IntX
Definition: picofeat.h:30
@ IntY
Definition: picofeat.h:31

◆ ExtractIntGeoFeatures()

FEATURE_SET tesseract::Classify::ExtractIntGeoFeatures ( const TBLOB blob,
const INT_FX_RESULT_STRUCT fx_info 
)
Parameters
blobblob to extract features from
fx_info
Returns
Geometric (top/bottom/width) features for blob.

Definition at line 247 of file picofeat.cpp.

248 {
249 INT_FX_RESULT_STRUCT local_fx_info(fx_info);
252 blob, false, &local_fx_info, &bl_features);
253 if (sample == nullptr) return nullptr;
254
255 FEATURE_SET feature_set = NewFeatureSet(1);
256 FEATURE feature = NewFeature(&IntFeatDesc);
257
258 feature->Params[GeoBottom] = sample->geo_feature(GeoBottom);
259 feature->Params[GeoTop] = sample->geo_feature(GeoTop);
260 feature->Params[GeoWidth] = sample->geo_feature(GeoWidth);
261 AddFeature(feature_set, feature);
262 delete sample;
263
264 return feature_set;
265} /* ExtractIntGeoFeatures */
@ GeoWidth
Definition: picofeat.h:39

◆ ExtractOutlineFeatures()

FEATURE_SET tesseract::Classify::ExtractOutlineFeatures ( TBLOB Blob)

Convert each segment in the outline to a feature and return the features.

Parameters
Blobblob to extract pico-features from
Returns
Outline-features for Blob.
Note
Globals: none

Definition at line 41 of file outfeat.cpp.

41 {
42 LIST Outlines;
43 LIST RemainingOutlines;
44 MFOUTLINE Outline;
45 FEATURE_SET FeatureSet;
46 float XScale, YScale;
47
49 if (Blob == nullptr)
50 return (FeatureSet);
51
52 Outlines = ConvertBlob (Blob);
53
54 NormalizeOutlines(Outlines, &XScale, &YScale);
55 RemainingOutlines = Outlines;
56 iterate(RemainingOutlines) {
57 Outline = static_cast<MFOUTLINE>first_node (RemainingOutlines);
58 ConvertToOutlineFeatures(Outline, FeatureSet);
59 }
61 NormalizeOutlineX(FeatureSet);
62 FreeOutlines(Outlines);
63 return (FeatureSet);
64} /* ExtractOutlineFeatures */
LIST ConvertBlob(TBLOB *blob)
Definition: mfoutline.cpp:37
void FreeOutlines(LIST Outlines)
Definition: mfoutline.cpp:167
void NormalizeOutlineX(FEATURE_SET FeatureSet)
Definition: outfeat.cpp:145
void ConvertToOutlineFeatures(MFOUTLINE Outline, FEATURE_SET FeatureSet)
Definition: outfeat.cpp:107
#define MAX_OUTLINE_FEATURES
Definition: outfeat.h:35
void NormalizeOutlines(LIST Outlines, float *XScale, float *YScale)
Definition: mfoutline.cpp:276

◆ ExtractPicoFeatures()

FEATURE_SET tesseract::Classify::ExtractPicoFeatures ( TBLOB Blob)

Operation: Dummy for now.

Globals:

  • classify_norm_method normalization method currently specified
    Parameters
    Blobblob to extract pico-features from
    Returns
    Pico-features for Blob.

Definition at line 63 of file picofeat.cpp.

63 {
64 LIST Outlines;
65 LIST RemainingOutlines;
66 MFOUTLINE Outline;
67 FEATURE_SET FeatureSet;
68 float XScale, YScale;
69
70 FeatureSet = NewFeatureSet(MAX_PICO_FEATURES);
71 Outlines = ConvertBlob(Blob);
72 NormalizeOutlines(Outlines, &XScale, &YScale);
73 RemainingOutlines = Outlines;
74 iterate(RemainingOutlines) {
75 Outline = static_cast<MFOUTLINE>first_node (RemainingOutlines);
76 ConvertToPicoFeatures2(Outline, FeatureSet);
77 }
79 NormalizePicoX(FeatureSet);
80 FreeOutlines(Outlines);
81 return (FeatureSet);
82
83} /* ExtractPicoFeatures */
void NormalizePicoX(FEATURE_SET FeatureSet)
Definition: picofeat.cpp:193
void ConvertToPicoFeatures2(MFOUTLINE Outline, FEATURE_SET FeatureSet)
Definition: picofeat.cpp:155
#define MAX_PICO_FEATURES
Definition: picofeat.h:46

◆ FreeNormProtos()

void tesseract::Classify::FreeNormProtos ( )

Definition at line 168 of file normmatch.cpp.

168 {
169 if (NormProtos != nullptr) {
170 for (int i = 0; i < NormProtos->NumProtos; i++)
175 NormProtos = nullptr;
176 }
177}
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:538
void Efree(void *ptr)
Definition: emalloc.cpp:45
PARAM_DESC * ParamDesc
Definition: normmatch.cpp:37

◆ get_fontinfo_table() [1/2]

UnicityTable< FontInfo > & tesseract::Classify::get_fontinfo_table ( )
inline

Definition at line 386 of file classify.h.

386 {
387 return fontinfo_table_;
388 }

◆ get_fontinfo_table() [2/2]

const UnicityTable< FontInfo > & tesseract::Classify::get_fontinfo_table ( ) const
inline

Definition at line 389 of file classify.h.

389 {
390 return fontinfo_table_;
391 }

◆ get_fontset_table()

UnicityTable< FontSet > & tesseract::Classify::get_fontset_table ( )
inline

Definition at line 392 of file classify.h.

392 {
393 return fontset_table_;
394 }

◆ GetAdaptiveFeatures()

int tesseract::Classify::GetAdaptiveFeatures ( TBLOB Blob,
INT_FEATURE_ARRAY  IntFeatures,
FEATURE_SET FloatFeatures 
)

This routine sets up the feature extractor to extract baseline normalized pico-features.

The extracted pico-features are converted to integer form and placed in IntFeatures. The original floating-pt. features are returned in FloatFeatures.

Globals: none

Parameters
Blobblob to extract features from
[out]IntFeaturesarray to fill with integer features
[out]FloatFeaturesplace to return actual floating-pt features
Returns
Number of pico-features returned (0 if an error occurred)

Definition at line 786 of file adaptmatch.cpp.

788 {
789 FEATURE_SET Features;
790 int NumFeatures;
791
793 Features = ExtractPicoFeatures(Blob);
794
795 NumFeatures = Features->NumFeatures;
796 if (NumFeatures == 0 || NumFeatures > UNLIKELY_NUM_FEAT) {
797 FreeFeatureSet(Features);
798 return 0;
799 }
800
801 ComputeIntFeatures(Features, IntFeatures);
802 *FloatFeatures = Features;
803
804 return NumFeatures;
805} /* GetAdaptiveFeatures */
#define UNLIKELY_NUM_FEAT
Definition: adaptmatch.cpp:78
FEATURE_SET ExtractPicoFeatures(TBLOB *Blob)
Definition: picofeat.cpp:63
void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
Definition: float2int.cpp:90

◆ GetAmbiguities()

UNICHAR_ID * tesseract::Classify::GetAmbiguities ( TBLOB Blob,
CLASS_ID  CorrectClass 
)

This routine matches blob to the built-in templates to find out if there are any classes other than the correct class which are potential ambiguities.

Parameters
Blobblob to get classification ambiguities for
CorrectClasscorrect class for Blob

Globals:

  • CurrentRatings used by qsort compare routine
  • PreTrainedTemplates built-in templates
Returns
String containing all possible ambiguous classes.

Definition at line 1592 of file adaptmatch.cpp.

1593 {
1594 auto *Results = new ADAPT_RESULTS();
1595 UNICHAR_ID *Ambiguities;
1596 int i;
1597
1598 Results->Initialize();
1599 INT_FX_RESULT_STRUCT fx_info;
1601 TrainingSample* sample =
1603 &bl_features);
1604 if (sample == nullptr) {
1605 delete Results;
1606 return nullptr;
1607 }
1608
1609 CharNormClassifier(Blob, *sample, Results);
1610 delete sample;
1611 RemoveBadMatches(Results);
1612 Results->match.sort(&UnicharRating::SortDescendingRating);
1613
1614 /* copy the class id's into an string of ambiguities - don't copy if
1615 the correct class is the only class id matched */
1616 Ambiguities = new UNICHAR_ID[Results->match.size() + 1];
1617 if (Results->match.size() > 1 ||
1618 (Results->match.size() == 1 &&
1619 Results->match[0].unichar_id != CorrectClass)) {
1620 for (i = 0; i < Results->match.size(); i++)
1621 Ambiguities[i] = Results->match[i].unichar_id;
1622 Ambiguities[i] = -1;
1623 } else {
1624 Ambiguities[0] = -1;
1625 }
1626
1627 delete Results;
1628 return Ambiguities;
1629} /* GetAmbiguities */

◆ GetCharNormFeature()

int tesseract::Classify::GetCharNormFeature ( const INT_FX_RESULT_STRUCT fx_info,
INT_TEMPLATES  templates,
uint8_t *  pruner_norm_array,
uint8_t *  char_norm_array 
)

This routine calls the integer (Hardware) feature extractor if it has not been called before for this blob.

The results from the feature extractor are placed into globals so that they can be used in other routines without re-extracting the features.

It then copies the char norm features into the IntFeatures array provided by the caller.

Parameters
templatesused to compute char norm adjustments
pruner_norm_arrayArray of factors from blob normalization process
char_norm_arrayarray to fill with dummy char norm adjustments
fx_info

Globals:

Returns
Number of features extracted or 0 if an error occurred.

Definition at line 1678 of file adaptmatch.cpp.

1681 {
1682 FEATURE norm_feature = NewFeature(&CharNormDesc);
1684 float scale = MF_SCALE_FACTOR;
1685 norm_feature->Params[CharNormY] = (fx_info.Ymean - baseline) * scale;
1686 norm_feature->Params[CharNormLength] =
1687 fx_info.Length * scale / LENGTH_COMPRESSION;
1688 norm_feature->Params[CharNormRx] = fx_info.Rx * scale;
1689 norm_feature->Params[CharNormRy] = fx_info.Ry * scale;
1690 // Deletes norm_feature.
1691 ComputeCharNormArrays(norm_feature, templates, char_norm_array,
1692 pruner_norm_array);
1694} /* GetCharNormFeature */
const int kBlnBaselineOffset
Definition: normalis.h:25
const FEATURE_DESC_STRUCT CharNormDesc
const float MF_SCALE_FACTOR
Definition: mfoutline.h:71
#define LENGTH_COMPRESSION
Definition: normfeat.h:27
int16_t Ymean
Definition: intfx.h:37

◆ GetClassToDebug()

CLASS_ID tesseract::Classify::GetClassToDebug ( const char *  Prompt,
bool *  adaptive_on,
bool *  pretrained_on,
int *  shape_id 
)

This routine prompts the user with Prompt and waits for the user to enter something in the debug window.

Parameters
Promptprompt to print while waiting for input from window
adaptive_on
pretrained_on
shape_id
Returns
Character entered in the debug window.
Note
Globals: none

Definition at line 1255 of file intproto.cpp.

1256 {
1257 tprintf("%s\n", Prompt);
1258 SVEvent* ev;
1259 SVEventType ev_type;
1260 int unichar_id = INVALID_UNICHAR_ID;
1261 // Wait until a click or popup event.
1262 do {
1263 ev = IntMatchWindow->AwaitEvent(SVET_ANY);
1264 ev_type = ev->type;
1265 if (ev_type == SVET_POPUP) {
1266 if (ev->command_id == IDA_SHAPE_INDEX) {
1267 if (shape_table_ != nullptr) {
1268 *shape_id = atoi(ev->parameter);
1269 *adaptive_on = false;
1270 *pretrained_on = true;
1271 if (*shape_id >= 0 && *shape_id < shape_table_->NumShapes()) {
1272 int font_id;
1273 shape_table_->GetFirstUnicharAndFont(*shape_id, &unichar_id,
1274 &font_id);
1275 tprintf("Shape %d, first unichar=%d, font=%d\n",
1276 *shape_id, unichar_id, font_id);
1277 return unichar_id;
1278 }
1279 tprintf("Shape index '%s' not found in shape table\n", ev->parameter);
1280 } else {
1281 tprintf("No shape table loaded!\n");
1282 }
1283 } else {
1285 unichar_id = unicharset.unichar_to_id(ev->parameter);
1286 if (ev->command_id == IDA_ADAPTIVE) {
1287 *adaptive_on = true;
1288 *pretrained_on = false;
1289 *shape_id = -1;
1290 } else if (ev->command_id == IDA_STATIC) {
1291 *adaptive_on = false;
1292 *pretrained_on = true;
1293 } else {
1294 *adaptive_on = true;
1295 *pretrained_on = true;
1296 }
1297 if (ev->command_id == IDA_ADAPTIVE || shape_table_ == nullptr) {
1298 *shape_id = -1;
1299 return unichar_id;
1300 }
1301 for (int s = 0; s < shape_table_->NumShapes(); ++s) {
1302 if (shape_table_->GetShape(s).ContainsUnichar(unichar_id)) {
1303 tprintf("%s\n", shape_table_->DebugStr(s).string());
1304 }
1305 }
1306 } else {
1307 tprintf("Char class '%s' not found in unicharset",
1308 ev->parameter);
1309 }
1310 }
1311 }
1312 delete ev;
1313 } while (ev_type != SVET_CLICK);
1314 return 0;
1315} /* GetClassToDebug */
@ IDA_STATIC
Definition: intproto.h:156
@ IDA_SHAPE_INDEX
Definition: intproto.h:157
@ IDA_ADAPTIVE
Definition: intproto.h:155
SVEventType
Definition: scrollview.h:45
@ SVET_CLICK
Definition: scrollview.h:48
@ SVET_POPUP
Definition: scrollview.h:54
@ SVET_ANY
Definition: scrollview.h:56
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:671
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
bool ContainsUnichar(int unichar_id) const
Definition: shapetable.cpp:147
int NumShapes() const
Definition: shapetable.h:274
void GetFirstUnicharAndFont(int shape_id, int *unichar_id, int *font_id) const
Definition: shapetable.cpp:404
SVEventType type
Definition: scrollview.h:64
char * parameter
Definition: scrollview.h:66
int command_id
Definition: scrollview.h:71
SVEvent * AwaitEvent(SVEventType type)
Definition: scrollview.cpp:443

◆ getDict()

virtual Dict & tesseract::Classify::getDict ( )
inlinevirtual

Reimplemented in tesseract::Tesseract.

Definition at line 107 of file classify.h.

107 {
108 return dict_;
109 }

◆ GetFontinfoId()

int tesseract::Classify::GetFontinfoId ( ADAPT_CLASS  Class,
uint8_t  ConfigId 
)

Definition at line 173 of file adaptive.cpp.

173 {
174 return (ConfigIsPermanent(Class, ConfigId) ?
175 PermConfigFor(Class, ConfigId)->FontinfoId :
176 TempConfigFor(Class, ConfigId)->FontinfoId);
177}
#define PermConfigFor(Class, ConfigId)
Definition: adaptive.h:93

◆ InitAdaptedClass()

void tesseract::Classify::InitAdaptedClass ( TBLOB Blob,
CLASS_ID  ClassId,
int  FontinfoId,
ADAPT_CLASS  Class,
ADAPT_TEMPLATES  Templates 
)

This routine creates a new adapted class and uses Blob as the model for the first config in that class.

Parameters
Blobblob to model new class after
ClassIdid of the class to be initialized
FontinfoIdfont information inferred from pre-trained templates
Classadapted class to be initialized
Templatesadapted templates to add new class to

Globals:

Definition at line 693 of file adaptmatch.cpp.

697 {
698 FEATURE_SET Features;
699 int Fid, Pid;
700 FEATURE Feature;
701 int NumFeatures;
702 TEMP_PROTO TempProto;
703 PROTO Proto;
704 INT_CLASS IClass;
706
708 Features = ExtractOutlineFeatures(Blob);
709 NumFeatures = Features->NumFeatures;
710 if (NumFeatures > UNLIKELY_NUM_FEAT || NumFeatures <= 0) {
711 FreeFeatureSet(Features);
712 return;
713 }
714
715 Config = NewTempConfig(NumFeatures - 1, FontinfoId);
716 TempConfigFor(Class, 0) = Config;
717
718 /* this is a kludge to construct cutoffs for adapted templates */
719 if (Templates == AdaptedTemplates)
720 BaselineCutoffs[ClassId] = CharNormCutoffs[ClassId];
721
722 IClass = ClassForClassId (Templates->Templates, ClassId);
723
724 for (Fid = 0; Fid < Features->NumFeatures; Fid++) {
725 Pid = AddIntProto (IClass);
726 assert (Pid != NO_PROTO);
727
728 Feature = Features->Features[Fid];
729 TempProto = NewTempProto ();
730 Proto = &(TempProto->Proto);
731
732 /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
733 ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
734 instead of the -0.25 to 0.75 used in baseline normalization */
735 Proto->Angle = Feature->Params[OutlineFeatDir];
736 Proto->X = Feature->Params[OutlineFeatX];
737 Proto->Y = Feature->Params[OutlineFeatY] - Y_DIM_OFFSET;
738 Proto->Length = Feature->Params[OutlineFeatLength];
739 FillABC(Proto);
740
741 TempProto->ProtoId = Pid;
742 SET_BIT (Config->Protos, Pid);
743
744 ConvertProto(Proto, Pid, IClass);
745 AddProtoToProtoPruner(Proto, Pid, IClass,
747
748 Class->TempProtos = push (Class->TempProtos, TempProto);
749 }
750 FreeFeatureSet(Features);
751
752 AddIntConfig(IClass);
753 ConvertConfig (AllProtosOn, 0, IClass);
754
756 tprintf("Added new class '%s' with class id %d and %d protos.\n",
757 unicharset.id_to_unichar(ClassId), ClassId, NumFeatures);
759 DisplayAdaptedChar(Blob, IClass);
760 }
761
762 if (IsEmptyAdaptedClass(Class))
763 (Templates->NumNonEmptyClasses)++;
764} /* InitAdaptedClass */
TEMP_PROTO NewTempProto()
Definition: adaptive.cpp:228
TEMP_CONFIG NewTempConfig(int MaxProtoId, int FontinfoId)
Definition: adaptive.cpp:203
#define Y_DIM_OFFSET
Definition: adaptmatch.cpp:84
@ OutlineFeatDir
Definition: outfeat.h:32
@ OutlineFeatX
Definition: outfeat.h:29
@ OutlineFeatY
Definition: outfeat.h:30
@ OutlineFeatLength
Definition: outfeat.h:31
void FillABC(PROTO Proto)
Definition: protos.cpp:108
LIST push(LIST list, void *element)
Definition: oldlist.cpp:213
#define NO_PROTO
Definition: matchdefs.h:41
uint16_t ProtoId
Definition: adaptive.h:28
PROTO_STRUCT Proto
Definition: adaptive.h:29
FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob)
Definition: outfeat.cpp:41
float Y
Definition: protos.h:41
float X
Definition: protos.h:40

◆ InitAdaptiveClassifier()

void tesseract::Classify::InitAdaptiveClassifier ( TessdataManager mgr)

This routine reads in the training information needed by the adaptive classifier and saves it into global variables. Parameters: load_pre_trained_templates Indicates whether the pre-trained templates (inttemp, normproto and pffmtable components) should be loaded. Should only be set to true if the necessary classifier components are present in the [lang].traineddata file. Globals: BuiltInTemplatesFile file to get built-in temps from BuiltInCutoffsFile file to get avg. feat per class from classify_use_pre_adapted_templates enables use of pre-adapted templates

Definition at line 527 of file adaptmatch.cpp.

527 {
529 return;
530 if (AllProtosOn != nullptr)
531 EndAdaptiveClassifier(); // Don't leak with multiple inits.
532
533 // If there is no language_data_path_prefix, the classifier will be
534 // adaptive only.
535 if (language_data_path_prefix.length() > 0 && mgr != nullptr) {
536 TFile fp;
537 ASSERT_HOST(mgr->GetComponent(TESSDATA_INTTEMP, &fp));
539
540 if (mgr->GetComponent(TESSDATA_SHAPE_TABLE, &fp)) {
541 shape_table_ = new ShapeTable(unicharset);
542 if (!shape_table_->DeSerialize(&fp)) {
543 tprintf("Error loading shape table!\n");
544 delete shape_table_;
545 shape_table_ = nullptr;
546 }
547 }
548
549 ASSERT_HOST(mgr->GetComponent(TESSDATA_PFFMTABLE, &fp));
550 ReadNewCutoffs(&fp, CharNormCutoffs);
551
552 ASSERT_HOST(mgr->GetComponent(TESSDATA_NORMPROTO, &fp));
554 static_classifier_ = new TessClassifier(false, this);
555 }
556
558
559 AllProtosOn = NewBitVector(MAX_NUM_PROTOS);
560 AllConfigsOn = NewBitVector(MAX_NUM_CONFIGS);
561 AllConfigsOff = NewBitVector(MAX_NUM_CONFIGS);
562 TempProtoMask = NewBitVector(MAX_NUM_PROTOS);
563 set_all_bits(AllProtosOn, WordsInVectorOfSize(MAX_NUM_PROTOS));
564 set_all_bits(AllConfigsOn, WordsInVectorOfSize(MAX_NUM_CONFIGS));
565 zero_all_bits(AllConfigsOff, WordsInVectorOfSize(MAX_NUM_CONFIGS));
566
567 for (uint16_t& BaselineCutoff : BaselineCutoffs) {
568 BaselineCutoff = 0;
569 }
570
572 TFile fp;
573 STRING Filename;
574
575 Filename = imagefile;
576 Filename += ADAPT_TEMPLATE_SUFFIX;
577 if (!fp.Open(Filename.string(), nullptr)) {
579 } else {
580 cprintf("\nReading pre-adapted templates from %s ...\n",
581 Filename.string());
582 fflush(stdout);
584 cprintf("\n");
586
587 for (int i = 0; i < AdaptedTemplates->Templates->NumClasses; i++) {
588 BaselineCutoffs[i] = CharNormCutoffs[i];
589 }
590 }
591 } else {
592 if (AdaptedTemplates != nullptr)
595 }
596} /* InitAdaptiveClassifier */
void InitIntegerFX()
Definition: intfx.cpp:49
#define MAX_NUM_CONFIGS
Definition: intproto.h:47
@ TESSDATA_SHAPE_TABLE
STRING language_data_path_prefix
Definition: ccutil.h:72
int32_t length() const
Definition: strngs.cpp:189
void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates)
Definition: adaptive.cpp:244
ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset)
Definition: adaptive.cpp:151
NORM_PROTOS * ReadNormProtos(TFile *fp)
Definition: normmatch.cpp:190
INT_TEMPLATES ReadIntTemplates(TFile *fp)
Definition: intproto.cpp:718
ADAPT_TEMPLATES ReadAdaptedTemplates(TFile *File)
Definition: adaptive.cpp:332
void ReadNewCutoffs(TFile *fp, uint16_t *Cutoffs)
Definition: cutoffs.cpp:41
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:246

◆ LargeSpeckle()

bool tesseract::Classify::LargeSpeckle ( const TBLOB blob)

Definition at line 242 of file classify.cpp.

242 {
243 double speckle_size = kBlnXHeight * speckle_large_max_size;
244 TBOX bbox = blob.bounding_box();
245 return bbox.width() < speckle_size && bbox.height() < speckle_size;
246}
const int kBlnXHeight
Definition: normalis.h:24
int16_t height() const
Definition: rect.h:108

◆ LearnBlob()

void tesseract::Classify::LearnBlob ( const STRING fontname,
TBLOB Blob,
const DENORM cn_denorm,
const INT_FX_RESULT_STRUCT fx_info,
const char *  blob_text 
)

Definition at line 70 of file blobclass.cpp.

73 {
75 CharDesc->FeatureSets[0] = ExtractMicros(blob, cn_denorm);
76 CharDesc->FeatureSets[1] = ExtractCharNormFeatures(fx_info);
77 CharDesc->FeatureSets[2] = ExtractIntCNFeatures(*blob, fx_info);
78 CharDesc->FeatureSets[3] = ExtractIntGeoFeatures(*blob, fx_info);
79
80 if (ValidCharDescription(feature_defs_, CharDesc)) {
81 // Label the features with a class name and font name.
82 tr_file_data_ += "\n";
83 tr_file_data_ += fontname;
84 tr_file_data_ += " ";
85 tr_file_data_ += blob_text;
86 tr_file_data_ += "\n";
87
88 // write micro-features to file and clean up
89 WriteCharDescription(feature_defs_, CharDesc, &tr_file_data_);
90 } else {
91 tprintf("Blob learned was invalid!\n");
92 }
93 FreeCharDescription(CharDesc);
94} // LearnBlob
void WriteCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, CHAR_DESC CharDesc, STRING *str)
Definition: featdefs.cpp:174
CHAR_DESC NewCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs)
Definition: featdefs.cpp:148
void FreeCharDescription(CHAR_DESC CharDesc)
Definition: featdefs.cpp:129
bool ValidCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, CHAR_DESC CharDesc)
Definition: featdefs.cpp:195
FEATURE_SET ExtractMicros(TBLOB *Blob, const DENORM &cn_denorm)
Definition: mf.cpp:43
FEATURE_SET ExtractCharNormFeatures(const INT_FX_RESULT_STRUCT &fx_info)
Definition: normfeat.cpp:61
FEATURE_SET ExtractIntCNFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
Definition: picofeat.cpp:217
FEATURE_SET ExtractIntGeoFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
Definition: picofeat.cpp:247
FEATURE_SET FeatureSets[NUM_FEATURE_TYPES]
Definition: featdefs.h:41

◆ LearnPieces()

void tesseract::Classify::LearnPieces ( const char *  fontname,
int  start,
int  length,
float  threshold,
CharSegmentationType  segmentation,
const char *  correct_text,
WERD_RES word 
)

Definition at line 374 of file adaptmatch.cpp.

376 {
377 // TODO(daria) Remove/modify this if/when we want
378 // to train and/or adapt to n-grams.
379 if (segmentation != CST_WHOLE &&
380 (segmentation != CST_FRAGMENT || disable_character_fragments))
381 return;
382
383 if (length > 1) {
384 SEAM::JoinPieces(word->seam_array, word->chopped_word->blobs, start,
385 start + length - 1);
386 }
387 TBLOB* blob = word->chopped_word->blobs[start];
388 // Rotate the blob if needed for classification.
389 TBLOB* rotated_blob = blob->ClassifyNormalizeIfNeeded();
390 if (rotated_blob == nullptr)
391 rotated_blob = blob;
392
393 #ifndef GRAPHICS_DISABLED
394 // Draw debug windows showing the blob that is being learned if needed.
395 if (strcmp(classify_learn_debug_str.string(), correct_text) == 0) {
396 RefreshDebugWindow(&learn_debug_win_, "LearnPieces", 600,
397 word->chopped_word->bounding_box());
398 rotated_blob->plot(learn_debug_win_, ScrollView::GREEN, ScrollView::BROWN);
399 learn_debug_win_->Update();
400 window_wait(learn_debug_win_);
401 }
402 if (classify_debug_character_fragments && segmentation == CST_FRAGMENT) {
403 ASSERT_HOST(learn_fragments_debug_win_ != nullptr); // set up in LearnWord
404 blob->plot(learn_fragments_debug_win_,
406 learn_fragments_debug_win_->Update();
407 }
408 #endif // GRAPHICS_DISABLED
409
410 if (fontname != nullptr) {
411 classify_norm_method.set_value(character); // force char norm spc 30/11/93
412 tess_bn_matching.set_value(false); // turn it off
413 tess_cn_matching.set_value(false);
414 DENORM bl_denorm, cn_denorm;
415 INT_FX_RESULT_STRUCT fx_info;
417 &bl_denorm, &cn_denorm, &fx_info);
418 LearnBlob(fontname, rotated_blob, cn_denorm, fx_info, correct_text);
419 } else if (unicharset.contains_unichar(correct_text)) {
420 UNICHAR_ID class_id = unicharset.unichar_to_id(correct_text);
421 int font_id = word->fontinfo != nullptr
422 ? fontinfo_table_.get_id(*word->fontinfo)
423 : 0;
425 tprintf("Adapting to char = %s, thr= %g font_id= %d\n",
426 unicharset.id_to_unichar(class_id), threshold, font_id);
427 // If filename is not nullptr we are doing recognition
428 // (as opposed to training), so we must have already set word fonts.
429 AdaptToChar(rotated_blob, class_id, font_id, threshold, AdaptedTemplates);
430 if (BackupAdaptedTemplates != nullptr) {
431 // Adapt the backup templates too. They will be used if the primary gets
432 // too full.
433 AdaptToChar(rotated_blob, class_id, font_id, threshold,
435 }
436 } else if (classify_debug_level >= 1) {
437 tprintf("Can't adapt to %s not in unicharset\n", correct_text);
438 }
439 if (rotated_blob != blob) {
440 delete rotated_blob;
441 }
442
443 SEAM::BreakPieces(word->seam_array, word->chopped_word->blobs, start,
444 start + length - 1);
445} // LearnPieces.
char window_wait(ScrollView *win)
Definition: callcpp.cpp:103
@ CST_WHOLE
Definition: classify.h:98
@ CST_FRAGMENT
Definition: classify.h:97
Definition: blobs.h:284
TBLOB * ClassifyNormalizeIfNeeded() const
Definition: blobs.cpp:346
void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color)
Definition: blobs.cpp:510
GenericVector< TBLOB * > blobs
Definition: blobs.h:459
TBOX bounding_box() const
Definition: blobs.cpp:861
const FontInfo * fontinfo
Definition: pageres.h:309
GenericVector< SEAM * > seam_array
Definition: pageres.h:214
TWERD * chopped_word
Definition: pageres.h:212
static void BreakPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:188
static void JoinPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:210
void RefreshDebugWindow(ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
Definition: adaptmatch.cpp:226
void AdaptToChar(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES adaptive_templates)
Definition: adaptmatch.cpp:853
void LearnBlob(const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
Definition: blobclass.cpp:70
static void Update()
Definition: scrollview.cpp:709

◆ LearnWord()

void tesseract::Classify::LearnWord ( const char *  fontname,
WERD_RES word 
)

Definition at line 250 of file adaptmatch.cpp.

250 {
251 int word_len = word->correct_text.size();
252 if (word_len == 0) return;
253
254 float* thresholds = nullptr;
255 if (fontname == nullptr) {
256 // Adaption mode.
257 if (!EnableLearning || word->best_choice == nullptr)
258 return; // Can't or won't adapt.
259
261 tprintf("\n\nAdapting to word = %s\n",
262 word->best_choice->debug_string().string());
263 thresholds = new float[word_len];
267 matcher_rating_margin, thresholds);
268 }
269 int start_blob = 0;
270
271 #ifndef GRAPHICS_DISABLED
273 if (learn_fragmented_word_debug_win_ != nullptr) {
274 window_wait(learn_fragmented_word_debug_win_);
275 }
276 RefreshDebugWindow(&learn_fragments_debug_win_, "LearnPieces", 400,
277 word->chopped_word->bounding_box());
278 RefreshDebugWindow(&learn_fragmented_word_debug_win_, "LearnWord", 200,
279 word->chopped_word->bounding_box());
280 word->chopped_word->plot(learn_fragmented_word_debug_win_);
282 }
283 #endif // GRAPHICS_DISABLED
284
285 for (int ch = 0; ch < word_len; ++ch) {
287 tprintf("\nLearning %s\n", word->correct_text[ch].string());
288 }
289 if (word->correct_text[ch].length() > 0) {
290 float threshold = thresholds != nullptr ? thresholds[ch] : 0.0f;
291
292 LearnPieces(fontname, start_blob, word->best_state[ch], threshold,
293 CST_WHOLE, word->correct_text[ch].string(), word);
294
295 if (word->best_state[ch] > 1 && !disable_character_fragments) {
296 // Check that the character breaks into meaningful fragments
297 // that each match a whole character with at least
298 // classify_character_fragments_garbage_certainty_threshold
299 bool garbage = false;
300 int frag;
301 for (frag = 0; frag < word->best_state[ch]; ++frag) {
302 TBLOB* frag_blob = word->chopped_word->blobs[start_blob + frag];
304 garbage |= LooksLikeGarbage(frag_blob);
305 }
306 }
307 // Learn the fragments.
308 if (!garbage) {
309 bool pieces_all_natural = word->PiecesAllNatural(start_blob,
310 word->best_state[ch]);
311 if (pieces_all_natural || !prioritize_division) {
312 for (frag = 0; frag < word->best_state[ch]; ++frag) {
314 word->correct_text[ch].split(' ', &tokens);
315
316 tokens[0] = CHAR_FRAGMENT::to_string(
317 tokens[0].string(), frag, word->best_state[ch],
318 pieces_all_natural);
319
320 STRING full_string;
321 for (int i = 0; i < tokens.size(); i++) {
322 full_string += tokens[i];
323 if (i != tokens.size() - 1)
324 full_string += ' ';
325 }
326 LearnPieces(fontname, start_blob + frag, 1, threshold,
327 CST_FRAGMENT, full_string.string(), word);
328 }
329 }
330 }
331 }
332
333 // TODO(rays): re-enable this part of the code when we switch to the
334 // new classifier that needs to see examples of garbage.
335 /*
336 if (word->best_state[ch] > 1) {
337 // If the next blob is good, make junk with the rightmost fragment.
338 if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
339 LearnPieces(fontname, start_blob + word->best_state[ch] - 1,
340 word->best_state[ch + 1] + 1,
341 threshold, CST_IMPROPER, INVALID_UNICHAR, word);
342 }
343 // If the previous blob is good, make junk with the leftmost fragment.
344 if (ch > 0 && word->correct_text[ch - 1].length() > 0) {
345 LearnPieces(fontname, start_blob - word->best_state[ch - 1],
346 word->best_state[ch - 1] + 1,
347 threshold, CST_IMPROPER, INVALID_UNICHAR, word);
348 }
349 }
350 // If the next blob is good, make a join with it.
351 if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
352 STRING joined_text = word->correct_text[ch];
353 joined_text += word->correct_text[ch + 1];
354 LearnPieces(fontname, start_blob,
355 word->best_state[ch] + word->best_state[ch + 1],
356 threshold, CST_NGRAM, joined_text.string(), word);
357 }
358 */
359 }
360 start_blob += word->best_state[ch];
361 }
362 delete [] thresholds;
363} // LearnWord.
int length() const
Definition: genericvector.h:86
void plot(ScrollView *window)
Definition: blobs.cpp:897
bool PiecesAllNatural(int start, int count) const
Definition: pageres.cpp:1078
void ComputeAdaptionThresholds(float certainty_scale, float min_rating, float max_rating, float rating_margin, float *thresholds)
Definition: pageres.cpp:561
GenericVector< int > best_state
Definition: pageres.h:285
GenericVector< STRING > correct_text
Definition: pageres.h:289
const STRING debug_string() const
Definition: ratngs.h:495
STRING to_string() const
Definition: unicharset.h:79
bool LooksLikeGarbage(TBLOB *blob)
void LearnPieces(const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
Definition: adaptmatch.cpp:374

◆ LooksLikeGarbage()

bool tesseract::Classify::LooksLikeGarbage ( TBLOB blob)

Definition at line 1633 of file adaptmatch.cpp.

1633 {
1634 auto *ratings = new BLOB_CHOICE_LIST();
1635 AdaptiveClassifier(blob, ratings);
1636 BLOB_CHOICE_IT ratings_it(ratings);
1639 print_ratings_list("======================\nLooksLikeGarbage() got ",
1640 ratings, unicharset);
1641 }
1642 for (ratings_it.mark_cycle_pt(); !ratings_it.cycled_list();
1643 ratings_it.forward()) {
1644 if (unicharset.get_fragment(ratings_it.data()->unichar_id()) != nullptr) {
1645 continue;
1646 }
1647 float certainty = ratings_it.data()->certainty();
1648 delete ratings;
1649 return certainty <
1651 }
1652 delete ratings;
1653 return true; // no whole characters in ratings
1654}
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:837
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:191
const UNICHARSET & getUnicharset() const
Definition: dict.h:101

◆ MakeNewTemporaryConfig()

int tesseract::Classify::MakeNewTemporaryConfig ( ADAPT_TEMPLATES  Templates,
CLASS_ID  ClassId,
int  FontinfoId,
int  NumFeatures,
INT_FEATURE_ARRAY  Features,
FEATURE_SET  FloatFeatures 
)
Parameters
Templatesadapted templates to add new config to
ClassIdclass id to associate with new config
FontinfoIdfont information inferred from pre-trained templates
NumFeaturesnumber of features in IntFeatures
Featuresfeatures describing model for new config
FloatFeaturesfloating-pt representation of features
Returns
The id of the new config created, a negative integer in case of error.

Definition at line 1740 of file adaptmatch.cpp.

1745 {
1746 INT_CLASS IClass;
1747 ADAPT_CLASS Class;
1748 PROTO_ID OldProtos[MAX_NUM_PROTOS];
1749 FEATURE_ID BadFeatures[MAX_NUM_INT_FEATURES];
1750 int NumOldProtos;
1751 int NumBadFeatures;
1752 int MaxProtoId, OldMaxProtoId;
1753 int MaskSize;
1754 int ConfigId;
1756 int i;
1757 int debug_level = NO_DEBUG;
1758
1760 debug_level =
1762
1763 IClass = ClassForClassId(Templates->Templates, ClassId);
1764 Class = Templates->Class[ClassId];
1765
1766 if (IClass->NumConfigs >= MAX_NUM_CONFIGS) {
1767 ++NumAdaptationsFailed;
1769 cprintf("Cannot make new temporary config: maximum number exceeded.\n");
1770 return -1;
1771 }
1772
1773 OldMaxProtoId = IClass->NumProtos - 1;
1774
1775 NumOldProtos = im_.FindGoodProtos(IClass, AllProtosOn, AllConfigsOff,
1776 NumFeatures, Features,
1778 debug_level);
1779
1780 MaskSize = WordsInVectorOfSize(MAX_NUM_PROTOS);
1781 zero_all_bits(TempProtoMask, MaskSize);
1782 for (i = 0; i < NumOldProtos; i++)
1783 SET_BIT(TempProtoMask, OldProtos[i]);
1784
1785 NumBadFeatures = im_.FindBadFeatures(IClass, TempProtoMask, AllConfigsOn,
1786 NumFeatures, Features,
1787 BadFeatures,
1789 debug_level);
1790
1791 MaxProtoId = MakeNewTempProtos(FloatFeatures, NumBadFeatures, BadFeatures,
1792 IClass, Class, TempProtoMask);
1793 if (MaxProtoId == NO_PROTO) {
1794 ++NumAdaptationsFailed;
1796 cprintf("Cannot make new temp protos: maximum number exceeded.\n");
1797 return -1;
1798 }
1799
1800 ConfigId = AddIntConfig(IClass);
1801 ConvertConfig(TempProtoMask, ConfigId, IClass);
1802 Config = NewTempConfig(MaxProtoId, FontinfoId);
1803 TempConfigFor(Class, ConfigId) = Config;
1804 copy_all_bits(TempProtoMask, Config->Protos, Config->ProtoVectorSize);
1805
1807 cprintf("Making new temp config %d fontinfo id %d"
1808 " using %d old and %d new protos.\n",
1809 ConfigId, Config->FontinfoId,
1810 NumOldProtos, MaxProtoId - OldMaxProtoId);
1811
1812 return ConfigId;
1813} /* MakeNewTemporaryConfig */
#define MAX_NUM_INT_FEATURES
Definition: intproto.h:129
#define PRINT_MATCH_SUMMARY
Definition: intproto.h:188
#define PRINT_PROTO_MATCHES
Definition: intproto.h:192
#define PRINT_FEATURE_MATCHES
Definition: intproto.h:191
int16_t PROTO_ID
Definition: matchdefs.h:40
uint8_t FEATURE_ID
Definition: matchdefs.h:46
PROTO_ID MakeNewTempProtos(FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
int FindBadFeatures(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_ID *FeatureArray, int AdaptFeatureThreshold, int Debug)
Definition: intmatcher.cpp:657
int FindGoodProtos(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, INT_FEATURE_ARRAY Features, PROTO_ID *ProtoArray, int AdaptProtoThreshold, int Debug)
Definition: intmatcher.cpp:589
uint16_t NumProtos
Definition: intproto.h:106

◆ MakeNewTempProtos()

PROTO_ID tesseract::Classify::MakeNewTempProtos ( FEATURE_SET  Features,
int  NumBadFeat,
FEATURE_ID  BadFeat[],
INT_CLASS  IClass,
ADAPT_CLASS  Class,
BIT_VECTOR  TempProtoMask 
)

This routine finds sets of sequential bad features that all have the same angle and converts each set into a new temporary proto. The temp proto is added to the proto pruner for IClass, pushed onto the list of temp protos in Class, and added to TempProtoMask.

Parameters
Featuresfloating-pt features describing new character
NumBadFeatnumber of bad features to turn into protos
BadFeatfeature id's of bad features
IClassinteger class templates to add new protos to
Classadapted class templates to add new protos to
TempProtoMaskproto mask to add new protos to

Globals: none

Returns
Max proto id in class after all protos have been added.

Definition at line 1834 of file adaptmatch.cpp.

1839 {
1840 FEATURE_ID *ProtoStart;
1841 FEATURE_ID *ProtoEnd;
1842 FEATURE_ID *LastBad;
1843 TEMP_PROTO TempProto;
1844 PROTO Proto;
1845 FEATURE F1, F2;
1846 float X1, X2, Y1, Y2;
1847 float A1, A2, AngleDelta;
1848 float SegmentLength;
1849 PROTO_ID Pid;
1850
1851 for (ProtoStart = BadFeat, LastBad = ProtoStart + NumBadFeat;
1852 ProtoStart < LastBad; ProtoStart = ProtoEnd) {
1853 F1 = Features->Features[*ProtoStart];
1854 X1 = F1->Params[PicoFeatX];
1855 Y1 = F1->Params[PicoFeatY];
1856 A1 = F1->Params[PicoFeatDir];
1857
1858 for (ProtoEnd = ProtoStart + 1,
1859 SegmentLength = GetPicoFeatureLength();
1860 ProtoEnd < LastBad;
1861 ProtoEnd++, SegmentLength += GetPicoFeatureLength()) {
1862 F2 = Features->Features[*ProtoEnd];
1863 X2 = F2->Params[PicoFeatX];
1864 Y2 = F2->Params[PicoFeatY];
1865 A2 = F2->Params[PicoFeatDir];
1866
1867 AngleDelta = fabs(A1 - A2);
1868 if (AngleDelta > 0.5)
1869 AngleDelta = 1.0 - AngleDelta;
1870
1871 if (AngleDelta > matcher_clustering_max_angle_delta ||
1872 fabs(X1 - X2) > SegmentLength ||
1873 fabs(Y1 - Y2) > SegmentLength)
1874 break;
1875 }
1876
1877 F2 = Features->Features[*(ProtoEnd - 1)];
1878 X2 = F2->Params[PicoFeatX];
1879 Y2 = F2->Params[PicoFeatY];
1880 A2 = F2->Params[PicoFeatDir];
1881
1882 Pid = AddIntProto(IClass);
1883 if (Pid == NO_PROTO)
1884 return (NO_PROTO);
1885
1886 TempProto = NewTempProto();
1887 Proto = &(TempProto->Proto);
1888
1889 /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
1890 ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
1891 instead of the -0.25 to 0.75 used in baseline normalization */
1892 Proto->Length = SegmentLength;
1893 Proto->Angle = A1;
1894 Proto->X = (X1 + X2) / 2.0;
1895 Proto->Y = (Y1 + Y2) / 2.0 - Y_DIM_OFFSET;
1896 FillABC(Proto);
1897
1898 TempProto->ProtoId = Pid;
1899 SET_BIT(TempProtoMask, Pid);
1900
1901 ConvertProto(Proto, Pid, IClass);
1902 AddProtoToProtoPruner(Proto, Pid, IClass,
1904
1905 Class->TempProtos = push(Class->TempProtos, TempProto);
1906 }
1907 return IClass->NumProtos - 1;
1908} /* MakeNewTempProtos */

◆ MakePermanent()

void tesseract::Classify::MakePermanent ( ADAPT_TEMPLATES  Templates,
CLASS_ID  ClassId,
int  ConfigId,
TBLOB Blob 
)
Parameters
Templatescurrent set of adaptive templates
ClassIdclass containing config to be made permanent
ConfigIdconfig to be made permanent
Blobcurrent blob being adapted to

Globals: none

Definition at line 1920 of file adaptmatch.cpp.

1923 {
1924 UNICHAR_ID *Ambigs;
1926 ADAPT_CLASS Class;
1927 PROTO_KEY ProtoKey;
1928
1929 Class = Templates->Class[ClassId];
1930 Config = TempConfigFor(Class, ConfigId);
1931
1932 MakeConfigPermanent(Class, ConfigId);
1933 if (Class->NumPermConfigs == 0)
1934 Templates->NumPermClasses++;
1935 Class->NumPermConfigs++;
1936
1937 // Initialize permanent config.
1938 Ambigs = GetAmbiguities(Blob, ClassId);
1939 auto Perm = static_cast<PERM_CONFIG>(malloc(sizeof(PERM_CONFIG_STRUCT)));
1940 Perm->Ambigs = Ambigs;
1941 Perm->FontinfoId = Config->FontinfoId;
1942
1943 // Free memory associated with temporary config (since ADAPTED_CONFIG
1944 // is a union we need to clean up before we record permanent config).
1945 ProtoKey.Templates = Templates;
1946 ProtoKey.ClassId = ClassId;
1947 ProtoKey.ConfigId = ConfigId;
1948 Class->TempProtos = delete_d(Class->TempProtos, &ProtoKey, MakeTempProtoPerm);
1950
1951 // Record permanent config.
1952 PermConfigFor(Class, ConfigId) = Perm;
1953
1955 tprintf("Making config %d for %s (ClassId %d) permanent:"
1956 " fontinfo id %d, ambiguities '",
1957 ConfigId, getDict().getUnicharset().debug_str(ClassId).string(),
1958 ClassId, PermConfigFor(Class, ConfigId)->FontinfoId);
1959 for (UNICHAR_ID *AmbigsPointer = Ambigs;
1960 *AmbigsPointer >= 0; ++AmbigsPointer)
1961 tprintf("%s", unicharset.id_to_unichar(*AmbigsPointer));
1962 tprintf("'.\n");
1963 }
1964} /* MakePermanent */
void FreeTempConfig(TEMP_CONFIG Config)
Definition: adaptive.cpp:74
#define MakeConfigPermanent(Class, ConfigId)
Definition: adaptive.h:85
int MakeTempProtoPerm(void *item1, void *item2)
LIST delete_d(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:110
UNICHAR_ID * Ambigs
Definition: adaptive.h:45
uint8_t NumPermConfigs
Definition: adaptive.h:56
CLASS_ID ClassId
Definition: adaptmatch.cpp:124
ADAPT_TEMPLATES Templates
Definition: adaptmatch.cpp:123
UNICHAR_ID * GetAmbiguities(TBLOB *Blob, CLASS_ID CorrectClass)

◆ MasterMatcher()

void tesseract::Classify::MasterMatcher ( INT_TEMPLATES  templates,
int16_t  num_features,
const INT_FEATURE_STRUCT features,
const uint8_t *  norm_factors,
ADAPT_CLASS classes,
int  debug,
int  matcher_multiplier,
const TBOX blob_box,
const GenericVector< CP_RESULT_STRUCT > &  results,
ADAPT_RESULTS final_results 
)

Factored-out calls to IntegerMatcher based on class pruner results. Returns integer matcher results inside CLASS_PRUNER_RESULTS structure.

Definition at line 1088 of file adaptmatch.cpp.

1097 {
1098 int top = blob_box.top();
1099 int bottom = blob_box.bottom();
1100 UnicharRating int_result;
1101 for (int c = 0; c < results.size(); c++) {
1102 CLASS_ID class_id = results[c].Class;
1103 BIT_VECTOR protos = classes != nullptr ? classes[class_id]->PermProtos
1104 : AllProtosOn;
1105 BIT_VECTOR configs = classes != nullptr ? classes[class_id]->PermConfigs
1106 : AllConfigsOn;
1107
1108 int_result.unichar_id = class_id;
1109 im_.Match(ClassForClassId(templates, class_id),
1110 protos, configs,
1111 num_features, features,
1112 &int_result, classify_adapt_feature_threshold, debug,
1114 bool is_debug = matcher_debug_level >= 2 || classify_debug_level > 1;
1115 ExpandShapesAndApplyCorrections(classes, is_debug, class_id, bottom, top,
1116 results[c].Rating,
1117 final_results->BlobLength,
1118 matcher_multiplier, norm_factors,
1119 &int_result, final_results);
1120 }
1121}
BIT_VECTOR PermProtos
Definition: adaptive.h:59
BIT_VECTOR PermConfigs
Definition: adaptive.h:60

◆ NewAdaptedTemplates()

ADAPT_TEMPLATES tesseract::Classify::NewAdaptedTemplates ( bool  InitFromUnicharset)

Allocates memory for adapted tempates. each char in unicharset to the newly created templates

Parameters
InitFromUnicharsetif true, add an empty class for
Returns
Ptr to new adapted templates.
Note
Globals: none

Definition at line 151 of file adaptive.cpp.

151 {
152 ADAPT_TEMPLATES Templates;
153
154 Templates = static_cast<ADAPT_TEMPLATES>(Emalloc (sizeof (ADAPT_TEMPLATES_STRUCT)));
155
156 Templates->Templates = NewIntTemplates ();
157 Templates->NumPermClasses = 0;
158 Templates->NumNonEmptyClasses = 0;
159
160 /* Insert an empty class for each unichar id in unicharset */
161 for (int i = 0; i < MAX_NUM_CLASSES; i++) {
162 Templates->Class[i] = nullptr;
163 if (InitFromUnicharset && i < unicharset.size()) {
164 AddAdaptedClass(Templates, NewAdaptedClass(), i);
165 }
166 }
167
168 return (Templates);
169
170} /* NewAdaptedTemplates */
void AddAdaptedClass(ADAPT_TEMPLATES Templates, ADAPT_CLASS Class, CLASS_ID ClassId)
Definition: adaptive.cpp:45
ADAPT_CLASS NewAdaptedClass()
Definition: adaptive.cpp:102
void * Emalloc(int Size)
Definition: emalloc.cpp:31
#define MAX_NUM_CLASSES
Definition: matchdefs.h:30

◆ NormalizeOutlines()

void tesseract::Classify::NormalizeOutlines ( LIST  Outlines,
float *  XScale,
float *  YScale 
)

This routine normalizes every outline in Outlines according to the currently selected normalization method. It also returns the scale factors that it used to do this scaling. The scale factors returned represent the x and y sizes in the normalized coordinate system that correspond to 1 pixel in the original coordinate system. Outlines are changed and XScale and YScale are updated.

Globals:

  • classify_norm_method method being used for normalization
  • classify_char_norm_range map radius of gyration to this value
    Parameters
    Outlineslist of outlines to be normalized
    XScalex-direction scale factor used by routine
    YScaley-direction scale factor used by routine

Definition at line 276 of file mfoutline.cpp.

278 {
279 MFOUTLINE Outline;
280
281 switch (classify_norm_method) {
282 case character:
283 ASSERT_HOST(!"How did NormalizeOutlines get called in character mode?");
284 break;
285
286 case baseline:
287 iterate(Outlines) {
288 Outline = static_cast<MFOUTLINE>first_node(Outlines);
289 NormalizeOutline(Outline, 0.0);
290 }
291 *XScale = *YScale = MF_SCALE_FACTOR;
292 break;
293 }
294} /* NormalizeOutlines */
void NormalizeOutline(MFOUTLINE Outline, float XOrigin)
Definition: mfoutline.cpp:242

◆ PrintAdaptedTemplates()

void tesseract::Classify::PrintAdaptedTemplates ( FILE *  File,
ADAPT_TEMPLATES  Templates 
)

This routine prints a summary of the adapted templates in Templates to File.

Parameters
Fileopen text file to print Templates to
Templatesadapted templates to print to File
Note
Globals: none

Definition at line 244 of file adaptive.cpp.

244 {
245 INT_CLASS IClass;
246 ADAPT_CLASS AClass;
247
248 fprintf (File, "\n\nSUMMARY OF ADAPTED TEMPLATES:\n\n");
249 fprintf (File, "Num classes = %d; Num permanent classes = %d\n\n",
250 Templates->NumNonEmptyClasses, Templates->NumPermClasses);
251 fprintf (File, " Id NC NPC NP NPP\n");
252 fprintf (File, "------------------------\n");
253
254 for (int i = 0; i < (Templates->Templates)->NumClasses; i++) {
255 IClass = Templates->Templates->Class[i];
256 AClass = Templates->Class[i];
257 if (!IsEmptyAdaptedClass (AClass)) {
258 fprintf (File, "%5d %s %3d %3d %3d %3d\n",
260 IClass->NumConfigs, AClass->NumPermConfigs,
261 IClass->NumProtos,
262 IClass->NumProtos - count (AClass->TempProtos));
263 }
264 }
265 fprintf (File, "\n");
266
267} /* PrintAdaptedTemplates */
int count(LIST var_list)
Definition: oldlist.cpp:95

◆ PrintAdaptiveMatchResults()

void tesseract::Classify::PrintAdaptiveMatchResults ( const ADAPT_RESULTS results)

This routine writes the matches in Results to File.

Parameters
resultsmatch results to write to File

Globals: none

Definition at line 2013 of file adaptmatch.cpp.

2013 {
2014 for (int i = 0; i < results.match.size(); ++i) {
2015 tprintf("%s ", unicharset.debug_str(results.match[i].unichar_id).string());
2016 results.match[i].Print();
2017 }
2018} /* PrintAdaptiveMatchResults */

◆ PruneClasses()

int tesseract::Classify::PruneClasses ( const INT_TEMPLATES_STRUCT int_templates,
int  num_features,
int  keep_this,
const INT_FEATURE_STRUCT features,
const uint8_t *  normalization_factors,
const uint16_t *  expected_num_features,
GenericVector< CP_RESULT_STRUCT > *  results 
)

Runs the class pruner from int_templates on the given features, returning the number of classes output in results.

Parameters
int_templatesClass pruner tables
num_featuresNumber of features in blob
featuresArray of features
normalization_factorsArray of fudge factors from blob normalization process (by CLASS_INDEX)
expected_num_featuresArray of expected number of features for each class (by CLASS_INDEX)
resultsSorted Array of pruned classes. Must be an array of size at least int_templates->NumClasses.
keep_this

Definition at line 452 of file intmatcher.cpp.

457 {
458 ClassPruner pruner(int_templates->NumClasses);
459 // Compute initial match scores for all classes.
460 pruner.ComputeScores(int_templates, num_features, features);
461 // Adjust match scores for number of expected features.
462 pruner.AdjustForExpectedNumFeatures(expected_num_features,
464 // Apply disabled classes in unicharset - only works without a shape_table.
465 if (shape_table_ == nullptr)
466 pruner.DisableDisabledClasses(unicharset);
467 // If fragments are disabled, remove them, also only without a shape table.
469 pruner.DisableFragments(unicharset);
470
471 // If we have good x-heights, apply the given normalization factors.
472 if (normalization_factors != nullptr) {
473 pruner.NormalizeForXheight(classify_class_pruner_multiplier,
474 normalization_factors);
475 } else {
476 pruner.NoNormalization();
477 }
478 // Do the actual pruning and sort the short-list.
479 pruner.PruneAndSort(classify_class_pruner_threshold, keep_this,
480 shape_table_ == nullptr, unicharset);
481
482 if (classify_debug_level > 2) {
483 pruner.DebugMatch(*this, int_templates, features);
484 }
485 if (classify_debug_level > 1) {
486 pruner.SummarizeResult(*this, int_templates, expected_num_features,
488 normalization_factors);
489 }
490 // Convert to the expected output format.
491 return pruner.SetupResults(results);
492}

◆ ReadAdaptedTemplates()

ADAPT_TEMPLATES tesseract::Classify::ReadAdaptedTemplates ( TFile fp)

Read a set of adapted templates from file and return a ptr to the templates.

Parameters
fpopen text file to read adapted templates from
Returns
Ptr to adapted templates read from file.
Note
Globals: none

Definition at line 332 of file adaptive.cpp.

332 {
333 ADAPT_TEMPLATES Templates;
334
335 /* first read the high level adaptive template struct */
336 Templates = static_cast<ADAPT_TEMPLATES>(Emalloc (sizeof (ADAPT_TEMPLATES_STRUCT)));
337 fp->FRead(Templates, sizeof(ADAPT_TEMPLATES_STRUCT), 1);
338
339 /* then read in the basic integer templates */
340 Templates->Templates = ReadIntTemplates(fp);
341
342 /* then read in the adaptive info for each class */
343 for (int i = 0; i < (Templates->Templates)->NumClasses; i++) {
344 Templates->Class[i] = ReadAdaptedClass(fp);
345 }
346 return (Templates);
347
348} /* ReadAdaptedTemplates */
ADAPT_CLASS ReadAdaptedClass(TFile *fp)
Definition: adaptive.cpp:281
int FRead(void *buffer, size_t size, int count)
Definition: serialis.cpp:271

◆ ReadIntTemplates()

INT_TEMPLATES tesseract::Classify::ReadIntTemplates ( TFile fp)

This routine reads a set of integer templates from File. File must already be open and must be in the correct binary format.

Parameters
fpopen file to read templates from
Returns
Pointer to integer templates read from File.
Note
Globals: none

Definition at line 718 of file intproto.cpp.

718 {
719 int i, j, w, x, y, z;
720 int unicharset_size;
721 int version_id = 0;
722 INT_TEMPLATES Templates;
723 CLASS_PRUNER_STRUCT* Pruner;
724 INT_CLASS Class;
725 uint8_t *Lengths;
726 PROTO_SET ProtoSet;
727
728 /* variables for conversion from older inttemp formats */
729 int b, bit_number, last_cp_bit_number, new_b, new_i, new_w;
730 CLASS_ID class_id, max_class_id;
731 auto *IndexFor = new int16_t[MAX_NUM_CLASSES];
732 auto *ClassIdFor = new CLASS_ID[MAX_NUM_CLASSES];
733 auto **TempClassPruner =
735 uint32_t SetBitsForMask = // word with NUM_BITS_PER_CLASS
736 (1 << NUM_BITS_PER_CLASS) - 1; // set starting at bit 0
737 uint32_t Mask, NewMask, ClassBits;
738 int MaxNumConfigs = MAX_NUM_CONFIGS;
739 int WerdsPerConfigVec = WERDS_PER_CONFIG_VEC;
740
741 /* first read the high level template struct */
742 Templates = NewIntTemplates();
743 // Read Templates in parts for 64 bit compatibility.
744 if (fp->FReadEndian(&unicharset_size, sizeof(unicharset_size), 1) != 1)
745 tprintf("Bad read of inttemp!\n");
746 if (fp->FReadEndian(&Templates->NumClasses, sizeof(Templates->NumClasses),
747 1) != 1 ||
748 fp->FReadEndian(&Templates->NumClassPruners,
749 sizeof(Templates->NumClassPruners), 1) != 1)
750 tprintf("Bad read of inttemp!\n");
751 if (Templates->NumClasses < 0) {
752 // This file has a version id!
753 version_id = -Templates->NumClasses;
754 if (fp->FReadEndian(&Templates->NumClasses, sizeof(Templates->NumClasses),
755 1) != 1)
756 tprintf("Bad read of inttemp!\n");
757 }
758
759 if (version_id < 3) {
760 MaxNumConfigs = OLD_MAX_NUM_CONFIGS;
761 WerdsPerConfigVec = OLD_WERDS_PER_CONFIG_VEC;
762 }
763
764 if (version_id < 2) {
765 if (fp->FReadEndian(IndexFor, sizeof(IndexFor[0]), unicharset_size) !=
766 unicharset_size) {
767 tprintf("Bad read of inttemp!\n");
768 }
769 if (fp->FReadEndian(ClassIdFor, sizeof(ClassIdFor[0]),
770 Templates->NumClasses) != Templates->NumClasses) {
771 tprintf("Bad read of inttemp!\n");
772 }
773 }
774
775 /* then read in the class pruners */
776 const int kNumBuckets =
778 for (i = 0; i < Templates->NumClassPruners; i++) {
779 Pruner = new CLASS_PRUNER_STRUCT;
780 if (fp->FReadEndian(Pruner, sizeof(Pruner->p[0][0][0][0]), kNumBuckets) !=
781 kNumBuckets) {
782 tprintf("Bad read of inttemp!\n");
783 }
784 if (version_id < 2) {
785 TempClassPruner[i] = Pruner;
786 } else {
787 Templates->ClassPruners[i] = Pruner;
788 }
789 }
790
791 /* fix class pruners if they came from an old version of inttemp */
792 if (version_id < 2) {
793 // Allocate enough class pruners to cover all the class ids.
794 max_class_id = 0;
795 for (i = 0; i < Templates->NumClasses; i++)
796 if (ClassIdFor[i] > max_class_id)
797 max_class_id = ClassIdFor[i];
798 for (i = 0; i <= CPrunerIdFor(max_class_id); i++) {
799 Templates->ClassPruners[i] = new CLASS_PRUNER_STRUCT;
800 memset(Templates->ClassPruners[i], 0, sizeof(CLASS_PRUNER_STRUCT));
801 }
802 // Convert class pruners from the old format (indexed by class index)
803 // to the new format (indexed by class id).
804 last_cp_bit_number = NUM_BITS_PER_CLASS * Templates->NumClasses - 1;
805 for (i = 0; i < Templates->NumClassPruners; i++) {
806 for (x = 0; x < NUM_CP_BUCKETS; x++)
807 for (y = 0; y < NUM_CP_BUCKETS; y++)
808 for (z = 0; z < NUM_CP_BUCKETS; z++)
809 for (w = 0; w < WERDS_PER_CP_VECTOR; w++) {
810 if (TempClassPruner[i]->p[x][y][z][w] == 0)
811 continue;
812 for (b = 0; b < BITS_PER_WERD; b += NUM_BITS_PER_CLASS) {
813 bit_number = i * BITS_PER_CP_VECTOR + w * BITS_PER_WERD + b;
814 if (bit_number > last_cp_bit_number)
815 break; // the rest of the bits in this word are not used
816 class_id = ClassIdFor[bit_number / NUM_BITS_PER_CLASS];
817 // Single out NUM_BITS_PER_CLASS bits relating to class_id.
818 Mask = SetBitsForMask << b;
819 ClassBits = TempClassPruner[i]->p[x][y][z][w] & Mask;
820 // Move these bits to the new position in which they should
821 // appear (indexed corresponding to the class_id).
822 new_i = CPrunerIdFor(class_id);
823 new_w = CPrunerWordIndexFor(class_id);
824 new_b = CPrunerBitIndexFor(class_id) * NUM_BITS_PER_CLASS;
825 if (new_b > b) {
826 ClassBits <<= (new_b - b);
827 } else {
828 ClassBits >>= (b - new_b);
829 }
830 // Copy bits relating to class_id to the correct position
831 // in Templates->ClassPruner.
832 NewMask = SetBitsForMask << new_b;
833 Templates->ClassPruners[new_i]->p[x][y][z][new_w] &= ~NewMask;
834 Templates->ClassPruners[new_i]->p[x][y][z][new_w] |= ClassBits;
835 }
836 }
837 }
838 for (i = 0; i < Templates->NumClassPruners; i++) {
839 delete TempClassPruner[i];
840 }
841 }
842
843 /* then read in each class */
844 for (i = 0; i < Templates->NumClasses; i++) {
845 /* first read in the high level struct for the class */
846 Class = static_cast<INT_CLASS>(Emalloc (sizeof (INT_CLASS_STRUCT)));
847 if (fp->FReadEndian(&Class->NumProtos, sizeof(Class->NumProtos), 1) != 1 ||
848 fp->FRead(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1) != 1 ||
849 fp->FRead(&Class->NumConfigs, sizeof(Class->NumConfigs), 1) != 1)
850 tprintf("Bad read of inttemp!\n");
851 if (version_id == 0) {
852 // Only version 0 writes 5 pointless pointers to the file.
853 for (j = 0; j < 5; ++j) {
854 int32_t junk;
855 if (fp->FRead(&junk, sizeof(junk), 1) != 1)
856 tprintf("Bad read of inttemp!\n");
857 }
858 }
859 int num_configs = version_id < 4 ? MaxNumConfigs : Class->NumConfigs;
860 ASSERT_HOST(num_configs <= MaxNumConfigs);
861 if (fp->FReadEndian(Class->ConfigLengths, sizeof(uint16_t), num_configs) !=
862 num_configs) {
863 tprintf("Bad read of inttemp!\n");
864 }
865 if (version_id < 2) {
866 ClassForClassId (Templates, ClassIdFor[i]) = Class;
867 } else {
868 ClassForClassId (Templates, i) = Class;
869 }
870
871 /* then read in the proto lengths */
872 Lengths = nullptr;
873 if (MaxNumIntProtosIn (Class) > 0) {
874 Lengths = static_cast<uint8_t *>(Emalloc(sizeof(uint8_t) * MaxNumIntProtosIn(Class)));
875 if (fp->FRead(Lengths, sizeof(uint8_t), MaxNumIntProtosIn(Class)) !=
876 MaxNumIntProtosIn(Class))
877 tprintf("Bad read of inttemp!\n");
878 }
879 Class->ProtoLengths = Lengths;
880
881 /* then read in the proto sets */
882 for (j = 0; j < Class->NumProtoSets; j++) {
883 ProtoSet = static_cast<PROTO_SET>(Emalloc(sizeof(PROTO_SET_STRUCT)));
885 if (fp->FReadEndian(&ProtoSet->ProtoPruner,
886 sizeof(ProtoSet->ProtoPruner[0][0][0]),
887 num_buckets) != num_buckets)
888 tprintf("Bad read of inttemp!\n");
889 for (x = 0; x < PROTOS_PER_PROTO_SET; x++) {
890 if (fp->FRead(&ProtoSet->Protos[x].A, sizeof(ProtoSet->Protos[x].A),
891 1) != 1 ||
892 fp->FRead(&ProtoSet->Protos[x].B, sizeof(ProtoSet->Protos[x].B),
893 1) != 1 ||
894 fp->FRead(&ProtoSet->Protos[x].C, sizeof(ProtoSet->Protos[x].C),
895 1) != 1 ||
896 fp->FRead(&ProtoSet->Protos[x].Angle,
897 sizeof(ProtoSet->Protos[x].Angle), 1) != 1)
898 tprintf("Bad read of inttemp!\n");
899 if (fp->FReadEndian(&ProtoSet->Protos[x].Configs,
900 sizeof(ProtoSet->Protos[x].Configs[0]),
901 WerdsPerConfigVec) != WerdsPerConfigVec)
902 cprintf("Bad read of inttemp!\n");
903 }
904 Class->ProtoSets[j] = ProtoSet;
905 }
906 if (version_id < 4) {
907 Class->font_set_id = -1;
908 } else {
909 fp->FReadEndian(&Class->font_set_id, sizeof(Class->font_set_id), 1);
910 }
911 }
912
913 if (version_id < 2) {
914 /* add an empty nullptr class with class id 0 */
915 assert(UnusedClassIdIn (Templates, 0));
916 ClassForClassId (Templates, 0) = NewIntClass (1, 1);
917 ClassForClassId (Templates, 0)->font_set_id = -1;
918 Templates->NumClasses++;
919 /* make sure the classes are contiguous */
920 for (i = 0; i < MAX_NUM_CLASSES; i++) {
921 if (i < Templates->NumClasses) {
922 if (ClassForClassId (Templates, i) == nullptr) {
923 fprintf(stderr, "Non-contiguous class ids in inttemp\n");
924 exit(1);
925 }
926 } else {
927 if (ClassForClassId (Templates, i) != nullptr) {
928 fprintf(stderr, "Class id %d exceeds NumClassesIn (Templates) %d\n",
929 i, Templates->NumClasses);
930 exit(1);
931 }
932 }
933 }
934 }
935 if (version_id >= 4) {
937 if (version_id >= 5) {
938 this->fontinfo_table_.read(fp,
940 }
942 }
943
944 // Clean up.
945 delete[] IndexFor;
946 delete[] ClassIdFor;
947 delete[] TempClassPruner;
948
949 return (Templates);
950} /* ReadIntTemplates */
#define OLD_MAX_NUM_CONFIGS
Definition: intproto.cpp:108
#define OLD_WERDS_PER_CONFIG_VEC
Definition: intproto.cpp:109
#define BITS_PER_CP_VECTOR
Definition: intproto.h:59
#define MaxNumIntProtosIn(C)
Definition: intproto.h:165
#define NUM_PP_PARAMS
Definition: intproto.h:51
#define WERDS_PER_PP_VECTOR
Definition: intproto.h:63
#define BITS_PER_WERD
Definition: intproto.h:45
#define WERDS_PER_CONFIG_VEC
Definition: intproto.h:68
#define CPrunerWordIndexFor(c)
Definition: intproto.h:182
#define CPrunerIdFor(c)
Definition: intproto.h:180
#define CPrunerBitIndexFor(c)
Definition: intproto.h:183
#define NUM_CP_BUCKETS
Definition: intproto.h:53
#define MAX_NUM_CLASS_PRUNERS
Definition: intproto.h:60
#define WERDS_PER_CP_VECTOR
Definition: intproto.h:62
#define PROTOS_PER_PROTO_SET
Definition: intproto.h:49
#define NUM_PP_BUCKETS
Definition: intproto.h:52
#define NUM_BITS_PER_CLASS
Definition: intproto.h:55
bool read_set(TFile *f, FontSet *fs)
Definition: fontinfo.cpp:226
bool read_info(TFile *f, FontInfo *fi)
Definition: fontinfo.cpp:153
bool read_spacing_info(TFile *f, FontInfo *fi)
Definition: fontinfo.cpp:170
uint32_t p[NUM_CP_BUCKETS][NUM_CP_BUCKETS][NUM_CP_BUCKETS][WERDS_PER_CP_VECTOR]
Definition: intproto.h:78
uint32_t Configs[WERDS_PER_CONFIG_VEC]
Definition: intproto.h:86
PROTO_PRUNER ProtoPruner
Definition: intproto.h:96
INT_PROTO_STRUCT Protos[PROTOS_PER_PROTO_SET]
Definition: intproto.h:97
uint8_t NumProtoSets
Definition: intproto.h:107
PROTO_SET ProtoSets[MAX_NUM_PROTO_SETS]
Definition: intproto.h:109
uint16_t ConfigLengths[MAX_NUM_CONFIGS]
Definition: intproto.h:111
CLASS_PRUNER_STRUCT * ClassPruners[MAX_NUM_CLASS_PRUNERS]
Definition: intproto.h:122

◆ ReadNewCutoffs()

void tesseract::Classify::ReadNewCutoffs ( TFile fp,
uint16_t *  Cutoffs 
)

Open file, read in all of the class-id/cutoff pairs and insert them into the Cutoffs array. Cutoffs are indexed in the array by class id. Unused entries in the array are set to an arbitrarily high cutoff value.

Parameters
fpfile containing cutoff definitions
Cutoffsarray to put cutoffs into

Definition at line 41 of file cutoffs.cpp.

41 {
42 int Cutoff;
43
44 if (shape_table_ != nullptr) {
45 if (!shapetable_cutoffs_.DeSerialize(fp)) {
46 tprintf("Error during read of shapetable pffmtable!\n");
47 }
48 }
49 for (int i = 0; i < MAX_NUM_CLASSES; i++)
50 Cutoffs[i] = MAX_CUTOFF;
51
52 const int kMaxLineSize = 100;
53 char line[kMaxLineSize];
54 while (fp->FGets(line, kMaxLineSize) != nullptr) {
55 std::string Class;
56 CLASS_ID ClassId;
57 std::istringstream stream(line);
58 stream >> Class >> Cutoff;
59 if (stream.fail()) {
60 break;
61 }
62 if (Class.compare("NULL") == 0) {
63 ClassId = unicharset.unichar_to_id(" ");
64 } else {
65 ClassId = unicharset.unichar_to_id(Class.c_str());
66 }
67 ASSERT_HOST(ClassId >= 0 && ClassId < MAX_NUM_CLASSES);
68 Cutoffs[ClassId] = Cutoff;
69 }
70}
#define MAX_CUTOFF
Definition: cutoffs.cpp:30
bool DeSerialize(bool swap, FILE *fp)

◆ ReadNormProtos()

NORM_PROTOS * tesseract::Classify::ReadNormProtos ( TFile fp)

This routine allocates a new data structure to hold a set of character normalization protos. It then fills in the data structure by reading from the specified File.

Parameters
fpopen text file to read normalization protos from Globals: none
Returns
Character normalization protos.

Definition at line 190 of file normmatch.cpp.

190 {
192 int i;
193 char unichar[2 * UNICHAR_LEN + 1];
194 UNICHAR_ID unichar_id;
195 LIST Protos;
196 int NumProtos;
197
198 /* allocate and initialization data structure */
199 NormProtos = static_cast<NORM_PROTOS *>(Emalloc (sizeof (NORM_PROTOS)));
201 NormProtos->Protos = static_cast<LIST *>(Emalloc (NormProtos->NumProtos * sizeof(LIST)));
202 for (i = 0; i < NormProtos->NumProtos; i++)
204
205 /* read file header and save in data structure */
208
209 /* read protos for each class into a separate list */
210 const int kMaxLineSize = 100;
211 char line[kMaxLineSize];
212 while (fp->FGets(line, kMaxLineSize) != nullptr) {
213 std::istringstream stream(line);
214 stream >> unichar >> NumProtos;
215 if (stream.fail()) {
216 continue;
217 }
218 if (unicharset.contains_unichar(unichar)) {
219 unichar_id = unicharset.unichar_to_id(unichar);
220 Protos = NormProtos->Protos[unichar_id];
221 for (i = 0; i < NumProtos; i++)
222 Protos = push_last(Protos, ReadPrototype(fp, NormProtos->NumParams));
223 NormProtos->Protos[unichar_id] = Protos;
224 } else {
225 tprintf("Error: unichar %s in normproto file is not in unichar set.\n",
226 unichar);
227 for (i = 0; i < NumProtos; i++)
229 }
230 }
231 return (NormProtos);
232} /* ReadNormProtos */
#define UNICHAR_LEN
Definition: unichar.h:30
void FreePrototype(void *arg)
Definition: cluster.cpp:549
uint16_t ReadSampleSize(TFile *fp)
Definition: clusttool.cpp:120
PARAM_DESC * ReadParamDesc(TFile *fp, uint16_t N)
Definition: clusttool.cpp:140
PROTOTYPE * ReadPrototype(TFile *fp, uint16_t N)
Definition: clusttool.cpp:176
LIST push_last(LIST list, void *item)
Definition: oldlist.cpp:227
#define NIL_LIST
Definition: oldlist.h:76

◆ RefreshDebugWindow()

void tesseract::Classify::RefreshDebugWindow ( ScrollView **  win,
const char *  msg,
int  y_offset,
const TBOX wbox 
)

Definition at line 226 of file adaptmatch.cpp.

227 {
228 #ifndef GRAPHICS_DISABLED
229 const int kSampleSpaceWidth = 500;
230 if (*win == nullptr) {
231 *win = new ScrollView(msg, 100, y_offset, kSampleSpaceWidth * 2, 200,
232 kSampleSpaceWidth * 2, 200, true);
233 }
234 (*win)->Clear();
235 (*win)->Pen(64, 64, 64);
236 (*win)->Line(-kSampleSpaceWidth, kBlnBaselineOffset,
237 kSampleSpaceWidth, kBlnBaselineOffset);
238 (*win)->Line(-kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset,
239 kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset);
240 (*win)->ZoomToRectangle(wbox.left(), wbox.top(),
241 wbox.right(), wbox.bottom());
242 #endif // GRAPHICS_DISABLED
243}
int16_t left() const
Definition: rect.h:72
int16_t right() const
Definition: rect.h:79

◆ RemoveBadMatches()

void tesseract::Classify::RemoveBadMatches ( ADAPT_RESULTS Results)

This routine steps through each matching class in Results and removes it from the match list if its rating is worse than the BestRating plus a pad. In other words, all good matches get moved to the front of the classes array.

Parameters
Resultscontains matches to be filtered

Globals:

  • matcher_bad_match_pad defines a "bad match"

Definition at line 2033 of file adaptmatch.cpp.

2033 {
2034 int Next, NextGood;
2035 float BadMatchThreshold;
2036 static const char* romans = "i v x I V X";
2037 BadMatchThreshold = Results->best_rating - matcher_bad_match_pad;
2038
2040 UNICHAR_ID unichar_id_one = unicharset.contains_unichar("1") ?
2041 unicharset.unichar_to_id("1") : -1;
2042 UNICHAR_ID unichar_id_zero = unicharset.contains_unichar("0") ?
2043 unicharset.unichar_to_id("0") : -1;
2044 float scored_one = ScoredUnichar(unichar_id_one, *Results);
2045 float scored_zero = ScoredUnichar(unichar_id_zero, *Results);
2046
2047 for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
2048 const UnicharRating& match = Results->match[Next];
2049 if (match.rating >= BadMatchThreshold) {
2050 if (!unicharset.get_isalpha(match.unichar_id) ||
2051 strstr(romans,
2052 unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
2053 } else if (unicharset.eq(match.unichar_id, "l") &&
2054 scored_one < BadMatchThreshold) {
2055 Results->match[Next].unichar_id = unichar_id_one;
2056 } else if (unicharset.eq(match.unichar_id, "O") &&
2057 scored_zero < BadMatchThreshold) {
2058 Results->match[Next].unichar_id = unichar_id_zero;
2059 } else {
2060 Results->match[Next].unichar_id = INVALID_UNICHAR_ID; // Don't copy.
2061 }
2062 if (Results->match[Next].unichar_id != INVALID_UNICHAR_ID) {
2063 if (NextGood == Next) {
2064 ++NextGood;
2065 } else {
2066 Results->match[NextGood++] = Results->match[Next];
2067 }
2068 }
2069 }
2070 }
2071 } else {
2072 for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
2073 if (Results->match[Next].rating >= BadMatchThreshold) {
2074 if (NextGood == Next) {
2075 ++NextGood;
2076 } else {
2077 Results->match[NextGood++] = Results->match[Next];
2078 }
2079 }
2080 }
2081 }
2082 Results->match.truncate(NextGood);
2083} /* RemoveBadMatches */
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:687

◆ RemoveExtraPuncs()

void tesseract::Classify::RemoveExtraPuncs ( ADAPT_RESULTS Results)

This routine discards extra digits or punctuation from the results. We keep only the top 2 punctuation answers and the top 1 digit answer if present.

Parameters
Resultscontains matches to be filtered

Definition at line 2093 of file adaptmatch.cpp.

2093 {
2094 int Next, NextGood;
2095 int punc_count; /*no of garbage characters */
2096 int digit_count;
2097 /*garbage characters */
2098 static char punc_chars[] = ". , ; : / ` ~ ' - = \\ | \" ! _ ^";
2099 static char digit_chars[] = "0 1 2 3 4 5 6 7 8 9";
2100
2101 punc_count = 0;
2102 digit_count = 0;
2103 for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
2104 const UnicharRating& match = Results->match[Next];
2105 bool keep = true;
2106 if (strstr(punc_chars,
2107 unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
2108 if (punc_count >= 2)
2109 keep = false;
2110 punc_count++;
2111 } else {
2112 if (strstr(digit_chars,
2113 unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
2114 if (digit_count >= 1)
2115 keep = false;
2116 digit_count++;
2117 }
2118 }
2119 if (keep) {
2120 if (NextGood == Next) {
2121 ++NextGood;
2122 } else {
2123 Results->match[NextGood++] = match;
2124 }
2125 }
2126 }
2127 Results->match.truncate(NextGood);
2128} /* RemoveExtraPuncs */

◆ ResetAdaptiveClassifierInternal()

void tesseract::Classify::ResetAdaptiveClassifierInternal ( )

Definition at line 598 of file adaptmatch.cpp.

598 {
600 tprintf("Resetting adaptive classifier (NumAdaptationsFailed=%d)\n",
601 NumAdaptationsFailed);
602 }
605 if (BackupAdaptedTemplates != nullptr)
607 BackupAdaptedTemplates = nullptr;
608 NumAdaptationsFailed = 0;
609}

◆ SetAdaptiveThreshold()

void tesseract::Classify::SetAdaptiveThreshold ( float  Threshold)

This routine resets the internal thresholds inside the integer matcher to correspond to the specified threshold.

Parameters
Thresholdthreshold for creating new templates

Globals:

  • matcher_good_threshold default good match rating

Definition at line 2141 of file adaptmatch.cpp.

2141 {
2142 Threshold = (Threshold == matcher_good_threshold) ? 0.9: (1.0 - Threshold);
2144 ClipToRange<int>(255 * Threshold, 0, 255));
2146 ClipToRange<int>(255 * Threshold, 0, 255));
2147} /* SetAdaptiveThreshold */

◆ SetStaticClassifier()

void tesseract::Classify::SetStaticClassifier ( ShapeClassifier static_classifier)

Definition at line 211 of file classify.cpp.

211 {
212 delete static_classifier_;
213 static_classifier_ = static_classifier;
214}

◆ SettupPass1()

void tesseract::Classify::SettupPass1 ( )

This routine prepares the adaptive matcher for the start of the first pass. Learning is enabled (unless it is disabled for the whole program).

Note
this is somewhat redundant, it simply says that if learning is enabled then it will remain enabled on the first pass. If it is disabled, then it will remain disabled. This is only put here to make it very clear that learning is controlled directly by the global setting of EnableLearning.

Globals:

Definition at line 652 of file adaptmatch.cpp.

652 {
654
656
657} /* SettupPass1 */
void SettupStopperPass1()
Sets up stopper variables in preparation for the first pass.
Definition: stopper.cpp:364

◆ SettupPass2()

void tesseract::Classify::SettupPass2 ( )

This routine prepares the adaptive matcher for the start of the second pass. Further learning is disabled.

Globals:

Definition at line 669 of file adaptmatch.cpp.

669 {
670 EnableLearning = false;
672
673} /* SettupPass2 */
void SettupStopperPass2()
Sets up stopper variables in preparation for the second pass.
Definition: stopper.cpp:368

◆ SetupBLCNDenorms()

void tesseract::Classify::SetupBLCNDenorms ( const TBLOB blob,
bool  nonlinear_norm,
DENORM bl_denorm,
DENORM cn_denorm,
INT_FX_RESULT_STRUCT fx_info 
)
static

Definition at line 129 of file intfx.cpp.

131 {
132 // Compute 1st and 2nd moments of the original outline.
133 FCOORD center, second_moments;
134 int length = blob.ComputeMoments(&center, &second_moments);
135 if (fx_info != nullptr) {
136 fx_info->Length = length;
137 fx_info->Rx = IntCastRounded(second_moments.y());
138 fx_info->Ry = IntCastRounded(second_moments.x());
139
140 fx_info->Xmean = IntCastRounded(center.x());
141 fx_info->Ymean = IntCastRounded(center.y());
142 }
143 // Setup the denorm for Baseline normalization.
144 bl_denorm->SetupNormalization(nullptr, nullptr, &blob.denorm(), center.x(), 128.0f,
145 1.0f, 1.0f, 128.0f, 128.0f);
146 // Setup the denorm for character normalization.
147 if (nonlinear_norm) {
150 TBOX box;
151 blob.GetPreciseBoundingBox(&box);
152 box.pad(1, 1);
153 blob.GetEdgeCoords(box, &x_coords, &y_coords);
154 cn_denorm->SetupNonLinear(&blob.denorm(), box, UINT8_MAX, UINT8_MAX,
155 0.0f, 0.0f, x_coords, y_coords);
156 } else {
157 cn_denorm->SetupNormalization(nullptr, nullptr, &blob.denorm(),
158 center.x(), center.y(),
159 51.2f / second_moments.x(),
160 51.2f / second_moments.y(),
161 128.0f, 128.0f);
162 }
163}
void GetEdgeCoords(const TBOX &box, GenericVector< GenericVector< int > > *x_coords, GenericVector< GenericVector< int > > *y_coords) const
Definition: blobs.cpp:557
int ComputeMoments(FCOORD *center, FCOORD *second_moments) const
Definition: blobs.cpp:522
void GetPreciseBoundingBox(TBOX *precise_box) const
Definition: blobs.cpp:541
void SetupNonLinear(const DENORM *predecessor, const TBOX &box, float target_width, float target_height, float final_xshift, float final_yshift, const GenericVector< GenericVector< int > > &x_coords, const GenericVector< GenericVector< int > > &y_coords)
Definition: normalis.cpp:268
void SetupNormalization(const BLOCK *block, const FCOORD *rotation, const DENORM *predecessor, float x_origin, float y_origin, float x_scale, float y_scale, float final_xshift, float final_yshift)
Definition: normalis.cpp:96
Definition: points.h:189
float y() const
Definition: points.h:210
float x() const
Definition: points.h:207
void pad(int xpad, int ypad)
Definition: rect.h:131
int16_t Xmean
Definition: intfx.h:37

◆ shape_table()

const ShapeTable * tesseract::Classify::shape_table ( ) const
inline

Definition at line 111 of file classify.h.

111 {
112 return shape_table_;
113 }

◆ ShapeIDToClassID()

int tesseract::Classify::ShapeIDToClassID ( int  shape_id) const

Definition at line 2220 of file adaptmatch.cpp.

2220 {
2221 for (int id = 0; id < PreTrainedTemplates->NumClasses; ++id) {
2222 int font_set_id = PreTrainedTemplates->Class[id]->font_set_id;
2223 ASSERT_HOST(font_set_id >= 0);
2224 const FontSet &fs = fontset_table_.get(font_set_id);
2225 for (int config = 0; config < fs.size; ++config) {
2226 if (fs.configs[config] == shape_id)
2227 return id;
2228 }
2229 }
2230 tprintf("Shape %d not found\n", shape_id);
2231 return -1;
2232}

◆ ShowBestMatchFor()

void tesseract::Classify::ShowBestMatchFor ( int  shape_id,
const INT_FEATURE_STRUCT features,
int  num_features 
)

This routine displays debug information for the best config of the given shape_id for the given set of features.

Parameters
shape_idclassifier id to work with
featuresfeatures of the unknown character
num_featuresNumber of features in the features array.

Definition at line 2159 of file adaptmatch.cpp.

2161 {
2162#ifndef GRAPHICS_DISABLED
2163 uint32_t config_mask;
2164 if (UnusedClassIdIn(PreTrainedTemplates, shape_id)) {
2165 tprintf("No built-in templates for class/shape %d\n", shape_id);
2166 return;
2167 }
2168 if (num_features <= 0) {
2169 tprintf("Illegal blob (char norm features)!\n");
2170 return;
2171 }
2172 UnicharRating cn_result;
2176 num_features, features, &cn_result,
2179 tprintf("\n");
2180 config_mask = 1 << cn_result.config;
2181
2182 tprintf("Static Shape ID: %d\n", shape_id);
2185 &config_mask, num_features, features, &cn_result,
2189#endif // GRAPHICS_DISABLED
2190} /* ShowBestMatchFor */

◆ ShowMatchDisplay()

void tesseract::Classify::ShowMatchDisplay ( )

This routine sends the shapes in the global display lists to the match debugger window.

Globals:

  • FeatureShapes display list containing feature matches
  • ProtoShapes display list containing proto matches

Definition at line 962 of file intproto.cpp.

962 {
964 if (ProtoDisplayWindow) {
965 ProtoDisplayWindow->Clear();
966 }
967 if (FeatureDisplayWindow) {
968 FeatureDisplayWindow->Clear();
969 }
971 static_cast<NORM_METHOD>(static_cast<int>(classify_norm_method)),
972 IntMatchWindow);
973 IntMatchWindow->ZoomToRectangle(INT_MIN_X, INT_MIN_Y,
975 if (ProtoDisplayWindow) {
976 ProtoDisplayWindow->ZoomToRectangle(INT_MIN_X, INT_MIN_Y,
978 }
979 if (FeatureDisplayWindow) {
980 FeatureDisplayWindow->ZoomToRectangle(INT_MIN_X, INT_MIN_Y,
982 }
983} /* ShowMatchDisplay */
void InitIntMatchWindowIfReqd()
Definition: intproto.cpp:1722
#define INT_MAX_Y
Definition: intproto.cpp:62
#define INT_MIN_Y
Definition: intproto.cpp:60
#define INT_MIN_X
Definition: intproto.cpp:59
#define INT_MAX_X
Definition: intproto.cpp:61
NORM_METHOD
Definition: mfoutline.h:63
void ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView *window)
Definition: intproto.cpp:987
void Clear()
Definition: scrollview.cpp:589
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:757

◆ StartBackupAdaptiveClassifier()

void tesseract::Classify::StartBackupAdaptiveClassifier ( )

Definition at line 629 of file adaptmatch.cpp.

◆ SwitchAdaptiveClassifier()

void tesseract::Classify::SwitchAdaptiveClassifier ( )

Definition at line 613 of file adaptmatch.cpp.

613 {
614 if (BackupAdaptedTemplates == nullptr) {
616 return;
617 }
619 tprintf("Switch to backup adaptive classifier (NumAdaptationsFailed=%d)\n",
620 NumAdaptationsFailed);
621 }
624 BackupAdaptedTemplates = nullptr;
625 NumAdaptationsFailed = 0;
626}
void ResetAdaptiveClassifierInternal()
Definition: adaptmatch.cpp:598

◆ TempConfigReliable()

bool tesseract::Classify::TempConfigReliable ( CLASS_ID  class_id,
const TEMP_CONFIG config 
)

Definition at line 2236 of file adaptmatch.cpp.

2237 {
2239 tprintf("NumTimesSeen for config of %s is %d\n",
2240 getDict().getUnicharset().debug_str(class_id).string(),
2241 config->NumTimesSeen);
2242 }
2244 return true;
2246 return false;
2247 } else if (use_ambigs_for_adaption) {
2248 // Go through the ambigs vector and see whether we have already seen
2249 // enough times all the characters represented by the ambigs vector.
2250 const UnicharIdVector *ambigs =
2252 int ambigs_size = (ambigs == nullptr) ? 0 : ambigs->size();
2253 for (int ambig = 0; ambig < ambigs_size; ++ambig) {
2254 ADAPT_CLASS ambig_class = AdaptedTemplates->Class[(*ambigs)[ambig]];
2255 assert(ambig_class != nullptr);
2256 if (ambig_class->NumPermConfigs == 0 &&
2257 ambig_class->MaxNumTimesSeen <
2260 tprintf("Ambig %s has not been seen enough times,"
2261 " not making config for %s permanent\n",
2262 getDict().getUnicharset().debug_str(
2263 (*ambigs)[ambig]).string(),
2264 getDict().getUnicharset().debug_str(class_id).string());
2265 }
2266 return false;
2267 }
2268 }
2269 }
2270 return true;
2271}
GenericVector< UNICHAR_ID > UnicharIdVector
Definition: ambigs.h:35
const UnicharIdVector * AmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:183
bool use_ambigs_for_adaption
Definition: ccutil.h:89
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:108

◆ UpdateAmbigsGroup()

void tesseract::Classify::UpdateAmbigsGroup ( CLASS_ID  class_id,
TBLOB Blob 
)

Definition at line 2273 of file adaptmatch.cpp.

2273 {
2274 const UnicharIdVector *ambigs =
2276 int ambigs_size = (ambigs == nullptr) ? 0 : ambigs->size();
2278 tprintf("Running UpdateAmbigsGroup for %s class_id=%d\n",
2279 getDict().getUnicharset().debug_str(class_id).string(), class_id);
2280 }
2281 for (int ambig = 0; ambig < ambigs_size; ++ambig) {
2282 CLASS_ID ambig_class_id = (*ambigs)[ambig];
2283 const ADAPT_CLASS ambigs_class = AdaptedTemplates->Class[ambig_class_id];
2284 for (int cfg = 0; cfg < MAX_NUM_CONFIGS; ++cfg) {
2285 if (ConfigIsPermanent(ambigs_class, cfg)) continue;
2286 const TEMP_CONFIG config =
2287 TempConfigFor(AdaptedTemplates->Class[ambig_class_id], cfg);
2288 if (config != nullptr && TempConfigReliable(ambig_class_id, config)) {
2290 tprintf("Making config %d of %s permanent\n", cfg,
2291 getDict().getUnicharset().debug_str(
2292 ambig_class_id).string());
2293 }
2294 MakePermanent(AdaptedTemplates, ambig_class_id, cfg, Blob);
2295 }
2296 }
2297 }
2298}
const UnicharIdVector * ReverseAmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:192

◆ WriteAdaptedTemplates()

void tesseract::Classify::WriteAdaptedTemplates ( FILE *  File,
ADAPT_TEMPLATES  Templates 
)

This routine saves Templates to File in a binary format.

Parameters
Fileopen text file to write Templates to
Templatesset of adapted templates to write to File
Note
Globals: none

Definition at line 453 of file adaptive.cpp.

453 {
454 int i;
455
456 /* first write the high level adaptive template struct */
457 fwrite(Templates, sizeof(ADAPT_TEMPLATES_STRUCT), 1, File);
458
459 /* then write out the basic integer templates */
460 WriteIntTemplates (File, Templates->Templates, unicharset);
461
462 /* then write out the adaptive info for each class */
463 for (i = 0; i < (Templates->Templates)->NumClasses; i++) {
464 WriteAdaptedClass (File, Templates->Class[i],
465 Templates->Templates->Class[i]->NumConfigs);
466 }
467} /* WriteAdaptedTemplates */
void WriteAdaptedClass(FILE *File, ADAPT_CLASS Class, int NumConfigs)
Definition: adaptive.cpp:409
void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
Definition: intproto.cpp:1017

◆ WriteIntTemplates()

void tesseract::Classify::WriteIntTemplates ( FILE *  File,
INT_TEMPLATES  Templates,
const UNICHARSET target_unicharset 
)

This routine writes Templates to File. The format is an efficient binary format. File must already be open for writing.

Parameters
Fileopen file to write templates to
Templatestemplates to save into File
target_unicharsetthe UNICHARSET to use

Definition at line 1017 of file intproto.cpp.

1018 {
1019 int i, j;
1020 INT_CLASS Class;
1021 int unicharset_size = target_unicharset.size();
1022 int version_id = -5; // When negated by the reader -1 becomes +1 etc.
1023
1024 if (Templates->NumClasses != unicharset_size) {
1025 cprintf("Warning: executing WriteIntTemplates() with %d classes in"
1026 " Templates, while target_unicharset size is %d\n",
1027 Templates->NumClasses, unicharset_size);
1028 }
1029
1030 /* first write the high level template struct */
1031 fwrite(&unicharset_size, sizeof(unicharset_size), 1, File);
1032 fwrite(&version_id, sizeof(version_id), 1, File);
1033 fwrite(&Templates->NumClassPruners, sizeof(Templates->NumClassPruners),
1034 1, File);
1035 fwrite(&Templates->NumClasses, sizeof(Templates->NumClasses), 1, File);
1036
1037 /* then write out the class pruners */
1038 for (i = 0; i < Templates->NumClassPruners; i++)
1039 fwrite(Templates->ClassPruners[i],
1040 sizeof(CLASS_PRUNER_STRUCT), 1, File);
1041
1042 /* then write out each class */
1043 for (i = 0; i < Templates->NumClasses; i++) {
1044 Class = Templates->Class[i];
1045
1046 /* first write out the high level struct for the class */
1047 fwrite(&Class->NumProtos, sizeof(Class->NumProtos), 1, File);
1048 fwrite(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1, File);
1049 ASSERT_HOST(Class->NumConfigs == this->fontset_table_.get(Class->font_set_id).size);
1050 fwrite(&Class->NumConfigs, sizeof(Class->NumConfigs), 1, File);
1051 for (j = 0; j < Class->NumConfigs; ++j) {
1052 fwrite(&Class->ConfigLengths[j], sizeof(uint16_t), 1, File);
1053 }
1054
1055 /* then write out the proto lengths */
1056 if (MaxNumIntProtosIn (Class) > 0) {
1057 fwrite(Class->ProtoLengths, sizeof(uint8_t),
1058 MaxNumIntProtosIn(Class), File);
1059 }
1060
1061 /* then write out the proto sets */
1062 for (j = 0; j < Class->NumProtoSets; j++)
1063 fwrite(Class->ProtoSets[j], sizeof(PROTO_SET_STRUCT), 1, File);
1064
1065 /* then write the fonts info */
1066 fwrite(&Class->font_set_id, sizeof(int), 1, File);
1067 }
1068
1069 /* Write the fonts info tables */
1071 this->fontinfo_table_.write(File,
1074} /* WriteIntTemplates */
bool write_set(FILE *f, const FontSet &fs)
Definition: fontinfo.cpp:232
bool write_info(FILE *f, const FontInfo &fi)
Definition: fontinfo.cpp:163
bool write_spacing_info(FILE *f, const FontInfo &fi)
Definition: fontinfo.cpp:198

◆ WriteTRFile()

bool tesseract::Classify::WriteTRFile ( const STRING filename)

Definition at line 98 of file blobclass.cpp.

98 {
99 bool result = false;
100 STRING tr_filename = filename + ".tr";
101 FILE* fp = fopen(tr_filename.string(), "wb");
102 if (fp) {
103 result =
104 tesseract::Serialize(fp, &tr_file_data_[0], tr_file_data_.length());
105 fclose(fp);
106 }
107 tr_file_data_.truncate_at(0);
108 return result;
109}
bool Serialize(FILE *fp, const char *data, size_t n)
Definition: serialis.cpp:60
void truncate_at(int32_t index)
Definition: strngs.cpp:265

Member Data Documentation

◆ AdaptedTemplates

ADAPT_TEMPLATES tesseract::Classify::AdaptedTemplates

Definition at line 515 of file classify.h.

◆ AllConfigsOff

BIT_VECTOR tesseract::Classify::AllConfigsOff

Definition at line 524 of file classify.h.

◆ AllConfigsOn

BIT_VECTOR tesseract::Classify::AllConfigsOn

Definition at line 523 of file classify.h.

◆ allow_blob_division

bool tesseract::Classify::allow_blob_division = true

"Use divisible blobs chopping"

Definition at line 423 of file classify.h.

◆ AllProtosOn

BIT_VECTOR tesseract::Classify::AllProtosOn

Definition at line 522 of file classify.h.

◆ BackupAdaptedTemplates

ADAPT_TEMPLATES tesseract::Classify::BackupAdaptedTemplates

Definition at line 519 of file classify.h.

◆ certainty_scale

double tesseract::Classify::certainty_scale = 20.0

"Certainty scaling factor"

Definition at line 473 of file classify.h.

◆ classify_adapt_feature_threshold

int tesseract::Classify::classify_adapt_feature_threshold = 230

"Threshold for good features during adaptive 0-255"

Definition at line 483 of file classify.h.

◆ classify_adapt_proto_threshold

int tesseract::Classify::classify_adapt_proto_threshold = 230

"Threshold for good protos during adaptive 0-255"

Definition at line 481 of file classify.h.

◆ classify_adapted_pruning_factor

double tesseract::Classify::classify_adapted_pruning_factor = 2.5

"Prune poor adapted results this much worse than best result"

Definition at line 477 of file classify.h.

◆ classify_adapted_pruning_threshold

double tesseract::Classify::classify_adapted_pruning_threshold = -1.0

"Threshold at which classify_adapted_pruning_factor starts"

Definition at line 479 of file classify.h.

◆ classify_bln_numeric_mode

bool tesseract::Classify::classify_bln_numeric_mode = 0

"Assume the input is numbers [0-9]."

Definition at line 508 of file classify.h.

◆ classify_char_norm_range

double tesseract::Classify::classify_char_norm_range = 0.2

"Character Normalization Range ..."

Definition at line 436 of file classify.h.

◆ classify_character_fragments_garbage_certainty_threshold

double tesseract::Classify::classify_character_fragments_garbage_certainty_threshold = -3.0

"Exclude fragments that do not match any whole character" " with at least this certainty"

Definition at line 489 of file classify.h.

◆ classify_class_pruner_multiplier

int tesseract::Classify::classify_class_pruner_multiplier = 15

"Class Pruner Multiplier 0-255: "

Definition at line 501 of file classify.h.

◆ classify_class_pruner_threshold

int tesseract::Classify::classify_class_pruner_threshold = 229

"Class Pruner Threshold 0-255"

Definition at line 499 of file classify.h.

◆ classify_cp_cutoff_strength

int tesseract::Classify::classify_cp_cutoff_strength = 7

"Class Pruner CutoffStrength: "

Definition at line 503 of file classify.h.

◆ classify_debug_character_fragments

bool tesseract::Classify::classify_debug_character_fragments = false

"Bring up graphical debugging windows for fragments training"

Definition at line 491 of file classify.h.

◆ classify_debug_level

int tesseract::Classify::classify_debug_level = 0

"Classify debug level"

Definition at line 430 of file classify.h.

◆ classify_enable_adaptive_debugger

bool tesseract::Classify::classify_enable_adaptive_debugger = 0

"Enable match debugger"

Definition at line 450 of file classify.h.

◆ classify_enable_adaptive_matcher

bool tesseract::Classify::classify_enable_adaptive_matcher = 1

"Enable adaptive classifier"

Definition at line 445 of file classify.h.

◆ classify_enable_learning

bool tesseract::Classify::classify_enable_learning = true

"Enable adaptive classifier"

Definition at line 429 of file classify.h.

◆ classify_integer_matcher_multiplier

int tesseract::Classify::classify_integer_matcher_multiplier = 10

"Integer Matcher Multiplier 0-255: "

Definition at line 505 of file classify.h.

◆ classify_learn_debug_str

char* tesseract::Classify::classify_learn_debug_str = ""

"Class str to debug learning"

Definition at line 495 of file classify.h.

◆ classify_learning_debug_level

int tesseract::Classify::classify_learning_debug_level = 0

"Learning Debug Level: "

Definition at line 455 of file classify.h.

◆ classify_max_certainty_margin

double tesseract::Classify::classify_max_certainty_margin = 5.5

"Veto difference between classifier certainties"

Definition at line 440 of file classify.h.

◆ classify_max_rating_ratio

double tesseract::Classify::classify_max_rating_ratio = 1.5

"Veto ratio between classifier ratings"

Definition at line 438 of file classify.h.

◆ classify_misfit_junk_penalty

double tesseract::Classify::classify_misfit_junk_penalty = 0.0

"Penalty to apply when a non-alnum is vertically out of " "its expected textline position"

Definition at line 471 of file classify.h.

◆ classify_nonlinear_norm

bool tesseract::Classify::classify_nonlinear_norm = 0

"Non-linear stroke-density normalization"

Definition at line 452 of file classify.h.

◆ classify_norm_method

int tesseract::Classify::classify_norm_method = character

"Normalization Method ..."

Definition at line 434 of file classify.h.

◆ classify_save_adapted_templates

bool tesseract::Classify::classify_save_adapted_templates = 0

"Save adapted templates to a file"

Definition at line 449 of file classify.h.

◆ classify_use_pre_adapted_templates

bool tesseract::Classify::classify_use_pre_adapted_templates = 0

"Use pre-adapted classifier templates"

Definition at line 447 of file classify.h.

◆ disable_character_fragments

bool tesseract::Classify::disable_character_fragments = true

"Do not include character fragments in the" " results of the classifier"

Definition at line 486 of file classify.h.

◆ EnableLearning

bool tesseract::Classify::EnableLearning

Definition at line 577 of file classify.h.

◆ feature_defs_

FEATURE_DEFS_STRUCT tesseract::Classify::feature_defs_
protected

Definition at line 541 of file classify.h.

◆ fontinfo_table_

UnicityTable<FontInfo> tesseract::Classify::fontinfo_table_

Definition at line 529 of file classify.h.

◆ fontset_table_

UnicityTable<FontSet> tesseract::Classify::fontset_table_

Definition at line 537 of file classify.h.

◆ im_

IntegerMatcher tesseract::Classify::im_
protected

Definition at line 540 of file classify.h.

◆ matcher_avg_noise_size

double tesseract::Classify::matcher_avg_noise_size = 12.0

"Avg. noise blob length: "

Definition at line 461 of file classify.h.

◆ matcher_bad_match_pad

double tesseract::Classify::matcher_bad_match_pad = 0.15

"Bad Match Pad (0-1)"

Definition at line 459 of file classify.h.

◆ matcher_clustering_max_angle_delta

double tesseract::Classify::matcher_clustering_max_angle_delta = 0.015

"Maximum angle delta for prototype clustering"

Definition at line 468 of file classify.h.

◆ matcher_debug_flags

int tesseract::Classify::matcher_debug_flags = 0

"Matcher Debug Flags"

Definition at line 454 of file classify.h.

◆ matcher_debug_level

int tesseract::Classify::matcher_debug_level = 0

"Matcher Debug Level"

Definition at line 453 of file classify.h.

◆ matcher_debug_separate_windows

bool tesseract::Classify::matcher_debug_separate_windows = false

"Use two different windows for debugging the matching: " "One for the protos and one for the features."

Definition at line 494 of file classify.h.

◆ matcher_good_threshold

double tesseract::Classify::matcher_good_threshold = 0.125

"Good Match (0-1)"

Definition at line 456 of file classify.h.

◆ matcher_min_examples_for_prototyping

int tesseract::Classify::matcher_min_examples_for_prototyping = 3

"Reliable Config Threshold"

Definition at line 464 of file classify.h.

◆ matcher_perfect_threshold

double tesseract::Classify::matcher_perfect_threshold = 0.02

"Perfect Match (0-1)"

Definition at line 458 of file classify.h.

◆ matcher_permanent_classes_min

int tesseract::Classify::matcher_permanent_classes_min = 1

"Min # of permanent classes"

Definition at line 462 of file classify.h.

◆ matcher_rating_margin

double tesseract::Classify::matcher_rating_margin = 0.1

"New template margin (0-1)"

Definition at line 460 of file classify.h.

◆ matcher_reliable_adaptive_result

double tesseract::Classify::matcher_reliable_adaptive_result = 0.0

"Great Match (0-1)"

Definition at line 457 of file classify.h.

◆ matcher_sufficient_examples_for_prototyping

int tesseract::Classify::matcher_sufficient_examples_for_prototyping = 5

"Enable adaption even if the ambiguities have not been seen"

Definition at line 466 of file classify.h.

◆ NormProtos

NORM_PROTOS* tesseract::Classify::NormProtos

Definition at line 527 of file classify.h.

◆ PreTrainedTemplates

INT_TEMPLATES tesseract::Classify::PreTrainedTemplates

Definition at line 514 of file classify.h.

◆ prioritize_division

bool tesseract::Classify::prioritize_division = false

"Prioritize blob division over chopping"

Definition at line 428 of file classify.h.

◆ rating_scale

double tesseract::Classify::rating_scale = 1.5

"Rating scaling factor"

Definition at line 472 of file classify.h.

◆ shape_table_

ShapeTable* tesseract::Classify::shape_table_
protected

Definition at line 546 of file classify.h.

◆ speckle_large_max_size

double tesseract::Classify::speckle_large_max_size = 0.30

"Max large speckle size"

Definition at line 509 of file classify.h.

◆ speckle_rating_penalty

double tesseract::Classify::speckle_rating_penalty = 10.0

"Penalty to add to worst rating for noise"

Definition at line 511 of file classify.h.

◆ TempProtoMask

BIT_VECTOR tesseract::Classify::TempProtoMask

Definition at line 525 of file classify.h.

◆ tess_bn_matching

bool tesseract::Classify::tess_bn_matching = 0

"Baseline Normalized Matching"

Definition at line 444 of file classify.h.

◆ tess_cn_matching

bool tesseract::Classify::tess_cn_matching = 0

"Character Normalized Matching"

Definition at line 443 of file classify.h.

◆ tessedit_class_miss_scale

double tesseract::Classify::tessedit_class_miss_scale = 0.00390625

"Scale factor for features not used"

Definition at line 475 of file classify.h.


The documentation for this class was generated from the following files: