tesseract 4.1.1
Loading...
Searching...
No Matches
tesseract::Wordrec Class Reference

#include <wordrec.h>

Inheritance diagram for tesseract::Wordrec:
tesseract::Classify tesseract::CCStruct tesseract::CUtil tesseract::CCUtil tesseract::Tesseract

Public Member Functions

 Wordrec ()
 
 ~Wordrec () override=default
 
void SaveAltChoices (const LIST &best_choices, WERD_RES *word)
 
void FillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void CallFillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void SegSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void InitialSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void DoSegSearch (WERD_RES *word_res)
 
void add_seam_to_queue (float new_priority, SEAM *new_seam, SeamQueue *seams)
 
void choose_best_seam (SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
 
void combine_seam (const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue)
 
SEAMpick_good_seam (TBLOB *blob)
 
void try_point_pairs (EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
void try_vertical_splits (EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
PRIORITY grade_split_length (SPLIT *split)
 
PRIORITY grade_sharpness (SPLIT *split)
 
bool near_point (EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt)
 
virtual BLOB_CHOICE_LIST * classify_piece (const GenericVector< SEAM * > &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
 
void merge_fragments (MATRIX *ratings, int16_t num_blobs)
 
void get_fragment_lists (int16_t current_frag, int16_t current_row, int16_t start, int16_t num_frag_parts, int16_t num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
 
void merge_and_put_fragment_lists (int16_t row, int16_t column, int16_t num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)
 
void fill_filtered_fragment_list (BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)
 
program_editup

Initialize all the things in the program that need to be initialized. init_permute determines whether to initialize the permute functions and Dawg models.

void program_editup (const char *textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
 
cc_recog

Recognize a word.

void cc_recog (WERD_RES *word)
 
program_editdown

This function holds any necessary post processing for the Wise Owl program.

void program_editdown (int32_t elasped_time)
 
set_pass1

Get ready to do some pass 1 stuff.

void set_pass1 ()
 
set_pass2

Get ready to do some pass 2 stuff.

void set_pass2 ()
 
end_recog

Cleanup and exit the recog program.

int end_recog ()
 
call_matcher

Called from Tess with a blob in tess form. The blob may need rotating to the correct orientation for classification.

BLOB_CHOICE_LIST * call_matcher (TBLOB *blob)
 
dict_word()

Test the dictionaries, returning NO_PERM (0) if not found, or one of the PermuterType values if found, according to the dictionary.

int dict_word (const WERD_CHOICE &word)
 
classify_blob

Classify the this blob if it is not already recorded in the match table. Attempt to recognize this blob as a character. The recognition rating for this blob will be stored as a part of the blob. This value will also be returned to the caller.

Parameters
blobCurrent blob
stringThe string to display in ScrollView
colorThe colour to use when displayed with ScrollView
BLOB_CHOICE_LIST * classify_blob (TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle)
 
point_priority

Assign a priority to and edge point that might be used as part of a split. The argument should be of type EDGEPT.

PRIORITY point_priority (EDGEPT *point)
 
add_point_to_list

Add an edge point to a POINT_GROUP containing a list of other points.

void add_point_to_list (PointHeap *point_heap, EDGEPT *point)
 
bool is_inside_angle (EDGEPT *pt)
 
angle_change

Return the change in angle (degrees) of the line segments between points one and two, and two and three.

int angle_change (EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
 
pick_close_point

Choose the edge point that is closest to the critical point. This point may not be exactly vertical from the critical point.

EDGEPTpick_close_point (EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist)
 
prioritize_points

Find a list of edge points from the outer outline of this blob. For each of these points assign a priority. Sort these points using a heap structure so that they can be visited in order.

void prioritize_points (TESSLINE *outline, PointHeap *points)
 
new_min_point

Found a new minimum point try to decide whether to save it or not. Return the new value for the local minimum. If a point is saved then the local minimum is reset to nullptr.

void new_min_point (EDGEPT *local_min, PointHeap *points)
 
new_max_point

Found a new minimum point try to decide whether to save it or not. Return the new value for the local minimum. If a point is saved then the local minimum is reset to nullptr.

void new_max_point (EDGEPT *local_max, PointHeap *points)
 
vertical_projection_point

For one point on the outline, find the corresponding point on the other side of the outline that is a likely projection for a split point. This is done by iterating through the edge points until the X value of the point being looked at is greater than the X value of the split point. Ensure that the point being returned is not right next to the split point. Return the edge point in *best_point as a result, and any points that were newly created are also saved on the new_points list.

void vertical_projection_point (EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points)
 
attempt_blob_chop

Try to split the this blob after this one. Check to make sure that it was successful.

SEAMattempt_blob_chop (TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
 
SEAMchop_numbered_blob (TWERD *word, int32_t blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
 
SEAMchop_overlapping_blob (const GenericVector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number)
 
improve_one_blob

Finds the best place to chop, based on the worst blob, fixpt, or next to a fragment, according to the input. Returns the SEAM corresponding to the chop point, if any is found, and the index in the ratings_matrix of the chopped blob. Note that blob_choices is just a copy of the pointers in the leading diagonal of the ratings MATRIX. Although the blob is chopped, the returned SEAM is yet to be inserted into word->seam_array and the resulting blobs are unclassified, so this function can be used by ApplyBox as well as during recognition.

SEAMimprove_one_blob (const GenericVector< BLOB_CHOICE * > &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number)
 
chop_one_blob

Start with the current one-blob word and its classification. Find the worst blobs and try to divide it up to improve the ratings. Used for testing chopper.

SEAMchop_one_blob (const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, int *blob_number)
 
chop_word_main

Classify the blobs in this word and permute the results. Find the worst blob in the word and chop it up. Continue this process until a good answer has been found or all the blobs have been chopped up enough. The results are returned in the WERD_RES.

void chop_word_main (WERD_RES *word)
 
improve_by_chopping

Repeatedly chops the worst blob, classifying the new blobs fixing up all the data, and incrementally runs the segmentation search until a good word is found, or no more chops can be found.

void improve_by_chopping (float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending)
 
int select_blob_to_split (const GenericVector< BLOB_CHOICE * > &blob_choices, float rating_ceiling, bool split_next_to_fragment)
 
int select_blob_to_split_from_fixpt (DANGERR *fixpt)
 
- Public Member Functions inherited from tesseract::Classify
 Classify ()
 
 ~Classify () override
 
virtual DictgetDict ()
 
const ShapeTableshape_table () const
 
void SetStaticClassifier (ShapeClassifier *static_classifier)
 
void AddLargeSpeckleTo (int blob_length, BLOB_CHOICE_LIST *choices)
 
bool LargeSpeckle (const TBLOB &blob)
 
ADAPT_TEMPLATES NewAdaptedTemplates (bool InitFromUnicharset)
 
int GetFontinfoId (ADAPT_CLASS Class, uint8_t ConfigId)
 
int PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
 
void ReadNewCutoffs (TFile *fp, uint16_t *Cutoffs)
 
void PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
void WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
ADAPT_TEMPLATES ReadAdaptedTemplates (TFile *File)
 
float ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch)
 
void FreeNormProtos ()
 
NORM_PROTOSReadNormProtos (TFile *fp)
 
void ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class)
 
INT_TEMPLATES CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset)
 
void LearnWord (const char *fontname, WERD_RES *word)
 
void LearnPieces (const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
 
void InitAdaptiveClassifier (TessdataManager *mgr)
 
void InitAdaptedClass (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
 
void AmbigClassifier (const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
 
void MasterMatcher (INT_TEMPLATES templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
 
void ExpandShapesAndApplyCorrections (ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
 
double ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors)
 
void ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
 
void AddNewResult (const UnicharRating &new_result, ADAPT_RESULTS *results)
 
int GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
 
void DebugAdaptiveClassifier (TBLOB *Blob, ADAPT_RESULTS *Results)
 
PROTO_ID MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
 
int MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
 
void MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
 
void PrintAdaptiveMatchResults (const ADAPT_RESULTS &results)
 
void RemoveExtraPuncs (ADAPT_RESULTS *Results)
 
void RemoveBadMatches (ADAPT_RESULTS *Results)
 
void SetAdaptiveThreshold (float Threshold)
 
void ShowBestMatchFor (int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
 
STRING ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
 
int ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const
 
int ShapeIDToClassID (int shape_id) const
 
UNICHAR_IDBaselineClassifier (TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
 
int CharNormClassifier (TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
 
int CharNormTrainingSample (bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results)
 
UNICHAR_IDGetAmbiguities (TBLOB *Blob, CLASS_ID CorrectClass)
 
void DoAdaptiveMatch (TBLOB *Blob, ADAPT_RESULTS *Results)
 
void AdaptToChar (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES adaptive_templates)
 
void DisplayAdaptedChar (TBLOB *blob, INT_CLASS_STRUCT *int_class)
 
bool AdaptableWord (WERD_RES *word)
 
void EndAdaptiveClassifier ()
 
void SettupPass1 ()
 
void SettupPass2 ()
 
void AdaptiveClassifier (TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
 
void ClassifyAsNoise (ADAPT_RESULTS *Results)
 
void ResetAdaptiveClassifierInternal ()
 
void SwitchAdaptiveClassifier ()
 
void StartBackupAdaptiveClassifier ()
 
int GetCharNormFeature (const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array)
 
void ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array)
 
bool TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config)
 
void UpdateAmbigsGroup (CLASS_ID class_id, TBLOB *Blob)
 
bool AdaptiveClassifierIsFull () const
 
bool AdaptiveClassifierIsEmpty () const
 
bool LooksLikeGarbage (TBLOB *blob)
 
void RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
 
void ClearCharNormArray (uint8_t *char_norm_array)
 
void ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array)
 
void ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
 
INT_TEMPLATES ReadIntTemplates (TFile *fp)
 
void WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
 
CLASS_ID GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
 
void ShowMatchDisplay ()
 
UnicityTable< FontInfo > & get_fontinfo_table ()
 
const UnicityTable< FontInfo > & get_fontinfo_table () const
 
UnicityTable< FontSet > & get_fontset_table ()
 
void NormalizeOutlines (LIST Outlines, float *XScale, float *YScale)
 
FEATURE_SET ExtractOutlineFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractPicoFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractIntCNFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
FEATURE_SET ExtractIntGeoFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
void LearnBlob (const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
 
bool WriteTRFile (const STRING &filename)
 
- Public Member Functions inherited from tesseract::CCStruct
 CCStruct ()=default
 
 ~CCStruct () override
 
- Public Member Functions inherited from tesseract::CUtil
 CUtil ()=default
 
 ~CUtil () override
 
void read_variables (const char *filename, bool global_only)
 
- Public Member Functions inherited from tesseract::CCUtil
 CCUtil ()
 
virtual ~CCUtil ()
 
void main_setup (const char *argv0, const char *basename)
 CCUtil::main_setup - set location of tessdata and name of image. More...
 
ParamsVectorsparams ()
 

Public Attributes

bool merge_fragments_in_matrix = true
 
bool wordrec_enable_assoc = true
 
bool force_word_assoc = false
 
int repair_unchopped_blobs = 1
 
double tessedit_certainty_threshold = -2.25
 
int chop_debug = 0
 
bool chop_enable = 1
 
bool chop_vertical_creep = 0
 
int chop_split_length = 10000
 
int chop_same_distance = 2
 
int chop_min_outline_points = 6
 
int chop_seam_pile_size = 150
 
bool chop_new_seam_pile = 1
 
int chop_inside_angle = -50
 
int chop_min_outline_area = 2000
 
double chop_split_dist_knob = 0.5
 
double chop_overlap_knob = 0.9
 
double chop_center_knob = 0.15
 
int chop_centered_maxwidth = 90
 
double chop_sharpness_knob = 0.06
 
double chop_width_change_knob = 5.0
 
double chop_ok_split = 100.0
 
double chop_good_split = 50.0
 
int chop_x_y_weight = 3
 
bool assume_fixed_pitch_char_segment = false
 
int wordrec_debug_level = 0
 
int wordrec_max_join_chunks = 4
 
bool wordrec_skip_no_truth_words = false
 
bool wordrec_debug_blamer = false
 
bool wordrec_run_blamer = false
 
int segsearch_debug_level = 0
 
int segsearch_max_pain_points = 2000
 
int segsearch_max_futile_classifications = 10
 
double segsearch_max_char_wh_ratio = 2.0
 
bool save_alt_choices = true
 
std::unique_ptr< LanguageModellanguage_model_
 
PRIORITY pass2_ok_split
 
WERD_CHOICEprev_word_best_choice_
 
GenericVector< int > blame_reasons_
 
void(Wordrec::* fill_lattice_ )(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
- Public Attributes inherited from tesseract::Classify
bool allow_blob_division = true
 
bool prioritize_division = false
 
bool classify_enable_learning = true
 
int classify_debug_level = 0
 
int classify_norm_method = character
 
double classify_char_norm_range = 0.2
 
double classify_max_rating_ratio = 1.5
 
double classify_max_certainty_margin = 5.5
 
bool tess_cn_matching = 0
 
bool tess_bn_matching = 0
 
bool classify_enable_adaptive_matcher = 1
 
bool classify_use_pre_adapted_templates = 0
 
bool classify_save_adapted_templates = 0
 
bool classify_enable_adaptive_debugger = 0
 
bool classify_nonlinear_norm = 0
 
int matcher_debug_level = 0
 
int matcher_debug_flags = 0
 
int classify_learning_debug_level = 0
 
double matcher_good_threshold = 0.125
 
double matcher_reliable_adaptive_result = 0.0
 
double matcher_perfect_threshold = 0.02
 
double matcher_bad_match_pad = 0.15
 
double matcher_rating_margin = 0.1
 
double matcher_avg_noise_size = 12.0
 
int matcher_permanent_classes_min = 1
 
int matcher_min_examples_for_prototyping = 3
 
int matcher_sufficient_examples_for_prototyping = 5
 
double matcher_clustering_max_angle_delta = 0.015
 
double classify_misfit_junk_penalty = 0.0
 
double rating_scale = 1.5
 
double certainty_scale = 20.0
 
double tessedit_class_miss_scale = 0.00390625
 
double classify_adapted_pruning_factor = 2.5
 
double classify_adapted_pruning_threshold = -1.0
 
int classify_adapt_proto_threshold = 230
 
int classify_adapt_feature_threshold = 230
 
bool disable_character_fragments = true
 
double classify_character_fragments_garbage_certainty_threshold = -3.0
 
bool classify_debug_character_fragments = false
 
bool matcher_debug_separate_windows = false
 
char * classify_learn_debug_str = ""
 
int classify_class_pruner_threshold = 229
 
int classify_class_pruner_multiplier = 15
 
int classify_cp_cutoff_strength = 7
 
int classify_integer_matcher_multiplier = 10
 
bool classify_bln_numeric_mode = 0
 
double speckle_large_max_size = 0.30
 
double speckle_rating_penalty = 10.0
 
INT_TEMPLATES PreTrainedTemplates
 
ADAPT_TEMPLATES AdaptedTemplates
 
ADAPT_TEMPLATES BackupAdaptedTemplates
 
BIT_VECTOR AllProtosOn
 
BIT_VECTOR AllConfigsOn
 
BIT_VECTOR AllConfigsOff
 
BIT_VECTOR TempProtoMask
 
NORM_PROTOSNormProtos
 
UnicityTable< FontInfofontinfo_table_
 
UnicityTable< FontSetfontset_table_
 
bool EnableLearning
 
- Public Attributes inherited from tesseract::CCUtil
STRING datadir
 
STRING imagebasename
 
STRING lang
 
STRING language_data_path_prefix
 
UNICHARSET unicharset
 
UnicharAmbigs unichar_ambigs
 
STRING imagefile
 
STRING directory
 
int ambigs_debug_level = 0
 
bool use_ambigs_for_adaption = false
 

Protected Member Functions

bool SegSearchDone (int num_futile_classifications)
 
void UpdateSegSearchNodes (float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void ProcessSegSearchPainPoint (float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
 
void ResetNGramSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, GenericVector< SegSearchPending > *pending)
 
void InitBlamerForSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug)
 

Additional Inherited Members

- Static Public Member Functions inherited from tesseract::Classify
static void SetupBLCNDenorms (const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
 
static void ExtractFeatures (const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
 
- Static Public Attributes inherited from tesseract::CCStruct
static const double kDescenderFraction = 0.25
 
static const double kXHeightFraction = 0.5
 
static const double kAscenderFraction = 0.25
 
static const double kXHeightCapRatio
 
- Protected Attributes inherited from tesseract::Classify
IntegerMatcher im_
 
FEATURE_DEFS_STRUCT feature_defs_
 
ShapeTableshape_table_
 

Detailed Description

Definition at line 192 of file wordrec.h.

Constructor & Destructor Documentation

◆ Wordrec()

tesseract::Wordrec::Wordrec ( )

Definition at line 47 of file wordrec.cpp.

47 :
48 // control parameters
50 "Merge the fragments in the ratings matrix and delete them"
51 " after merging", params()),
52 BOOL_MEMBER(wordrec_enable_assoc, true, "Associator Enable",
53 params()),
55 "force associator to run regardless of what enable_assoc is."
56 " This is used for CJK where component grouping is necessary.",
58 INT_MEMBER(repair_unchopped_blobs, 1, "Fix blobs that aren't chopped",
59 params()),
60 double_MEMBER(tessedit_certainty_threshold, -2.25, "Good blob limit",
61 params()),
62 INT_MEMBER(chop_debug, 0, "Chop debug",
63 params()),
64 BOOL_MEMBER(chop_enable, 1, "Chop enable",
65 params()),
66 BOOL_MEMBER(chop_vertical_creep, 0, "Vertical creep",
67 params()),
68 INT_MEMBER(chop_split_length, 10000, "Split Length",
69 params()),
70 INT_MEMBER(chop_same_distance, 2, "Same distance",
71 params()),
72 INT_MEMBER(chop_min_outline_points, 6, "Min Number of Points on Outline",
73 params()),
74 INT_MEMBER(chop_seam_pile_size, 150, "Max number of seams in seam_pile",
75 params()),
76 BOOL_MEMBER(chop_new_seam_pile, 1, "Use new seam_pile", params()),
77 INT_MEMBER(chop_inside_angle, -50, "Min Inside Angle Bend",
78 params()),
79 INT_MEMBER(chop_min_outline_area, 2000, "Min Outline Area",
80 params()),
81 double_MEMBER(chop_split_dist_knob, 0.5, "Split length adjustment",
82 params()),
83 double_MEMBER(chop_overlap_knob, 0.9, "Split overlap adjustment",
84 params()),
85 double_MEMBER(chop_center_knob, 0.15, "Split center adjustment",
86 params()),
87 INT_MEMBER(chop_centered_maxwidth, 90, "Width of (smaller) chopped blobs "
88 "above which we don't care that a chop is not near the center.",
89 params()),
90 double_MEMBER(chop_sharpness_knob, 0.06, "Split sharpness adjustment",
91 params()),
92 double_MEMBER(chop_width_change_knob, 5.0, "Width change adjustment",
93 params()),
94 double_MEMBER(chop_ok_split, 100.0, "OK split limit",
95 params()),
96 double_MEMBER(chop_good_split, 50.0, "Good split limit",
97 params()),
98 INT_MEMBER(chop_x_y_weight, 3, "X / Y length weight",
99 params()),
101 "include fixed-pitch heuristics in char segmentation",
102 params()),
104 "Debug level for wordrec", params()),
106 "Max number of broken pieces to associate", params()),
108 "Only run OCR for words that had truth recorded in BlamerBundle",
109 params()),
111 "Print blamer debug messages", params()),
113 "Try to set the blame for errors", params()),
115 "SegSearch debug level", params()),
117 "Maximum number of pain points stored in the queue",
118 params()),
120 "Maximum number of pain point classifications per chunk that"
121 " did not result in finding a better word choice.",
122 params()),
124 "Maximum character width-to-height ratio", params()),
126 "Save alternative paths found during chopping"
127 " and segmentation search",
128 params()),
129 pass2_ok_split(0.0f) {
130 prev_word_best_choice_ = nullptr;
131 language_model_.reset(new LanguageModel(&get_fontinfo_table(),
132 &(getDict())));
133 fill_lattice_ = nullptr;
134}
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:315
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:324
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:318
ParamsVectors * params()
Definition: ccutil.h:67
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:386
virtual Dict & getDict()
Definition: classify.h:107
void(Wordrec::* fill_lattice_)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:480
int segsearch_max_pain_points
Definition: wordrec.h:235
bool merge_fragments_in_matrix
Definition: wordrec.h:197
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:476
int wordrec_debug_level
Definition: wordrec.h:226
double chop_sharpness_knob
Definition: wordrec.h:219
int segsearch_debug_level
Definition: wordrec.h:233
int repair_unchopped_blobs
Definition: wordrec.h:202
bool wordrec_skip_no_truth_words
Definition: wordrec.h:230
bool force_word_assoc
Definition: wordrec.h:201
int chop_same_distance
Definition: wordrec.h:208
double tessedit_certainty_threshold
Definition: wordrec.h:203
bool wordrec_run_blamer
Definition: wordrec.h:232
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:225
PRIORITY pass2_ok_split
Definition: wordrec.h:472
bool chop_new_seam_pile
Definition: wordrec.h:211
bool wordrec_debug_blamer
Definition: wordrec.h:231
double chop_center_knob
Definition: wordrec.h:216
int segsearch_max_futile_classifications
Definition: wordrec.h:237
int wordrec_max_join_chunks
Definition: wordrec.h:228
bool chop_vertical_creep
Definition: wordrec.h:206
double segsearch_max_char_wh_ratio
Definition: wordrec.h:239
double chop_width_change_knob
Definition: wordrec.h:220
int chop_centered_maxwidth
Definition: wordrec.h:218
int chop_min_outline_area
Definition: wordrec.h:213
bool wordrec_enable_assoc
Definition: wordrec.h:198
double chop_ok_split
Definition: wordrec.h:221
bool save_alt_choices
Definition: wordrec.h:242
double chop_good_split
Definition: wordrec.h:222
double chop_overlap_knob
Definition: wordrec.h:215
int chop_min_outline_points
Definition: wordrec.h:209
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:471
double chop_split_dist_knob
Definition: wordrec.h:214
int chop_seam_pile_size
Definition: wordrec.h:210

◆ ~Wordrec()

tesseract::Wordrec::~Wordrec ( )
overridedefault

Member Function Documentation

◆ add_point_to_list()

void tesseract::Wordrec::add_point_to_list ( PointHeap point_heap,
EDGEPT point 
)

Definition at line 76 of file chop.cpp.

76 {
77 if (point_heap->size() < MAX_NUM_POINTS - 2) {
78 PointPair pair(point_priority(point), point);
79 point_heap->Push(&pair);
80 }
81
82#ifndef GRAPHICS_DISABLED
83 if (chop_debug > 2)
84 mark_outline(point);
85#endif
86}
#define MAX_NUM_POINTS
Definition: chop.h:33
void mark_outline(EDGEPT *edgept)
Definition: plotedges.cpp:86
void Push(Pair *entry)
Definition: genericheap.h:95
PRIORITY point_priority(EDGEPT *point)
Definition: chop.cpp:66

◆ add_seam_to_queue()

void tesseract::Wordrec::add_seam_to_queue ( float  new_priority,
SEAM new_seam,
SeamQueue seams 
)

Definition at line 66 of file findseam.cpp.

67 {
68 if (new_seam == nullptr) return;
69 if (chop_debug) {
70 tprintf("Pushing new seam with priority %g :", new_priority);
71 new_seam->Print("seam: ");
72 }
73 if (seams->size() >= MAX_NUM_SEAMS) {
74 SeamPair old_pair(0, nullptr);
75 if (seams->PopWorst(&old_pair) && old_pair.key() <= new_priority) {
76 if (chop_debug) {
77 tprintf("Old seam staying with priority %g\n", old_pair.key());
78 }
79 delete new_seam;
80 seams->Push(&old_pair);
81 return;
82 } else if (chop_debug) {
83 tprintf("New seam with priority %g beats old worst seam with %g\n",
84 new_priority, old_pair.key());
85 }
86 }
87 SeamPair new_pair(new_priority, new_seam);
88 seams->Push(&new_pair);
89}
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
#define MAX_NUM_SEAMS
Definition: findseam.cpp:49
void Print(const char *label) const
Definition: seam.cpp:154
bool PopWorst(Pair *entry)
Definition: genericheap.h:140

◆ angle_change()

int tesseract::Wordrec::angle_change ( EDGEPT point1,
EDGEPT point2,
EDGEPT point3 
)

Definition at line 100 of file chop.cpp.

100 {
101 VECTOR vector1;
102 VECTOR vector2;
103
104 int angle;
105
106 /* Compute angle */
107 vector1.x = point2->pos.x - point1->pos.x;
108 vector1.y = point2->pos.y - point1->pos.y;
109 vector2.x = point3->pos.x - point2->pos.x;
110 vector2.y = point3->pos.y - point2->pos.y;
111 /* Use cross product */
112 float length = std::sqrt(static_cast<float>(vector1.length()) * vector2.length());
113 if (static_cast<int>(length) == 0)
114 return (0);
115 angle = static_cast<int>(floor(asin(vector1.cross(vector2) /
116 length) / M_PI * 180.0 + 0.5));
117
118 /* Use dot product */
119 if (vector1.dot(vector2) < 0)
120 angle = 180 - angle;
121 /* Adjust angle */
122 if (angle > 180)
123 angle -= 360;
124 if (angle <= -180)
125 angle += 360;
126 return (angle);
127}
Definition: blobs.h:51
int16_t x
Definition: blobs.h:93
int16_t y
Definition: blobs.h:94
int dot(const TPOINT &other) const
Definition: blobs.h:84
int length() const
Definition: blobs.h:89
int cross(const TPOINT &other) const
Definition: blobs.h:79
TPOINT pos
Definition: blobs.h:186

◆ attempt_blob_chop()

SEAM * tesseract::Wordrec::attempt_blob_chop ( TWERD word,
TBLOB blob,
int32_t  blob_number,
bool  italic_blob,
const GenericVector< SEAM * > &  seams 
)

Definition at line 211 of file chopper.cpp.

213 {
215 preserve_outline_tree (blob->outlines);
216 TBLOB *other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */
217 // Insert it into the word.
218 word->blobs.insert(other_blob, blob_number + 1);
219
220 SEAM *seam = nullptr;
222 TPOINT location;
223 if (divisible_blob(blob, italic_blob, &location)) {
224 seam = new SEAM(0.0f, location);
225 }
226 }
227 if (seam == nullptr)
228 seam = pick_good_seam(blob);
229 if (chop_debug) {
230 if (seam != nullptr)
231 seam->Print("Good seam picked=");
232 else
233 tprintf("\n** no seam picked *** \n");
234 }
235 if (seam) {
236 seam->ApplySeam(italic_blob, blob, other_blob);
237 }
238
239 seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob,
240 seams, seam);
241 if (seam == nullptr) {
243 restore_outline_tree(blob->outlines);
245 // If the blob can simply be divided into outlines, then do that.
246 TPOINT location;
247 if (divisible_blob(blob, italic_blob, &location)) {
248 other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */
249 word->blobs.insert(other_blob, blob_number + 1);
250 seam = new SEAM(0.0f, location);
251 seam->ApplySeam(italic_blob, blob, other_blob);
252 seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob,
253 seams, seam);
254 }
255 }
256 }
257 if (seam != nullptr) {
258 // Make sure this seam doesn't get chopped again.
259 seam->Finalize();
260 }
261 return seam;
262}
bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT *location)
Definition: blobs.cpp:913
void insert(const T &t, int index)
Definition: blobs.h:284
TESSLINE * outlines
Definition: blobs.h:400
static TBLOB * ShallowCopy(const TBLOB &src)
Definition: blobs.cpp:335
GenericVector< TBLOB * > blobs
Definition: blobs.h:459
Definition: seam.h:38
void ApplySeam(bool italic_blob, TBLOB *blob, TBLOB *other_blob) const
Definition: seam.cpp:118
void Finalize()
Definition: seam.h:110
bool allow_blob_division
Definition: classify.h:423
bool prioritize_division
Definition: classify.h:428
SEAM * pick_good_seam(TBLOB *blob)
Definition: findseam.cpp:217

◆ call_matcher()

BLOB_CHOICE_LIST * tesseract::Wordrec::call_matcher ( TBLOB blob)

Definition at line 140 of file tface.cpp.

140 {
141 // Rotate the blob for classification if necessary.
142 TBLOB* rotated_blob = tessblob->ClassifyNormalizeIfNeeded();
143 if (rotated_blob == nullptr) {
144 rotated_blob = tessblob;
145 }
146 auto *ratings = new BLOB_CHOICE_LIST(); // matcher result
147 AdaptiveClassifier(rotated_blob, ratings);
148 if (rotated_blob != tessblob) {
149 delete rotated_blob;
150 }
151 return ratings;
152}
TBLOB * ClassifyNormalizeIfNeeded() const
Definition: blobs.cpp:346
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:191

◆ CallFillLattice()

void tesseract::Wordrec::CallFillLattice ( const MATRIX ratings,
const WERD_CHOICE_LIST &  best_choices,
const UNICHARSET unicharset,
BlamerBundle blamer_bundle 
)
inline

Definition at line 259 of file wordrec.h.

262 {
263 (this->*fill_lattice_)(ratings, best_choices, unicharset, blamer_bundle);
264 }
UNICHARSET unicharset
Definition: ccutil.h:73

◆ cc_recog()

void tesseract::Wordrec::cc_recog ( WERD_RES word)

Definition at line 125 of file tface.cpp.

125 {
127 chop_word_main(word);
128 word->DebugWordChoices(getDict().stopper_debug_level >= 1,
129 getDict().word_to_debug.string());
131}
@ W_EOL
end of line
Definition: werd.h:33
#define ASSERT_HOST(x)
Definition: errcode.h:88
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:480
bool StatesAllValid()
Definition: pageres.cpp:458
WERD * word
Definition: pageres.h:186
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:28
void chop_word_main(WERD_RES *word)
Definition: chopper.cpp:391

◆ choose_best_seam()

void tesseract::Wordrec::choose_best_seam ( SeamQueue seam_queue,
const SPLIT split,
PRIORITY  priority,
SEAM **  seam_result,
TBLOB blob,
SeamPile seam_pile 
)

Definition at line 105 of file findseam.cpp.

107 {
108 SEAM *seam;
109 char str[80];
110 float my_priority;
111 /* Add seam of split */
112 my_priority = priority;
113 if (split != nullptr) {
114 TPOINT split_point = split->point1->pos;
115 split_point += split->point2->pos;
116 split_point /= 2;
117 seam = new SEAM(my_priority, split_point, *split);
118 if (chop_debug > 1) seam->Print("Partial priority ");
119 add_seam_to_queue(my_priority, seam, seam_queue);
120
121 if (my_priority > chop_good_split)
122 return;
123 }
124
125 TBOX bbox = blob->bounding_box();
126 /* Queue loop */
127 while (!seam_queue->empty()) {
128 SeamPair seam_pair;
129 seam_queue->Pop(&seam_pair);
130 seam = seam_pair.extract_data();
131 /* Set full priority */
132 my_priority = seam->FullPriority(bbox.left(), bbox.right(),
135 if (chop_debug) {
136 sprintf (str, "Full my_priority %0.0f, ", my_priority);
137 seam->Print(str);
138 }
139
140 if ((*seam_result == nullptr || (*seam_result)->priority() > my_priority) &&
141 my_priority < chop_ok_split) {
142 /* No crossing */
143 if (seam->IsHealthy(*blob, chop_min_outline_points,
145 delete *seam_result;
146 *seam_result = new SEAM(*seam);
147 (*seam_result)->set_priority(my_priority);
148 } else {
149 delete seam;
150 seam = nullptr;
151 my_priority = BAD_PRIORITY;
152 }
153 }
154
155 if (my_priority < chop_good_split) {
156 delete seam;
157 return; /* Made good answer */
158 }
159
160 if (seam) {
161 /* Combine with others */
162 if (seam_pile->size() < chop_seam_pile_size) {
163 combine_seam(*seam_pile, seam, seam_queue);
164 SeamDecPair pair(seam_pair.key(), seam);
165 seam_pile->Push(&pair);
166 } else if (chop_new_seam_pile &&
167 seam_pile->size() == chop_seam_pile_size &&
168 seam_pile->PeekTop().key() > seam_pair.key()) {
169 combine_seam(*seam_pile, seam, seam_queue);
170 SeamDecPair pair;
171 seam_pile->Pop(&pair); // pop the worst.
172 // Replace the seam in pair (deleting the old one) with
173 // the new seam and score, then push back into the heap.
174 pair.set_key(seam_pair.key());
175 pair.set_data(seam);
176 seam_pile->Push(&pair);
177 } else {
178 delete seam;
179 }
180 }
181
182 my_priority = seam_queue->empty() ? NO_FULL_PRIORITY
183 : seam_queue->PeekTop().key();
184 if ((my_priority > chop_ok_split) ||
185 (my_priority > chop_good_split && split))
186 return;
187 }
188}
#define NO_FULL_PRIORITY
Definition: findseam.cpp:51
#define BAD_PRIORITY
Definition: findseam.cpp:53
TBOX bounding_box() const
Definition: blobs.cpp:468
Definition: rect.h:34
int16_t left() const
Definition: rect.h:72
int16_t right() const
Definition: rect.h:79
float FullPriority(int xmin, int xmax, double overlap_knob, int centered_maxwidth, double center_knob, double width_change_knob) const
Definition: seam.cpp:239
bool IsHealthy(const TBLOB &blob, int min_points, int min_area) const
Definition: seam.cpp:66
EDGEPT * point1
Definition: split.h:103
EDGEPT * point2
Definition: split.h:104
bool empty() const
Definition: genericheap.h:68
const Pair & PeekTop() const
Definition: genericheap.h:108
bool Pop(Pair *entry)
Definition: genericheap.h:118
void set_data(Data *new_data)
Definition: kdpair.h:126
const Key & key() const
Definition: kdpair.h:116
void set_key(const Key &new_key)
Definition: kdpair.h:119
Data * extract_data()
Definition: kdpair.h:131
void add_seam_to_queue(float new_priority, SEAM *new_seam, SeamQueue *seams)
Definition: findseam.cpp:66
void combine_seam(const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue)
Definition: findseam.cpp:198

◆ chop_numbered_blob()

SEAM * tesseract::Wordrec::chop_numbered_blob ( TWERD word,
int32_t  blob_number,
bool  italic_blob,
const GenericVector< SEAM * > &  seams 
)

Definition at line 265 of file chopper.cpp.

267 {
268 return attempt_blob_chop(word, word->blobs[blob_number], blob_number,
269 italic_blob, seams);
270}
SEAM * attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
Definition: chopper.cpp:211

◆ chop_one_blob()

SEAM * tesseract::Wordrec::chop_one_blob ( const GenericVector< TBOX > &  boxes,
const GenericVector< BLOB_CHOICE * > &  blob_choices,
WERD_RES word_res,
int *  blob_number 
)

Definition at line 371 of file chopper.cpp.

374 {
376 return chop_overlapping_blob(boxes, true, word_res, blob_number);
377 } else {
378 return improve_one_blob(blob_choices, nullptr, false, true, word_res,
379 blob_number);
380 }
381}
SEAM * chop_overlapping_blob(const GenericVector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number)
Definition: chopper.cpp:273
SEAM * improve_one_blob(const GenericVector< BLOB_CHOICE * > &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number)
Definition: chopper.cpp:327

◆ chop_overlapping_blob()

SEAM * tesseract::Wordrec::chop_overlapping_blob ( const GenericVector< TBOX > &  boxes,
bool  italic_blob,
WERD_RES word_res,
int *  blob_number 
)

Definition at line 273 of file chopper.cpp.

275 {
276 TWERD *word = word_res->chopped_word;
277 for (*blob_number = 0; *blob_number < word->NumBlobs(); ++*blob_number) {
278 TBLOB *blob = word->blobs[*blob_number];
279 TPOINT topleft, botright;
280 topleft.x = blob->bounding_box().left();
281 topleft.y = blob->bounding_box().top();
282 botright.x = blob->bounding_box().right();
283 botright.y = blob->bounding_box().bottom();
284
285 TPOINT original_topleft, original_botright;
286 word_res->denorm.DenormTransform(nullptr, topleft, &original_topleft);
287 word_res->denorm.DenormTransform(nullptr, botright, &original_botright);
288
289 TBOX original_box = TBOX(original_topleft.x, original_botright.y,
290 original_botright.x, original_topleft.y);
291
292 bool almost_equal_box = false;
293 int num_overlap = 0;
294 for (int i = 0; i < boxes.size(); i++) {
295 if (original_box.overlap_fraction(boxes[i]) > 0.125)
296 num_overlap++;
297 if (original_box.almost_equal(boxes[i], 3))
298 almost_equal_box = true;
299 }
300
301 TPOINT location;
302 if (divisible_blob(blob, italic_blob, &location) ||
303 (!almost_equal_box && num_overlap > 1)) {
304 SEAM *seam = attempt_blob_chop(word, blob, *blob_number,
305 italic_blob, word_res->seam_array);
306 if (seam != nullptr)
307 return seam;
308 }
309 }
310
311 *blob_number = -1;
312 return nullptr;
313}
int size() const
Definition: genericvector.h:72
Definition: blobs.h:418
int NumBlobs() const
Definition: blobs.h:448
void DenormTransform(const DENORM *last_denorm, const TPOINT &pt, TPOINT *original) const
Definition: normalis.cpp:390
DENORM denorm
Definition: pageres.h:201
GenericVector< SEAM * > seam_array
Definition: pageres.h:214
TWERD * chopped_word
Definition: pageres.h:212
bool almost_equal(const TBOX &box, int tolerance) const
Definition: rect.cpp:258
double overlap_fraction(const TBOX &box) const
Definition: rect.h:388
int16_t top() const
Definition: rect.h:58
int16_t bottom() const
Definition: rect.h:65

◆ chop_word_main()

void tesseract::Wordrec::chop_word_main ( WERD_RES word)

Definition at line 391 of file chopper.cpp.

391 {
392 int num_blobs = word->chopped_word->NumBlobs();
393 if (word->ratings == nullptr) {
394 word->ratings = new MATRIX(num_blobs, wordrec_max_join_chunks);
395 }
396 if (word->ratings->get(0, 0) == nullptr) {
397 // Run initial classification.
398 for (int b = 0; b < num_blobs; ++b) {
399 BLOB_CHOICE_LIST* choices = classify_piece(word->seam_array, b, b,
400 "Initial:", word->chopped_word,
401 word->blamer_bundle);
402 word->ratings->put(b, b, choices);
403 }
404 } else {
405 // Blobs have been pre-classified. Set matrix cell for all blob choices
406 for (int col = 0; col < word->ratings->dimension(); ++col) {
407 for (int row = col; row < word->ratings->dimension() &&
408 row < col + word->ratings->bandwidth(); ++row) {
409 BLOB_CHOICE_LIST* choices = word->ratings->get(col, row);
410 if (choices != nullptr) {
411 BLOB_CHOICE_IT bc_it(choices);
412 for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
413 bc_it.data()->set_matrix_cell(col, row);
414 }
415 }
416 }
417 }
418 }
419
420 // Run Segmentation Search.
421 BestChoiceBundle best_choice_bundle(word->ratings->dimension());
422 SegSearch(word, &best_choice_bundle, word->blamer_bundle);
423
424 if (word->best_choice == nullptr) {
425 // SegSearch found no valid paths, so just use the leading diagonal.
427 }
428 word->RebuildBestState();
429 // If we finished without a hyphen at the end of the word, let the next word
430 // be found in the dictionary.
431 if (word->word->flag(W_EOL) &&
432 !getDict().has_hyphen_end(*word->best_choice)) {
434 }
435
436 if (word->blamer_bundle != nullptr && this->fill_lattice_ != nullptr) {
438 *word->uch_set, word->blamer_bundle);
439 }
440 if (wordrec_debug_level > 0) {
441 tprintf("Final Ratings Matrix:\n");
442 word->ratings->print(getDict().getUnicharset());
443 }
444 word->FilterWordChoices(getDict().stopper_debug_level);
445}
@ TOP_CHOICE_PERM
Definition: ratngs.h:235
T get(ICOORD pos) const
Definition: matrix.h:231
void put(ICOORD pos, const T &thing)
Definition: matrix.h:223
int dimension() const
Definition: matrix.h:536
Definition: matrix.h:578
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:112
const UNICHARSET * uch_set
Definition: pageres.h:203
WERD_CHOICE_LIST best_choices
Definition: pageres.h:249
BlamerBundle * blamer_bundle
Definition: pageres.h:252
WERD_CHOICE * best_choice
Definition: pageres.h:241
void FakeWordFromRatings(PermuterType permuter)
Definition: pageres.cpp:898
void FilterWordChoices(int debug_level)
Definition: pageres.cpp:513
void RebuildBestState()
Definition: pageres.cpp:808
MATRIX * ratings
Definition: pageres.h:237
void CallFillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:259
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM * > &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:50
void SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:42

◆ classify_blob()

BLOB_CHOICE_LIST * tesseract::Wordrec::classify_blob ( TBLOB blob,
const char *  string,
C_COL  color,
BlamerBundle blamer_bundle 
)

Definition at line 54 of file wordclass.cpp.

56 {
57#ifndef GRAPHICS_DISABLED
59 display_blob(blob, color);
60#endif
61 // TODO(rays) collapse with call_matcher and move all to wordrec.cpp.
62 BLOB_CHOICE_LIST* choices = call_matcher(blob);
63 // If a blob with the same bounding box as one of the truth character
64 // bounding boxes is not classified as the corresponding truth character
65 // blame character classifier for incorrect answer.
66 if (blamer_bundle != nullptr) {
67 blamer_bundle->BlameClassifier(getDict().getUnicharset(),
68 blob->bounding_box(),
69 *choices,
71 }
72 #ifndef GRAPHICS_DISABLED
73 if (classify_debug_level && string)
74 print_ratings_list(string, choices, getDict().getUnicharset());
75
78#endif
79
80 return choices;
81}
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:837
char window_wait(ScrollView *win)
Definition: callcpp.cpp:103
ScrollView * blob_window
Definition: render.cpp:33
bool wordrec_display_all_blobs
Definition: render.cpp:39
void display_blob(TBLOB *blob, C_COL color)
Definition: render.cpp:52
bool wordrec_blob_pause
Definition: render.cpp:41
void BlameClassifier(const UNICHARSET &unicharset, const TBOX &blob_box, const BLOB_CHOICE_LIST &choices, bool debug)
Definition: blamer.cpp:265
BLOB_CHOICE_LIST * call_matcher(TBLOB *blob)
Definition: tface.cpp:140

◆ classify_piece()

BLOB_CHOICE_LIST * tesseract::Wordrec::classify_piece ( const GenericVector< SEAM * > &  seams,
int16_t  start,
int16_t  end,
const char *  description,
TWERD word,
BlamerBundle blamer_bundle 
)
virtual

Definition at line 50 of file pieces.cpp.

55 {
56 if (end > start) SEAM::JoinPieces(seams, word->blobs, start, end);
57 BLOB_CHOICE_LIST *choices = classify_blob(word->blobs[start], description,
58 White, blamer_bundle);
59 // Set the matrix_cell_ entries in all the BLOB_CHOICES.
60 BLOB_CHOICE_IT bc_it(choices);
61 for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
62 bc_it.data()->set_matrix_cell(start, end);
63 }
64
65 if (end > start) SEAM::BreakPieces(seams, word->blobs, start, end);
66
67 return (choices);
68}
@ White
Definition: callcpp.h:29
static void BreakPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:188
static void JoinPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:210
BLOB_CHOICE_LIST * classify_blob(TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle)
Definition: wordclass.cpp:54

◆ combine_seam()

void tesseract::Wordrec::combine_seam ( const SeamPile seam_pile,
const SEAM seam,
SeamQueue seam_queue 
)

Definition at line 198 of file findseam.cpp.

199 {
200 for (int x = 0; x < seam_pile.size(); ++x) {
201 const SEAM *this_one = seam_pile.get(x).data();
202 if (seam->CombineableWith(*this_one, SPLIT_CLOSENESS, chop_ok_split)) {
203 SEAM *new_one = new SEAM(*seam);
204 new_one->CombineWith(*this_one);
205 if (chop_debug > 1) new_one->Print("Combo priority ");
206 add_seam_to_queue(new_one->priority(), new_one, seam_queue);
207 }
208 }
209}
#define SPLIT_CLOSENESS
Definition: findseam.cpp:47
float priority() const
Definition: seam.h:59
bool CombineableWith(const SEAM &other, int max_x_dist, float max_total_priority) const
Definition: seam.cpp:40
void CombineWith(const SEAM &other)
Definition: seam.cpp:54
const Pair & get(int index) const
Definition: genericheap.h:87

◆ dict_word()

int tesseract::Wordrec::dict_word ( const WERD_CHOICE word)

Definition at line 89 of file tface.cpp.

89 {
90 return getDict().valid_word(word);
91}
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:778

◆ DoSegSearch()

void tesseract::Wordrec::DoSegSearch ( WERD_RES word_res)

Definition at line 36 of file segsearch.cpp.

36 {
37 BestChoiceBundle best_choice_bundle(word_res->ratings->dimension());
38 // Run Segmentation Search.
39 SegSearch(word_res, &best_choice_bundle, nullptr);
40}

◆ end_recog()

int tesseract::Wordrec::end_recog ( )

Definition at line 62 of file tface.cpp.

62 {
64
65 return (0);
66}
void program_editdown(int32_t elasped_time)
Definition: tface.cpp:75

◆ fill_filtered_fragment_list()

void tesseract::Wordrec::fill_filtered_fragment_list ( BLOB_CHOICE_LIST *  choices,
int  fragment_pos,
int  num_frag_parts,
BLOB_CHOICE_LIST *  filtered_choices 
)

Definition at line 99 of file pieces.cpp.

102 {
103 BLOB_CHOICE_IT filtered_choices_it(filtered_choices);
104 BLOB_CHOICE_IT choices_it(choices);
105
106 for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
107 choices_it.forward()) {
108 UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
109 const CHAR_FRAGMENT *frag = unicharset.get_fragment(choice_unichar_id);
110
111 if (frag != nullptr && frag->get_pos() == fragment_pos &&
112 frag->get_total() == num_frag_parts) {
113 // Recover the unichar_id of the unichar that this fragment is
114 // a part of
115 auto *b = new BLOB_CHOICE(*choices_it.data());
116 int original_unichar = unicharset.unichar_to_id(frag->get_unichar());
117 b->set_unichar_id(original_unichar);
118 filtered_choices_it.add_to_end(b);
119 }
120 }
121
122 filtered_choices->sort(SortByUnicharID<BLOB_CHOICE>);
123}
int UNICHAR_ID
Definition: unichar.h:34
int get_total() const
Definition: unicharset.h:72
const char * get_unichar() const
Definition: unicharset.h:70
int get_pos() const
Definition: unicharset.h:71
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:734
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210

◆ FillLattice()

void tesseract::Wordrec::FillLattice ( const MATRIX ratings,
const WERD_CHOICE_LIST &  best_choices,
const UNICHARSET unicharset,
BlamerBundle blamer_bundle 
)

◆ get_fragment_lists()

void tesseract::Wordrec::get_fragment_lists ( int16_t  current_frag,
int16_t  current_row,
int16_t  start,
int16_t  num_frag_parts,
int16_t  num_blobs,
MATRIX ratings,
BLOB_CHOICE_LIST *  choice_lists 
)

Definition at line 275 of file pieces.cpp.

278 {
279 if (current_frag == num_frag_parts) {
280 merge_and_put_fragment_lists(start, current_row - 1, num_frag_parts,
281 choice_lists, ratings);
282 return;
283 }
284
285 for (int16_t x = current_row; x < num_blobs; x++) {
286 BLOB_CHOICE_LIST *choices = ratings->get(current_row, x);
287 if (choices == nullptr)
288 continue;
289
290 fill_filtered_fragment_list(choices, current_frag, num_frag_parts,
291 &choice_lists[current_frag]);
292 if (!choice_lists[current_frag].empty()) {
293 get_fragment_lists(current_frag + 1, x + 1, start, num_frag_parts,
294 num_blobs, ratings, choice_lists);
295 choice_lists[current_frag].clear();
296 }
297 }
298}
void get_fragment_lists(int16_t current_frag, int16_t current_row, int16_t start, int16_t num_frag_parts, int16_t num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
Definition: pieces.cpp:275
void fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)
Definition: pieces.cpp:99
void merge_and_put_fragment_lists(int16_t row, int16_t column, int16_t num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)
Definition: pieces.cpp:132

◆ grade_sharpness()

PRIORITY tesseract::Wordrec::grade_sharpness ( SPLIT split)

Definition at line 69 of file gradechop.cpp.

69 {
70 PRIORITY grade;
71
72 grade = point_priority (split->point1) + point_priority (split->point2);
73
74 if (grade < -360.0)
75 grade = 0;
76 else
77 grade += 360.0;
78
79 grade *= chop_sharpness_knob; /* Values 0 to -360 */
80
81 return (grade);
82}
float PRIORITY
Definition: seam.h:36

◆ grade_split_length()

PRIORITY tesseract::Wordrec::grade_split_length ( SPLIT split)

Definition at line 46 of file gradechop.cpp.

46 {
47 PRIORITY grade;
48 float split_length;
49
50 split_length =
52
53 if (split_length <= 0)
54 grade = 0;
55 else
56 grade = sqrt (split_length) * chop_split_dist_knob;
57
58 return (std::max(0.0f, grade));
59}
int WeightedDistance(const EDGEPT &other, int x_factor) const
Definition: blobs.h:122

◆ improve_by_chopping()

void tesseract::Wordrec::improve_by_chopping ( float  rating_cert_scale,
WERD_RES word,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle,
LMPainPoints pain_points,
GenericVector< SegSearchPending > *  pending 
)

Definition at line 454 of file chopper.cpp.

459 {
460 int blob_number;
461 do { // improvement loop.
462 // Make a simple vector of BLOB_CHOICEs to make it easy to pick which
463 // one to chop.
464 GenericVector<BLOB_CHOICE*> blob_choices;
465 int num_blobs = word->ratings->dimension();
466 for (int i = 0; i < num_blobs; ++i) {
467 BLOB_CHOICE_LIST* choices = word->ratings->get(i, i);
468 if (choices == nullptr || choices->empty()) {
469 blob_choices.push_back(nullptr);
470 } else {
471 BLOB_CHOICE_IT bc_it(choices);
472 blob_choices.push_back(bc_it.data());
473 }
474 }
475 SEAM* seam = improve_one_blob(blob_choices, &best_choice_bundle->fixpt,
476 false, false, word, &blob_number);
477 if (seam == nullptr) break;
478 // A chop has been made. We have to correct all the data structures to
479 // take into account the extra bottom-level blob.
480 // Put the seam into the seam_array and correct everything else on the
481 // word: ratings matrix (including matrix location in the BLOB_CHOICES),
482 // states in WERD_CHOICEs, and blob widths.
483 word->InsertSeam(blob_number, seam);
484 // Insert a new entry in the beam array.
485 best_choice_bundle->beam.insert(new LanguageModelState, blob_number);
486 // Fixpts are outdated, but will get recalculated.
487 best_choice_bundle->fixpt.clear();
488 // Remap existing pain points.
489 pain_points->RemapForSplit(blob_number);
490 // Insert a new pending at the chop point.
491 pending->insert(SegSearchPending(), blob_number);
492
493 // Classify the two newly created blobs using ProcessSegSearchPainPoint,
494 // as that updates the pending correctly and adds new pain points.
495 MATRIX_COORD pain_point(blob_number, blob_number);
496 ProcessSegSearchPainPoint(0.0f, pain_point, "Chop1", pending, word,
497 pain_points, blamer_bundle);
498 pain_point.col = blob_number + 1;
499 pain_point.row = blob_number + 1;
500 ProcessSegSearchPainPoint(0.0f, pain_point, "Chop2", pending, word,
501 pain_points, blamer_bundle);
502 if (language_model_->language_model_ngram_on) {
503 // N-gram evaluation depends on the number of blobs in a chunk, so we
504 // have to re-evaluate everything in the word.
505 ResetNGramSearch(word, best_choice_bundle, pending);
506 blob_number = 0;
507 }
508 // Run language model incrementally. (Except with the n-gram model on.)
509 UpdateSegSearchNodes(rating_cert_scale, blob_number, pending,
510 word, pain_points, best_choice_bundle, blamer_bundle);
511 } while (!language_model_->AcceptableChoiceFound() &&
512 word->ratings->dimension() < kMaxNumChunks);
513
514 // If after running only the chopper best_choice is incorrect and no blame
515 // has been yet set, blame the classifier if best_choice is classifier's
516 // top choice and is a dictionary word (i.e. language model could not have
517 // helped). Otherwise blame the tradeoff between the classifier and
518 // the old language model (permuters).
519 if (word->blamer_bundle != nullptr &&
522 bool valid_permuter = word->best_choice != nullptr &&
525 getDict().getUnicharset(),
526 valid_permuter,
528 }
529}
@ IRR_CORRECT
Definition: blamer.h:53
int push_back(T object)
bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const
Definition: blamer.cpp:119
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:120
void BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset, bool valid_permuter, bool debug)
Definition: blamer.cpp:377
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:418
uint8_t permuter() const
Definition: ratngs.h:336
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:474
void ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, GenericVector< SegSearchPending > *pending)
Definition: segsearch.cpp:311
void UpdateSegSearchNodes(float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:180
void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:248

◆ improve_one_blob()

SEAM * tesseract::Wordrec::improve_one_blob ( const GenericVector< BLOB_CHOICE * > &  blob_choices,
DANGERR fixpt,
bool  split_next_to_fragment,
bool  italic_blob,
WERD_RES word,
int *  blob_number 
)

Definition at line 327 of file chopper.cpp.

332 {
333 float rating_ceiling = FLT_MAX;
334 SEAM *seam = nullptr;
335 do {
336 *blob_number = select_blob_to_split_from_fixpt(fixpt);
337 if (chop_debug) tprintf("blob_number from fixpt = %d\n", *blob_number);
338 bool split_point_from_dict = (*blob_number != -1);
339 if (split_point_from_dict) {
340 fixpt->clear();
341 } else {
342 *blob_number = select_blob_to_split(blob_choices, rating_ceiling,
343 split_next_to_fragment);
344 }
345 if (chop_debug) tprintf("blob_number = %d\n", *blob_number);
346 if (*blob_number == -1)
347 return nullptr;
348
349 // TODO(rays) it may eventually help to allow italic_blob to be true,
350 seam = chop_numbered_blob(word->chopped_word, *blob_number, italic_blob,
351 word->seam_array);
352 if (seam != nullptr)
353 return seam; // Success!
354 if (blob_choices[*blob_number] == nullptr)
355 return nullptr;
356 if (!split_point_from_dict) {
357 // We chopped the worst rated blob, try something else next time.
358 rating_ceiling = blob_choices[*blob_number]->rating();
359 }
360 } while (true);
361 return seam;
362}
int select_blob_to_split(const GenericVector< BLOB_CHOICE * > &blob_choices, float rating_ceiling, bool split_next_to_fragment)
Definition: chopper.cpp:538
SEAM * chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
Definition: chopper.cpp:265
int select_blob_to_split_from_fixpt(DANGERR *fixpt)
Definition: chopper.cpp:626

◆ InitBlamerForSegSearch()

void tesseract::Wordrec::InitBlamerForSegSearch ( WERD_RES word_res,
LMPainPoints pain_points,
BlamerBundle blamer_bundle,
STRING blamer_debug 
)
protected

Definition at line 328 of file segsearch.cpp.

331 {
332 pain_points->Clear(); // Clear pain points heap.
335 static_cast<double>(segsearch_max_char_wh_ratio), word_res);
336 blamer_bundle->InitForSegSearch(word_res->best_choice, word_res->ratings,
337 getDict().WildcardID(), wordrec_debug_blamer,
338 blamer_debug, pp_cb);
339 delete pp_cb;
340}
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
Definition: tesscallback.h:258
void InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings, UNICHAR_ID wildcard_id, bool debug, STRING *debug_str, TessResultCallback2< bool, int, int > *pp_cb)
Definition: blamer.cpp:484
bool GenerateForBlamer(double max_char_wh_ratio, WERD_RES *word_res, int col, int row)

◆ InitialSegSearch()

void tesseract::Wordrec::InitialSegSearch ( WERD_RES word_res,
LMPainPoints pain_points,
GenericVector< SegSearchPending > *  pending,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)

Definition at line 136 of file segsearch.cpp.

139 {
140 if (segsearch_debug_level > 0) {
141 tprintf("Starting SegSearch on ratings matrix%s:\n",
142 wordrec_enable_assoc ? " (with assoc)" : "");
143 word_res->ratings->print(getDict().getUnicharset());
144 }
145
146 pain_points->GenerateInitial(word_res);
147
148 // Compute scaling factor that will help us recover blob outline length
149 // from classifier rating and certainty for the blob.
150 float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
151
154 segsearch_max_char_wh_ratio, rating_cert_scale);
155
156 // Initialize blamer-related information: map character boxes recorded in
157 // blamer_bundle->norm_truth_word to the corresponding i,j indices in the
158 // ratings matrix. We expect this step to succeed, since when running the
159 // chopper we checked that the correct chops are present.
160 if (blamer_bundle != nullptr) {
161 blamer_bundle->SetupCorrectSegmentation(word_res->chopped_word,
163 }
164
165 // pending[col] tells whether there is update work to do to combine
166 // best_choice_bundle->beam[col - 1] with some BLOB_CHOICEs in matrix[col, *].
167 // As the language model state is updated, pending entries are modified to
168 // minimize duplication of work. It is important that during the update the
169 // children are considered in the non-decreasing order of their column, since
170 // this guarantees that all the parents would be up to date before an update
171 // of a child is done.
172 pending->init_to_size(word_res->ratings->dimension(), SegSearchPending());
173
174 // Search the ratings matrix for the initial best path.
175 (*pending)[0].SetColumnClassified();
176 UpdateSegSearchNodes(rating_cert_scale, 0, pending, word_res,
177 pain_points, best_choice_bundle, blamer_bundle);
178}
void init_to_size(int size, const T &t)
void SetupCorrectSegmentation(const TWERD *word, bool debug)
Definition: blamer.cpp:415
double certainty_scale
Definition: dict.h:627

◆ is_inside_angle()

bool tesseract::Wordrec::is_inside_angle ( EDGEPT pt)

Definition at line 90 of file chop.cpp.

90 {
91 return angle_change(pt->prev, pt, pt->next) < chop_inside_angle;
92}
EDGEPT * next
Definition: blobs.h:192
EDGEPT * prev
Definition: blobs.h:193
int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
Definition: chop.cpp:100

◆ merge_and_put_fragment_lists()

void tesseract::Wordrec::merge_and_put_fragment_lists ( int16_t  row,
int16_t  column,
int16_t  num_frag_parts,
BLOB_CHOICE_LIST *  choice_lists,
MATRIX ratings 
)

Definition at line 132 of file pieces.cpp.

135 {
136 auto *choice_lists_it = new BLOB_CHOICE_IT[num_frag_parts];
137
138 for (int i = 0; i < num_frag_parts; i++) {
139 choice_lists_it[i].set_to_list(&choice_lists[i]);
140 choice_lists_it[i].mark_cycle_pt();
141 }
142
143 BLOB_CHOICE_LIST *merged_choice = ratings->get(row, column);
144 if (merged_choice == nullptr)
145 merged_choice = new BLOB_CHOICE_LIST;
146
147 bool end_of_list = false;
148 BLOB_CHOICE_IT merged_choice_it(merged_choice);
149 while (!end_of_list) {
150 // Find the maximum unichar_id of the current entry the iterators
151 // are pointing at
152 UNICHAR_ID max_unichar_id = choice_lists_it[0].data()->unichar_id();
153 for (int i = 0; i < num_frag_parts; i++) {
154 UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
155 if (max_unichar_id < unichar_id) {
156 max_unichar_id = unichar_id;
157 }
158 }
159
160 // Move the each iterators until it gets to an entry that has a
161 // value greater than or equal to max_unichar_id
162 for (int i = 0; i < num_frag_parts; i++) {
163 UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
164 while (!choice_lists_it[i].cycled_list() &&
165 unichar_id < max_unichar_id) {
166 choice_lists_it[i].forward();
167 unichar_id = choice_lists_it[i].data()->unichar_id();
168 }
169 if (choice_lists_it[i].cycled_list()) {
170 end_of_list = true;
171 break;
172 }
173 }
174
175 if (end_of_list)
176 break;
177
178 // Checks if the fragments are parts of the same character
179 UNICHAR_ID first_unichar_id = choice_lists_it[0].data()->unichar_id();
180 bool same_unichar = true;
181 for (int i = 1; i < num_frag_parts; i++) {
182 UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
183 if (unichar_id != first_unichar_id) {
184 same_unichar = false;
185 break;
186 }
187 }
188
189 if (same_unichar) {
190 // Add the merged character to the result
191 UNICHAR_ID merged_unichar_id = first_unichar_id;
192 GenericVector<ScoredFont> merged_fonts =
193 choice_lists_it[0].data()->fonts();
194 float merged_min_xheight = choice_lists_it[0].data()->min_xheight();
195 float merged_max_xheight = choice_lists_it[0].data()->max_xheight();
196 float positive_yshift = 0, negative_yshift = 0;
197 int merged_script_id = choice_lists_it[0].data()->script_id();
198 BlobChoiceClassifier classifier = choice_lists_it[0].data()->classifier();
199
200 float merged_rating = 0, merged_certainty = 0;
201 for (int i = 0; i < num_frag_parts; i++) {
202 float rating = choice_lists_it[i].data()->rating();
203 float certainty = choice_lists_it[i].data()->certainty();
204
205 if (i == 0 || certainty < merged_certainty)
206 merged_certainty = certainty;
207 merged_rating += rating;
208
209 choice_lists_it[i].forward();
210 if (choice_lists_it[i].cycled_list())
211 end_of_list = true;
212 IntersectRange(choice_lists_it[i].data()->min_xheight(),
213 choice_lists_it[i].data()->max_xheight(),
214 &merged_min_xheight, &merged_max_xheight);
215 float yshift = choice_lists_it[i].data()->yshift();
216 if (yshift > positive_yshift) positive_yshift = yshift;
217 if (yshift < negative_yshift) negative_yshift = yshift;
218 // Use the min font rating over the parts.
219 // TODO(rays) font lists are unsorted. Need to be faster?
220 const GenericVector<ScoredFont>& frag_fonts =
221 choice_lists_it[i].data()->fonts();
222 for (int f = 0; f < frag_fonts.size(); ++f) {
223 int merged_f = 0;
224 for (merged_f = 0; merged_f < merged_fonts.size() &&
225 merged_fonts[merged_f].fontinfo_id != frag_fonts[f].fontinfo_id;
226 ++merged_f) {}
227 if (merged_f == merged_fonts.size()) {
228 merged_fonts.push_back(frag_fonts[f]);
229 } else if (merged_fonts[merged_f].score > frag_fonts[f].score) {
230 merged_fonts[merged_f].score = frag_fonts[f].score;
231 }
232 }
233 }
234
235 float merged_yshift = positive_yshift != 0
236 ? (negative_yshift != 0 ? 0 : positive_yshift)
237 : negative_yshift;
238 auto* choice = new BLOB_CHOICE(merged_unichar_id,
239 merged_rating,
240 merged_certainty,
241 merged_script_id,
242 merged_min_xheight,
243 merged_max_xheight,
244 merged_yshift,
245 classifier);
246 choice->set_fonts(merged_fonts);
247 merged_choice_it.add_to_end(choice);
248 }
249 }
250
252 print_ratings_list("Merged Fragments", merged_choice,
253 unicharset);
254
255 if (merged_choice->empty())
256 delete merged_choice;
257 else
258 ratings->put(row, column, merged_choice);
259
260 delete [] choice_lists_it;
261}
BlobChoiceClassifier
Definition: ratngs.h:43
void IntersectRange(const T &lower1, const T &upper1, T *lower2, T *upper2)
Definition: helpers.h:145

◆ merge_fragments()

void tesseract::Wordrec::merge_fragments ( MATRIX ratings,
int16_t  num_blobs 
)

Definition at line 307 of file pieces.cpp.

307 {
308 BLOB_CHOICE_LIST choice_lists[CHAR_FRAGMENT::kMaxChunks];
309 for (int16_t start = 0; start < num_blobs; start++) {
310 for (int frag_parts = 2; frag_parts <= CHAR_FRAGMENT::kMaxChunks;
311 frag_parts++) {
312 get_fragment_lists(0, start, start, frag_parts, num_blobs,
313 ratings, choice_lists);
314 }
315 }
316
317 // Delete fragments from the rating matrix
318 for (int16_t x = 0; x < num_blobs; x++) {
319 for (int16_t y = x; y < num_blobs; y++) {
320 BLOB_CHOICE_LIST *choices = ratings->get(x, y);
321 if (choices != nullptr) {
322 BLOB_CHOICE_IT choices_it(choices);
323 for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
324 choices_it.forward()) {
325 UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
326 const CHAR_FRAGMENT *frag =
327 unicharset.get_fragment(choice_unichar_id);
328 if (frag != nullptr)
329 delete choices_it.extract();
330 }
331 }
332 }
333 }
334}
static const int kMaxChunks
Definition: unicharset.h:55

◆ near_point()

bool tesseract::Wordrec::near_point ( EDGEPT point,
EDGEPT line_pt_0,
EDGEPT line_pt_1,
EDGEPT **  near_pt 
)

Definition at line 40 of file outlines.cpp.

42 {
43 TPOINT p;
44
45 float slope;
46 float intercept;
47
48 float x0 = line_pt_0->pos.x;
49 float x1 = line_pt_1->pos.x;
50 float y0 = line_pt_0->pos.y;
51 float y1 = line_pt_1->pos.y;
52
53 if (x0 == x1) {
54 /* Handle vertical line */
55 p.x = static_cast<int16_t>(x0);
56 p.y = point->pos.y;
57 }
58 else {
59 /* Slope and intercept */
60 slope = (y0 - y1) / (x0 - x1);
61 intercept = y1 - x1 * slope;
62
63 /* Find perpendicular */
64 p.x = static_cast<int16_t>((point->pos.x + (point->pos.y - intercept) * slope) /
65 (slope * slope + 1));
66 p.y = static_cast<int16_t>(slope * p.x + intercept);
67 }
68
69 if (is_on_line (p, line_pt_0->pos, line_pt_1->pos) &&
70 (!same_point (p, line_pt_0->pos)) && (!same_point (p, line_pt_1->pos))) {
71 /* Intersection on line */
72 *near_pt = make_edgept(p.x, p.y, line_pt_1, line_pt_0);
73 return true;
74 } else { /* Intersection not on line */
75 *near_pt = closest(point, line_pt_0, line_pt_1);
76 return false;
77 }
78}
EDGEPT * make_edgept(int x, int y, EDGEPT *next, EDGEPT *prev)
Definition: split.cpp:139
#define same_point(p1, p2)
Definition: outlines.h:45
#define is_on_line(p, p0, p1)
Definition: outlines.h:116
#define closest(test_p, p1, p2)
Definition: outlines.h:67

◆ new_max_point()

void tesseract::Wordrec::new_max_point ( EDGEPT local_max,
PointHeap points 
)

Definition at line 256 of file chop.cpp.

256 {
257 int16_t dir;
258
259 dir = direction (local_max);
260
261 if (dir > 0) {
262 add_point_to_list(points, local_max);
263 return;
264 }
265
266 if (dir == 0 && point_priority (local_max) < 0) {
267 add_point_to_list(points, local_max);
268 return;
269 }
270}
void add_point_to_list(PointHeap *point_heap, EDGEPT *point)
Definition: chop.cpp:76

◆ new_min_point()

void tesseract::Wordrec::new_min_point ( EDGEPT local_min,
PointHeap points 
)

Definition at line 232 of file chop.cpp.

232 {
233 int16_t dir;
234
235 dir = direction (local_min);
236
237 if (dir < 0) {
238 add_point_to_list(points, local_min);
239 return;
240 }
241
242 if (dir == 0 && point_priority (local_min) < 0) {
243 add_point_to_list(points, local_min);
244 return;
245 }
246}

◆ pick_close_point()

EDGEPT * tesseract::Wordrec::pick_close_point ( EDGEPT critical_point,
EDGEPT vertical_point,
int *  best_dist 
)

Definition at line 135 of file chop.cpp.

137 {
138 EDGEPT *best_point = nullptr;
139 int this_distance;
140 int found_better;
141
142 do {
143 found_better = false;
144
145 this_distance = edgept_dist (critical_point, vertical_point);
146 if (this_distance <= *best_dist) {
147
148 if (!(same_point (critical_point->pos, vertical_point->pos) ||
149 same_point (critical_point->pos, vertical_point->next->pos) ||
150 (best_point && same_point (best_point->pos, vertical_point->pos)) ||
151 is_exterior_point (critical_point, vertical_point))) {
152 *best_dist = this_distance;
153 best_point = vertical_point;
155 found_better = true;
156 }
157 }
158 vertical_point = vertical_point->next;
159 }
160 while (found_better == true);
161
162 return (best_point);
163}
#define edgept_dist(p1, p2)
Definition: outlines.h:83
#define is_exterior_point(edge, point)
Definition: outlines.h:93
Definition: blobs.h:99

◆ pick_good_seam()

SEAM * tesseract::Wordrec::pick_good_seam ( TBLOB blob)

Definition at line 217 of file findseam.cpp.

217 {
218 SeamPile seam_pile(chop_seam_pile_size);
219 EDGEPT *points[MAX_NUM_POINTS];
220 EDGEPT_CLIST new_points;
221 SEAM *seam = nullptr;
222 TESSLINE *outline;
223 int16_t num_points = 0;
224
225#ifndef GRAPHICS_DISABLED
226 if (chop_debug > 2)
227 wordrec_display_splits.set_value(true);
228
229 draw_blob_edges(blob);
230#endif
231
232 PointHeap point_heap(MAX_NUM_POINTS);
233 for (outline = blob->outlines; outline; outline = outline->next)
234 prioritize_points(outline, &point_heap);
235
236 while (!point_heap.empty() && num_points < MAX_NUM_POINTS) {
237 points[num_points++] = point_heap.PeekTop().data;
238 point_heap.Pop(nullptr);
239 }
240
241 /* Initialize queue */
242 SeamQueue seam_queue(MAX_NUM_SEAMS);
243
244 try_point_pairs(points, num_points, &seam_queue, &seam_pile, &seam, blob);
245 try_vertical_splits(points, num_points, &new_points,
246 &seam_queue, &seam_pile, &seam, blob);
247
248 if (seam == nullptr) {
249 choose_best_seam(&seam_queue, nullptr, BAD_PRIORITY, &seam, blob, &seam_pile);
250 } else if (seam->priority() > chop_good_split) {
251 choose_best_seam(&seam_queue, nullptr, seam->priority(), &seam, blob,
252 &seam_pile);
253 }
254
255 EDGEPT_C_IT it(&new_points);
256 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
257 EDGEPT *inserted_point = it.data();
258 if (seam == nullptr || !seam->UsesPoint(inserted_point)) {
259 for (outline = blob->outlines; outline; outline = outline->next) {
260 if (outline->loop == inserted_point) {
261 outline->loop = outline->loop->next;
262 }
263 }
264 remove_edgept(inserted_point);
265 }
266 }
267
268 if (seam) {
269 if (seam->priority() > chop_ok_split) {
270 delete seam;
271 seam = nullptr;
272 }
273#ifndef GRAPHICS_DISABLED
274 else if (wordrec_display_splits) {
275 seam->Mark(edge_window);
276 if (chop_debug > 2) {
279 }
280 }
281#endif
282 }
283
284 if (chop_debug)
285 wordrec_display_splits.set_value(false);
286
287 return (seam);
288}
bool wordrec_display_splits
Definition: split.cpp:41
void remove_edgept(EDGEPT *point)
Definition: split.cpp:200
ScrollView * edge_window
Definition: plotedges.cpp:35
void draw_blob_edges(TBLOB *blob)
Definition: plotedges.cpp:69
#define edge_window_wait()
Definition: plotedges.h:56
#define update_edge_window()
Definition: plotedges.h:44
EDGEPT * loop
Definition: blobs.h:280
TESSLINE * next
Definition: blobs.h:281
bool UsesPoint(const EDGEPT *point) const
Definition: seam.h:82
void Mark(ScrollView *window) const
Definition: seam.cpp:180
void prioritize_points(TESSLINE *outline, PointHeap *points)
Definition: chop.cpp:173
void try_point_pairs(EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
Definition: findseam.cpp:298
void try_vertical_splits(EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
Definition: findseam.cpp:336
void choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
Definition: findseam.cpp:105

◆ point_priority()

PRIORITY tesseract::Wordrec::point_priority ( EDGEPT point)

Definition at line 66 of file chop.cpp.

66 {
67 return static_cast<PRIORITY>(angle_change(point->prev, point, point->next));
68}

◆ prioritize_points()

void tesseract::Wordrec::prioritize_points ( TESSLINE outline,
PointHeap points 
)

Definition at line 173 of file chop.cpp.

173 {
174 EDGEPT *this_point;
175 EDGEPT *local_min = nullptr;
176 EDGEPT *local_max = nullptr;
177
178 this_point = outline->loop;
179 local_min = this_point;
180 local_max = this_point;
181 do {
182 if (this_point->vec.y < 0) {
183 /* Look for minima */
184 if (local_max != nullptr)
185 new_max_point(local_max, points);
186 else if (is_inside_angle (this_point))
187 add_point_to_list(points, this_point);
188 local_max = nullptr;
189 local_min = this_point->next;
190 }
191 else if (this_point->vec.y > 0) {
192 /* Look for maxima */
193 if (local_min != nullptr)
194 new_min_point(local_min, points);
195 else if (is_inside_angle (this_point))
196 add_point_to_list(points, this_point);
197 local_min = nullptr;
198 local_max = this_point->next;
199 }
200 else {
201 /* Flat area */
202 if (local_max != nullptr) {
203 if (local_max->prev->vec.y != 0) {
204 new_max_point(local_max, points);
205 }
206 local_max = this_point->next;
207 local_min = nullptr;
208 }
209 else {
210 if (local_min->prev->vec.y != 0) {
211 new_min_point(local_min, points);
212 }
213 local_min = this_point->next;
214 local_max = nullptr;
215 }
216 }
217
218 /* Next point */
219 this_point = this_point->next;
220 }
221 while (this_point != outline->loop);
222}
VECTOR vec
Definition: blobs.h:187
bool is_inside_angle(EDGEPT *pt)
Definition: chop.cpp:90
void new_min_point(EDGEPT *local_min, PointHeap *points)
Definition: chop.cpp:232
void new_max_point(EDGEPT *local_max, PointHeap *points)
Definition: chop.cpp:256

◆ ProcessSegSearchPainPoint()

void tesseract::Wordrec::ProcessSegSearchPainPoint ( float  pain_point_priority,
const MATRIX_COORD pain_point,
const char *  pain_point_type,
GenericVector< SegSearchPending > *  pending,
WERD_RES word_res,
LMPainPoints pain_points,
BlamerBundle blamer_bundle 
)
protected

Definition at line 248 of file segsearch.cpp.

252 {
253 if (segsearch_debug_level > 0) {
254 tprintf("Classifying pain point %s priority=%.4f, col=%d, row=%d\n",
255 pain_point_type, pain_point_priority,
256 pain_point.col, pain_point.row);
257 }
258 ASSERT_HOST(pain_points != nullptr);
259 MATRIX *ratings = word_res->ratings;
260 // Classify blob [pain_point.col pain_point.row]
261 if (!pain_point.Valid(*ratings)) {
262 ratings->IncreaseBandSize(pain_point.row + 1 - pain_point.col);
263 }
264 ASSERT_HOST(pain_point.Valid(*ratings));
265 BLOB_CHOICE_LIST *classified = classify_piece(word_res->seam_array,
266 pain_point.col, pain_point.row,
267 pain_point_type,
268 word_res->chopped_word,
269 blamer_bundle);
270 BLOB_CHOICE_LIST *lst = ratings->get(pain_point.col, pain_point.row);
271 if (lst == nullptr) {
272 ratings->put(pain_point.col, pain_point.row, classified);
273 } else {
274 // We can not delete old BLOB_CHOICEs, since they might contain
275 // ViterbiStateEntries that are parents of other "active" entries.
276 // Thus if the matrix cell already contains classifications we add
277 // the new ones to the beginning of the list.
278 BLOB_CHOICE_IT it(lst);
279 it.add_list_before(classified);
280 delete classified; // safe to delete, since empty after add_list_before()
281 classified = nullptr;
282 }
283
284 if (segsearch_debug_level > 0) {
285 print_ratings_list("Updated ratings matrix with a new entry:",
286 ratings->get(pain_point.col, pain_point.row),
287 getDict().getUnicharset());
288 ratings->print(getDict().getUnicharset());
289 }
290
291 // Insert initial "pain points" to join the newly classified blob
292 // with its left and right neighbors.
293 if (classified != nullptr && !classified->empty()) {
294 if (pain_point.col > 0) {
295 pain_points->GeneratePainPoint(
296 pain_point.col - 1, pain_point.row, LM_PPTYPE_SHAPE, 0.0,
297 true, segsearch_max_char_wh_ratio, word_res);
298 }
299 if (pain_point.row + 1 < ratings->dimension()) {
300 pain_points->GeneratePainPoint(
301 pain_point.col, pain_point.row + 1, LM_PPTYPE_SHAPE, 0.0,
302 true, segsearch_max_char_wh_ratio, word_res);
303 }
304 }
305 (*pending)[pain_point.col].SetBlobClassified(pain_point.row);
306}
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:49
bool Valid(const MATRIX &m) const
Definition: matrix.h:618

◆ program_editdown()

void tesseract::Wordrec::program_editdown ( int32_t  elasped_time)

Definition at line 75 of file tface.cpp.

75 {
76#ifndef DISABLED_LEGACY_ENGINE
78#endif // ndef DISABLED_LEGACY_ENGINE
79 getDict().End();
80}
void EndAdaptiveClassifier()
Definition: adaptmatch.cpp:459
void End()
Definition: dict.cpp:372

◆ program_editup()

void tesseract::Wordrec::program_editup ( const char *  textbase,
TessdataManager init_classifier,
TessdataManager init_dict 
)

Definition at line 40 of file tface.cpp.

42 {
43 if (textbase != nullptr) imagefile = textbase;
44#ifndef DISABLED_LEGACY_ENGINE
46 InitAdaptiveClassifier(init_classifier);
47 if (init_dict) {
49 getDict().Load(lang, init_dict);
51 }
53#endif // ndef DISABLED_LEGACY_ENGINE
54}
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:112
STRING imagefile
Definition: ccutil.h:77
STRING lang
Definition: ccutil.h:71
FEATURE_DEFS_STRUCT feature_defs_
Definition: classify.h:541
void InitAdaptiveClassifier(TessdataManager *mgr)
Definition: adaptmatch.cpp:527
static TESS_API DawgCache * GlobalDawgCache()
Definition: dict.cpp:184
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:192
bool FinishLoad()
Definition: dict.cpp:351
void Load(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:210

◆ ResetNGramSearch()

void tesseract::Wordrec::ResetNGramSearch ( WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
GenericVector< SegSearchPending > *  pending 
)
protected

Definition at line 311 of file segsearch.cpp.

313 {
314 // TODO(rays) More refactoring required here.
315 // Delete existing viterbi states.
316 for (int col = 0; col < best_choice_bundle->beam.size(); ++col) {
317 best_choice_bundle->beam[col]->Clear();
318 }
319 // Reset best_choice_bundle.
320 word_res->ClearWordChoices();
321 best_choice_bundle->best_vse = nullptr;
322 // Clear out all existing pendings and add a new one for the first column.
323 (*pending)[0].SetColumnClassified();
324 for (int i = 1; i < pending->size(); ++i)
325 (*pending)[i].Clear();
326}
void ClearWordChoices()
Definition: pageres.cpp:1129

◆ SaveAltChoices()

void tesseract::Wordrec::SaveAltChoices ( const LIST best_choices,
WERD_RES word 
)

◆ SegSearch()

void tesseract::Wordrec::SegSearch ( WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)

Definition at line 42 of file segsearch.cpp.

44 {
45 LMPainPoints pain_points(segsearch_max_pain_points,
49 // Compute scaling factor that will help us recover blob outline length
50 // from classifier rating and certainty for the blob.
51 float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
53 InitialSegSearch(word_res, &pain_points, &pending, best_choice_bundle,
54 blamer_bundle);
55
56 if (!SegSearchDone(0)) { // find a better choice
57 if (chop_enable && word_res->chopped_word != nullptr) {
58 improve_by_chopping(rating_cert_scale, word_res, best_choice_bundle,
59 blamer_bundle, &pain_points, &pending);
60 }
61 if (chop_debug) SEAM::PrintSeams("Final seam list:", word_res->seam_array);
62
63 if (blamer_bundle != nullptr &&
64 !blamer_bundle->ChoiceIsCorrect(word_res->best_choice)) {
65 blamer_bundle->SetChopperBlame(word_res, wordrec_debug_blamer);
66 }
67 }
68 // Keep trying to find a better path by fixing the "pain points".
69
70 MATRIX_COORD pain_point;
71 float pain_point_priority;
72 int num_futile_classifications = 0;
73 STRING blamer_debug;
74 while (wordrec_enable_assoc &&
75 (!SegSearchDone(num_futile_classifications) ||
76 (blamer_bundle != nullptr &&
77 blamer_bundle->GuidedSegsearchStillGoing()))) {
78 // Get the next valid "pain point".
79 bool found_nothing = true;
80 LMPainPointsType pp_type;
81 while ((pp_type = pain_points.Deque(&pain_point, &pain_point_priority)) !=
83 if (!pain_point.Valid(*word_res->ratings)) {
84 word_res->ratings->IncreaseBandSize(
85 pain_point.row - pain_point.col + 1);
86 }
87 if (pain_point.Valid(*word_res->ratings) &&
88 !word_res->ratings->Classified(pain_point.col, pain_point.row,
89 getDict().WildcardID())) {
90 found_nothing = false;
91 break;
92 }
93 }
94 if (found_nothing) {
95 if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n");
96 break;
97 }
98 ProcessSegSearchPainPoint(pain_point_priority, pain_point,
100 &pending, word_res, &pain_points, blamer_bundle);
101
102 UpdateSegSearchNodes(rating_cert_scale, pain_point.col, &pending,
103 word_res, &pain_points, best_choice_bundle,
104 blamer_bundle);
105 if (!best_choice_bundle->updated) ++num_futile_classifications;
106
107 if (segsearch_debug_level > 0) {
108 tprintf("num_futile_classifications %d\n", num_futile_classifications);
109 }
110
111 best_choice_bundle->updated = false; // reset updated
112
113 // See if it's time to terminate SegSearch or time for starting a guided
114 // search for the true path to find the blame for the incorrect best_choice.
115 if (SegSearchDone(num_futile_classifications) &&
116 blamer_bundle != nullptr &&
117 blamer_bundle->GuidedSegsearchNeeded(word_res->best_choice)) {
118 InitBlamerForSegSearch(word_res, &pain_points, blamer_bundle,
119 &blamer_debug);
120 }
121 } // end while loop exploring alternative paths
122 if (blamer_bundle != nullptr) {
123 blamer_bundle->FinishSegSearch(word_res->best_choice,
124 wordrec_debug_blamer, &blamer_debug);
125 }
126
127 if (segsearch_debug_level > 0) {
128 tprintf("Done with SegSearch (AcceptableChoiceFound: %d)\n",
129 language_model_->AcceptableChoiceFound());
130 }
131}
void SetChopperBlame(const WERD_RES *word, bool debug)
Definition: blamer.cpp:318
bool GuidedSegsearchStillGoing() const
Definition: blamer.cpp:514
void FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, STRING *debug_str)
Definition: blamer.cpp:519
bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const
Definition: blamer.cpp:471
bool Classified(int col, int row, int wildcard_id) const
Definition: matrix.cpp:36
static void PrintSeams(const char *label, const GenericVector< SEAM * > &seams)
Definition: seam.cpp:167
Definition: strngs.h:45
static const char * PainPointDescription(LMPainPointsType type)
void improve_by_chopping(float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending)
Definition: chopper.cpp:454
bool SegSearchDone(int num_futile_classifications)
Definition: wordrec.h:486
void InitBlamerForSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug)
Definition: segsearch.cpp:328
void InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:136

◆ SegSearchDone()

bool tesseract::Wordrec::SegSearchDone ( int  num_futile_classifications)
inlineprotected

Definition at line 486 of file wordrec.h.

486 {
487 return (language_model_->AcceptableChoiceFound() ||
488 num_futile_classifications >=
490 }

◆ select_blob_to_split()

int tesseract::Wordrec::select_blob_to_split ( const GenericVector< BLOB_CHOICE * > &  blob_choices,
float  rating_ceiling,
bool  split_next_to_fragment 
)

Definition at line 538 of file chopper.cpp.

540 {
541 BLOB_CHOICE *blob_choice;
542 int x;
543 float worst = -FLT_MAX;
544 int worst_index = -1;
545 float worst_near_fragment = -FLT_MAX;
546 int worst_index_near_fragment = -1;
547 const CHAR_FRAGMENT **fragments = nullptr;
548
549 if (chop_debug) {
550 if (rating_ceiling < FLT_MAX)
551 tprintf("rating_ceiling = %8.4f\n", rating_ceiling);
552 else
553 tprintf("rating_ceiling = No Limit\n");
554 }
555
556 if (split_next_to_fragment && blob_choices.size() > 0) {
557 fragments = new const CHAR_FRAGMENT *[blob_choices.length()];
558 if (blob_choices[0] != nullptr) {
559 fragments[0] = getDict().getUnicharset().get_fragment(
560 blob_choices[0]->unichar_id());
561 } else {
562 fragments[0] = nullptr;
563 }
564 }
565
566 for (x = 0; x < blob_choices.size(); ++x) {
567 if (blob_choices[x] == nullptr) {
568 delete[] fragments;
569 return x;
570 } else {
571 blob_choice = blob_choices[x];
572 // Populate fragments for the following position.
573 if (split_next_to_fragment && x+1 < blob_choices.size()) {
574 if (blob_choices[x + 1] != nullptr) {
575 fragments[x + 1] = getDict().getUnicharset().get_fragment(
576 blob_choices[x + 1]->unichar_id());
577 } else {
578 fragments[x + 1] = nullptr;
579 }
580 }
581 if (blob_choice->rating() < rating_ceiling &&
582 blob_choice->certainty() < tessedit_certainty_threshold) {
583 // Update worst and worst_index.
584 if (blob_choice->rating() > worst) {
585 worst_index = x;
586 worst = blob_choice->rating();
587 }
588 if (split_next_to_fragment) {
589 // Update worst_near_fragment and worst_index_near_fragment.
590 bool expand_following_fragment =
591 (x + 1 < blob_choices.size() &&
592 fragments[x+1] != nullptr && !fragments[x+1]->is_beginning());
593 bool expand_preceding_fragment =
594 (x > 0 && fragments[x-1] != nullptr && !fragments[x-1]->is_ending());
595 if ((expand_following_fragment || expand_preceding_fragment) &&
596 blob_choice->rating() > worst_near_fragment) {
597 worst_index_near_fragment = x;
598 worst_near_fragment = blob_choice->rating();
599 if (chop_debug) {
600 tprintf("worst_index_near_fragment=%d"
601 " expand_following_fragment=%d"
602 " expand_preceding_fragment=%d\n",
603 worst_index_near_fragment,
604 expand_following_fragment,
605 expand_preceding_fragment);
606 }
607 }
608 }
609 }
610 }
611 }
612 delete[] fragments;
613 // TODO(daria): maybe a threshold of badness for
614 // worst_near_fragment would be useful.
615 return worst_index_near_fragment != -1 ?
616 worst_index_near_fragment : worst_index;
617}
int length() const
Definition: genericvector.h:86
float certainty() const
Definition: ratngs.h:83
float rating() const
Definition: ratngs.h:80
bool is_beginning() const
Definition: unicharset.h:105
bool is_ending() const
Definition: unicharset.h:108
const UNICHARSET & getUnicharset() const
Definition: dict.h:101

◆ select_blob_to_split_from_fixpt()

int tesseract::Wordrec::select_blob_to_split_from_fixpt ( DANGERR fixpt)

Definition at line 626 of file chopper.cpp.

626 {
627 if (!fixpt)
628 return -1;
629 for (int i = 0; i < fixpt->size(); i++) {
630 if ((*fixpt)[i].begin + 1 == (*fixpt)[i].end &&
631 (*fixpt)[i].dangerous &&
632 (*fixpt)[i].correct_is_ngram) {
633 return (*fixpt)[i].begin;
634 }
635 }
636 return -1;
637}

◆ set_pass1()

void tesseract::Wordrec::set_pass1 ( )

Definition at line 101 of file tface.cpp.

101 {
102 chop_ok_split.set_value(70.0);
103 language_model_->getParamsModel().SetPass(ParamsModel::PTRAIN_PASS1);
104 SettupPass1();
105}

◆ set_pass2()

void tesseract::Wordrec::set_pass2 ( )

Definition at line 113 of file tface.cpp.

113 {
114 chop_ok_split.set_value(pass2_ok_split);
115 language_model_->getParamsModel().SetPass(ParamsModel::PTRAIN_PASS2);
116 SettupPass2();
117}

◆ try_point_pairs()

void tesseract::Wordrec::try_point_pairs ( EDGEPT points[MAX_NUM_POINTS],
int16_t  num_points,
SeamQueue seam_queue,
SeamPile seam_pile,
SEAM **  seam,
TBLOB blob 
)

Definition at line 298 of file findseam.cpp.

303 {
304 int16_t x;
305 int16_t y;
306 PRIORITY priority;
307
308 for (x = 0; x < num_points; x++) {
309 for (y = x + 1; y < num_points; y++) {
310 if (points[y] &&
311 points[x]->WeightedDistance(*points[y], chop_x_y_weight) <
313 points[x] != points[y]->next && points[y] != points[x]->next &&
314 !is_exterior_point(points[x], points[y]) &&
315 !is_exterior_point(points[y], points[x])) {
316 SPLIT split(points[x], points[y]);
317 priority = partial_split_priority(&split);
318
319 choose_best_seam(seam_queue, &split, priority, seam, blob, seam_pile);
320 }
321 }
322 }
323}
#define partial_split_priority(split)
Definition: findseam.cpp:41
Definition: split.h:37

◆ try_vertical_splits()

void tesseract::Wordrec::try_vertical_splits ( EDGEPT points[MAX_NUM_POINTS],
int16_t  num_points,
EDGEPT_CLIST *  new_points,
SeamQueue seam_queue,
SeamPile seam_pile,
SEAM **  seam,
TBLOB blob 
)

Definition at line 336 of file findseam.cpp.

342 {
343 EDGEPT *vertical_point = nullptr;
344 int16_t x;
345 PRIORITY priority;
346 TESSLINE *outline;
347
348 for (x = 0; x < num_points; x++) {
349 vertical_point = nullptr;
350 for (outline = blob->outlines; outline; outline = outline->next) {
351 vertical_projection_point(points[x], outline->loop,
352 &vertical_point, new_points);
353 }
354
355 if (vertical_point && points[x] != vertical_point->next &&
356 vertical_point != points[x]->next &&
357 points[x]->WeightedDistance(*vertical_point, chop_x_y_weight) <
359 SPLIT split(points[x], vertical_point);
360 priority = partial_split_priority(&split);
361 choose_best_seam(seam_queue, &split, priority, seam, blob, seam_pile);
362 }
363 }
364}
void vertical_projection_point(EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points)
Definition: chop.cpp:285

◆ UpdateSegSearchNodes()

void tesseract::Wordrec::UpdateSegSearchNodes ( float  rating_cert_scale,
int  starting_col,
GenericVector< SegSearchPending > *  pending,
WERD_RES word_res,
LMPainPoints pain_points,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 180 of file segsearch.cpp.

187 {
188 MATRIX *ratings = word_res->ratings;
189 ASSERT_HOST(ratings->dimension() == pending->size());
190 ASSERT_HOST(ratings->dimension() == best_choice_bundle->beam.size());
191 for (int col = starting_col; col < ratings->dimension(); ++col) {
192 if (!(*pending)[col].WorkToDo()) continue;
193 int first_row = col;
194 int last_row = std::min(ratings->dimension() - 1,
195 col + ratings->bandwidth() - 1);
196 if ((*pending)[col].SingleRow() >= 0) {
197 first_row = last_row = (*pending)[col].SingleRow();
198 }
199 if (segsearch_debug_level > 0) {
200 tprintf("\n\nUpdateSegSearchNodes: col=%d, rows=[%d,%d], alljust=%d\n",
201 col, first_row, last_row,
202 (*pending)[col].IsRowJustClassified(INT32_MAX));
203 }
204 // Iterate over the pending list for this column.
205 for (int row = first_row; row <= last_row; ++row) {
206 // Update language model state of this child+parent pair.
207 BLOB_CHOICE_LIST *current_node = ratings->get(col, row);
208 LanguageModelState *parent_node =
209 col == 0 ? nullptr : best_choice_bundle->beam[col - 1];
210 if (current_node != nullptr &&
211 language_model_->UpdateState((*pending)[col].IsRowJustClassified(row),
212 col, row, current_node, parent_node,
213 pain_points, word_res,
214 best_choice_bundle, blamer_bundle) &&
215 row + 1 < ratings->dimension()) {
216 // Since the language model state of this entry changed, process all
217 // the child column.
218 (*pending)[row + 1].RevisitWholeColumn();
219 if (segsearch_debug_level > 0) {
220 tprintf("Added child col=%d to pending\n", row + 1);
221 }
222 } // end if UpdateState.
223 } // end for row.
224 } // end for col.
225 if (best_choice_bundle->best_vse != nullptr) {
226 ASSERT_HOST(word_res->StatesAllValid());
227 if (best_choice_bundle->best_vse->updated) {
228 pain_points->GenerateFromPath(rating_cert_scale,
229 best_choice_bundle->best_vse, word_res);
230 if (!best_choice_bundle->fixpt.empty()) {
231 pain_points->GenerateFromAmbigs(best_choice_bundle->fixpt,
232 best_choice_bundle->best_vse, word_res);
233 }
234 }
235 }
236 // The segsearch is completed. Reset all updated flags on all VSEs and reset
237 // all pendings.
238 for (int col = 0; col < pending->size(); ++col) {
239 (*pending)[col].Clear();
240 ViterbiStateEntry_IT
241 vse_it(&best_choice_bundle->beam[col]->viterbi_state_entries);
242 for (vse_it.mark_cycle_pt(); !vse_it.cycled_list(); vse_it.forward()) {
243 vse_it.data()->updated = false;
244 }
245 }
246}
int bandwidth() const
Definition: matrix.h:538

◆ vertical_projection_point()

void tesseract::Wordrec::vertical_projection_point ( EDGEPT split_point,
EDGEPT target_point,
EDGEPT **  best_point,
EDGEPT_CLIST *  new_points 
)

Definition at line 285 of file chop.cpp.

287 {
288 EDGEPT *p; /* Iterator */
289 EDGEPT *this_edgept; /* Iterator */
290 EDGEPT_C_IT new_point_it(new_points);
291 int x = split_point->pos.x; /* X value of vertical */
292 int best_dist = LARGE_DISTANCE;/* Best point found */
293
294 if (*best_point != nullptr)
295 best_dist = edgept_dist(split_point, *best_point);
296
297 p = target_point;
298 /* Look at each edge point */
299 do {
300 if (((p->pos.x <= x && x <= p->next->pos.x) ||
301 (p->next->pos.x <= x && x <= p->pos.x)) &&
302 !same_point(split_point->pos, p->pos) &&
303 !same_point(split_point->pos, p->next->pos) &&
304 !p->IsChopPt() &&
305 (*best_point == nullptr || !same_point((*best_point)->pos, p->pos))) {
306
307 if (near_point(split_point, p, p->next, &this_edgept)) {
308 new_point_it.add_before_then_move(this_edgept);
309 }
310
311 if (*best_point == nullptr)
312 best_dist = edgept_dist (split_point, this_edgept);
313
314 this_edgept =
315 pick_close_point(split_point, this_edgept, &best_dist);
316 if (this_edgept)
317 *best_point = this_edgept;
318 }
319
320 p = p->next;
321 }
322 while (p != target_point);
323}
#define LARGE_DISTANCE
Definition: outlines.h:32
bool IsChopPt() const
Definition: blobs.h:182
bool near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt)
Definition: outlines.cpp:40
EDGEPT * pick_close_point(EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist)
Definition: chop.cpp:135

Member Data Documentation

◆ assume_fixed_pitch_char_segment

bool tesseract::Wordrec::assume_fixed_pitch_char_segment = false

"include fixed-pitch heuristics in char segmentation"

Definition at line 225 of file wordrec.h.

◆ blame_reasons_

GenericVector<int> tesseract::Wordrec::blame_reasons_

Definition at line 478 of file wordrec.h.

◆ chop_center_knob

double tesseract::Wordrec::chop_center_knob = 0.15

"Split center adjustment"

Definition at line 216 of file wordrec.h.

◆ chop_centered_maxwidth

int tesseract::Wordrec::chop_centered_maxwidth = 90

"Width of (smaller) chopped blobs " "above which we don't care that a chop is not near the center."

Definition at line 218 of file wordrec.h.

◆ chop_debug

int tesseract::Wordrec::chop_debug = 0

"Chop debug"

Definition at line 204 of file wordrec.h.

◆ chop_enable

bool tesseract::Wordrec::chop_enable = 1

"Chop enable"

Definition at line 205 of file wordrec.h.

◆ chop_good_split

double tesseract::Wordrec::chop_good_split = 50.0

"Good split limit"

Definition at line 222 of file wordrec.h.

◆ chop_inside_angle

int tesseract::Wordrec::chop_inside_angle = -50

"Min Inside Angle Bend"

Definition at line 212 of file wordrec.h.

◆ chop_min_outline_area

int tesseract::Wordrec::chop_min_outline_area = 2000

"Min Outline Area"

Definition at line 213 of file wordrec.h.

◆ chop_min_outline_points

int tesseract::Wordrec::chop_min_outline_points = 6

"Min Number of Points on Outline"

Definition at line 209 of file wordrec.h.

◆ chop_new_seam_pile

bool tesseract::Wordrec::chop_new_seam_pile = 1

"Use new seam_pile"

Definition at line 211 of file wordrec.h.

◆ chop_ok_split

double tesseract::Wordrec::chop_ok_split = 100.0

"OK split limit"

Definition at line 221 of file wordrec.h.

◆ chop_overlap_knob

double tesseract::Wordrec::chop_overlap_knob = 0.9

"Split overlap adjustment"

Definition at line 215 of file wordrec.h.

◆ chop_same_distance

int tesseract::Wordrec::chop_same_distance = 2

"Same distance"

Definition at line 208 of file wordrec.h.

◆ chop_seam_pile_size

int tesseract::Wordrec::chop_seam_pile_size = 150

"Max number of seams in seam_pile"

Definition at line 210 of file wordrec.h.

◆ chop_sharpness_knob

double tesseract::Wordrec::chop_sharpness_knob = 0.06

"Split sharpness adjustment"

Definition at line 219 of file wordrec.h.

◆ chop_split_dist_knob

double tesseract::Wordrec::chop_split_dist_knob = 0.5

"Split length adjustment"

Definition at line 214 of file wordrec.h.

◆ chop_split_length

int tesseract::Wordrec::chop_split_length = 10000

"Split Length"

Definition at line 207 of file wordrec.h.

◆ chop_vertical_creep

bool tesseract::Wordrec::chop_vertical_creep = 0

"Vertical creep"

Definition at line 206 of file wordrec.h.

◆ chop_width_change_knob

double tesseract::Wordrec::chop_width_change_knob = 5.0

"Width change adjustment"

Definition at line 220 of file wordrec.h.

◆ chop_x_y_weight

int tesseract::Wordrec::chop_x_y_weight = 3

"X / Y length weight"

Definition at line 223 of file wordrec.h.

◆ fill_lattice_

void(Wordrec::* tesseract::Wordrec::fill_lattice_) (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)

Definition at line 480 of file wordrec.h.

◆ force_word_assoc

bool tesseract::Wordrec::force_word_assoc = false

"force associator to run regardless of what enable_assoc is." "This is used for CJK where component grouping is necessary."

Definition at line 201 of file wordrec.h.

◆ language_model_

std::unique_ptr<LanguageModel> tesseract::Wordrec::language_model_

Definition at line 471 of file wordrec.h.

◆ merge_fragments_in_matrix

bool tesseract::Wordrec::merge_fragments_in_matrix = true

"Merge the fragments in the ratings matrix and delete them " "after merging"

Definition at line 197 of file wordrec.h.

◆ pass2_ok_split

PRIORITY tesseract::Wordrec::pass2_ok_split

Definition at line 472 of file wordrec.h.

◆ prev_word_best_choice_

WERD_CHOICE* tesseract::Wordrec::prev_word_best_choice_

Definition at line 476 of file wordrec.h.

◆ repair_unchopped_blobs

int tesseract::Wordrec::repair_unchopped_blobs = 1

"Fix blobs that aren't chopped"

Definition at line 202 of file wordrec.h.

◆ save_alt_choices

bool tesseract::Wordrec::save_alt_choices = true

"Save alternative paths found during chopping " "and segmentation search"

Definition at line 242 of file wordrec.h.

◆ segsearch_debug_level

int tesseract::Wordrec::segsearch_debug_level = 0

"SegSearch debug level"

Definition at line 233 of file wordrec.h.

◆ segsearch_max_char_wh_ratio

double tesseract::Wordrec::segsearch_max_char_wh_ratio = 2.0

"Maximum character width-to-height ratio"

Definition at line 239 of file wordrec.h.

◆ segsearch_max_futile_classifications

int tesseract::Wordrec::segsearch_max_futile_classifications = 10

"Maximum number of pain point classifications per word."

Definition at line 237 of file wordrec.h.

◆ segsearch_max_pain_points

int tesseract::Wordrec::segsearch_max_pain_points = 2000

"Maximum number of pain points stored in the queue"

Definition at line 235 of file wordrec.h.

◆ tessedit_certainty_threshold

double tesseract::Wordrec::tessedit_certainty_threshold = -2.25

"Good blob limit"

Definition at line 203 of file wordrec.h.

◆ wordrec_debug_blamer

bool tesseract::Wordrec::wordrec_debug_blamer = false

"Print blamer debug messages"

Definition at line 231 of file wordrec.h.

◆ wordrec_debug_level

int tesseract::Wordrec::wordrec_debug_level = 0

"Debug level for wordrec"

Definition at line 226 of file wordrec.h.

◆ wordrec_enable_assoc

bool tesseract::Wordrec::wordrec_enable_assoc = true

"Associator Enable"

Definition at line 198 of file wordrec.h.

◆ wordrec_max_join_chunks

int tesseract::Wordrec::wordrec_max_join_chunks = 4

"Max number of broken pieces to associate"

Definition at line 228 of file wordrec.h.

◆ wordrec_run_blamer

bool tesseract::Wordrec::wordrec_run_blamer = false

"Try to set the blame for errors"

Definition at line 232 of file wordrec.h.

◆ wordrec_skip_no_truth_words

bool tesseract::Wordrec::wordrec_skip_no_truth_words = false

"Only run OCR for words that had truth recorded in BlamerBundle"

Definition at line 230 of file wordrec.h.


The documentation for this class was generated from the following files: