tesseract 4.1.1
Loading...
Searching...
No Matches
tesseract::Tesseract Class Reference

#include <tesseractclass.h>

Inheritance diagram for tesseract::Tesseract:
tesseract::Wordrec tesseract::Classify tesseract::CCStruct tesseract::CUtil tesseract::CCUtil

Public Member Functions

 Tesseract ()
 
 ~Tesseract () override
 
DictgetDict () override
 
void Clear ()
 
void ResetAdaptiveClassifier ()
 
void ResetDocumentDictionary ()
 
void SetEquationDetect (EquationDetect *detector)
 
const FCOORDreskew () const
 
Pix ** mutable_pix_binary ()
 
Pix * pix_binary () const
 
Pix * pix_grey () const
 
void set_pix_grey (Pix *grey_pix)
 
Pix * pix_original () const
 
void set_pix_original (Pix *original_pix)
 
Pix * BestPix () const
 
void set_pix_thresholds (Pix *thresholds)
 
int source_resolution () const
 
void set_source_resolution (int ppi)
 
int ImageWidth () const
 
int ImageHeight () const
 
Pix * scaled_color () const
 
int scaled_factor () const
 
void SetScaledColor (int factor, Pix *color)
 
const Textordtextord () const
 
Textordmutable_textord ()
 
bool right_to_left () const
 
int num_sub_langs () const
 
Tesseractget_sub_lang (int index) const
 
bool AnyTessLang () const
 
bool AnyLSTMLang () const
 
void SetBlackAndWhitelist ()
 
void PrepareForPageseg ()
 
void PrepareForTessOCR (BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
 
int SegmentPage (const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr)
 
void SetupWordScripts (BLOCK_LIST *blocks)
 
int AutoPageSeg (PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
 
ColumnFinderSetupPageSegAndDetectOrientation (PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix)
 
void PrerecAllWordsPar (const GenericVector< WordData > &words)
 
bool TrainLineRecognizer (const STRING &input_imagename, const STRING &output_basename, BLOCK_LIST *block_list)
 
void TrainFromBoxes (const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, BLOCK_LIST *block_list, DocumentData *training_data)
 
ImageDataGetLineData (const TBOX &line_box, const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, int start_box, int end_box, const BLOCK &block)
 
ImageDataGetRectImage (const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
 
void LSTMRecognizeWord (const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
 
void SearchWords (PointerVector< WERD_RES > *words)
 
bool ProcessTargetWord (const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
 
void SetupAllWordsPassN (int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
 
void SetupWordPassN (int pass_n, WordData *word)
 
bool RecogAllWordsPassN (int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
 
bool recog_all_words (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
 
void rejection_passes (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
 
void bigram_correction_pass (PAGE_RES *page_res)
 
void blamer_pass (PAGE_RES *page_res)
 
void script_pos_pass (PAGE_RES *page_res)
 
int RetryWithLanguage (const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
 
bool ReassignDiacritics (int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
 
void AssignDiacriticsToOverlappingBlobs (const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB * > *target_blobs)
 
void AssignDiacriticsToNewBlobs (const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB * > *target_blobs)
 
bool SelectGoodDiacriticOutlines (int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE * > &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
 
float ClassifyBlobPlusOutlines (const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
 
float ClassifyBlobAsWord (int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
 
void classify_word_and_language (int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
 
void classify_word_pass1 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
 
void recog_pseudo_word (PAGE_RES *page_res, TBOX &selection_box)
 
void fix_rep_char (PAGE_RES_IT *page_res_it)
 
ACCEPTABLE_WERD_TYPE acceptable_word_string (const UNICHARSET &char_set, const char *s, const char *lengths)
 
void match_word_pass_n (int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
 
void classify_word_pass2 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
 
void ReportXhtFixResult (bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
 
bool RunOldFixXht (WERD_RES *word, BLOCK *block, ROW *row)
 
bool TrainedXheightFix (WERD_RES *word, BLOCK *block, ROW *row)
 
bool TestNewNormalization (int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
 
bool recog_interactive (PAGE_RES_IT *pr_it)
 
void set_word_fonts (WERD_RES *word)
 
void font_recognition_pass (PAGE_RES *page_res)
 
void dictionary_correction_pass (PAGE_RES *page_res)
 
bool check_debug_pt (WERD_RES *word, int location)
 
bool SubAndSuperscriptFix (WERD_RES *word_res)
 
void GetSubAndSuperscriptCandidates (const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
 
WERD_RESTrySuperscriptSplits (int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
 
bool BelievableSuperscript (bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
 
void output_pass (PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
 
void write_results (PAGE_RES_IT &page_res_it, char newline_type, bool force_eol)
 
void set_unlv_suspects (WERD_RES *word)
 
UNICHAR_ID get_rep_char (WERD_RES *word)
 
bool acceptable_number_string (const char *s, const char *lengths)
 
int16_t count_alphanums (const WERD_CHOICE &word)
 
int16_t count_alphas (const WERD_CHOICE &word)
 
void read_config_file (const char *filename, SetParamConstraint constraint)
 
int init_tesseract (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
 
int init_tesseract (const char *datapath, const char *language, OcrEngineMode oem)
 
int init_tesseract_internal (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
 
void SetupUniversalFontIds ()
 
int init_tesseract_lm (const char *arg0, const char *textbase, const char *language, TessdataManager *mgr)
 
void recognize_page (STRING &image_name)
 
void end_tesseract ()
 
bool init_tesseract_lang_data (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
 
void ParseLanguageString (const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
 
SVMenuNodebuild_menu_new ()
 
void pgeditor_main (int width, int height, PAGE_RES *page_res)
 
void process_image_event (const SVEvent &event)
 
bool process_cmd_win_event (int32_t cmd_event, char *new_value)
 
void debug_word (PAGE_RES *page_res, const TBOX &selection_box)
 
void do_re_display (bool(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
 
bool word_display (PAGE_RES_IT *pr_it)
 
bool word_bln_display (PAGE_RES_IT *pr_it)
 
bool word_blank_and_set_display (PAGE_RES_IT *pr_its)
 
bool word_set_display (PAGE_RES_IT *pr_it)
 
bool word_dumper (PAGE_RES_IT *pr_it)
 
void blob_feature_display (PAGE_RES *page_res, const TBOX &selection_box)
 
void make_reject_map (WERD_RES *word, ROW *row, int16_t pass)
 
bool one_ell_conflict (WERD_RES *word_res, bool update_map)
 
int16_t first_alphanum_index (const char *word, const char *word_lengths)
 
int16_t first_alphanum_offset (const char *word, const char *word_lengths)
 
int16_t alpha_count (const char *word, const char *word_lengths)
 
bool word_contains_non_1_digit (const char *word, const char *word_lengths)
 
void dont_allow_1Il (WERD_RES *word)
 
int16_t count_alphanums (WERD_RES *word)
 
void flip_0O (WERD_RES *word)
 
bool non_0_digit (const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
 
bool non_O_upper (const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
 
bool repeated_nonalphanum_wd (WERD_RES *word, ROW *row)
 
void nn_match_word (WERD_RES *word, ROW *row)
 
void nn_recover_rejects (WERD_RES *word, ROW *row)
 
void set_done (WERD_RES *word, int16_t pass)
 
int16_t safe_dict_word (const WERD_RES *werd_res)
 
void flip_hyphens (WERD_RES *word)
 
void reject_I_1_L (WERD_RES *word)
 
void reject_edge_blobs (WERD_RES *word)
 
void reject_mostly_rejects (WERD_RES *word)
 
bool word_adaptable (WERD_RES *word, uint16_t mode)
 
void recog_word_recursive (WERD_RES *word)
 
void recog_word (WERD_RES *word)
 
void split_and_recog_word (WERD_RES *word)
 
void split_word (WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
 
void join_words (WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
 
void match_current_words (WERD_RES_LIST &words, ROW *row, BLOCK *block)
 
int16_t fp_eval_word_spacing (WERD_RES_LIST &word_res_list)
 
void dump_words (WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
 
bool fixspace_thinks_word_done (WERD_RES *word)
 
GARBAGE_LEVEL garbage_word (WERD_RES *word, bool ok_dict_word)
 
bool potential_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
 
void tilde_crunch (PAGE_RES_IT &page_res_it)
 
void unrej_good_quality_words (PAGE_RES_IT &page_res_it)
 
void doc_and_block_rejection (PAGE_RES_IT &page_res_it, bool good_quality_doc)
 
void quality_based_rejection (PAGE_RES_IT &page_res_it, bool good_quality_doc)
 
void convert_bad_unlv_chs (WERD_RES *word_res)
 
void tilde_delete (PAGE_RES_IT &page_res_it)
 
int16_t word_blob_quality (WERD_RES *word, ROW *row)
 
void word_char_quality (WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
 
void unrej_good_chs (WERD_RES *word, ROW *row)
 
int16_t count_outline_errs (char c, int16_t outline_count)
 
int16_t word_outline_errs (WERD_RES *word)
 
bool terrible_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level)
 
CRUNCH_MODE word_deletable (WERD_RES *word, int16_t &delete_mode)
 
int16_t failure_count (WERD_RES *word)
 
bool noise_outlines (TWERD *word)
 
void tess_segment_pass_n (int pass_n, WERD_RES *word)
 
PAGE_RESApplyBoxes (const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
 
void PreenXHeights (BLOCK_LIST *block_list)
 
PAGE_RESSetupApplyBoxes (const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
 
void MaximallyChopWord (const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
 
bool ResegmentCharBox (PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text)
 
bool ResegmentWordBox (BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text)
 
void ReSegmentByClassification (PAGE_RES *page_res)
 
bool ConvertStringToUnichars (const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
 
bool FindSegmentation (const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
 
void SearchForText (const GenericVector< BLOB_CHOICE_LIST * > *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
 
void TidyUp (PAGE_RES *page_res)
 
void ReportFailedBox (int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
 
void CorrectClassifyWords (PAGE_RES *page_res)
 
void ApplyBoxTraining (const STRING &fontname, PAGE_RES *page_res)
 
int CountMisfitTops (WERD_RES *word_res)
 
float ComputeCompatibleXheight (WERD_RES *word_res, float *baseline_shift)
 
FILE * init_recog_training (const STRING &fname)
 
void recog_training_segmented (const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
 
void ambigs_classify_and_output (const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
 
eval_word_spacing()

The basic measure is the number of characters in contextually confirmed words. (I.e the word is done) If all words are contextually confirmed the evaluation is deemed perfect.

Some fiddles are done to handle "1"s as these are VERY frequent causes of fuzzy spaces. The problem with the basic measure is that "561 63" would score the same as "56163", though given our knowledge that the space is fuzzy, and that there is a "1" next to the fuzzy space, we need to ensure that "56163" is preferred.

The solution is to NOT COUNT the score of any word which has a digit at one end and a "1Il" as the character the other side of the space.

Conversely, any character next to a "1" within a word is counted as a positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of the "1" joined). "56163" would score 7 - all chars in a numeric word + 2 sides of a "1" joined.

The joined 1 rule is applied to any word REGARDLESS of contextual confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.

bool digit_or_numeric_punct (WERD_RES *word, int char_position)
 
int16_t eval_word_spacing (WERD_RES_LIST &word_res_list)
 
fix_sp_fp_word()

Test the current word to see if it can be split by deleting noise blobs. If so, do the business. Return with the iterator pointing to the same place if the word is unchanged, or the last of the replacement words.

void fix_noisy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
 
void fix_sp_fp_word (WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
 
int16_t worst_noise_blob (WERD_RES *word_res, float *worst_noise_score)
 
float blob_noise_score (TBLOB *blob)
 
void break_noisiest_blob_word (WERD_RES_LIST &words)
 
fix_fuzzy_spaces()

Walk over the page finding sequences of words joined by fuzzy spaces. Extract them as a sublist, process the sublist to find the optimal arrangement of spaces then replace the sublist in the ROW_RES.

Parameters
monitorprogress monitor
word_countcount of words in doc
[out]page_res
void fix_fuzzy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
 
void fix_fuzzy_spaces (ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
 
process_selected_words()

Walk the current block list applying the specified word processor function to each word that overlaps the selection_box.

void process_selected_words (PAGE_RES *page_res, TBOX &selection_box, bool(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
 
tess_add_doc_word

Add the given word to the document dictionary

void tess_add_doc_word (WERD_CHOICE *word_choice)
 
tess_acceptable_word
Returns
true if the word is regarded as "good enough".
Parameters
word_choiceafter context
raw_choicebefore context
bool tess_acceptable_word (WERD_RES *word)
 
- Public Member Functions inherited from tesseract::Wordrec
 Wordrec ()
 
 ~Wordrec () override=default
 
void SaveAltChoices (const LIST &best_choices, WERD_RES *word)
 
void FillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void CallFillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void SegSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void InitialSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void DoSegSearch (WERD_RES *word_res)
 
void add_seam_to_queue (float new_priority, SEAM *new_seam, SeamQueue *seams)
 
void choose_best_seam (SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
 
void combine_seam (const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue)
 
SEAMpick_good_seam (TBLOB *blob)
 
void try_point_pairs (EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
void try_vertical_splits (EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
PRIORITY grade_split_length (SPLIT *split)
 
PRIORITY grade_sharpness (SPLIT *split)
 
bool near_point (EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt)
 
virtual BLOB_CHOICE_LIST * classify_piece (const GenericVector< SEAM * > &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
 
void merge_fragments (MATRIX *ratings, int16_t num_blobs)
 
void get_fragment_lists (int16_t current_frag, int16_t current_row, int16_t start, int16_t num_frag_parts, int16_t num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
 
void merge_and_put_fragment_lists (int16_t row, int16_t column, int16_t num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)
 
void fill_filtered_fragment_list (BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)
 
void program_editup (const char *textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
 
void cc_recog (WERD_RES *word)
 
void program_editdown (int32_t elasped_time)
 
void set_pass1 ()
 
void set_pass2 ()
 
int end_recog ()
 
BLOB_CHOICE_LIST * call_matcher (TBLOB *blob)
 
int dict_word (const WERD_CHOICE &word)
 
BLOB_CHOICE_LIST * classify_blob (TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle)
 
PRIORITY point_priority (EDGEPT *point)
 
void add_point_to_list (PointHeap *point_heap, EDGEPT *point)
 
bool is_inside_angle (EDGEPT *pt)
 
int angle_change (EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
 
EDGEPTpick_close_point (EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist)
 
void prioritize_points (TESSLINE *outline, PointHeap *points)
 
void new_min_point (EDGEPT *local_min, PointHeap *points)
 
void new_max_point (EDGEPT *local_max, PointHeap *points)
 
void vertical_projection_point (EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points)
 
SEAMattempt_blob_chop (TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
 
SEAMchop_numbered_blob (TWERD *word, int32_t blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
 
SEAMchop_overlapping_blob (const GenericVector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number)
 
SEAMimprove_one_blob (const GenericVector< BLOB_CHOICE * > &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number)
 
SEAMchop_one_blob (const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, int *blob_number)
 
void chop_word_main (WERD_RES *word)
 
void improve_by_chopping (float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending)
 
int select_blob_to_split (const GenericVector< BLOB_CHOICE * > &blob_choices, float rating_ceiling, bool split_next_to_fragment)
 
int select_blob_to_split_from_fixpt (DANGERR *fixpt)
 
- Public Member Functions inherited from tesseract::Classify
 Classify ()
 
 ~Classify () override
 
virtual DictgetDict ()
 
const ShapeTableshape_table () const
 
void SetStaticClassifier (ShapeClassifier *static_classifier)
 
void AddLargeSpeckleTo (int blob_length, BLOB_CHOICE_LIST *choices)
 
bool LargeSpeckle (const TBLOB &blob)
 
ADAPT_TEMPLATES NewAdaptedTemplates (bool InitFromUnicharset)
 
int GetFontinfoId (ADAPT_CLASS Class, uint8_t ConfigId)
 
int PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
 
void ReadNewCutoffs (TFile *fp, uint16_t *Cutoffs)
 
void PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
void WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
ADAPT_TEMPLATES ReadAdaptedTemplates (TFile *File)
 
float ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch)
 
void FreeNormProtos ()
 
NORM_PROTOSReadNormProtos (TFile *fp)
 
void ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class)
 
INT_TEMPLATES CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset)
 
void LearnWord (const char *fontname, WERD_RES *word)
 
void LearnPieces (const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
 
void InitAdaptiveClassifier (TessdataManager *mgr)
 
void InitAdaptedClass (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
 
void AmbigClassifier (const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
 
void MasterMatcher (INT_TEMPLATES templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
 
void ExpandShapesAndApplyCorrections (ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
 
double ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors)
 
void ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
 
void AddNewResult (const UnicharRating &new_result, ADAPT_RESULTS *results)
 
int GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
 
void DebugAdaptiveClassifier (TBLOB *Blob, ADAPT_RESULTS *Results)
 
PROTO_ID MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
 
int MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
 
void MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
 
void PrintAdaptiveMatchResults (const ADAPT_RESULTS &results)
 
void RemoveExtraPuncs (ADAPT_RESULTS *Results)
 
void RemoveBadMatches (ADAPT_RESULTS *Results)
 
void SetAdaptiveThreshold (float Threshold)
 
void ShowBestMatchFor (int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
 
STRING ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
 
int ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const
 
int ShapeIDToClassID (int shape_id) const
 
UNICHAR_IDBaselineClassifier (TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
 
int CharNormClassifier (TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
 
int CharNormTrainingSample (bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results)
 
UNICHAR_IDGetAmbiguities (TBLOB *Blob, CLASS_ID CorrectClass)
 
void DoAdaptiveMatch (TBLOB *Blob, ADAPT_RESULTS *Results)
 
void AdaptToChar (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES adaptive_templates)
 
void DisplayAdaptedChar (TBLOB *blob, INT_CLASS_STRUCT *int_class)
 
bool AdaptableWord (WERD_RES *word)
 
void EndAdaptiveClassifier ()
 
void SettupPass1 ()
 
void SettupPass2 ()
 
void AdaptiveClassifier (TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
 
void ClassifyAsNoise (ADAPT_RESULTS *Results)
 
void ResetAdaptiveClassifierInternal ()
 
void SwitchAdaptiveClassifier ()
 
void StartBackupAdaptiveClassifier ()
 
int GetCharNormFeature (const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array)
 
void ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array)
 
bool TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config)
 
void UpdateAmbigsGroup (CLASS_ID class_id, TBLOB *Blob)
 
bool AdaptiveClassifierIsFull () const
 
bool AdaptiveClassifierIsEmpty () const
 
bool LooksLikeGarbage (TBLOB *blob)
 
void RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
 
void ClearCharNormArray (uint8_t *char_norm_array)
 
void ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array)
 
void ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
 
INT_TEMPLATES ReadIntTemplates (TFile *fp)
 
void WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
 
CLASS_ID GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
 
void ShowMatchDisplay ()
 
UnicityTable< FontInfo > & get_fontinfo_table ()
 
const UnicityTable< FontInfo > & get_fontinfo_table () const
 
UnicityTable< FontSet > & get_fontset_table ()
 
void NormalizeOutlines (LIST Outlines, float *XScale, float *YScale)
 
FEATURE_SET ExtractOutlineFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractPicoFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractIntCNFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
FEATURE_SET ExtractIntGeoFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
void LearnBlob (const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
 
bool WriteTRFile (const STRING &filename)
 
- Public Member Functions inherited from tesseract::CCStruct
 CCStruct ()=default
 
 ~CCStruct () override
 
- Public Member Functions inherited from tesseract::CUtil
 CUtil ()=default
 
 ~CUtil () override
 
void read_variables (const char *filename, bool global_only)
 
- Public Member Functions inherited from tesseract::CCUtil
 CCUtil ()
 
virtual ~CCUtil ()
 
void main_setup (const char *argv0, const char *basename)
 CCUtil::main_setup - set location of tessdata and name of image. More...
 
ParamsVectorsparams ()
 

Public Attributes

bool tessedit_resegment_from_boxes = false
 
bool tessedit_resegment_from_line_boxes = false
 
bool tessedit_train_from_boxes = false
 
bool tessedit_make_boxes_from_boxes = false
 
bool tessedit_train_line_recognizer = false
 
bool tessedit_dump_pageseg_images = false
 
bool tessedit_do_invert = true
 
int tessedit_pageseg_mode = PSM_SINGLE_BLOCK
 
int tessedit_ocr_engine_mode = tesseract::OEM_DEFAULT
 
char * tessedit_char_blacklist = ""
 
char * tessedit_char_whitelist = ""
 
char * tessedit_char_unblacklist = ""
 
bool tessedit_ambigs_training = false
 
int pageseg_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT
 
int ocr_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT
 
char * tessedit_write_params_to_file = ""
 
bool tessedit_adaption_debug = false
 
int bidi_debug = 0
 
int applybox_debug = 1
 
int applybox_page = 0
 
char * applybox_exposure_pattern = ".exp"
 
bool applybox_learn_chars_and_char_frags_mode = false
 
bool applybox_learn_ngrams_mode = false
 
bool tessedit_display_outwords = false
 
bool tessedit_dump_choices = false
 
bool tessedit_timing_debug = false
 
bool tessedit_fix_fuzzy_spaces = true
 
bool tessedit_unrej_any_wd = false
 
bool tessedit_fix_hyphens = true
 
bool tessedit_enable_doc_dict = true
 
bool tessedit_debug_fonts = false
 
bool tessedit_debug_block_rejection = false
 
bool tessedit_enable_bigram_correction = true
 
bool tessedit_enable_dict_correction = false
 
int tessedit_bigram_debug = 0
 
bool enable_noise_removal = true
 
int debug_noise_removal = 0
 
double noise_cert_basechar = -8.0
 
double noise_cert_disjoint = -2.5
 
double noise_cert_punc = -2.5
 
double noise_cert_factor = 0.375
 
int noise_maxperblob = 8
 
int noise_maxperword = 16
 
int debug_x_ht_level = 0
 
char * chs_leading_punct = "('`\""
 
char * chs_trailing_punct1 = ").,;:?!"
 
char * chs_trailing_punct2 = ")'`\""
 
double quality_rej_pc = 0.08
 
double quality_blob_pc = 0.0
 
double quality_outline_pc = 1.0
 
double quality_char_pc = 0.95
 
int quality_min_initial_alphas_reqd = 2
 
int tessedit_tess_adaption_mode = 0x27
 
bool tessedit_minimal_rej_pass1 = false
 
bool tessedit_test_adaption = false
 
bool test_pt = false
 
double test_pt_x = 99999.99
 
double test_pt_y = 99999.99
 
int multilang_debug_level = 0
 
int paragraph_debug_level = 0
 
bool paragraph_text_based = true
 
bool lstm_use_matrix = 1
 
char * outlines_odd = "%| "
 
char * outlines_2 = "ij!?%\":;"
 
bool tessedit_good_quality_unrej = true
 
bool tessedit_use_reject_spaces = true
 
double tessedit_reject_doc_percent = 65.00
 
double tessedit_reject_block_percent = 45.00
 
double tessedit_reject_row_percent = 40.00
 
double tessedit_whole_wd_rej_row_percent = 70.00
 
bool tessedit_preserve_blk_rej_perfect_wds = true
 
bool tessedit_preserve_row_rej_perfect_wds = true
 
bool tessedit_dont_blkrej_good_wds = false
 
bool tessedit_dont_rowrej_good_wds = false
 
int tessedit_preserve_min_wd_len = 2
 
bool tessedit_row_rej_good_docs = true
 
double tessedit_good_doc_still_rowrej_wd = 1.1
 
bool tessedit_reject_bad_qual_wds = true
 
bool tessedit_debug_doc_rejection = false
 
bool tessedit_debug_quality_metrics = false
 
bool bland_unrej = false
 
double quality_rowrej_pc = 1.1
 
bool unlv_tilde_crunching = false
 
bool hocr_font_info = false
 
bool hocr_char_boxes = false
 
bool crunch_early_merge_tess_fails = true
 
bool crunch_early_convert_bad_unlv_chs = false
 
double crunch_terrible_rating = 80.0
 
bool crunch_terrible_garbage = true
 
double crunch_poor_garbage_cert = -9.0
 
double crunch_poor_garbage_rate = 60
 
double crunch_pot_poor_rate = 40
 
double crunch_pot_poor_cert = -8.0
 
double crunch_del_rating = 60
 
double crunch_del_cert = -10.0
 
double crunch_del_min_ht = 0.7
 
double crunch_del_max_ht = 3.0
 
double crunch_del_min_width = 3.0
 
double crunch_del_high_word = 1.5
 
double crunch_del_low_word = 0.5
 
double crunch_small_outlines_size = 0.6
 
int crunch_rating_max = 10
 
int crunch_pot_indicators = 1
 
bool crunch_leave_ok_strings = true
 
bool crunch_accept_ok = true
 
bool crunch_leave_accept_strings = false
 
bool crunch_include_numerals = false
 
int crunch_leave_lc_strings = 4
 
int crunch_leave_uc_strings = 4
 
int crunch_long_repetitions = 3
 
int crunch_debug = 0
 
int fixsp_non_noise_limit = 1
 
double fixsp_small_outlines_size = 0.28
 
bool tessedit_prefer_joined_punct = false
 
int fixsp_done_mode = 1
 
int debug_fix_space_level = 0
 
char * numeric_punctuation = ".,"
 
int x_ht_acceptance_tolerance = 8
 
int x_ht_min_change = 8
 
int superscript_debug = 0
 
double superscript_worse_certainty = 2.0
 
double superscript_bettered_certainty = 0.97
 
double superscript_scaledown_ratio = 0.4
 
double subscript_max_y_top = 0.5
 
double superscript_min_y_bottom = 0.3
 
bool tessedit_write_block_separators = false
 
bool tessedit_write_rep_codes = false
 
bool tessedit_write_unlv = false
 
bool tessedit_create_txt = false
 
bool tessedit_create_hocr = false
 
bool tessedit_create_alto = false
 
bool tessedit_create_lstmbox = false
 
bool tessedit_create_tsv = false
 
bool tessedit_create_wordstrbox = false
 
bool tessedit_create_pdf = false
 
bool textonly_pdf = false
 
int jpg_quality = 85
 
int user_defined_dpi = 0
 
int min_characters_to_try = 50
 
char * unrecognised_char = "|"
 
int suspect_level = 99
 
int suspect_short_words = 2
 
bool suspect_constrain_1Il = false
 
double suspect_rating_per_ch = 999.9
 
double suspect_accept_rating = -999.9
 
bool tessedit_minimal_rejection = false
 
bool tessedit_zero_rejection = false
 
bool tessedit_word_for_word = false
 
bool tessedit_zero_kelvin_rejection = false
 
int tessedit_reject_mode = 0
 
bool tessedit_rejection_debug = false
 
bool tessedit_flip_0O = true
 
double tessedit_lower_flip_hyphen = 1.5
 
double tessedit_upper_flip_hyphen = 1.8
 
bool rej_trust_doc_dawg = false
 
bool rej_1Il_use_dict_word = false
 
bool rej_1Il_trust_permuter_type = true
 
bool rej_use_tess_accepted = true
 
bool rej_use_tess_blanks = true
 
bool rej_use_good_perm = true
 
bool rej_use_sensible_wd = false
 
bool rej_alphas_in_number_perm = false
 
double rej_whole_of_mostly_reject_word_fract = 0.85
 
int tessedit_image_border = 2
 
char * ok_repeated_ch_non_alphanum_wds = "-?*\075"
 
char * conflict_set_I_l_1 = "Il1[]"
 
int min_sane_x_ht_pixels = 8
 
bool tessedit_create_boxfile = false
 
int tessedit_page_number = -1
 
bool tessedit_write_images = false
 
bool interactive_display_mode = false
 
char * file_type = ".tif"
 
bool tessedit_override_permuter = true
 
char * tessedit_load_sublangs = ""
 
bool tessedit_use_primary_params_model = false
 
double min_orientation_margin = 7.0
 
bool textord_tabfind_show_vlines = false
 
bool textord_use_cjk_fp_model = false
 
bool poly_allow_detailed_fx = false
 
bool tessedit_init_config_only = false
 
bool textord_equation_detect = false
 
bool textord_tabfind_vertical_text = true
 
bool textord_tabfind_force_vertical_text = false
 
double textord_tabfind_vertical_text_ratio = 0.5
 
double textord_tabfind_aligned_gap_fraction = 0.75
 
int tessedit_parallelize = 0
 
bool preserve_interword_spaces = false
 
char * page_separator = "\f"
 
int lstm_choice_mode = 0
 
bool pageseg_apply_music_mask = true
 
- Public Attributes inherited from tesseract::Wordrec
bool merge_fragments_in_matrix = true
 
bool wordrec_enable_assoc = true
 
bool force_word_assoc = false
 
int repair_unchopped_blobs = 1
 
double tessedit_certainty_threshold = -2.25
 
int chop_debug = 0
 
bool chop_enable = 1
 
bool chop_vertical_creep = 0
 
int chop_split_length = 10000
 
int chop_same_distance = 2
 
int chop_min_outline_points = 6
 
int chop_seam_pile_size = 150
 
bool chop_new_seam_pile = 1
 
int chop_inside_angle = -50
 
int chop_min_outline_area = 2000
 
double chop_split_dist_knob = 0.5
 
double chop_overlap_knob = 0.9
 
double chop_center_knob = 0.15
 
int chop_centered_maxwidth = 90
 
double chop_sharpness_knob = 0.06
 
double chop_width_change_knob = 5.0
 
double chop_ok_split = 100.0
 
double chop_good_split = 50.0
 
int chop_x_y_weight = 3
 
bool assume_fixed_pitch_char_segment = false
 
int wordrec_debug_level = 0
 
int wordrec_max_join_chunks = 4
 
bool wordrec_skip_no_truth_words = false
 
bool wordrec_debug_blamer = false
 
bool wordrec_run_blamer = false
 
int segsearch_debug_level = 0
 
int segsearch_max_pain_points = 2000
 
int segsearch_max_futile_classifications = 10
 
double segsearch_max_char_wh_ratio = 2.0
 
bool save_alt_choices = true
 
std::unique_ptr< LanguageModellanguage_model_
 
PRIORITY pass2_ok_split
 
WERD_CHOICEprev_word_best_choice_
 
GenericVector< int > blame_reasons_
 
void(Wordrec::* fill_lattice_ )(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
- Public Attributes inherited from tesseract::Classify
bool allow_blob_division = true
 
bool prioritize_division = false
 
bool classify_enable_learning = true
 
int classify_debug_level = 0
 
int classify_norm_method = character
 
double classify_char_norm_range = 0.2
 
double classify_max_rating_ratio = 1.5
 
double classify_max_certainty_margin = 5.5
 
bool tess_cn_matching = 0
 
bool tess_bn_matching = 0
 
bool classify_enable_adaptive_matcher = 1
 
bool classify_use_pre_adapted_templates = 0
 
bool classify_save_adapted_templates = 0
 
bool classify_enable_adaptive_debugger = 0
 
bool classify_nonlinear_norm = 0
 
int matcher_debug_level = 0
 
int matcher_debug_flags = 0
 
int classify_learning_debug_level = 0
 
double matcher_good_threshold = 0.125
 
double matcher_reliable_adaptive_result = 0.0
 
double matcher_perfect_threshold = 0.02
 
double matcher_bad_match_pad = 0.15
 
double matcher_rating_margin = 0.1
 
double matcher_avg_noise_size = 12.0
 
int matcher_permanent_classes_min = 1
 
int matcher_min_examples_for_prototyping = 3
 
int matcher_sufficient_examples_for_prototyping = 5
 
double matcher_clustering_max_angle_delta = 0.015
 
double classify_misfit_junk_penalty = 0.0
 
double rating_scale = 1.5
 
double certainty_scale = 20.0
 
double tessedit_class_miss_scale = 0.00390625
 
double classify_adapted_pruning_factor = 2.5
 
double classify_adapted_pruning_threshold = -1.0
 
int classify_adapt_proto_threshold = 230
 
int classify_adapt_feature_threshold = 230
 
bool disable_character_fragments = true
 
double classify_character_fragments_garbage_certainty_threshold = -3.0
 
bool classify_debug_character_fragments = false
 
bool matcher_debug_separate_windows = false
 
char * classify_learn_debug_str = ""
 
int classify_class_pruner_threshold = 229
 
int classify_class_pruner_multiplier = 15
 
int classify_cp_cutoff_strength = 7
 
int classify_integer_matcher_multiplier = 10
 
bool classify_bln_numeric_mode = 0
 
double speckle_large_max_size = 0.30
 
double speckle_rating_penalty = 10.0
 
INT_TEMPLATES PreTrainedTemplates
 
ADAPT_TEMPLATES AdaptedTemplates
 
ADAPT_TEMPLATES BackupAdaptedTemplates
 
BIT_VECTOR AllProtosOn
 
BIT_VECTOR AllConfigsOn
 
BIT_VECTOR AllConfigsOff
 
BIT_VECTOR TempProtoMask
 
NORM_PROTOSNormProtos
 
UnicityTable< FontInfofontinfo_table_
 
UnicityTable< FontSetfontset_table_
 
bool EnableLearning
 
- Public Attributes inherited from tesseract::CCUtil
STRING datadir
 
STRING imagebasename
 
STRING lang
 
STRING language_data_path_prefix
 
UNICHARSET unicharset
 
UnicharAmbigs unichar_ambigs
 
STRING imagefile
 
STRING directory
 
int ambigs_debug_level = 0
 
bool use_ambigs_for_adaption = false
 

Additional Inherited Members

- Static Public Member Functions inherited from tesseract::Classify
static void SetupBLCNDenorms (const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
 
static void ExtractFeatures (const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
 
- Static Public Attributes inherited from tesseract::CCStruct
static const double kDescenderFraction = 0.25
 
static const double kXHeightFraction = 0.5
 
static const double kAscenderFraction = 0.25
 
static const double kXHeightCapRatio
 
- Protected Member Functions inherited from tesseract::Wordrec
bool SegSearchDone (int num_futile_classifications)
 
void UpdateSegSearchNodes (float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void ProcessSegSearchPainPoint (float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
 
void ResetNGramSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, GenericVector< SegSearchPending > *pending)
 
void InitBlamerForSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug)
 
- Protected Attributes inherited from tesseract::Classify
IntegerMatcher im_
 
FEATURE_DEFS_STRUCT feature_defs_
 
ShapeTableshape_table_
 

Detailed Description

Definition at line 174 of file tesseractclass.h.

Constructor & Destructor Documentation

◆ Tesseract()

tesseract::Tesseract::Tesseract ( )

Definition at line 52 of file tesseractclass.cpp.

54 "Take segmentation and labeling from box file",
55 this->params()),
57 "Conversion of word/line box file to char box file",
58 this->params()),
60 "Generate training data from boxed chars", this->params()),
62 "Generate more boxes from boxed chars", this->params()),
64 "Break input into lines and remap boxes if present",
65 this->params()),
67 "Dump intermediate images made during page segmentation",
68 this->params()),
70 "Try inverting the image in `LSTMRecognizeWord`", this->params()),
71 // The default for pageseg_mode is the old behaviour, so as not to
72 // upset anything that relies on that.
75 "Page seg mode: 0=osd only, 1=auto+osd, 2=auto_only, 3=auto, 4=column,"
76 " 5=block_vert, 6=block, 7=line, 8=word, 9=word_circle, 10=char,"
77 "11=sparse_text, 12=sparse_text+osd, 13=raw_line"
78 " (Values from PageSegMode enum in publictypes.h)",
79 this->params()),
81 "Which OCR engine(s) to run (Tesseract, LSTM, both)."
82 " Defaults to loading and running the most accurate"
83 " available.",
84 this->params()),
86 "Blacklist of chars not to recognize", this->params()),
88 "Whitelist of chars to recognize", this->params()),
90 "List of chars to override tessedit_char_blacklist",
91 this->params()),
93 "Perform training for ambiguities", this->params()),
96 "Whether to use the top-line splitting process for Devanagari "
97 "documents while performing page-segmentation.",
98 this->params()),
101 "Whether to use the top-line splitting process for Devanagari "
102 "documents while performing ocr.",
103 this->params()),
105 "Write all parameters to the given file.", this->params()),
107 "Generate and print debug"
108 " information for adaption",
109 this->params()),
110 INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()),
111 INT_MEMBER(applybox_debug, 1, "Debug level", this->params()),
112 INT_MEMBER(applybox_page, 0, "Page number to apply boxes from",
113 this->params()),
115 "Exposure value follows"
116 " this pattern in the image filename. The name of the image"
117 " files are expected to be in the form"
118 " [lang].[fontname].exp[num].tif",
119 this->params()),
121 "Learn both character fragments (as is done in the"
122 " special low exposure mode) as well as unfragmented"
123 " characters.",
124 this->params()),
126 "Each bounding box"
127 " is assumed to contain ngrams. Only learn the ngrams"
128 " whose outlines overlap horizontally.",
129 this->params()),
130 BOOL_MEMBER(tessedit_display_outwords, false, "Draw output words",
131 this->params()),
132 BOOL_MEMBER(tessedit_dump_choices, false, "Dump char choices",
133 this->params()),
134 BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats",
135 this->params()),
137 "Try to improve fuzzy spaces", this->params()),
139 "Don't bother with word plausibility", this->params()),
140 BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?",
141 this->params()),
143 "Add words to the document dictionary", this->params()),
144 BOOL_MEMBER(tessedit_debug_fonts, false, "Output font info per char",
145 this->params()),
146 BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats",
147 this->params()),
149 "Enable correction based on the word bigram dictionary.",
150 this->params()),
152 "Enable single word correction based on the dictionary.",
153 this->params()),
155 "Amount of debug output for bigram correction.",
156 this->params()),
158 "Remove and conditionally reassign small outlines when they"
159 " confuse layout analysis, determining diacritics vs noise",
160 this->params()),
161 INT_MEMBER(debug_noise_removal, 0, "Debug reassignment of small outlines",
162 this->params()),
163 // Worst (min) certainty, for which a diacritic is allowed to make the
164 // base
165 // character worse and still be included.
167 "Hingepoint for base char certainty", this->params()),
168 // Worst (min) certainty, for which a non-overlapping diacritic is allowed
169 // to make the base character worse and still be included.
171 "Hingepoint for disjoint certainty", this->params()),
172 // Worst (min) certainty, for which a diacritic is allowed to make a new
173 // stand-alone blob.
175 "Threshold for new punc char certainty", this->params()),
176 // Factor of certainty margin for adding diacritics to not count as worse.
178 "Scaling on certainty diff from Hingepoint",
179 this->params()),
180 INT_MEMBER(noise_maxperblob, 8, "Max diacritics to apply to a blob",
181 this->params()),
182 INT_MEMBER(noise_maxperword, 16, "Max diacritics to apply to a word",
183 this->params()),
184 INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()),
185 STRING_MEMBER(chs_leading_punct, "('`\"", "Leading punctuation",
186 this->params()),
187 STRING_MEMBER(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation",
188 this->params()),
189 STRING_MEMBER(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation",
190 this->params()),
192 "good_quality_doc lte rejection limit", this->params()),
194 "good_quality_doc gte good blobs limit", this->params()),
196 "good_quality_doc lte outline error limit", this->params()),
198 "good_quality_doc gte good char limit", this->params()),
199 INT_MEMBER(quality_min_initial_alphas_reqd, 2, "alphas in a good word",
200 this->params()),
202 "Adaptation decision algorithm for tess", this->params()),
204 "Do minimal rejection on pass 1 output", this->params()),
205 BOOL_MEMBER(tessedit_test_adaption, false, "Test adaption criteria",
206 this->params()),
207 BOOL_MEMBER(test_pt, false, "Test for point", this->params()),
208 double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()),
209 double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()),
210 INT_MEMBER(multilang_debug_level, 0, "Print multilang debug info.",
211 this->params()),
212 INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.",
213 this->params()),
215 "Run paragraph detection on the post-text-recognition "
216 "(more accurate)",
217 this->params()),
219 "Use ratings matrix/beam search with lstm", this->params()),
220 STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines",
221 this->params()),
222 STRING_MEMBER(outlines_2, "ij!?%\":;", "Non standard number of outlines",
223 this->params()),
225 "Reduce rejection on good docs", this->params()),
226 BOOL_MEMBER(tessedit_use_reject_spaces, true, "Reject spaces?",
227 this->params()),
229 "%rej allowed before rej whole doc", this->params()),
231 "%rej allowed before rej whole block", this->params()),
233 "%rej allowed before rej whole row", this->params()),
235 "Number of row rejects in whole word rejects"
236 " which prevents whole row rejection",
237 this->params()),
239 "Only rej partially rejected words in block rejection",
240 this->params()),
242 "Only rej partially rejected words in row rejection",
243 this->params()),
245 "Use word segmentation quality metric", this->params()),
247 "Use word segmentation quality metric", this->params()),
249 "Only preserve wds longer than this", this->params()),
251 "Apply row rejection to good docs", this->params()),
253 "rej good doc wd if more than this fraction rejected",
254 this->params()),
256 "Reject all bad quality wds", this->params()),
257 BOOL_MEMBER(tessedit_debug_doc_rejection, false, "Page stats",
258 this->params()),
260 "Output data to debug file", this->params()),
261 BOOL_MEMBER(bland_unrej, false, "unrej potential with no checks",
262 this->params()),
264 "good_quality_doc gte good char limit", this->params()),
266 "Mark v.bad words for tilde crunch", this->params()),
267 BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output",
268 this->params()),
269 BOOL_MEMBER(hocr_char_boxes, false, "Add coordinates for each character to hocr output",
270 this->params()),
271 BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?",
272 this->params()),
274 "Take out ~^ early?", this->params()),
275 double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this",
276 this->params()),
277 BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()),
279 "crunch garbage cert lt this", this->params()),
281 "crunch garbage rating lt this", this->params()),
282 double_MEMBER(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this",
283 this->params()),
284 double_MEMBER(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this",
285 this->params()),
286 double_MEMBER(crunch_del_rating, 60, "POTENTIAL crunch rating lt this",
287 this->params()),
288 double_MEMBER(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this",
289 this->params()),
290 double_MEMBER(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this",
291 this->params()),
292 double_MEMBER(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this",
293 this->params()),
295 "Del if word width lt xht x this", this->params()),
297 "Del if word gt xht x this above bl", this->params()),
299 "Del if word gt xht x this below bl", this->params()),
300 double_MEMBER(crunch_small_outlines_size, 0.6, "Small if lt xht x this",
301 this->params()),
302 INT_MEMBER(crunch_rating_max, 10, "For adj length in rating per ch",
303 this->params()),
305 "How many potential indicators needed", this->params()),
306 BOOL_MEMBER(crunch_leave_ok_strings, true, "Don't touch sensible strings",
307 this->params()),
308 BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring",
309 this->params()),
311 "Don't pot crunch sensible strings", this->params()),
312 BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures",
313 this->params()),
315 "Don't crunch words with long lower case strings",
316 this->params()),
318 "Don't crunch words with long lower case strings",
319 this->params()),
321 "Crunch words with long repetitions", this->params()),
322 INT_MEMBER(crunch_debug, 0, "As it says", this->params()),
324 "How many non-noise blbs either side?", this->params()),
325 double_MEMBER(fixsp_small_outlines_size, 0.28, "Small if lt xht x this",
326 this->params()),
328 "Reward punctuation joins", this->params()),
329 INT_MEMBER(fixsp_done_mode, 1, "What constitues done for spacing",
330 this->params()),
331 INT_MEMBER(debug_fix_space_level, 0, "Contextual fixspace debug",
332 this->params()),
334 "Punct. chs expected WITHIN numbers", this->params()),
336 "Max allowed deviation of blob top outside of font data",
337 this->params()),
339 "Min change in xht before actually trying it", this->params()),
341 "Debug level for sub & superscript fixer", this->params()),
344 "How many times worse "
345 "certainty does a superscript position glyph need to be for "
346 "us to try classifying it as a char with a different "
347 "baseline?",
348 this->params()),
351 "What reduction in "
352 "badness do we think sufficient to choose a superscript "
353 "over what we'd thought. For example, a value of 0.6 means "
354 "we want to reduce badness of certainty by at least 40%",
355 this->params()),
357 "A superscript scaled down more than this is unbelievably "
358 "small. For example, 0.3 means we expect the font size to "
359 "be no smaller than 30% of the text line font size.",
360 this->params()),
362 "Maximum top of a character measured as a multiple of "
363 "x-height above the baseline for us to reconsider whether "
364 "it's a subscript.",
365 this->params()),
367 "Minimum bottom of a character measured as a multiple of "
368 "x-height above the baseline for us to reconsider whether "
369 "it's a superscript.",
370 this->params()),
372 "Write block separators in output", this->params()),
373 BOOL_MEMBER(tessedit_write_rep_codes, false, "Write repetition char code",
374 this->params()),
375 BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file",
376 this->params()),
377 BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file",
378 this->params()),
379 BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
380 this->params()),
381 BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file",
382 this->params()),
383 BOOL_MEMBER(tessedit_create_lstmbox, false, "Write .box file for LSTM training",
384 this->params()),
385 BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
386 this->params()),
387 BOOL_MEMBER(tessedit_create_wordstrbox, false, "Write WordStr format .box output file",
388 this->params()),
389 BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
390 this->params()),
392 "Create PDF with only one invisible text layer",
393 this->params()),
394 INT_MEMBER(jpg_quality, 85, "Set JPEG quality level", this->params()),
395 INT_MEMBER(user_defined_dpi, 0, "Specify DPI for input image",
396 this->params()),
398 "Specify minimum characters to try during OSD",
399 this->params()),
401 "Output char for unidentified blobs", this->params()),
402 INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),
404 "Don't suspect dict wds longer than this", this->params()),
405 BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected",
406 this->params()),
408 "Don't touch bad rating limit", this->params()),
409 double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit",
410 this->params()),
412 "Only reject tess failures", this->params()),
413 BOOL_MEMBER(tessedit_zero_rejection, false, "Don't reject ANYTHING",
414 this->params()),
416 "Make output have exactly one word per WERD", this->params()),
418 "Don't reject ANYTHING AT ALL", this->params()),
419 INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm",
420 this->params()),
421 BOOL_MEMBER(tessedit_rejection_debug, false, "Adaption debug",
422 this->params()),
423 BOOL_MEMBER(tessedit_flip_0O, true, "Contextual 0O O0 flips",
424 this->params()),
426 "Aspect ratio dot/hyphen test", this->params()),
428 "Aspect ratio dot/hyphen test", this->params()),
430 "Use DOC dawg in 11l conf. detector", this->params()),
431 BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test",
432 this->params()),
433 BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Don't double check",
434 this->params()),
435 BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control",
436 this->params()),
437 BOOL_MEMBER(rej_use_tess_blanks, true, "Individual rejection control",
438 this->params()),
439 BOOL_MEMBER(rej_use_good_perm, true, "Individual rejection control",
440 this->params()),
441 BOOL_MEMBER(rej_use_sensible_wd, false, "Extend permuter check",
442 this->params()),
443 BOOL_MEMBER(rej_alphas_in_number_perm, false, "Extend permuter check",
444 this->params()),
446 "if >this fract", this->params()),
447 INT_MEMBER(tessedit_image_border, 2, "Rej blbs near image edge limit",
448 this->params()),
450 "Allow NN to unrej", this->params()),
451 STRING_MEMBER(conflict_set_I_l_1, "Il1[]", "Il1 conflict set",
452 this->params()),
453 INT_MEMBER(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this",
454 this->params()),
455 BOOL_MEMBER(tessedit_create_boxfile, false, "Output text with boxes",
456 this->params()),
458 "-1 -> All pages, else specific page to process",
459 this->params()),
461 "Capture the image from the IPE", this->params()),
462 BOOL_MEMBER(interactive_display_mode, false, "Run interactively?",
463 this->params()),
464 STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()),
465 BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word",
466 this->params()),
468 "List of languages to load with this one", this->params()),
470 "In multilingual mode use params model of the"
471 " primary language",
472 this->params()),
474 "Min acceptable orientation margin", this->params()),
475 BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding",
476 this->params()),
477 BOOL_MEMBER(textord_use_cjk_fp_model, false, "Use CJK fixed pitch model",
478 this->params()),
480 "Allow feature extractors to see the original outline",
481 this->params()),
483 "Only initialize with the config file. Useful if the "
484 "instance is not going to be used for OCR but say only "
485 "for layout analysis.",
486 this->params()),
487 BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector",
488 this->params()),
490 "Enable vertical detection", this->params()),
492 "Force using vertical text page mode", this->params()),
495 "Fraction of textlines deemed vertical to use vertical page "
496 "mode",
497 this->params()),
500 "Fraction of height used as a minimum gap for aligned blobs.",
501 this->params()),
502 INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible",
503 this->params()),
505 "Preserve multiple interword spaces", this->params()),
507 "Page separator (default is form feed control character)",
508 this->params()),
510 "Allows to include alternative symbols choices in the hOCR output. "
511 "Valid input values are 0, 1, 2 and 3. 0 is the default value. "
512 "With 1 the alternative symbol choices per timestep are included. "
513 "With 2 the alternative symbol choices are accumulated per "
514 "character. ",
515 this->params()),
517 "Detect music staff and remove intersecting components", this->params()),
518
519 backup_config_file_(nullptr),
520 pix_binary_(nullptr),
521 pix_grey_(nullptr),
522 pix_original_(nullptr),
523 pix_thresholds_(nullptr),
524 source_resolution_(0),
525 textord_(this),
526 right_to_left_(false),
527 scaled_color_(nullptr),
528 scaled_factor_(-1),
529 deskew_(1.0f, 0.0f),
530 reskew_(1.0f, 0.0f),
531 most_recently_used_(this),
532 font_table_size_(0),
533 equ_detect_(nullptr),
534#ifndef ANDROID_BUILD
535 lstm_recognizer_(nullptr),
536#endif
537 train_line_page_num_(0) {
538}
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:315
#define INT_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:327
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:330
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:324
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:321
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:318
@ PSM_SINGLE_BLOCK
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:172
double tessedit_reject_doc_percent
bool tessedit_preserve_row_rej_perfect_wds
double superscript_bettered_certainty
double superscript_worse_certainty
double textord_tabfind_vertical_text_ratio
bool tessedit_enable_bigram_correction
bool applybox_learn_chars_and_char_frags_mode
bool crunch_early_convert_bad_unlv_chs
double tessedit_whole_wd_rej_row_percent
double tessedit_reject_row_percent
bool tessedit_resegment_from_line_boxes
double rej_whole_of_mostly_reject_word_fract
double textord_tabfind_aligned_gap_fraction
char * tessedit_write_params_to_file
double superscript_scaledown_ratio
bool textord_tabfind_force_vertical_text
char * ok_repeated_ch_non_alphanum_wds
bool tessedit_preserve_blk_rej_perfect_wds
double tessedit_reject_block_percent
double tessedit_good_doc_still_rowrej_wd
ParamsVectors * params()
Definition: ccutil.h:67

◆ ~Tesseract()

tesseract::Tesseract::~Tesseract ( )
override

Definition at line 540 of file tesseractclass.cpp.

540 {
541 Clear();
542 pixDestroy(&pix_original_);
544 sub_langs_.delete_data_pointers();
545#ifndef ANDROID_BUILD
546 delete lstm_recognizer_;
547 lstm_recognizer_ = nullptr;
548#endif
549}

Member Function Documentation

◆ acceptable_number_string()

bool tesseract::Tesseract::acceptable_number_string ( const char *  s,
const char *  lengths 
)

Definition at line 387 of file output.cpp.

388 {
389 bool prev_digit = false;
390
391 if (*lengths == 1 && *s == '(')
392 s++;
393
394 if (*lengths == 1 &&
395 ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
396 s++;
397
398 for (; *s != '\0'; s += *(lengths++)) {
399 if (unicharset.get_isdigit(s, *lengths))
400 prev_digit = true;
401 else if (prev_digit &&
402 (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
403 prev_digit = false;
404 else if (prev_digit && *lengths == 1 &&
405 (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
406 return true;
407 else if (prev_digit &&
408 *lengths == 1 && (*s == '%') &&
409 (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
410 (*(s + *lengths + *(lengths + 1)) == '\0'))
411 return true;
412 else
413 return false;
414 }
415 return true;
416}
UNICHARSET unicharset
Definition: ccutil.h:73
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512

◆ acceptable_word_string()

ACCEPTABLE_WERD_TYPE tesseract::Tesseract::acceptable_word_string ( const UNICHARSET char_set,
const char *  s,
const char *  lengths 
)

Definition at line 1745 of file control.cpp.

1746 {
1747 int i = 0;
1748 int offset = 0;
1749 int leading_punct_count;
1750 int upper_count = 0;
1751 int hyphen_pos = -1;
1753
1754 if (strlen (lengths) > 20)
1755 return word_type;
1756
1757 /* Single Leading punctuation char*/
1758
1759 if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset]))
1760 offset += lengths[i++];
1761 leading_punct_count = i;
1762
1763 /* Initial cap */
1764 while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
1765 offset += lengths[i++];
1766 upper_count++;
1767 }
1768 if (upper_count > 1) {
1769 word_type = AC_UPPER_CASE;
1770 } else {
1771 /* Lower case word, possibly with an initial cap */
1772 while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
1773 offset += lengths[i++];
1774 }
1775 if (i - leading_punct_count < quality_min_initial_alphas_reqd)
1776 goto not_a_word;
1777 /*
1778 Allow a single hyphen in a lower case word
1779 - don't trust upper case - I've seen several cases of "H" -> "I-I"
1780 */
1781 if (lengths[i] == 1 && s[offset] == '-') {
1782 hyphen_pos = i;
1783 offset += lengths[i++];
1784 if (s[offset] != '\0') {
1785 while ((s[offset] != '\0') &&
1786 char_set.get_islower(s + offset, lengths[i])) {
1787 offset += lengths[i++];
1788 }
1789 if (i < hyphen_pos + 3)
1790 goto not_a_word;
1791 }
1792 } else {
1793 /* Allow "'s" in NON hyphenated lower case words */
1794 if (lengths[i] == 1 && (s[offset] == '\'') &&
1795 lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
1796 offset += lengths[i++];
1797 offset += lengths[i++];
1798 }
1799 }
1800 if (upper_count > 0)
1801 word_type = AC_INITIAL_CAP;
1802 else
1803 word_type = AC_LOWER_CASE;
1804 }
1805
1806 /* Up to two different, constrained trailing punctuation chars */
1807 if (lengths[i] == 1 && s[offset] != '\0' &&
1808 STRING(chs_trailing_punct1).contains(s[offset]))
1809 offset += lengths[i++];
1810 if (lengths[i] == 1 && s[offset] != '\0' && i > 0 &&
1811 s[offset - lengths[i - 1]] != s[offset] &&
1812 STRING(chs_trailing_punct2).contains (s[offset]))
1813 offset += lengths[i++];
1814
1815 if (s[offset] != '\0')
1816 word_type = AC_UNACCEPTABLE;
1817
1818 not_a_word:
1819
1820 if (word_type == AC_UNACCEPTABLE) {
1821 /* Look for abbreviation string */
1822 i = 0;
1823 offset = 0;
1824 if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
1825 word_type = AC_UC_ABBREV;
1826 while (s[offset] != '\0' &&
1827 char_set.get_isupper(s + offset, lengths[i]) &&
1828 lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1829 offset += lengths[i++];
1830 offset += lengths[i++];
1831 }
1832 }
1833 else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
1834 word_type = AC_LC_ABBREV;
1835 while (s[offset] != '\0' &&
1836 char_set.get_islower(s + offset, lengths[i]) &&
1837 lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1838 offset += lengths[i++];
1839 offset += lengths[i++];
1840 }
1841 }
1842 if (s[offset] != '\0')
1843 word_type = AC_UNACCEPTABLE;
1844 }
1845
1846 return word_type;
1847}
ACCEPTABLE_WERD_TYPE
Definition: control.h:29
@ AC_UC_ABBREV
A.B.C.
Definition: control.h:35
@ AC_INITIAL_CAP
ALL but initial lc.
Definition: control.h:33
@ AC_LC_ABBREV
a.b.c.
Definition: control.h:34
@ AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:30
@ AC_UPPER_CASE
ALL upper case.
Definition: control.h:32
@ AC_LOWER_CASE
ALL lower case.
Definition: control.h:31
Definition: strngs.h:45
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:505
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:498

◆ alpha_count()

int16_t tesseract::Tesseract::alpha_count ( const char *  word,
const char *  word_lengths 
)

Definition at line 496 of file reject.cpp.

497 {
498 int16_t i;
499 int16_t offset;
500 int16_t count = 0;
501
502 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
503 if (unicharset.get_isalpha (word + offset, word_lengths[i]))
504 count++;
505 }
506 return count;
507}
int count(LIST var_list)
Definition: oldlist.cpp:95
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491

◆ ambigs_classify_and_output()

void tesseract::Tesseract::ambigs_classify_and_output ( const char *  label,
PAGE_RES_IT pr_it,
FILE *  output_file 
)

Definition at line 211 of file recogtraining.cpp.

213 {
214 // Classify word.
215 fflush(stdout);
216 WordData word_data(*pr_it);
217 SetupWordPassN(1, &word_data);
218 classify_word_and_language(1, pr_it, &word_data);
219 WERD_RES* werd_res = word_data.word;
220 WERD_CHOICE* best_choice = werd_res->best_choice;
221 ASSERT_HOST(best_choice != nullptr);
222
223 // Compute the number of unichars in the label.
225 if (!unicharset.encode_string(label, true, &encoding, nullptr, nullptr)) {
226 tprintf("Not outputting illegal unichar %s\n", label);
227 return;
228 }
229
230 // Dump all paths through the ratings matrix (which is normally small).
231 int dim = werd_res->ratings->dimension();
232 const auto** blob_choices = new const BLOB_CHOICE*[dim];
233 PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices, unicharset,
234 label, output_file);
235 delete[] blob_choices;
236}
#define ASSERT_HOST(x)
Definition: errcode.h:88
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1319
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:177
int dimension() const
Definition: matrix.h:536
WERD_CHOICE * best_choice
Definition: pageres.h:241
MATRIX * ratings
Definition: pageres.h:237
WERD * word
Definition: pageres.h:186
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:259

◆ AnyLSTMLang()

bool tesseract::Tesseract::AnyLSTMLang ( ) const
inline

Definition at line 295 of file tesseractclass.h.

295 {
297 return true;
298 for (int i = 0; i < sub_langs_.size(); ++i) {
299 if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY) {
300 return true;
301 }
302 }
303 return false;
304 }
@ OEM_TESSERACT_ONLY
Definition: publictypes.h:269

◆ AnyTessLang()

bool tesseract::Tesseract::AnyTessLang ( ) const
inline

Definition at line 285 of file tesseractclass.h.

285 {
287 return true;
288 for (int i = 0; i < sub_langs_.size(); ++i) {
289 if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_LSTM_ONLY)
290 return true;
291 }
292 return false;
293 }

◆ ApplyBoxes()

PAGE_RES * tesseract::Tesseract::ApplyBoxes ( const STRING fname,
bool  find_segmentation,
BLOCK_LIST *  block_list 
)

Definition at line 109 of file applybox.cpp.

111 {
113 GenericVector<STRING> texts, full_texts;
114 if (!ReadAllBoxes(applybox_page, true, fname, &boxes, &texts, &full_texts,
115 nullptr)) {
116 return nullptr; // Can't do it.
117 }
118
119 const int box_count = boxes.size();
120 int box_failures = 0;
121
122 // In word mode, we use the boxes to make a word for each box, but
123 // in blob mode we use the existing words and maximally chop them first.
124 PAGE_RES* page_res = find_segmentation ?
125 nullptr : SetupApplyBoxes(boxes, block_list);
126 clear_any_old_text(block_list);
127
128 for (int i = 0; i < box_count; i++) {
129 bool foundit = false;
130 if (page_res != nullptr) {
131 foundit = ResegmentCharBox(page_res,
132 (i == 0) ? nullptr : &boxes[i - 1],
133 boxes[i],
134 (i == box_count - 1) ? nullptr : &boxes[i + 1],
135 full_texts[i].string());
136 } else {
137 foundit = ResegmentWordBox(block_list, boxes[i],
138 (i == box_count - 1) ? nullptr : &boxes[i + 1],
139 texts[i].string());
140 }
141 if (!foundit) {
142 box_failures++;
143 ReportFailedBox(i, boxes[i], texts[i].string(),
144 "FAILURE! Couldn't find a matching blob");
145 }
146 }
147
148 if (page_res == nullptr) {
149 // In word/line mode, we now maximally chop all the words and resegment
150 // them with the classifier.
151 page_res = SetupApplyBoxes(boxes, block_list);
153 }
154 if (applybox_debug > 0) {
155 tprintf("APPLY_BOXES:\n");
156 tprintf(" Boxes read from boxfile: %6d\n", box_count);
157 if (box_failures > 0)
158 tprintf(" Boxes failed resegmentation: %6d\n", box_failures);
159 }
160 TidyUp(page_res);
161 return page_res;
162}
bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING &filename, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)
Definition: boxread.cpp:53
int size() const
Definition: genericvector.h:72
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text)
Definition: applybox.cpp:431
void TidyUp(PAGE_RES *page_res)
Definition: applybox.cpp:708
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text)
Definition: applybox.cpp:329
void ReSegmentByClassification(PAGE_RES *page_res)
Definition: applybox.cpp:506
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
Definition: applybox.cpp:207
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
Definition: applybox.cpp:768

◆ ApplyBoxTraining()

void tesseract::Tesseract::ApplyBoxTraining ( const STRING fontname,
PAGE_RES page_res 
)

Calls LearnWord to extract features for labelled blobs within each word. Features are stored in an internal buffer.

Definition at line 803 of file applybox.cpp.

803 {
804 PAGE_RES_IT pr_it(page_res);
805 int word_count = 0;
806 for (WERD_RES *word_res = pr_it.word(); word_res != nullptr;
807 word_res = pr_it.forward()) {
808 LearnWord(fontname.string(), word_res);
809 ++word_count;
810 }
811 tprintf("Generated training data for %d words\n", word_count);
812}
const char * string() const
Definition: strngs.cpp:194
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:250

◆ AssignDiacriticsToNewBlobs()

void tesseract::Tesseract::AssignDiacriticsToNewBlobs ( const GenericVector< C_OUTLINE * > &  outlines,
int  pass,
WERD real_word,
PAGE_RES_IT pr_it,
GenericVector< bool > *  word_wanted,
GenericVector< C_BLOB * > *  target_blobs 
)

Definition at line 1064 of file control.cpp.

1067 {
1068 GenericVector<bool> blob_wanted;
1069 word_wanted->init_to_size(outlines.size(), false);
1070 target_blobs->init_to_size(outlines.size(), nullptr);
1071 // Check for outlines that need to be turned into stand-alone blobs.
1072 for (int i = 0; i < outlines.size(); ++i) {
1073 if (outlines[i] == nullptr) continue;
1074 // Get a set of adjacent outlines that don't overlap any existing blob.
1075 blob_wanted.init_to_size(outlines.size(), false);
1076 int num_blob_outlines = 0;
1077 TBOX total_ol_box(outlines[i]->bounding_box());
1078 while (i < outlines.size() && outlines[i] != nullptr) {
1079 blob_wanted[i] = true;
1080 total_ol_box += outlines[i]->bounding_box();
1081 ++i;
1082 ++num_blob_outlines;
1083 }
1084 // Find the insertion point.
1085 C_BLOB_IT blob_it(real_word->cblob_list());
1086 while (!blob_it.at_last() &&
1087 blob_it.data_relative(1)->bounding_box().left() <=
1088 total_ol_box.left()) {
1089 blob_it.forward();
1090 }
1091 // Choose which combination of them we actually want and where to put
1092 // them.
1094 tprintf("Num blobless outlines = %d\n", num_blob_outlines);
1095 C_BLOB* left_blob = blob_it.data();
1096 TBOX left_box = left_blob->bounding_box();
1097 C_BLOB* right_blob = blob_it.at_last() ? nullptr : blob_it.data_relative(1);
1098 if ((left_box.x_overlap(total_ol_box) || right_blob == nullptr ||
1099 !right_blob->bounding_box().x_overlap(total_ol_box)) &&
1100 SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob,
1101 outlines, num_blob_outlines,
1102 &blob_wanted)) {
1103 if (debug_noise_removal) tprintf("Added to left blob\n");
1104 for (int j = 0; j < blob_wanted.size(); ++j) {
1105 if (blob_wanted[j]) {
1106 (*word_wanted)[j] = true;
1107 (*target_blobs)[j] = left_blob;
1108 }
1109 }
1110 } else if (right_blob != nullptr &&
1111 (!left_box.x_overlap(total_ol_box) ||
1112 right_blob->bounding_box().x_overlap(total_ol_box)) &&
1114 right_blob, outlines,
1115 num_blob_outlines, &blob_wanted)) {
1116 if (debug_noise_removal) tprintf("Added to right blob\n");
1117 for (int j = 0; j < blob_wanted.size(); ++j) {
1118 if (blob_wanted[j]) {
1119 (*word_wanted)[j] = true;
1120 (*target_blobs)[j] = right_blob;
1121 }
1122 }
1123 } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, nullptr,
1124 outlines, num_blob_outlines,
1125 &blob_wanted)) {
1126 if (debug_noise_removal) tprintf("Fitted between blobs\n");
1127 for (int j = 0; j < blob_wanted.size(); ++j) {
1128 if (blob_wanted[j]) {
1129 (*word_wanted)[j] = true;
1130 (*target_blobs)[j] = nullptr;
1131 }
1132 }
1133 }
1134 }
1135}
void init_to_size(int size, const T &t)
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE * > &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
Definition: control.cpp:1140
Definition: rect.h:34
bool x_overlap(const TBOX &box) const
Definition: rect.h:401
TBOX bounding_box() const
Definition: stepblob.cpp:253
C_BLOB_LIST * cblob_list()
Definition: werd.h:95

◆ AssignDiacriticsToOverlappingBlobs()

void tesseract::Tesseract::AssignDiacriticsToOverlappingBlobs ( const GenericVector< C_OUTLINE * > &  outlines,
int  pass,
WERD real_word,
PAGE_RES_IT pr_it,
GenericVector< bool > *  word_wanted,
GenericVector< bool > *  overlapped_any_blob,
GenericVector< C_BLOB * > *  target_blobs 
)

Definition at line 1011 of file control.cpp.

1015 {
1016 GenericVector<bool> blob_wanted;
1017 word_wanted->init_to_size(outlines.size(), false);
1018 overlapped_any_blob->init_to_size(outlines.size(), false);
1019 target_blobs->init_to_size(outlines.size(), nullptr);
1020 // For each real blob, find the outlines that seriously overlap it.
1021 // A single blob could be several merged characters, so there can be quite
1022 // a few outlines overlapping, and the full engine needs to be used to chop
1023 // and join to get a sensible result.
1024 C_BLOB_IT blob_it(real_word->cblob_list());
1025 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1026 C_BLOB* blob = blob_it.data();
1027 const TBOX blob_box = blob->bounding_box();
1028 blob_wanted.init_to_size(outlines.size(), false);
1029 int num_blob_outlines = 0;
1030 for (int i = 0; i < outlines.size(); ++i) {
1031 if (blob_box.major_x_overlap(outlines[i]->bounding_box()) &&
1032 !(*word_wanted)[i]) {
1033 blob_wanted[i] = true;
1034 (*overlapped_any_blob)[i] = true;
1035 ++num_blob_outlines;
1036 }
1037 }
1038 if (debug_noise_removal) {
1039 tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
1040 blob_box.print();
1041 }
1042 // If any outlines overlap the blob, and not too many, classify the blob
1043 // (using the full engine, languages and all), and choose the maximal
1044 // combination of outlines that doesn't hurt the end-result classification
1045 // by too much. Mark them as wanted.
1046 if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
1047 if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob,
1048 outlines, num_blob_outlines,
1049 &blob_wanted)) {
1050 for (int i = 0; i < blob_wanted.size(); ++i) {
1051 if (blob_wanted[i]) {
1052 // Claim the outline and record where it is going.
1053 (*word_wanted)[i] = true;
1054 (*target_blobs)[i] = blob;
1055 }
1056 }
1057 }
1058 }
1059 }
1060}
void print() const
Definition: rect.h:278
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:412

◆ AutoPageSeg()

int tesseract::Tesseract::AutoPageSeg ( PageSegMode  pageseg_mode,
BLOCK_LIST *  blocks,
TO_BLOCK_LIST *  to_blocks,
BLOBNBOX_LIST *  diacritic_blobs,
Tesseract osd_tess,
OSResults osr 
)

Auto page segmentation. Divide the page image into blocks of uniform text linespacing and images.

Resolution (in ppi) is derived from the input image.

The output goes in the blocks list with corresponding TO_BLOCKs in the to_blocks list.

If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide the image into columns, but multiple blocks are still made if the text is of non-uniform linespacing.

If diacritic_blobs is non-null, then diacritics/noise blobs, that would confuse layout analysis by causing textline overlap, are placed there, with the expectation that they will be reassigned to words later and noise/diacriticness determined via classification.

If osd (orientation and script detection) is true then that is performed as well. If only_osd is true, then only orientation and script detection is performed. If osd is desired, (osd or only_osd) then osr_tess must be another Tesseract that was initialized especially for osd, and the results will be output into osr (orientation and script result).

Definition at line 200 of file pagesegmain.cpp.

203 {
204 Pix* photomask_pix = nullptr;
205 Pix* musicmask_pix = nullptr;
206 // The blocks made by the ColumnFinder. Moved to blocks before return.
207 BLOCK_LIST found_blocks;
208 TO_BLOCK_LIST temp_blocks;
209
210 ColumnFinder* finder = SetupPageSegAndDetectOrientation(
211 pageseg_mode, blocks, osd_tess, osr, &temp_blocks, &photomask_pix,
212 pageseg_apply_music_mask ? &musicmask_pix : nullptr);
213 int result = 0;
214 if (finder != nullptr) {
215 TO_BLOCK_IT to_block_it(&temp_blocks);
216 TO_BLOCK* to_block = to_block_it.data();
217 if (musicmask_pix != nullptr) {
218 // TODO(rays) pass the musicmask_pix into FindBlocks and mark music
219 // blocks separately. For now combine with photomask_pix.
220 pixOr(photomask_pix, photomask_pix, musicmask_pix);
221 }
222 if (equ_detect_) {
223 finder->SetEquationDetect(equ_detect_);
224 }
225 result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_,
226 to_block, photomask_pix, pix_thresholds_,
227 pix_grey_, &pixa_debug_, &found_blocks,
228 diacritic_blobs, to_blocks);
229 if (result >= 0)
230 finder->GetDeskewVectors(&deskew_, &reskew_);
231 delete finder;
232 }
233 pixDestroy(&photomask_pix);
234 pixDestroy(&musicmask_pix);
235 if (result < 0) return result;
236
237 blocks->clear();
238 BLOCK_IT block_it(blocks);
239 // Move the found blocks to the input/output blocks.
240 block_it.add_list_after(&found_blocks);
241 return result;
242}
ColumnFinder * SetupPageSegAndDetectOrientation(PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix)

◆ BelievableSuperscript()

bool tesseract::Tesseract::BelievableSuperscript ( bool  debug,
const WERD_RES word,
float  certainty_threshold,
int *  left_ok,
int *  right_ok 
) const

Return whether this is believable superscript or subscript text.

We insist that:

  • there are no punctuation marks.
  • there are no italics.
  • no normal-sized character is smaller than superscript_scaledown_ratio of what it ought to be, and
  • each character is at least as certain as certainty_threshold.
Parameters
[in]debugIf true, spew debug output
[in]wordThe word whose best_choice we're evaluating
[in]certainty_thresholdIf any of the characters have less certainty than this, reject.
[out]left_okHow many left-side characters were ok?
[out]right_okHow many right-side characters were ok?
Returns
Whether the complete best choice is believable as a superscript.

Definition at line 522 of file superscript.cpp.

526 {
527 int initial_ok_run_count = 0;
528 int ok_run_count = 0;
529 float worst_certainty = 0.0f;
530 const WERD_CHOICE &wc = *word.best_choice;
531
532 const UnicityTable<FontInfo>& fontinfo_table = get_fontinfo_table();
533 for (int i = 0; i < wc.length(); i++) {
534 TBLOB *blob = word.rebuild_word->blobs[i];
535 UNICHAR_ID unichar_id = wc.unichar_id(i);
536 float char_certainty = wc.certainty(i);
537 bool bad_certainty = char_certainty < certainty_threshold;
538 bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id);
539 bool is_italic = word.fontinfo && word.fontinfo->is_italic();
540 BLOB_CHOICE *choice = word.GetBlobChoice(i);
541 if (choice && fontinfo_table.size() > 0) {
542 // Get better information from the specific choice, if available.
543 int font_id1 = choice->fontinfo_id();
544 bool font1_is_italic = font_id1 >= 0
545 ? fontinfo_table.get(font_id1).is_italic() : false;
546 int font_id2 = choice->fontinfo_id2();
547 is_italic = font1_is_italic &&
548 (font_id2 < 0 || fontinfo_table.get(font_id2).is_italic());
549 }
550
551 float height_fraction = 1.0f;
552 float char_height = blob->bounding_box().height();
553 float normal_height = char_height;
554 if (wc.unicharset()->top_bottom_useful()) {
555 int min_bot, max_bot, min_top, max_top;
556 wc.unicharset()->get_top_bottom(unichar_id,
557 &min_bot, &max_bot,
558 &min_top, &max_top);
559 float hi_height = max_top - max_bot;
560 float lo_height = min_top - min_bot;
561 normal_height = (hi_height + lo_height) / 2;
562 if (normal_height >= kBlnXHeight) {
563 // Only ding characters that we have decent information for because
564 // they're supposed to be normal sized, not tiny specks or dashes.
565 height_fraction = char_height / normal_height;
566 }
567 }
568 bool bad_height = height_fraction < superscript_scaledown_ratio;
569
570 if (debug) {
571 if (is_italic) {
572 tprintf(" Rejecting: superscript is italic.\n");
573 }
574 if (is_punc) {
575 tprintf(" Rejecting: punctuation present.\n");
576 }
577 const char *char_str = wc.unicharset()->id_to_unichar(unichar_id);
578 if (bad_certainty) {
579 tprintf(" Rejecting: don't believe character %s with certainty %.2f "
580 "which is less than threshold %.2f\n", char_str,
581 char_certainty, certainty_threshold);
582 }
583 if (bad_height) {
584 tprintf(" Rejecting: character %s seems too small @ %.2f versus "
585 "expected %.2f\n", char_str, char_height, normal_height);
586 }
587 }
588 if (bad_certainty || bad_height || is_punc || is_italic) {
589 if (ok_run_count == i) {
590 initial_ok_run_count = ok_run_count;
591 }
592 ok_run_count = 0;
593 } else {
594 ok_run_count++;
595 }
596 if (char_certainty < worst_certainty) {
597 worst_certainty = char_certainty;
598 }
599 }
600 bool all_ok = ok_run_count == wc.length();
601 if (all_ok && debug) {
602 tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty);
603 }
604 if (!all_ok) {
605 if (left_ok) *left_ok = initial_ok_run_count;
606 if (right_ok) *right_ok = ok_run_count;
607 }
608 return all_ok;
609}
const int kBlnXHeight
Definition: normalis.h:24
int UNICHAR_ID
Definition: unichar.h:34
Definition: blobs.h:284
TBOX bounding_box() const
Definition: blobs.cpp:468
GenericVector< TBLOB * > blobs
Definition: blobs.h:459
int size() const
Return the size used.
const T & get(int id) const
Return the object from an id.
bool is_italic() const
Definition: fontinfo.h:111
TWERD * rebuild_word
Definition: pageres.h:266
const FontInfo * fontinfo
Definition: pageres.h:309
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:750
int16_t fontinfo_id2() const
Definition: ratngs.h:89
int16_t fontinfo_id() const
Definition: ratngs.h:86
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
const UNICHARSET * unicharset() const
Definition: ratngs.h:290
float certainty() const
Definition: ratngs.h:320
int length() const
Definition: ratngs.h:293
int16_t height() const
Definition: rect.h:108
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:519
bool top_bottom_useful() const
Definition: unicharset.h:537
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:568
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:386

◆ BestPix()

Pix * tesseract::Tesseract::BestPix ( ) const
inline

Definition at line 233 of file tesseractclass.h.

233 {
234 if (pixGetWidth(pix_original_) == ImageWidth()) {
235 return pix_original_;
236 } else if (pix_grey_ != nullptr) {
237 return pix_grey_;
238 } else {
239 return pix_binary_;
240 }
241 }

◆ bigram_correction_pass()

void tesseract::Tesseract::bigram_correction_pass ( PAGE_RES page_res)

Definition at line 467 of file control.cpp.

467 {
468 PAGE_RES_IT word_it(page_res);
469
470 WERD_RES *w_prev = nullptr;
471 WERD_RES *w = word_it.word();
472 while (true) {
473 w_prev = w;
474 while (word_it.forward() != nullptr &&
475 (!word_it.word() || word_it.word()->part_of_combo)) {
476 // advance word_it, skipping over parts of combos
477 }
478 if (!word_it.word()) break;
479 w = word_it.word();
480 if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
481 continue;
482 }
483 if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
485 tprintf("Skipping because one of the words is W_REP_CHAR\n");
486 }
487 continue;
488 }
489 // Two words sharing the same language model, excellent!
490 GenericVector<WERD_CHOICE *> overrides_word1;
491 GenericVector<WERD_CHOICE *> overrides_word2;
492
493 const STRING orig_w1_str = w_prev->best_choice->unichar_string();
494 const STRING orig_w2_str = w->best_choice->unichar_string();
495 WERD_CHOICE prev_best(w->uch_set);
496 {
497 int w1start, w1end;
498 w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
499 prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
500 }
501 WERD_CHOICE this_best(w->uch_set);
502 {
503 int w2start, w2end;
504 w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
505 this_best = w->best_choice->shallow_copy(w2start, w2end);
506 }
507
508 if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
510 tprintf("Top choice \"%s %s\" verified by bigram model.\n",
511 orig_w1_str.string(), orig_w2_str.string());
512 }
513 continue;
514 }
515 if (tessedit_bigram_debug > 2) {
516 tprintf("Examining alt choices for \"%s %s\".\n",
517 orig_w1_str.string(), orig_w2_str.string());
518 }
519 if (tessedit_bigram_debug > 1) {
520 if (!w_prev->best_choices.singleton()) {
521 w_prev->PrintBestChoices();
522 }
523 if (!w->best_choices.singleton()) {
524 w->PrintBestChoices();
525 }
526 }
527 float best_rating = 0.0;
528 int best_idx = 0;
529 WERD_CHOICE_IT prev_it(&w_prev->best_choices);
530 for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
531 WERD_CHOICE *p1 = prev_it.data();
532 WERD_CHOICE strip1(w->uch_set);
533 {
534 int p1start, p1end;
535 p1->GetNonSuperscriptSpan(&p1start, &p1end);
536 strip1 = p1->shallow_copy(p1start, p1end);
537 }
538 WERD_CHOICE_IT w_it(&w->best_choices);
539 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
540 WERD_CHOICE *p2 = w_it.data();
541 WERD_CHOICE strip2(w->uch_set);
542 {
543 int p2start, p2end;
544 p2->GetNonSuperscriptSpan(&p2start, &p2end);
545 strip2 = p2->shallow_copy(p2start, p2end);
546 }
547 if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
548 overrides_word1.push_back(p1);
549 overrides_word2.push_back(p2);
550 if (overrides_word1.size() == 1 ||
551 p1->rating() + p2->rating() < best_rating) {
552 best_rating = p1->rating() + p2->rating();
553 best_idx = overrides_word1.size() - 1;
554 }
555 }
556 }
557 }
558 if (!overrides_word1.empty()) {
559 // Excellent, we have some bigram matches.
561 *overrides_word1[best_idx]) &&
563 *overrides_word2[best_idx])) {
564 if (tessedit_bigram_debug > 1) {
565 tprintf("Top choice \"%s %s\" verified (sans case) by bigram "
566 "model.\n", orig_w1_str.string(), orig_w2_str.string());
567 }
568 continue;
569 }
570 const STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
571 const STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
572 if (new_w1_str != orig_w1_str) {
573 w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
574 }
575 if (new_w2_str != orig_w2_str) {
576 w->ReplaceBestChoice(overrides_word2[best_idx]);
577 }
578 if (tessedit_bigram_debug > 0) {
579 STRING choices_description;
580 int num_bigram_choices
581 = overrides_word1.size() * overrides_word2.size();
582 if (num_bigram_choices == 1) {
583 choices_description = "This was the unique bigram choice.";
584 } else {
585 if (tessedit_bigram_debug > 1) {
586 STRING bigrams_list;
587 const int kMaxChoicesToPrint = 20;
588 for (int i = 0; i < overrides_word1.size() &&
589 i < kMaxChoicesToPrint; i++) {
590 if (i > 0) { bigrams_list += ", "; }
591 WERD_CHOICE *p1 = overrides_word1[i];
592 WERD_CHOICE *p2 = overrides_word2[i];
593 bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
594 }
595 choices_description = "There were many choices: {";
596 choices_description += bigrams_list;
597 choices_description += "}";
598 } else {
599 choices_description.add_str_int("There were ", num_bigram_choices);
600 choices_description += " compatible bigrams.";
601 }
602 }
603 tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
604 orig_w1_str.string(), orig_w2_str.string(),
605 new_w1_str.string(), new_w2_str.string(),
606 choices_description.string());
607 }
608 }
609 }
610}
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
Definition: ratngs.cpp:809
@ W_REP_CHAR
repeated character
Definition: werd.h:38
int push_back(T object)
bool empty() const
Definition: genericvector.h:91
Dict & getDict() override
const UNICHARSET * uch_set
Definition: pageres.h:203
WERD_CHOICE_LIST best_choices
Definition: pageres.h:249
tesseract::Tesseract * tesseract
Definition: pageres.h:280
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:795
void PrintBestChoices() const
Definition: pageres.cpp:717
WERD_CHOICE shallow_copy(int start, int end) const
Definition: ratngs.cpp:418
const STRING & unichar_string() const
Definition: ratngs.h:531
void GetNonSuperscriptSpan(int *start, int *end) const
Definition: ratngs.cpp:401
float rating() const
Definition: ratngs.h:317
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
void add_str_int(const char *str, int number)
Definition: strngs.cpp:377
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:813

◆ blamer_pass()

void tesseract::Tesseract::blamer_pass ( PAGE_RES page_res)

Definition at line 710 of file control.cpp.

710 {
711 if (!wordrec_run_blamer) return;
712 PAGE_RES_IT page_res_it(page_res);
713 for (page_res_it.restart_page(); page_res_it.word() != nullptr;
714 page_res_it.forward()) {
715 WERD_RES *word = page_res_it.word();
718 }
719 tprintf("Blame reasons:\n");
720 for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
722 static_cast<IncorrectResultReason>(bl)),
723 page_res->blame_reasons[bl]);
724 }
725 if (page_res->misadaption_log.length() > 0) {
726 tprintf("Misadaption log:\n");
727 for (int i = 0; i < page_res->misadaption_log.length(); ++i) {
728 tprintf("%s\n", page_res->misadaption_log[i].string());
729 }
730 }
731}
IncorrectResultReason
Definition: blamer.h:51
@ IRR_NUM_REASONS
Definition: blamer.h:98
int length() const
Definition: genericvector.h:86
static void LastChanceBlame(bool debug, WERD_RES *word)
Definition: blamer.cpp:560
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: blamer.cpp:64
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:120
GenericVector< int > blame_reasons
Definition: pageres.h:86
GenericVector< STRING > misadaption_log
Definition: pageres.h:91
BlamerBundle * blamer_bundle
Definition: pageres.h:252
bool wordrec_run_blamer
Definition: wordrec.h:232
bool wordrec_debug_blamer
Definition: wordrec.h:231

◆ blob_feature_display()

void tesseract::Tesseract::blob_feature_display ( PAGE_RES page_res,
const TBOX selection_box 
)

Definition at line 943 of file pgedit.cpp.

944 {
945#ifndef DISABLED_LEGACY_ENGINE
946 PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
947 if (it != nullptr) {
948 WERD_RES* word_res = it->word();
949 word_res->x_height = it->row()->row->x_height();
950 word_res->SetupForRecognition(unicharset, this, BestPix(),
955 it->row()->row, it->block()->block);
956 TWERD* bln_word = word_res->chopped_word;
957 TBLOB* bln_blob = bln_word->blobs[0];
958 INT_FX_RESULT_STRUCT fx_info;
961 Classify::ExtractFeatures(*bln_blob, classify_nonlinear_norm, &bl_features,
962 &cn_features, &fx_info, nullptr);
963 // Display baseline features.
964 ScrollView* bl_win = CreateFeatureSpaceWindow("BL Features", 512, 0);
966 for (int f = 0; f < bl_features.size(); ++f)
967 RenderIntFeature(bl_win, &bl_features[f], ScrollView::GREEN);
968 bl_win->Update();
969 // Display cn features.
970 ScrollView* cn_win = CreateFeatureSpaceWindow("CN Features", 512, 0);
972 for (int f = 0; f < cn_features.size(); ++f)
973 RenderIntFeature(cn_win, &cn_features[f], ScrollView::GREEN);
974 cn_win->Update();
975
976 it->DeleteCurrentWord();
977 delete it;
978 }
979#endif // ndef DISABLED_LEGACY_ENGINE
980}
PAGE_RES_IT * make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: werdit.cpp:35
ScrollView * CreateFeatureSpaceWindow(const char *name, int xpos, int ypos)
Definition: intproto.cpp:1763
void RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT *Feature, ScrollView::Color color)
Definition: intproto.cpp:1602
@ baseline
Definition: mfoutline.h:63
@ character
Definition: mfoutline.h:63
void ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView *window)
Definition: intproto.cpp:987
Pix * BestPix() const
Definition: blobs.h:418
float x_height() const
Definition: ocrrow.h:64
BLOCK * block
Definition: pageres.h:116
ROW * row
Definition: pageres.h:140
float x_height
Definition: pageres.h:316
TWERD * chopped_word
Definition: pageres.h:212
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:302
WERD_RES * word() const
Definition: pageres.h:754
ROW_RES * row() const
Definition: pageres.h:757
BLOCK_RES * block() const
Definition: pageres.h:760
void DeleteCurrentWord()
Definition: pageres.cpp:1440
static void ExtractFeatures(const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
Definition: intfx.cpp:442
bool classify_nonlinear_norm
Definition: classify.h:452
bool classify_bln_numeric_mode
Definition: classify.h:508
static void Update()
Definition: scrollview.cpp:709

◆ blob_noise_score()

float tesseract::Tesseract::blob_noise_score ( TBLOB blob)

Definition at line 787 of file fixspace.cpp.

787 {
788 TBOX box; // BB of outline
789 int16_t outline_count = 0;
790 int16_t max_dimension;
791 int16_t largest_outline_dimension = 0;
792
793 for (TESSLINE* ol = blob->outlines; ol != nullptr; ol= ol->next) {
794 outline_count++;
795 box = ol->bounding_box();
796 if (box.height() > box.width()) {
797 max_dimension = box.height();
798 } else {
799 max_dimension = box.width();
800 }
801
802 if (largest_outline_dimension < max_dimension)
803 largest_outline_dimension = max_dimension;
804 }
805
806 if (outline_count > 5) {
807 // penalise LOTS of blobs
808 largest_outline_dimension *= 2;
809 }
810
811 box = blob->bounding_box();
812 if (box.bottom() > kBlnBaselineOffset * 4 ||
813 box.top() < kBlnBaselineOffset / 2) {
814 // Lax blob is if high or low
815 largest_outline_dimension /= 2;
816 }
817
818 return largest_outline_dimension;
819}
const int kBlnBaselineOffset
Definition: normalis.h:25
TESSLINE * next
Definition: blobs.h:281
TESSLINE * outlines
Definition: blobs.h:400
int16_t top() const
Definition: rect.h:58
int16_t width() const
Definition: rect.h:115
int16_t bottom() const
Definition: rect.h:65

◆ break_noisiest_blob_word()

void tesseract::Tesseract::break_noisiest_blob_word ( WERD_RES_LIST &  words)

break_noisiest_blob_word() Find the word with the blob which looks like the worst noise. Break the word into two, deleting the noise blob.

Definition at line 642 of file fixspace.cpp.

642 {
643 WERD_RES_IT word_it(&words);
644 WERD_RES_IT worst_word_it;
645 float worst_noise_score = 9999;
646 int worst_blob_index = -1; // Noisiest blob of noisiest wd
647 int blob_index; // of wds noisiest blob
648 float noise_score; // of wds noisiest blob
649 WERD_RES *word_res;
650 C_BLOB_IT blob_it;
651 C_BLOB_IT rej_cblob_it;
652 C_BLOB_LIST new_blob_list;
653 C_BLOB_IT new_blob_it;
654 C_BLOB_IT new_rej_cblob_it;
655 WERD *new_word;
656 int16_t start_of_noise_blob;
657 int16_t i;
658
659 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
660 blob_index = worst_noise_blob(word_it.data(), &noise_score);
661 if (blob_index > -1 && worst_noise_score > noise_score) {
662 worst_noise_score = noise_score;
663 worst_blob_index = blob_index;
664 worst_word_it = word_it;
665 }
666 }
667 if (worst_blob_index < 0) {
668 words.clear(); // signal termination
669 return;
670 }
671
672 /* Now split the worst_word_it */
673
674 word_res = worst_word_it.data();
675
676 /* Move blobs before noise blob to a new bloblist */
677
678 new_blob_it.set_to_list(&new_blob_list);
679 blob_it.set_to_list(word_res->word->cblob_list());
680 for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
681 new_blob_it.add_after_then_move(blob_it.extract());
682 }
683 start_of_noise_blob = blob_it.data()->bounding_box().left();
684 delete blob_it.extract(); // throw out noise blob
685
686 new_word = new WERD(&new_blob_list, word_res->word);
687 new_word->set_flag(W_EOL, false);
688 word_res->word->set_flag(W_BOL, false);
689 word_res->word->set_blanks(1); // After break
690
691 new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
692 rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
693 for (;
694 (!rej_cblob_it.empty() &&
695 (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
696 rej_cblob_it.forward()) {
697 new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
698 }
699
700 auto* new_word_res = new WERD_RES(new_word);
701 new_word_res->combination = true;
702 worst_word_it.add_before_then_move(new_word_res);
703
704 word_res->ClearResults();
705}
@ W_EOL
end of line
Definition: werd.h:33
@ W_BOL
start of line
Definition: werd.h:32
int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:707
void ClearResults()
Definition: pageres.cpp:1104
Definition: werd.h:56
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:90
void set_blanks(uint8_t new_blanks)
Definition: werd.h:102
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:118

◆ build_menu_new()

SVMenuNode * tesseract::Tesseract::build_menu_new ( )

Definition at line 298 of file pgedit.cpp.

298 {
299 SVMenuNode* parent_menu;
300 auto* root_menu_item = new SVMenuNode();
301
302 SVMenuNode* modes_menu_item = root_menu_item->AddChild("MODES");
303
304 modes_menu_item->AddChild("Change Display", CHANGE_DISP_CMD_EVENT);
305 modes_menu_item->AddChild("Dump Word", DUMP_WERD_CMD_EVENT);
306 modes_menu_item->AddChild("Show Point", SHOW_POINT_CMD_EVENT);
307 modes_menu_item->AddChild("Show BL Norm Word", SHOW_BLN_WERD_CMD_EVENT);
308 modes_menu_item->AddChild("Config Words", DEBUG_WERD_CMD_EVENT);
309 modes_menu_item->AddChild("Recog Words", RECOG_WERDS);
310 modes_menu_item->AddChild("Recog Blobs", RECOG_PSEUDO);
311 modes_menu_item->AddChild("Show Blob Features", SHOW_BLOB_FEATURES);
312
313 parent_menu = root_menu_item->AddChild("DISPLAY");
314
315 parent_menu->AddChild("Blamer", BLAMER_CMD_EVENT, false);
316 parent_menu->AddChild("Bounding Boxes", BOUNDING_BOX_CMD_EVENT, false);
317 parent_menu->AddChild("Correct Text", CORRECT_TEXT_CMD_EVENT, false);
318 parent_menu->AddChild("Polygonal Approx", POLYGONAL_CMD_EVENT, false);
319 parent_menu->AddChild("Baseline Normalized", BL_NORM_CMD_EVENT, false);
320 parent_menu->AddChild("Edge Steps", BITMAP_CMD_EVENT, true);
321 parent_menu->AddChild("Subscripts", SHOW_SUBSCRIPT_CMD_EVENT);
322 parent_menu->AddChild("Superscripts", SHOW_SUPERSCRIPT_CMD_EVENT);
323 parent_menu->AddChild("Italics", SHOW_ITALIC_CMD_EVENT);
324 parent_menu->AddChild("Bold", SHOW_BOLD_CMD_EVENT);
325 parent_menu->AddChild("Underline", SHOW_UNDERLINE_CMD_EVENT);
326 parent_menu->AddChild("FixedPitch", SHOW_FIXEDPITCH_CMD_EVENT);
327 parent_menu->AddChild("Serifs", SHOW_SERIF_CMD_EVENT);
328 parent_menu->AddChild("SmallCaps", SHOW_SMALLCAPS_CMD_EVENT);
329 parent_menu->AddChild("DropCaps", SHOW_DROPCAPS_CMD_EVENT);
330
331
332 parent_menu = root_menu_item->AddChild("OTHER");
333
334 parent_menu->AddChild("Quit", QUIT_CMD_EVENT);
335 parent_menu->AddChild("Show Image", IMAGE_CMD_EVENT, false);
336 parent_menu->AddChild("ShowBlock Outlines", BLOCKS_CMD_EVENT, false);
337 parent_menu->AddChild("Show Baselines", BASELINES_CMD_EVENT, false);
338 parent_menu->AddChild("Uniform Display", UNIFORM_DISP_CMD_EVENT);
339 parent_menu->AddChild("Refresh Display", REFRESH_CMD_EVENT);
340
341 return root_menu_item;
342}
@ SHOW_DROPCAPS_CMD_EVENT
Definition: pgedit.cpp:77
@ SHOW_SMALLCAPS_CMD_EVENT
Definition: pgedit.cpp:76
@ UNIFORM_DISP_CMD_EVENT
Definition: pgedit.cpp:63
@ SHOW_SUPERSCRIPT_CMD_EVENT
Definition: pgedit.cpp:70
@ BLOCKS_CMD_EVENT
Definition: pgedit.cpp:61
@ SHOW_UNDERLINE_CMD_EVENT
Definition: pgedit.cpp:73
@ BL_NORM_CMD_EVENT
Definition: pgedit.cpp:58
@ SHOW_BOLD_CMD_EVENT
Definition: pgedit.cpp:72
@ QUIT_CMD_EVENT
Definition: pgedit.cpp:65
@ POLYGONAL_CMD_EVENT
Definition: pgedit.cpp:57
@ SHOW_SUBSCRIPT_CMD_EVENT
Definition: pgedit.cpp:69
@ IMAGE_CMD_EVENT
Definition: pgedit.cpp:60
@ BOUNDING_BOX_CMD_EVENT
Definition: pgedit.cpp:55
@ BASELINES_CMD_EVENT
Definition: pgedit.cpp:62
@ SHOW_BLN_WERD_CMD_EVENT
Definition: pgedit.cpp:52
@ DEBUG_WERD_CMD_EVENT
Definition: pgedit.cpp:53
@ SHOW_ITALIC_CMD_EVENT
Definition: pgedit.cpp:71
@ SHOW_FIXEDPITCH_CMD_EVENT
Definition: pgedit.cpp:74
@ CORRECT_TEXT_CMD_EVENT
Definition: pgedit.cpp:56
@ BLAMER_CMD_EVENT
Definition: pgedit.cpp:54
@ SHOW_SERIF_CMD_EVENT
Definition: pgedit.cpp:75
@ CHANGE_DISP_CMD_EVENT
Definition: pgedit.cpp:49
@ SHOW_POINT_CMD_EVENT
Definition: pgedit.cpp:51
@ REFRESH_CMD_EVENT
Definition: pgedit.cpp:64
@ BITMAP_CMD_EVENT
Definition: pgedit.cpp:59
@ SHOW_BLOB_FEATURES
Definition: pgedit.cpp:68
@ DUMP_WERD_CMD_EVENT
Definition: pgedit.cpp:50
@ RECOG_PSEUDO
Definition: tessedit.cpp:472
SVMenuNode * AddChild(const char *txt)
Definition: svmnode.cpp:58

◆ check_debug_pt()

bool tesseract::Tesseract::check_debug_pt ( WERD_RES word,
int  location 
)

Definition at line 1849 of file control.cpp.

1849 {
1850 bool show_map_detail = false;
1851 int16_t i;
1852
1853 if (!test_pt)
1854 return false;
1855
1856 tessedit_rejection_debug.set_value (false);
1857 debug_x_ht_level.set_value(0);
1858
1860 if (location < 0)
1861 return true; // For breakpoint use
1862 tessedit_rejection_debug.set_value(true);
1863 debug_x_ht_level.set_value(2);
1864 tprintf ("\n\nTESTWD::");
1865 switch (location) {
1866 case 0:
1867 tprintf ("classify_word_pass1 start\n");
1868 word->word->print();
1869 break;
1870 case 10:
1871 tprintf ("make_reject_map: initial map");
1872 break;
1873 case 20:
1874 tprintf ("make_reject_map: after NN");
1875 break;
1876 case 30:
1877 tprintf ("classify_word_pass2 - START");
1878 break;
1879 case 40:
1880 tprintf ("classify_word_pass2 - Pre Xht");
1881 break;
1882 case 50:
1883 tprintf ("classify_word_pass2 - END");
1884 show_map_detail = true;
1885 break;
1886 case 60:
1887 tprintf ("fixspace");
1888 break;
1889 case 70:
1890 tprintf ("MM pass START");
1891 break;
1892 case 80:
1893 tprintf ("MM pass END");
1894 break;
1895 case 90:
1896 tprintf ("After Poor quality rejection");
1897 break;
1898 case 100:
1899 tprintf ("unrej_good_quality_words - START");
1900 break;
1901 case 110:
1902 tprintf ("unrej_good_quality_words - END");
1903 break;
1904 case 120:
1905 tprintf ("Write results pass");
1906 show_map_detail = true;
1907 break;
1908 }
1909 if (word->best_choice != nullptr) {
1910 tprintf(" \"%s\" ", word->best_choice->unichar_string().string());
1911 word->reject_map.print(debug_fp);
1912 tprintf("\n");
1913 if (show_map_detail) {
1914 tprintf("\"%s\"\n", word->best_choice->unichar_string().string());
1915 for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1916 tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
1917 word->reject_map[i].full_print(debug_fp);
1918 }
1919 }
1920 } else {
1921 tprintf("null best choice\n");
1922 }
1923 tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
1924 tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
1925 return true;
1926 } else {
1927 return false;
1928 }
1929}
FILE * debug_fp
Definition: tessvars.cpp:24
bool done
Definition: pageres.h:305
bool tess_accepted
Definition: pageres.h:303
REJMAP reject_map
Definition: pageres.h:294
Definition: points.h:189
bool contains(const FCOORD pt) const
Definition: rect.h:333
void print(FILE *fp)
Definition: rejctmap.cpp:321
void full_print(FILE *fp)
Definition: rejctmap.cpp:333
void print()
Definition: werd.cpp:253
TBOX bounding_box() const
Definition: werd.cpp:148

◆ classify_word_and_language()

void tesseract::Tesseract::classify_word_and_language ( int  pass_n,
PAGE_RES_IT pr_it,
WordData word_data 
)

Definition at line 1319 of file control.cpp.

1320 {
1321#ifdef DISABLED_LEGACY_ENGINE
1323#else
1324 WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
1326#endif // def DISABLED_LEGACY_ENGINE
1327
1328 // Best result so far.
1329 PointerVector<WERD_RES> best_words;
1330 // Points to the best result. May be word or in lang_words.
1331 const WERD_RES* word = word_data->word;
1332 clock_t start_t = clock();
1333 const bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
1334 if (debug) {
1335 tprintf("%s word with lang %s at:",
1336 word->done ? "Already done" : "Processing",
1337 most_recently_used_->lang.string());
1338 word->word->bounding_box().print();
1339 }
1340 if (word->done) {
1341 // If done on pass1, leave it as-is.
1342 if (!word->tess_failed)
1343 most_recently_used_ = word->tesseract;
1344 return;
1345 }
1346 int sub = sub_langs_.size();
1347 if (most_recently_used_ != this) {
1348 // Get the index of the most_recently_used_.
1349 for (sub = 0; sub < sub_langs_.size() &&
1350 most_recently_used_ != sub_langs_[sub]; ++sub) {}
1351 }
1352 most_recently_used_->RetryWithLanguage(
1353 *word_data, recognizer, debug, &word_data->lang_words[sub], &best_words);
1354 Tesseract* best_lang_tess = most_recently_used_;
1355 if (!WordsAcceptable(best_words)) {
1356 // Try all the other languages to see if they are any better.
1357 if (most_recently_used_ != this &&
1358 this->RetryWithLanguage(*word_data, recognizer, debug,
1359 &word_data->lang_words[sub_langs_.size()],
1360 &best_words) > 0) {
1361 best_lang_tess = this;
1362 }
1363 for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
1364 ++i) {
1365 if (most_recently_used_ != sub_langs_[i] &&
1366 sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug,
1367 &word_data->lang_words[i],
1368 &best_words) > 0) {
1369 best_lang_tess = sub_langs_[i];
1370 }
1371 }
1372 }
1373 most_recently_used_ = best_lang_tess;
1374 if (!best_words.empty()) {
1375 if (best_words.size() == 1 && !best_words[0]->combination) {
1376 // Move the best single result to the main word.
1377 word_data->word->ConsumeWordResults(best_words[0]);
1378 } else {
1379 // Words came from LSTM, and must be moved to the PAGE_RES properly.
1380 word_data->word = best_words.back();
1381 pr_it->ReplaceCurrentWord(&best_words);
1382 }
1383 ASSERT_HOST(word_data->word->box_word != nullptr);
1384 } else {
1385 tprintf("no best words!!\n");
1386 }
1387 clock_t ocr_t = clock();
1389 tprintf("%s (ocr took %.2f sec)\n",
1390 word_data->word->best_choice->unichar_string().string(),
1391 static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC);
1392 }
1393}
void(Tesseract::*)(const WordData &, WERD_RES **, PointerVector< WERD_RES > *) WordRecognizer
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1401
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
Definition: control.cpp:904
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1572
bool tess_failed
Definition: pageres.h:295
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1333
STRING lang
Definition: ccutil.h:71

◆ classify_word_pass1()

void tesseract::Tesseract::classify_word_pass1 ( const WordData word_data,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  out_words 
)

classify_word_pass1

Baseline normalize the word and pass it to Tess.

Definition at line 1401 of file control.cpp.

1403 {
1404 ROW* row = word_data.row;
1405 BLOCK* block = word_data.block;
1406 prev_word_best_choice_ = word_data.prev_word != nullptr
1407 ? word_data.prev_word->word->best_choice : nullptr;
1408#ifndef ANDROID_BUILD
1409#ifdef DISABLED_LEGACY_ENGINE
1411#else
1414#endif // def DISABLED_LEGACY_ENGINE
1415 if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1416 LSTMRecognizeWord(*block, row, *in_word, out_words);
1417 if (!out_words->empty())
1418 return; // Successful lstm recognition.
1419 }
1421 // No fallback allowed, so use a fake.
1422 (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset());
1423 return;
1424 }
1425
1426 #ifndef DISABLED_LEGACY_ENGINE
1427 // Fall back to tesseract for failed words or odd words.
1428 (*in_word)->SetupForRecognition(unicharset, this, BestPix(),
1429 OEM_TESSERACT_ONLY, nullptr,
1432 poly_allow_detailed_fx, row, block);
1433#endif // ndef DISABLED_LEGACY_ENGINE
1434 }
1435#endif // ndef ANDROID_BUILD
1436
1437#ifndef DISABLED_LEGACY_ENGINE
1438 WERD_RES* word = *in_word;
1439 match_word_pass_n(1, word, row, block);
1440 if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1441 word->tess_would_adapt = AdaptableWord(word);
1442 bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
1443
1444 if (adapt_ok) {
1445 // Send word to adaptive classifier for training.
1447 LearnWord(nullptr, word);
1448 // Mark misadaptions if running blamer.
1449 if (word->blamer_bundle != nullptr) {
1452 }
1453 }
1454
1455 if (tessedit_enable_doc_dict && !word->IsAmbiguous())
1457 }
1458#endif // ndef DISABLED_LEGACY_ENGINE
1459}
@ OEM_TESSERACT_LSTM_COMBINED
Definition: publictypes.h:271
void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
Definition: linerec.cpp:229
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1630
void tess_add_doc_word(WERD_CHOICE *word_choice)
Definition: tessbox.cpp:72
bool word_adaptable(WERD_RES *word, uint16_t mode)
Definition: adaptions.cpp:34
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug)
Definition: blamer.cpp:587
Definition: ocrblock.h:31
Definition: ocrrow.h:37
bool tess_would_adapt
Definition: pageres.h:304
bool IsAmbiguous()
Definition: pageres.cpp:452
void BestChoiceToCorrectText()
Definition: pageres.cpp:923
bool AdaptableWord(WERD_RES *word)
Definition: adaptmatch.cpp:821
const UNICHARSET & GetUnicharset() const
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:476

◆ classify_word_pass2()

void tesseract::Tesseract::classify_word_pass2 ( const WordData word_data,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  out_words 
)

classify_word_pass2

Control what to do with the word in pass 2

Definition at line 1572 of file control.cpp.

1574 {
1575 // Return if we do not want to run Tesseract.
1577 return;
1578 }
1579#ifndef DISABLED_LEGACY_ENGINE
1580 ROW* row = word_data.row;
1581 BLOCK* block = word_data.block;
1582 WERD_RES* word = *in_word;
1583 prev_word_best_choice_ = word_data.prev_word != nullptr
1584 ? word_data.prev_word->word->best_choice : nullptr;
1585
1587 check_debug_pt(word, 30);
1588 if (!word->done) {
1589 word->caps_height = 0.0;
1590 if (word->x_height == 0.0f)
1591 word->x_height = row->x_height();
1592 match_word_pass_n(2, word, row, block);
1593 check_debug_pt(word, 40);
1594 }
1595
1597
1598 if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1600 block->classify_rotation().y() == 0.0f) {
1601 // Use the tops and bottoms since they are available.
1602 TrainedXheightFix(word, block, row);
1603 }
1604
1606 }
1607#ifndef GRAPHICS_DISABLED
1609 if (fx_win == nullptr)
1610 create_fx_win();
1611 clear_fx_win();
1612 word->rebuild_word->plot(fx_win);
1613 TBOX wbox = word->rebuild_word->bounding_box();
1614 fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
1615 wbox.right(), wbox.bottom());
1617 }
1618#endif
1620 check_debug_pt(word, 50);
1621#endif // ndef DISABLED_LEGACY_ENGINE
1622}
#define SUBLOC_NORM
Definition: errcode.h:58
void set_global_subloc_code(int loc_code)
Definition: globaloc.cpp:30
ScrollView * fx_win
Definition: drawfx.cpp:40
void clear_fx_win()
Definition: drawfx.cpp:62
void create_fx_win()
Definition: drawfx.cpp:49
bool SubAndSuperscriptFix(WERD_RES *word_res)
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1849
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1485
TBOX bounding_box() const
Definition: blobs.cpp:861
void plot(ScrollView *window)
Definition: blobs.cpp:897
FCOORD classify_rotation() const
Definition: ocrblock.h:140
float caps_height
Definition: pageres.h:317
float y() const
Definition: points.h:210
int16_t left() const
Definition: rect.h:72
int16_t right() const
Definition: rect.h:79
bool script_has_xheight() const
Definition: unicharset.h:904
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:757

◆ ClassifyBlobAsWord()

float tesseract::Tesseract::ClassifyBlobAsWord ( int  pass_n,
PAGE_RES_IT pr_it,
C_BLOB blob,
STRING best_str,
float *  c2 
)

Definition at line 1270 of file control.cpp.

1271 {
1272 WERD* real_word = pr_it->word()->word;
1273 WERD* word = real_word->ConstructFromSingleBlob(
1274 real_word->flag(W_BOL), real_word->flag(W_EOL), C_BLOB::deep_copy(blob));
1275 WERD_RES* word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
1276 // Get a new iterator that points to the new word.
1277 PAGE_RES_IT it(pr_it->page_res);
1278 while (it.word() != word_res && it.word() != nullptr) it.forward();
1279 ASSERT_HOST(it.word() == word_res);
1280 WordData wd(it);
1281 // Force full initialization.
1282 SetupWordPassN(1, &wd);
1283 classify_word_and_language(pass_n, &it, &wd);
1284 if (debug_noise_removal) {
1285 if (wd.word->raw_choice != nullptr) {
1286 tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height,
1287 wd.row->x_height(), wd.word->raw_choice->min_x_height(),
1288 wd.word->raw_choice->max_x_height());
1289 } else {
1290 tprintf("Got word with null raw choice xheight=%g, row=%g\n", word_res->x_height,
1291 wd.row->x_height());
1292 }
1293 }
1294 float cert = 0.0f;
1295 if (wd.word->raw_choice != nullptr) { // This probably shouldn't happen, but...
1296 cert = wd.word->raw_choice->certainty();
1297 float rat = wd.word->raw_choice->rating();
1298 *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
1299 *best_str = wd.word->raw_choice->unichar_string();
1300 } else {
1301 *c2 = 0.0f;
1302 *best_str = "";
1303 }
1304 it.DeleteCurrentWord();
1305 pr_it->ResetWordIterator();
1306 return cert;
1307}
void ResetWordIterator()
Definition: pageres.cpp:1523
PAGE_RES * page_res
Definition: pageres.h:677
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1213
static C_BLOB * deep_copy(const C_BLOB *src)
Definition: stepblob.h:119
WERD * ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob)
Definition: werd.cpp:125

◆ ClassifyBlobPlusOutlines()

float tesseract::Tesseract::ClassifyBlobPlusOutlines ( const GenericVector< bool > &  ok_outlines,
const GenericVector< C_OUTLINE * > &  outlines,
int  pass_n,
PAGE_RES_IT pr_it,
C_BLOB blob,
STRING best_str 
)

Definition at line 1226 of file control.cpp.

1229 {
1230 C_OUTLINE_IT ol_it;
1231 C_OUTLINE* first_to_keep = nullptr;
1232 C_BLOB* local_blob = nullptr;
1233 if (blob != nullptr) {
1234 // Add the required outlines to the blob.
1235 ol_it.set_to_list(blob->out_list());
1236 first_to_keep = ol_it.data();
1237 }
1238 for (int i = 0; i < ok_outlines.size(); ++i) {
1239 if (ok_outlines[i]) {
1240 // This outline is to be added.
1241 if (blob == nullptr) {
1242 local_blob = new C_BLOB(outlines[i]);
1243 blob = local_blob;
1244 ol_it.set_to_list(blob->out_list());
1245 } else {
1246 ol_it.add_before_stay_put(outlines[i]);
1247 }
1248 }
1249 }
1250 float c2;
1251 float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
1252 ol_it.move_to_first();
1253 if (first_to_keep == nullptr) {
1254 // We created blob. Empty its outlines and delete it.
1255 for (; !ol_it.empty(); ol_it.forward()) ol_it.extract();
1256 delete local_blob;
1257 cert = -c2;
1258 } else {
1259 // Remove the outlines that we put in.
1260 for (; ol_it.data() != first_to_keep; ol_it.forward()) {
1261 ol_it.extract();
1262 }
1263 }
1264 return cert;
1265}
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
Definition: control.cpp:1270
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:70

◆ Clear()

void tesseract::Tesseract::Clear ( )

Definition at line 564 of file tesseractclass.cpp.

564 {
565 STRING debug_name = imagebasename + "_debug.pdf";
566 pixa_debug_.WritePDF(debug_name.string());
567 pixDestroy(&pix_binary_);
568 pixDestroy(&pix_grey_);
569 pixDestroy(&pix_thresholds_);
570 pixDestroy(&scaled_color_);
571 deskew_ = FCOORD(1.0f, 0.0f);
572 reskew_ = FCOORD(1.0f, 0.0f);
573 splitter_.Clear();
574 scaled_factor_ = -1;
575 for (int i = 0; i < sub_langs_.size(); ++i)
576 sub_langs_[i]->Clear();
577}
void WritePDF(const char *filename)
Definition: debugpixa.h:36
STRING imagebasename
Definition: ccutil.h:70

◆ ComputeCompatibleXheight()

float tesseract::Tesseract::ComputeCompatibleXheight ( WERD_RES word_res,
float *  baseline_shift 
)

Definition at line 102 of file fixxht.cpp.

103 {
104 STATS top_stats(0, UINT8_MAX);
105 STATS shift_stats(-UINT8_MAX, UINT8_MAX);
106 int bottom_shift = 0;
107 int num_blobs = word_res->rebuild_word->NumBlobs();
108 do {
109 top_stats.clear();
110 shift_stats.clear();
111 for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
112 TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
113 UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
114 if (unicharset.get_isalpha(class_id) ||
115 unicharset.get_isdigit(class_id)) {
116 int top = blob->bounding_box().top() + bottom_shift;
117 // Clip the top to the limit of normalized feature space.
118 if (top >= INT_FEAT_RANGE)
119 top = INT_FEAT_RANGE - 1;
120 int bottom = blob->bounding_box().bottom() + bottom_shift;
121 int min_bottom, max_bottom, min_top, max_top;
122 unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
123 &min_top, &max_top);
124 // Chars with a wild top range would mess up the result so ignore them.
125 if (max_top - min_top > kMaxCharTopRange)
126 continue;
127 int misfit_dist = std::max((min_top - x_ht_acceptance_tolerance) - top,
128 top - (max_top + x_ht_acceptance_tolerance));
129 int height = top - kBlnBaselineOffset;
130 if (debug_x_ht_level >= 2) {
131 tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
132 unicharset.id_to_unichar(class_id),
133 height, min_bottom, max_bottom, min_top, max_top,
134 bottom, top);
135 }
136 // Use only chars that fit in the expected bottom range, and where
137 // the range of tops is sensibly near the xheight.
138 if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
139 bottom - x_ht_acceptance_tolerance <= max_bottom &&
140 min_top > kBlnBaselineOffset &&
141 max_top - kBlnBaselineOffset >= kBlnXHeight &&
142 misfit_dist > 0) {
143 // Compute the x-height position using proportionality between the
144 // actual height and expected height.
145 int min_xht = DivRounded(height * kBlnXHeight,
146 max_top - kBlnBaselineOffset);
147 int max_xht = DivRounded(height * kBlnXHeight,
148 min_top - kBlnBaselineOffset);
149 if (debug_x_ht_level >= 2) {
150 tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
151 }
152 // The range of expected heights gets a vote equal to the distance
153 // of the actual top from the expected top.
154 for (int y = min_xht; y <= max_xht; ++y)
155 top_stats.add(y, misfit_dist);
156 } else if ((min_bottom > bottom + x_ht_acceptance_tolerance ||
157 bottom - x_ht_acceptance_tolerance > max_bottom) &&
158 bottom_shift == 0) {
159 // Get the range of required bottom shift.
160 int min_shift = min_bottom - bottom;
161 int max_shift = max_bottom - bottom;
162 if (debug_x_ht_level >= 2) {
163 tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift);
164 }
165 // The range of expected shifts gets a vote equal to the min distance
166 // of the actual bottom from the expected bottom, spread over the
167 // range of its acceptance.
168 int misfit_weight = abs(min_shift);
169 if (max_shift > min_shift)
170 misfit_weight /= max_shift - min_shift;
171 for (int y = min_shift; y <= max_shift; ++y)
172 shift_stats.add(y, misfit_weight);
173 } else {
174 if (bottom_shift == 0) {
175 // Things with bottoms that are already ok need to say so, on the
176 // 1st iteration only.
177 shift_stats.add(0, kBlnBaselineOffset);
178 }
179 if (debug_x_ht_level >= 2) {
180 tprintf(" already OK\n");
181 }
182 }
183 }
184 }
185 if (shift_stats.get_total() > top_stats.get_total()) {
186 bottom_shift = IntCastRounded(shift_stats.median());
187 if (debug_x_ht_level >= 2) {
188 tprintf("Applying bottom shift=%d\n", bottom_shift);
189 }
190 }
191 } while (bottom_shift != 0 &&
192 top_stats.get_total() < shift_stats.get_total());
193 // Baseline shift is opposite sign to the bottom shift.
194 *baseline_shift = -bottom_shift / word_res->denorm.y_scale();
195 if (debug_x_ht_level >= 2) {
196 tprintf("baseline shift=%g\n", *baseline_shift);
197 }
198 if (top_stats.get_total() == 0)
199 return bottom_shift != 0 ? word_res->x_height : 0.0f;
200 // The new xheight is just the median vote, which is then scaled out
201 // of BLN space back to pixel space to get the x-height in pixel space.
202 float new_xht = top_stats.median();
203 if (debug_x_ht_level >= 2) {
204 tprintf("Median xht=%f\n", new_xht);
205 tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n",
206 new_xht, new_xht / word_res->denorm.y_scale());
207 }
208 // The xheight must change by at least x_ht_min_change to be used.
209 if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change)
210 return new_xht / word_res->denorm.y_scale();
211 else
212 return bottom_shift != 0 ? word_res->x_height : 0.0f;
213}
int DivRounded(int a, int b)
Definition: helpers.h:167
int IntCastRounded(double x)
Definition: helpers.h:175
#define INT_FEAT_RANGE
Definition: float2int.h:27
const int kMaxCharTopRange
Definition: fixxht.cpp:67
int NumBlobs() const
Definition: blobs.h:448
float y_scale() const
Definition: normalis.h:270
DENORM denorm
Definition: pageres.h:201
Definition: statistc.h:31

◆ convert_bad_unlv_chs()

void tesseract::Tesseract::convert_bad_unlv_chs ( WERD_RES word_res)

Definition at line 659 of file docqual.cpp.

659 {
660 int i;
661 UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
662 UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
663 UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
664 UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
665 for (i = 0; i < word_res->reject_map.length(); ++i) {
666 if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
667 word_res->best_choice->set_unichar_id(unichar_dash, i);
668 if (word_res->reject_map[i].accepted ())
669 word_res->reject_map[i].setrej_unlv_rej ();
670 }
671 if (word_res->best_choice->unichar_id(i) == unichar_pow) {
672 word_res->best_choice->set_unichar_id(unichar_space, i);
673 if (word_res->reject_map[i].accepted ())
674 word_res->reject_map[i].setrej_unlv_rej ();
675 }
676 }
677}
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:349
int32_t length() const
Definition: rejctmap.h:223
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210

◆ ConvertStringToUnichars()

bool tesseract::Tesseract::ConvertStringToUnichars ( const char *  utf8,
GenericVector< UNICHAR_ID > *  class_ids 
)

Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.

Returns
false if an invalid UNICHAR_ID is encountered.

Definition at line 534 of file applybox.cpp.

535 {
536 for (int step = 0; *utf8 != '\0'; utf8 += step) {
537 const char* next_space = strchr(utf8, ' ');
538 if (next_space == nullptr)
539 next_space = utf8 + strlen(utf8);
540 step = next_space - utf8;
541 UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
542 if (class_id == INVALID_UNICHAR_ID) {
543 return false;
544 }
545 while (utf8[step] == ' ')
546 ++step;
547 class_ids->push_back(class_id);
548 }
549 return true;
550}

◆ CorrectClassifyWords()

void tesseract::Tesseract::CorrectClassifyWords ( PAGE_RES page_res)

Creates a fake best_choice entry in each WERD_RES with the correct text.

Definition at line 776 of file applybox.cpp.

776 {
777 PAGE_RES_IT pr_it(page_res);
778 for (WERD_RES *word_res = pr_it.word(); word_res != nullptr;
779 word_res = pr_it.forward()) {
780 auto* choice = new WERD_CHOICE(word_res->uch_set,
781 word_res->correct_text.size());
782 for (int i = 0; i < word_res->correct_text.size(); ++i) {
783 // The part before the first space is the real ground truth, and the
784 // rest is the bounding box location and page number.
786 word_res->correct_text[i].split(' ', &tokens);
787 UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
788 choice->append_unichar_id_space_allocated(char_id,
789 word_res->best_state[i],
790 0.0f, 0.0f);
791 }
792 word_res->ClearWordChoices();
793 word_res->LogNewRawChoice(choice);
794 word_res->LogNewCookedChoice(1, false, choice);
795 }
796}

◆ count_alphanums() [1/2]

int16_t tesseract::Tesseract::count_alphanums ( const WERD_CHOICE word)

Definition at line 376 of file output.cpp.

376 {
377 int count = 0;
378 for (int i = 0; i < word.length(); ++i) {
379 if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
380 word.unicharset()->get_isdigit(word.unichar_id(i)))
381 count++;
382 }
383 return count;
384}

◆ count_alphanums() [2/2]

int16_t tesseract::Tesseract::count_alphanums ( WERD_RES word)

Definition at line 559 of file reject.cpp.

559 {
560 int count = 0;
561 const WERD_CHOICE *best_choice = word_res->best_choice;
562 for (int i = 0; i < word_res->reject_map.length(); ++i) {
563 if ((word_res->reject_map[i].accepted()) &&
564 (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
565 word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
566 count++;
567 }
568 }
569 return count;
570}

◆ count_alphas()

int16_t tesseract::Tesseract::count_alphas ( const WERD_CHOICE word)

Definition at line 366 of file output.cpp.

366 {
367 int count = 0;
368 for (int i = 0; i < word.length(); ++i) {
369 if (word.unicharset()->get_isalpha(word.unichar_id(i)))
370 count++;
371 }
372 return count;
373}

◆ count_outline_errs()

int16_t tesseract::Tesseract::count_outline_errs ( char  c,
int16_t  outline_count 
)

Definition at line 126 of file docqual.cpp.

126 {
127 int expected_outline_count;
128
129 if (STRING (outlines_odd).contains (c))
130 return 0; // Don't use this char
131 else if (STRING (outlines_2).contains (c))
132 expected_outline_count = 2;
133 else
134 expected_outline_count = 1;
135 return abs (outline_count - expected_outline_count);
136}

◆ CountMisfitTops()

int tesseract::Tesseract::CountMisfitTops ( WERD_RES word_res)

Definition at line 70 of file fixxht.cpp.

70 {
71 int bad_blobs = 0;
72 int num_blobs = word_res->rebuild_word->NumBlobs();
73 for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
74 TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
75 UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
76 if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
77 int top = blob->bounding_box().top();
78 if (top >= INT_FEAT_RANGE)
79 top = INT_FEAT_RANGE - 1;
80 int min_bottom, max_bottom, min_top, max_top;
81 unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
82 &min_top, &max_top);
83 if (max_top - min_top > kMaxCharTopRange)
84 continue;
85 bool bad = top < min_top - x_ht_acceptance_tolerance ||
86 top > max_top + x_ht_acceptance_tolerance;
87 if (bad)
88 ++bad_blobs;
89 if (debug_x_ht_level >= 1) {
90 tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
91 unicharset.id_to_unichar(class_id),
92 bad ? "Misfit" : "OK", top, min_top, max_top,
93 static_cast<int>(x_ht_acceptance_tolerance));
94 }
95 }
96 }
97 return bad_blobs;
98}

◆ debug_word()

void tesseract::Tesseract::debug_word ( PAGE_RES page_res,
const TBOX selection_box 
)

debug_word

Process the whole image, but load word_config_ for the selected word(s).

Definition at line 665 of file pgedit.cpp.

665 {
666#ifndef DISABLED_LEGACY_ENGINE
668#endif
669 recog_all_words(page_res, nullptr, &selection_box, word_config_.string(), 0);
670}
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:302

◆ dictionary_correction_pass()

void tesseract::Tesseract::dictionary_correction_pass ( PAGE_RES page_res)

Definition at line 2093 of file control.cpp.

2093 {
2094 PAGE_RES_IT word_it(page_res);
2095 for (WERD_RES* word = word_it.word(); word != nullptr;
2096 word = word_it.forward()) {
2097 if (word->best_choices.singleton())
2098 continue; // There are no alternates.
2099
2100 const WERD_CHOICE* best = word->best_choice;
2101 if (word->tesseract->getDict().valid_word(*best) != 0)
2102 continue; // The best choice is in the dictionary.
2103
2104 WERD_CHOICE_IT choice_it(&word->best_choices);
2105 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
2106 choice_it.forward()) {
2107 WERD_CHOICE* alternate = choice_it.data();
2108 if (word->tesseract->getDict().valid_word(*alternate)) {
2109 // The alternate choice is in the dictionary.
2111 tprintf("Dictionary correction replaces best choice '%s' with '%s'\n",
2112 best->unichar_string().string(),
2113 alternate->unichar_string().string());
2114 }
2115 // Replace the 'best' choice with a better choice.
2116 word->ReplaceBestChoice(alternate);
2117 break;
2118 }
2119 }
2120 }
2121}

◆ digit_or_numeric_punct()

bool tesseract::Tesseract::digit_or_numeric_punct ( WERD_RES word,
int  char_position 
)

Definition at line 370 of file fixspace.cpp.

370 {
371 int i;
372 int offset;
373
374 for (i = 0, offset = 0; i < char_position;
375 offset += word->best_choice->unichar_lengths()[i++]);
376 return (
377 word->uch_set->get_isdigit(
378 word->best_choice->unichar_string().string() + offset,
379 word->best_choice->unichar_lengths()[i]) ||
380 (word->best_choice->permuter() == NUMBER_PERM &&
381 STRING(numeric_punctuation).contains(
382 word->best_choice->unichar_string().string()[offset])));
383}
@ NUMBER_PERM
Definition: ratngs.h:239
uint8_t permuter() const
Definition: ratngs.h:336
const STRING & unichar_lengths() const
Definition: ratngs.h:538

◆ do_re_display()

void tesseract::Tesseract::do_re_display ( bool(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it)  word_painter)

do_re_display()

Redisplay page

Definition at line 349 of file pgedit.cpp.

350 {
351 int block_count = 1;
352
353 image_win->Clear();
354 if (display_image) {
355 image_win->Image(pix_binary_, 0, 0);
356 }
357
358 image_win->Brush(ScrollView::NONE);
359 PAGE_RES_IT pr_it(current_page_res);
360 for (WERD_RES* word = pr_it.word(); word != nullptr; word = pr_it.forward()) {
361 (this->*word_painter)(&pr_it);
362 if (display_baselines && pr_it.row() != pr_it.prev_row())
363 pr_it.row()->row->plot_baseline(image_win, ScrollView::GREEN);
364 if (display_blocks && pr_it.block() != pr_it.prev_block())
365 pr_it.block()->block->pdblk.plot(image_win, block_count++, ScrollView::RED);
366 }
367 image_win->Update();
368}
void Image(struct Pix *image, int x_pos, int y_pos)
Definition: scrollview.cpp:765
void Clear()
Definition: scrollview.cpp:589
void Brush(Color color)
Definition: scrollview.cpp:725

◆ doc_and_block_rejection()

void tesseract::Tesseract::doc_and_block_rejection ( PAGE_RES_IT page_res_it,
bool  good_quality_doc 
)

Definition at line 232 of file docqual.cpp.

234 {
235 int16_t block_no = 0;
236 int16_t row_no = 0;
237 BLOCK_RES *current_block;
238 ROW_RES *current_row;
239
240 bool rej_word;
241 bool prev_word_rejected;
242 int16_t char_quality = 0;
243 int16_t accepted_char_quality;
244
245 if (page_res_it.page_res->rej_count * 100.0 /
247 reject_whole_page(page_res_it);
249 tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
250 page_res_it.page_res->char_count,
251 page_res_it.page_res->rej_count);
252 }
253 } else {
255 tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
256 page_res_it.page_res->char_count,
257 page_res_it.page_res->rej_count);
258 }
259
260 /* Walk blocks testing for block rejection */
261
262 page_res_it.restart_page();
263 WERD_RES* word;
264 while ((word = page_res_it.word()) != nullptr) {
265 current_block = page_res_it.block();
266 block_no = current_block->block->pdblk.index();
267 if (current_block->char_count > 0 &&
268 (current_block->rej_count * 100.0 / current_block->char_count) >
271 tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
272 block_no, current_block->char_count,
273 current_block->rej_count);
274 }
275 prev_word_rejected = false;
276 while ((word = page_res_it.word()) != nullptr &&
277 (page_res_it.block() == current_block)) {
279 rej_word = word->reject_map.reject_count() > 0 ||
281 if (rej_word && tessedit_dont_blkrej_good_wds &&
284 *word->uch_set,
286 word->best_choice->unichar_lengths().string()) !=
288 word_char_quality(word, page_res_it.row()->row,
289 &char_quality,
290 &accepted_char_quality);
291 rej_word = char_quality != word->reject_map.length();
292 }
293 } else {
294 rej_word = true;
295 }
296 if (rej_word) {
297 /*
298 Reject spacing if both current and prev words are rejected.
299 NOTE - this is NOT restricted to FUZZY spaces. - When tried this
300 generated more space errors.
301 */
303 prev_word_rejected &&
304 page_res_it.prev_row() == page_res_it.row() &&
305 word->word->space() == 1)
306 word->reject_spaces = true;
308 }
309 prev_word_rejected = rej_word;
310 page_res_it.forward();
311 }
312 } else {
314 tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
315 block_no, page_res_it.block()->char_count,
316 page_res_it.block()->rej_count);
317 }
318
319 /* Walk rows in block testing for row rejection */
320 row_no = 0;
321 while (page_res_it.word() != nullptr &&
322 page_res_it.block() == current_block) {
323 current_row = page_res_it.row();
324 row_no++;
325 /* Reject whole row if:
326 fraction of chars on row which are rejected exceed a limit AND
327 fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
328 limit
329 */
330 if (current_row->char_count > 0 &&
331 (current_row->rej_count * 100.0 / current_row->char_count) >
333 (current_row->whole_word_rej_count * 100.0 /
334 current_row->rej_count) <
337 tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n",
338 row_no, current_row->char_count,
339 current_row->rej_count);
340 }
341 prev_word_rejected = false;
342 while ((word = page_res_it.word()) != nullptr &&
343 page_res_it.row () == current_row) {
344 /* Preserve words on good docs unless they are mostly rejected*/
345 if (!tessedit_row_rej_good_docs && good_quality_doc) {
346 rej_word = word->reject_map.reject_count() /
347 static_cast<float>(word->reject_map.length()) >
350 /* Preserve perfect words anyway */
351 rej_word = word->reject_map.reject_count() > 0 ||
353 if (rej_word && tessedit_dont_rowrej_good_wds &&
357 word->best_choice->unichar_lengths().string()) !=
359 word_char_quality(word, page_res_it.row()->row,
360 &char_quality,
361 &accepted_char_quality);
362 rej_word = char_quality != word->reject_map.length();
363 }
364 } else {
365 rej_word = true;
366 }
367 if (rej_word) {
368 /*
369 Reject spacing if both current and prev words are rejected.
370 NOTE - this is NOT restricted to FUZZY spaces. - When tried
371 this generated more space errors.
372 */
374 prev_word_rejected &&
375 page_res_it.prev_row() == page_res_it.row() &&
376 word->word->space () == 1)
377 word->reject_spaces = true;
379 }
380 prev_word_rejected = rej_word;
381 page_res_it.forward();
382 }
383 } else {
385 tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
386 row_no, current_row->char_count, current_row->rej_count);
387 }
388 while (page_res_it.word() != nullptr &&
389 page_res_it.row() == current_row)
390 page_res_it.forward();
391 }
392 }
393 }
394 }
395 }
396}
void reject_whole_page(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:406
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1745
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:92
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:190
int32_t rej_count
Definition: pageres.h:79
int32_t char_count
Definition: pageres.h:78
int32_t rej_count
Definition: pageres.h:118
int32_t char_count
Definition: pageres.h:117
int32_t whole_word_rej_count
Definition: pageres.h:143
int32_t rej_count
Definition: pageres.h:142
int32_t char_count
Definition: pageres.h:141
bool reject_spaces
Definition: pageres.h:341
ROW_RES * prev_row() const
Definition: pageres.h:748
WERD_RES * restart_page()
Definition: pageres.h:701
WERD_RES * forward()
Definition: pageres.h:734
int index() const
Definition: pdblock.h:67
void rej_word_row_rej()
Definition: rejctmap.cpp:442
int16_t reject_count()
Definition: rejctmap.h:229
void rej_word_block_rej()
Definition: rejctmap.cpp:433
uint8_t space()
Definition: werd.h:99

◆ dont_allow_1Il()

void tesseract::Tesseract::dont_allow_1Il ( WERD_RES word)

Definition at line 527 of file reject.cpp.

527 {
528 int i = 0;
529 int offset;
530 int word_len = word->reject_map.length();
531 const char *s = word->best_choice->unichar_string().string();
532 const char *lengths = word->best_choice->unichar_lengths().string();
533 bool accepted_1Il = false;
534
535 for (i = 0, offset = 0; i < word_len;
536 offset += word->best_choice->unichar_lengths()[i++]) {
537 if (word->reject_map[i].accepted()) {
538 if (STRING(conflict_set_I_l_1).contains(s[offset])) {
539 accepted_1Il = true;
540 } else {
541 if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
542 word->uch_set->get_isdigit(s + offset, lengths[i]))
543 return; // >=1 non 1Il ch accepted
544 }
545 }
546 }
547 if (!accepted_1Il)
548 return; //Nothing to worry about
549
550 for (i = 0, offset = 0; i < word_len;
551 offset += word->best_choice->unichar_lengths()[i++]) {
552 if (STRING(conflict_set_I_l_1).contains(s[offset]) &&
553 word->reject_map[i].accepted())
554 word->reject_map[i].setrej_postNN_1Il();
555 }
556}

◆ dump_words()

void tesseract::Tesseract::dump_words ( WERD_RES_LIST &  perm,
int16_t  score,
int16_t  mode,
bool  improved 
)

Definition at line 476 of file fixspace.cpp.

477 {
478 WERD_RES_IT word_res_it(&perm);
479
480 if (debug_fix_space_level > 0) {
481 if (mode == 1) {
482 stats_.dump_words_str = "";
483 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
484 word_res_it.forward()) {
485 if (!word_res_it.data()->part_of_combo) {
486 stats_.dump_words_str +=
487 word_res_it.data()->best_choice->unichar_string();
488 stats_.dump_words_str += ' ';
489 }
490 }
491 }
492
493 if (debug_fix_space_level > 1) {
494 switch (mode) {
495 case 1:
496 tprintf("EXTRACTED (%d): \"", score);
497 break;
498 case 2:
499 tprintf("TESTED (%d): \"", score);
500 break;
501 case 3:
502 tprintf("RETURNED (%d): \"", score);
503 break;
504 }
505
506 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
507 word_res_it.forward()) {
508 if (!word_res_it.data()->part_of_combo) {
509 tprintf("%s/%1d ",
510 word_res_it.data()->best_choice->unichar_string().string(),
511 static_cast<int>(word_res_it.data()->best_choice->permuter()));
512 }
513 }
514 tprintf("\"\n");
515 } else if (improved) {
516 tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.string());
517 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
518 word_res_it.forward()) {
519 if (!word_res_it.data()->part_of_combo) {
520 tprintf("%s/%1d ",
521 word_res_it.data()->best_choice->unichar_string().string(),
522 static_cast<int>(word_res_it.data()->best_choice->permuter()));
523 }
524 }
525 tprintf("\"\n");
526 }
527 }
528}

◆ end_tesseract()

void tesseract::Tesseract::end_tesseract ( )

Definition at line 465 of file tessedit.cpp.

465{ end_recog(); }

◆ eval_word_spacing()

int16_t tesseract::Tesseract::eval_word_spacing ( WERD_RES_LIST &  word_res_list)

Definition at line 266 of file fixspace.cpp.

266 {
267 WERD_RES_IT word_res_it(&word_res_list);
268 int16_t total_score = 0;
269 int16_t word_count = 0;
270 int16_t done_word_count = 0;
271 int16_t word_len;
272 int16_t i;
273 int16_t offset;
274 WERD_RES *word; // current word
275 int16_t prev_word_score = 0;
276 bool prev_word_done = false;
277 bool prev_char_1 = false; // prev ch a "1/I/l"?
278 bool prev_char_digit = false; // prev ch 2..9 or 0
279 bool current_char_1 = false;
280 bool current_word_ok_so_far;
281 STRING punct_chars = "!\"`',.:;";
282 bool prev_char_punct = false;
283 bool current_char_punct = false;
284 bool word_done = false;
285
286 do {
287 word = word_res_it.data();
288 word_done = fixspace_thinks_word_done(word);
289 word_count++;
290 if (word->tess_failed) {
291 total_score += prev_word_score;
292 if (prev_word_done)
293 done_word_count++;
294 prev_word_score = 0;
295 prev_char_1 = false;
296 prev_char_digit = false;
297 prev_word_done = false;
298 } else {
299 /*
300 Can we add the prev word score and potentially count this word?
301 Yes IF it didn't end in a 1 when the first char of this word is a digit
302 AND it didn't end in a digit when the first char of this word is a 1
303 */
304 word_len = word->reject_map.length();
305 current_word_ok_so_far = false;
306 if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
307 (prev_char_digit && (
308 (word_done &&
309 word->best_choice->unichar_lengths().string()[0] == 1 &&
310 word->best_choice->unichar_string()[0] == '1') ||
311 (!word_done && STRING(conflict_set_I_l_1).contains(
312 word->best_choice->unichar_string()[0])))))) {
313 total_score += prev_word_score;
314 if (prev_word_done)
315 done_word_count++;
316 current_word_ok_so_far = word_done;
317 }
318
319 if (current_word_ok_so_far) {
320 prev_word_done = true;
321 prev_word_score = word_len;
322 } else {
323 prev_word_done = false;
324 prev_word_score = 0;
325 }
326
327 /* Add 1 to total score for every joined 1 regardless of context and
328 rejtn */
329 for (i = 0, prev_char_1 = false; i < word_len; i++) {
330 current_char_1 = word->best_choice->unichar_string()[i] == '1';
331 if (prev_char_1 || (current_char_1 && (i > 0)))
332 total_score++;
333 prev_char_1 = current_char_1;
334 }
335
336 /* Add 1 to total score for every joined punctuation regardless of context
337 and rejtn */
339 for (i = 0, offset = 0, prev_char_punct = false; i < word_len;
340 offset += word->best_choice->unichar_lengths()[i++]) {
341 current_char_punct =
342 punct_chars.contains(word->best_choice->unichar_string()[offset]);
343 if (prev_char_punct || (current_char_punct && i > 0))
344 total_score++;
345 prev_char_punct = current_char_punct;
346 }
347 }
348 prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
349 for (i = 0, offset = 0; i < word_len - 1;
350 offset += word->best_choice->unichar_lengths()[i++]);
351 prev_char_1 =
352 ((word_done && (word->best_choice->unichar_string()[offset] == '1'))
353 || (!word_done && STRING(conflict_set_I_l_1).contains(
354 word->best_choice->unichar_string()[offset])));
355 }
356 /* Find next word */
357 do {
358 word_res_it.forward();
359 } while (word_res_it.data()->part_of_combo);
360 } while (!word_res_it.at_first());
361 total_score += prev_word_score;
362 if (prev_word_done)
363 done_word_count++;
364 if (done_word_count == word_count)
365 return PERFECT_WERDS;
366 else
367 return total_score;
368}
#define PERFECT_WERDS
Definition: fixspace.cpp:44
bool fixspace_thinks_word_done(WERD_RES *word)
Definition: fixspace.cpp:530
bool digit_or_numeric_punct(WERD_RES *word, int char_position)
Definition: fixspace.cpp:370
bool contains(char c) const
Definition: strngs.cpp:185

◆ failure_count()

int16_t tesseract::Tesseract::failure_count ( WERD_RES word)

Definition at line 968 of file docqual.cpp.

968 {
969 const char *str = word->best_choice->unichar_string().string();
970 int tess_rejs = 0;
971
972 for (; *str != '\0'; str++) {
973 if (*str == ' ')
974 tess_rejs++;
975 }
976 return tess_rejs;
977}

◆ FindSegmentation()

bool tesseract::Tesseract::FindSegmentation ( const GenericVector< UNICHAR_ID > &  target_text,
WERD_RES word_res 
)

Resegments the word to achieve the target_text from the classifier. Returns false if the re-segmentation fails. Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and applies a full search on the classifier results to find the best classified segmentation. As a compromise to obtain better recall, 1-1 ambiguity substitutions ARE used.

Definition at line 561 of file applybox.cpp.

562 {
563 // Classify all required combinations of blobs and save results in choices.
564 const int word_length = word_res->box_word->length();
565 auto* choices =
566 new GenericVector<BLOB_CHOICE_LIST*>[word_length];
567 for (int i = 0; i < word_length; ++i) {
568 for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
569 BLOB_CHOICE_LIST* match_result = classify_piece(
570 word_res->seam_array, i, i + j - 1, "Applybox",
571 word_res->chopped_word, word_res->blamer_bundle);
572 if (applybox_debug > 2) {
573 tprintf("%d+%d:", i, j);
574 print_ratings_list("Segment:", match_result, unicharset);
575 }
576 choices[i].push_back(match_result);
577 }
578 }
579 // Search the segmentation graph for the target text. Must be an exact
580 // match. Using wildcards makes it difficult to find the correct
581 // segmentation even when it is there.
582 word_res->best_state.clear();
583 GenericVector<int> search_segmentation;
584 float best_rating = 0.0f;
585 SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
586 &search_segmentation, &best_rating, &word_res->best_state);
587 for (int i = 0; i < word_length; ++i)
588 choices[i].delete_data_pointers();
589 delete [] choices;
590 if (word_res->best_state.empty()) {
591 // Build the original segmentation and if it is the same length as the
592 // truth, assume it will do.
593 int blob_count = 1;
594 for (int s = 0; s < word_res->seam_array.size(); ++s) {
595 SEAM* seam = word_res->seam_array[s];
596 if (!seam->HasAnySplits()) {
597 word_res->best_state.push_back(blob_count);
598 blob_count = 1;
599 } else {
600 ++blob_count;
601 }
602 }
603 word_res->best_state.push_back(blob_count);
604 if (word_res->best_state.size() != target_text.size()) {
605 word_res->best_state.clear(); // No good. Original segmentation bad size.
606 return false;
607 }
608 }
609 word_res->correct_text.clear();
610 for (int i = 0; i < target_text.size(); ++i) {
611 word_res->correct_text.push_back(
612 STRING(unicharset.id_to_unichar(target_text[i])));
613 }
614 return true;
615}
const int kMaxGroupSize
Definition: applybox.cpp:31
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:837
void SearchForText(const GenericVector< BLOB_CHOICE_LIST * > *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
Definition: applybox.cpp:631
int length() const
Definition: boxword.h:83
tesseract::BoxWord * box_word
Definition: pageres.h:272
GenericVector< SEAM * > seam_array
Definition: pageres.h:214
GenericVector< int > best_state
Definition: pageres.h:285
GenericVector< STRING > correct_text
Definition: pageres.h:289
Definition: seam.h:38
bool HasAnySplits() const
Definition: seam.h:61
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM * > &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:50

◆ first_alphanum_index()

int16_t tesseract::Tesseract::first_alphanum_index ( const char *  word,
const char *  word_lengths 
)

Definition at line 470 of file reject.cpp.

471 {
472 int16_t i;
473 int16_t offset;
474
475 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
476 if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
477 unicharset.get_isdigit(word + offset, word_lengths[i]))
478 return i;
479 }
480 return -1;
481}

◆ first_alphanum_offset()

int16_t tesseract::Tesseract::first_alphanum_offset ( const char *  word,
const char *  word_lengths 
)

Definition at line 483 of file reject.cpp.

484 {
485 int16_t i;
486 int16_t offset;
487
488 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
489 if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
490 unicharset.get_isdigit(word + offset, word_lengths[i]))
491 return offset;
492 }
493 return -1;
494}

◆ fix_fuzzy_space_list()

void tesseract::Tesseract::fix_fuzzy_space_list ( WERD_RES_LIST &  best_perm,
ROW row,
BLOCK block 
)

Definition at line 172 of file fixspace.cpp.

174 {
175 int16_t best_score;
176 WERD_RES_LIST current_perm;
177 int16_t current_score;
178 bool improved = false;
179
180 best_score = eval_word_spacing(best_perm); // default score
181 dump_words(best_perm, best_score, 1, improved);
182
183 if (best_score != PERFECT_WERDS)
184 initialise_search(best_perm, current_perm);
185
186 while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
187 match_current_words(current_perm, row, block);
188 current_score = eval_word_spacing(current_perm);
189 dump_words(current_perm, current_score, 2, improved);
190 if (current_score > best_score) {
191 best_perm.clear();
192 best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
193 best_score = current_score;
194 improved = true;
195 }
196 if (current_score < PERFECT_WERDS)
197 transform_to_next_perm(current_perm);
198 }
199 dump_words(best_perm, best_score, 3, improved);
200}
void transform_to_next_perm(WERD_RES_LIST &words)
Definition: fixspace.cpp:399
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list)
Definition: fixspace.cpp:204
void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
Definition: fixspace.cpp:476
int16_t eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:266
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
Definition: fixspace.cpp:223
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:649

◆ fix_fuzzy_spaces()

void tesseract::Tesseract::fix_fuzzy_spaces ( ETEXT_DESC monitor,
int32_t  word_count,
PAGE_RES page_res 
)

Definition at line 75 of file fixspace.cpp.

77 {
78 BLOCK_RES_IT block_res_it;
79 ROW_RES_IT row_res_it;
80 WERD_RES_IT word_res_it_from;
81 WERD_RES_IT word_res_it_to;
82 WERD_RES *word_res;
83 WERD_RES_LIST fuzzy_space_words;
84 int16_t new_length;
85 bool prevent_null_wd_fixsp; // DON'T process blobless wds
86 int32_t word_index; // current word
87
88 block_res_it.set_to_list(&page_res->block_res_list);
89 word_index = 0;
90 for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
91 block_res_it.forward()) {
92 row_res_it.set_to_list(&block_res_it.data()->row_res_list);
93 for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
94 row_res_it.forward()) {
95 word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
96 while (!word_res_it_from.at_last()) {
97 word_res = word_res_it_from.data();
98 while (!word_res_it_from.at_last() &&
99 !(word_res->combination ||
100 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
101 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
102 fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
103 block_res_it.data()->block);
104 word_res = word_res_it_from.forward();
105 word_index++;
106 if (monitor != nullptr) {
107 monitor->ocr_alive = true;
108 monitor->progress = 90 + 5 * word_index / word_count;
109 if (monitor->deadline_exceeded() ||
110 (monitor->cancel != nullptr &&
111 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
112 return;
113 }
114 }
115
116 if (!word_res_it_from.at_last()) {
117 word_res_it_to = word_res_it_from;
118 prevent_null_wd_fixsp =
119 word_res->word->cblob_list()->empty();
120 if (check_debug_pt(word_res, 60))
121 debug_fix_space_level.set_value(10);
122 word_res_it_to.forward();
123 word_index++;
124 if (monitor != nullptr) {
125 monitor->ocr_alive = true;
126 monitor->progress = 90 + 5 * word_index / word_count;
127 if (monitor->deadline_exceeded() ||
128 (monitor->cancel != nullptr &&
129 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
130 return;
131 }
132 while (!word_res_it_to.at_last () &&
133 (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
134 word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
135 if (check_debug_pt(word_res, 60))
136 debug_fix_space_level.set_value(10);
137 if (word_res->word->cblob_list()->empty())
138 prevent_null_wd_fixsp = true;
139 word_res = word_res_it_to.forward();
140 }
141 if (check_debug_pt(word_res, 60))
142 debug_fix_space_level.set_value(10);
143 if (word_res->word->cblob_list()->empty())
144 prevent_null_wd_fixsp = true;
145 if (prevent_null_wd_fixsp) {
146 word_res_it_from = word_res_it_to;
147 } else {
148 fuzzy_space_words.assign_to_sublist(&word_res_it_from,
149 &word_res_it_to);
150 fix_fuzzy_space_list(fuzzy_space_words,
151 row_res_it.data()->row,
152 block_res_it.data()->block);
153 new_length = fuzzy_space_words.length();
154 word_res_it_from.add_list_before(&fuzzy_space_words);
155 for (;
156 !word_res_it_from.at_last() && new_length > 0;
157 new_length--) {
158 word_res_it_from.forward();
159 }
160 }
161 if (test_pt)
162 debug_fix_space_level.set_value(0);
163 }
164 fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
165 block_res_it.data()->block);
166 // Last word in row
167 }
168 }
169 }
170}
@ W_FUZZY_SP
fuzzy space
Definition: werd.h:39
@ W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:40
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
Definition: fixspace.cpp:562
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:172
BLOCK_RES_LIST block_res_list
Definition: pageres.h:80
bool combination
Definition: pageres.h:339
volatile int8_t ocr_alive
true if not last
Definition: ocrclass.h:110
int16_t progress
chars in this buffer(0)
Definition: ocrclass.h:105
void * cancel_this
monitor-aware progress callback
Definition: ocrclass.h:116
bool deadline_exceeded() const
Definition: ocrclass.h:138
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:112

◆ fix_noisy_space_list()

void tesseract::Tesseract::fix_noisy_space_list ( WERD_RES_LIST &  best_perm,
ROW row,
BLOCK block 
)

Definition at line 596 of file fixspace.cpp.

597 {
598 int16_t best_score;
599 WERD_RES_IT best_perm_it(&best_perm);
600 WERD_RES_LIST current_perm;
601 WERD_RES_IT current_perm_it(&current_perm);
602 WERD_RES *old_word_res;
603 int16_t current_score;
604 bool improved = false;
605
606 best_score = fp_eval_word_spacing(best_perm); // default score
607
608 dump_words(best_perm, best_score, 1, improved);
609
610 old_word_res = best_perm_it.data();
611 // Even deep_copy doesn't copy the underlying WERD unless its combination
612 // flag is true!.
613 old_word_res->combination = true; // Kludge to force deep copy
614 current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
615 old_word_res->combination = false; // Undo kludge
616
617 break_noisiest_blob_word(current_perm);
618
619 while (best_score != PERFECT_WERDS && !current_perm.empty()) {
620 match_current_words(current_perm, row, block);
621 current_score = fp_eval_word_spacing(current_perm);
622 dump_words(current_perm, current_score, 2, improved);
623 if (current_score > best_score) {
624 best_perm.clear();
625 best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
626 best_score = current_score;
627 improved = true;
628 }
629 if (current_score < PERFECT_WERDS) {
630 break_noisiest_blob_word(current_perm);
631 }
632 }
633 dump_words(best_perm, best_score, 3, improved);
634}
void break_noisiest_blob_word(WERD_RES_LIST &words)
Definition: fixspace.cpp:642
int16_t fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:857

◆ fix_rep_char()

void tesseract::Tesseract::fix_rep_char ( PAGE_RES_IT page_res_it)

fix_rep_char() The word is a repeated char. (Leader.) Find the repeated char character. Create the appropriate single-word or multi-word sequence according to the size of spaces in between blobs, and correct the classifications where some of the characters disagree with the majority.

Definition at line 1706 of file control.cpp.

1706 {
1707 WERD_RES *word_res = page_res_it->word();
1708 const WERD_CHOICE &word = *(word_res->best_choice);
1709
1710 // Find the frequency of each unique character in the word.
1711 SortHelper<UNICHAR_ID> rep_ch(word.length());
1712 for (int i = 0; i < word.length(); ++i) {
1713 rep_ch.Add(word.unichar_id(i), 1);
1714 }
1715
1716 // Find the most frequent result.
1717 UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
1718 int max_count = rep_ch.MaxCount(&maxch_id);
1719 // Find the best exemplar of a classifier result for maxch_id.
1720 BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
1721 if (best_choice == nullptr) {
1722 tprintf("Failed to find a choice for %s, occurring %d times\n",
1723 word_res->uch_set->debug_str(maxch_id).string(), max_count);
1724 return;
1725 }
1726 word_res->done = true;
1727
1728 // Measure the mean space.
1729 int gap_count = 0;
1730 WERD* werd = word_res->word;
1731 C_BLOB_IT blob_it(werd->cblob_list());
1732 C_BLOB* prev_blob = blob_it.data();
1733 for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1734 C_BLOB* blob = blob_it.data();
1735 int gap = blob->bounding_box().left();
1736 gap -= prev_blob->bounding_box().right();
1737 ++gap_count;
1738 prev_blob = blob;
1739 }
1740 // Just correct existing classification.
1741 CorrectRepcharChoices(best_choice, word_res);
1742 word_res->reject_map.initialise(word.length());
1743}
void initialise(int16_t length)
Definition: rejctmap.cpp:273
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:343

◆ fix_sp_fp_word()

void tesseract::Tesseract::fix_sp_fp_word ( WERD_RES_IT &  word_res_it,
ROW row,
BLOCK block 
)

Definition at line 562 of file fixspace.cpp.

563 {
564 WERD_RES *word_res;
565 WERD_RES_LIST sub_word_list;
566 WERD_RES_IT sub_word_list_it(&sub_word_list);
567 int16_t blob_index;
568 int16_t new_length;
569 float junk;
570
571 word_res = word_res_it.data();
572 if (word_res->word->flag(W_REP_CHAR) ||
573 word_res->combination ||
574 word_res->part_of_combo ||
575 !word_res->word->flag(W_DONT_CHOP))
576 return;
577
578 blob_index = worst_noise_blob(word_res, &junk);
579 if (blob_index < 0)
580 return;
581
582 if (debug_fix_space_level > 1) {
583 tprintf("FP fixspace working on \"%s\"\n",
584 word_res->best_choice->unichar_string().string());
585 }
586 word_res->word->rej_cblob_list()->sort(c_blob_comparator);
587 sub_word_list_it.add_after_stay_put(word_res_it.extract());
588 fix_noisy_space_list(sub_word_list, row, block);
589 new_length = sub_word_list.length();
590 word_res_it.add_list_before(&sub_word_list);
591 for (; !word_res_it.at_last() && new_length > 1; new_length--) {
592 word_res_it.forward();
593 }
594}
@ W_DONT_CHOP
fixed pitch chopped
Definition: werd.h:37
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:596
bool part_of_combo
Definition: pageres.h:340

◆ fixspace_thinks_word_done()

bool tesseract::Tesseract::fixspace_thinks_word_done ( WERD_RES word)

Definition at line 530 of file fixspace.cpp.

530 {
531 if (word->done)
532 return true;
533
534 /*
535 Use all the standard pass 2 conditions for mode 5 in set_done() in
536 reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
537 CARE WHETHER WE HAVE of/at on/an etc.
538 */
539 if (fixsp_done_mode > 0 &&
540 (word->tess_accepted ||
541 (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
542 fixsp_done_mode == 3) &&
543 (strchr(word->best_choice->unichar_string().string(), ' ') == nullptr) &&
544 ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
545 (word->best_choice->permuter() == FREQ_DAWG_PERM) ||
546 (word->best_choice->permuter() == USER_DAWG_PERM) ||
547 (word->best_choice->permuter() == NUMBER_PERM))) {
548 return true;
549 } else {
550 return false;
551 }
552}
@ FREQ_DAWG_PERM
Definition: ratngs.h:244
@ USER_DAWG_PERM
Definition: ratngs.h:243
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:241

◆ flip_0O()

void tesseract::Tesseract::flip_0O ( WERD_RES word)

Definition at line 674 of file reject.cpp.

674 {
675 WERD_CHOICE *best_choice = word_res->best_choice;
676 int i;
677 TBOX out_box;
678
679 if (!tessedit_flip_0O)
680 return;
681
682 int num_blobs = word_res->rebuild_word->NumBlobs();
683 for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
684 TBLOB* blob = word_res->rebuild_word->blobs[i];
685 if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
686 word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
687 out_box = blob->bounding_box();
688 if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
689 (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4))
690 return; //Beware words with sub/superscripts
691 }
692 }
693 UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
694 UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
695 if (unichar_0 == INVALID_UNICHAR_ID ||
696 !word_res->uch_set->get_enabled(unichar_0) ||
697 unichar_O == INVALID_UNICHAR_ID ||
698 !word_res->uch_set->get_enabled(unichar_O)) {
699 return; // 0 or O are not present/enabled in unicharset
700 }
701 for (i = 1; i < best_choice->length(); ++i) {
702 if (best_choice->unichar_id(i) == unichar_0 ||
703 best_choice->unichar_id(i) == unichar_O) {
704 /* A0A */
705 if ((i+1) < best_choice->length() &&
706 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
707 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) {
708 best_choice->set_unichar_id(unichar_O, i);
709 }
710 /* A00A */
711 if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
712 (i+1) < best_choice->length() &&
713 (best_choice->unichar_id(i+1) == unichar_0 ||
714 best_choice->unichar_id(i+1) == unichar_O) &&
715 (i+2) < best_choice->length() &&
716 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) {
717 best_choice->set_unichar_id(unichar_O, i);
718 i++;
719 }
720 /* AA0<non digit or end of word> */
721 if ((i > 1) &&
722 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) &&
723 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
724 (((i+1) < best_choice->length() &&
725 !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) &&
726 !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") &&
727 !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) ||
728 (i == best_choice->length() - 1))) {
729 best_choice->set_unichar_id(unichar_O, i);
730 }
731 /* 9O9 */
732 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
733 (i+1) < best_choice->length() &&
734 non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) {
735 best_choice->set_unichar_id(unichar_0, i);
736 }
737 /* 9OOO */
738 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
739 (i+2) < best_choice->length() &&
740 (best_choice->unichar_id(i+1) == unichar_0 ||
741 best_choice->unichar_id(i+1) == unichar_O) &&
742 (best_choice->unichar_id(i+2) == unichar_0 ||
743 best_choice->unichar_id(i+2) == unichar_O)) {
744 best_choice->set_unichar_id(unichar_0, i);
745 best_choice->set_unichar_id(unichar_0, i+1);
746 best_choice->set_unichar_id(unichar_0, i+2);
747 i += 2;
748 }
749 /* 9OO<non upper> */
750 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
751 (i+2) < best_choice->length() &&
752 (best_choice->unichar_id(i+1) == unichar_0 ||
753 best_choice->unichar_id(i+1) == unichar_O) &&
754 !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) {
755 best_choice->set_unichar_id(unichar_0, i);
756 best_choice->set_unichar_id(unichar_0, i+1);
757 i++;
758 }
759 /* 9O<non upper> */
760 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
761 (i+1) < best_choice->length() &&
762 !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) {
763 best_choice->set_unichar_id(unichar_0, i);
764 }
765 /* 9[.,]OOO.. */
766 if ((i > 1) &&
767 (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") ||
768 word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) &&
769 (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) ||
770 best_choice->unichar_id(i-2) == unichar_O)) {
771 if (best_choice->unichar_id(i-2) == unichar_O) {
772 best_choice->set_unichar_id(unichar_0, i-2);
773 }
774 while (i < best_choice->length() &&
775 (best_choice->unichar_id(i) == unichar_O ||
776 best_choice->unichar_id(i) == unichar_0)) {
777 best_choice->set_unichar_id(unichar_0, i);
778 i++;
779 }
780 i--;
781 }
782 }
783 }
784}
bool non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:786
bool non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:790

◆ flip_hyphens()

void tesseract::Tesseract::flip_hyphens ( WERD_RES word)

Definition at line 617 of file reject.cpp.

617 {
618 WERD_CHOICE *best_choice = word_res->best_choice;
619 int i;
620 int prev_right = -9999;
621 int next_left;
622 TBOX out_box;
623 float aspect_ratio;
624
626 return;
627
628 int num_blobs = word_res->rebuild_word->NumBlobs();
629 UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
630 for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
631 TBLOB* blob = word_res->rebuild_word->blobs[i];
632 out_box = blob->bounding_box();
633 if (i + 1 == num_blobs)
634 next_left = 9999;
635 else
636 next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
637 // Don't touch small or touching blobs - it is too dangerous.
638 if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
639 (out_box.left() > prev_right) && (out_box.right() < next_left)) {
640 aspect_ratio = out_box.width() / static_cast<float>(out_box.height());
641 if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
642 if (aspect_ratio >= tessedit_upper_flip_hyphen &&
643 word_res->uch_set->contains_unichar_id(unichar_dash) &&
644 word_res->uch_set->get_enabled(unichar_dash)) {
645 /* Certain HYPHEN */
646 best_choice->set_unichar_id(unichar_dash, i);
647 if (word_res->reject_map[i].rejected())
648 word_res->reject_map[i].setrej_hyphen_accept();
649 }
650 if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
651 word_res->reject_map[i].accepted())
652 //Suspected HYPHEN
653 word_res->reject_map[i].setrej_hyphen ();
654 }
655 else if (best_choice->unichar_id(i) == unichar_dash) {
656 if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
657 (word_res->reject_map[i].rejected()))
658 word_res->reject_map[i].setrej_hyphen_accept();
659 //Certain HYPHEN
660
661 if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
662 (word_res->reject_map[i].accepted()))
663 //Suspected HYPHEN
664 word_res->reject_map[i].setrej_hyphen();
665 }
666 }
667 prev_right = out_box.right();
668 }
669}

◆ font_recognition_pass()

void tesseract::Tesseract::font_recognition_pass ( PAGE_RES page_res)

font_recognition_pass

Smooth the fonts for the document.

Definition at line 2037 of file control.cpp.

2037 {
2038 PAGE_RES_IT page_res_it(page_res);
2039 WERD_RES *word; // current word
2040 STATS doc_fonts(0, font_table_size_); // font counters
2041
2042 // Gather font id statistics.
2043 for (page_res_it.restart_page(); page_res_it.word() != nullptr;
2044 page_res_it.forward()) {
2045 word = page_res_it.word();
2046 if (word->fontinfo != nullptr) {
2047 doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
2048 }
2049 if (word->fontinfo2 != nullptr) {
2050 doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
2051 }
2052 }
2053 int16_t doc_font; // modal font
2054 int8_t doc_font_count; // modal font
2055 find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
2056 if (doc_font_count == 0)
2057 return;
2058 // Get the modal font pointer.
2059 const FontInfo* modal_font = nullptr;
2060 for (page_res_it.restart_page(); page_res_it.word() != nullptr;
2061 page_res_it.forward()) {
2062 word = page_res_it.word();
2063 if (word->fontinfo != nullptr && word->fontinfo->universal_id == doc_font) {
2064 modal_font = word->fontinfo;
2065 break;
2066 }
2067 if (word->fontinfo2 != nullptr && word->fontinfo2->universal_id == doc_font) {
2068 modal_font = word->fontinfo2;
2069 break;
2070 }
2071 }
2072 ASSERT_HOST(modal_font != nullptr);
2073
2074 // Assign modal font to weak words.
2075 for (page_res_it.restart_page(); page_res_it.word() != nullptr;
2076 page_res_it.forward()) {
2077 word = page_res_it.word();
2078 const int length = word->best_choice->length();
2079
2080 const int count = word->fontinfo_id_count;
2081 if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
2082 word->fontinfo = modal_font;
2083 // Counts only get 1 as it came from the doc.
2084 word->fontinfo_id_count = 1;
2085 }
2086 }
2087}
int32_t universal_id
Definition: fontinfo.h:123
int8_t fontinfo_id2_count
Definition: pageres.h:312
const FontInfo * fontinfo2
Definition: pageres.h:310
int8_t fontinfo_id_count
Definition: pageres.h:311

◆ fp_eval_word_spacing()

int16_t tesseract::Tesseract::fp_eval_word_spacing ( WERD_RES_LIST &  word_res_list)

Definition at line 857 of file fixspace.cpp.

857 {
858 WERD_RES_IT word_it(&word_res_list);
859 WERD_RES *word;
860 int16_t score = 0;
861 int16_t i;
862 float small_limit = kBlnXHeight * fixsp_small_outlines_size;
863
864 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
865 word = word_it.data();
866 if (word->rebuild_word == nullptr)
867 continue; // Can't handle cube words.
868 if (word->done ||
869 word->tess_accepted ||
871 word->best_choice->permuter() == FREQ_DAWG_PERM ||
872 word->best_choice->permuter() == USER_DAWG_PERM ||
873 safe_dict_word(word) > 0) {
874 int num_blobs = word->rebuild_word->NumBlobs();
875 UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
876 for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
877 TBLOB* blob = word->rebuild_word->blobs[i];
878 if (word->best_choice->unichar_id(i) == space ||
879 blob_noise_score(blob) < small_limit) {
880 score -= 1; // penalise possibly erroneous non-space
881 } else if (word->reject_map[i].accepted()) {
882 score++;
883 }
884 }
885 }
886 }
887 if (score < 0)
888 score = 0;
889 return score;
890}
float blob_noise_score(TBLOB *blob)
Definition: fixspace.cpp:787
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:608

◆ garbage_word()

GARBAGE_LEVEL tesseract::Tesseract::garbage_word ( WERD_RES word,
bool  ok_dict_word 
)

Definition at line 679 of file docqual.cpp.

679 {
680 enum STATES
681 {
682 JUNK,
683 FIRST_UPPER,
684 FIRST_LOWER,
685 FIRST_NUM,
686 SUBSEQUENT_UPPER,
687 SUBSEQUENT_LOWER,
688 SUBSEQUENT_NUM
689 };
690 const char *str = word->best_choice->unichar_string().string();
691 const char *lengths = word->best_choice->unichar_lengths().string();
692 STATES state = JUNK;
693 int len = 0;
694 int isolated_digits = 0;
695 int isolated_alphas = 0;
696 int bad_char_count = 0;
697 int tess_rejs = 0;
698 int dodgy_chars = 0;
699 int ok_chars;
700 UNICHAR_ID last_char = -1;
701 int alpha_repetition_count = 0;
702 int longest_alpha_repetition_count = 0;
703 int longest_lower_run_len = 0;
704 int lower_string_count = 0;
705 int longest_upper_run_len = 0;
706 int upper_string_count = 0;
707 int total_alpha_count = 0;
708 int total_digit_count = 0;
709
710 for (; *str != '\0'; str += *(lengths++)) {
711 len++;
712 if (word->uch_set->get_isupper (str, *lengths)) {
713 total_alpha_count++;
714 switch (state) {
715 case SUBSEQUENT_UPPER:
716 case FIRST_UPPER:
717 state = SUBSEQUENT_UPPER;
718 upper_string_count++;
719 if (longest_upper_run_len < upper_string_count)
720 longest_upper_run_len = upper_string_count;
721 if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
722 alpha_repetition_count++;
723 if (longest_alpha_repetition_count < alpha_repetition_count) {
724 longest_alpha_repetition_count = alpha_repetition_count;
725 }
726 }
727 else {
728 last_char = word->uch_set->unichar_to_id(str, *lengths);
729 alpha_repetition_count = 1;
730 }
731 break;
732 case FIRST_NUM:
733 isolated_digits++;
734 // Fall through.
735 default:
736 state = FIRST_UPPER;
737 last_char = word->uch_set->unichar_to_id(str, *lengths);
738 alpha_repetition_count = 1;
739 upper_string_count = 1;
740 break;
741 }
742 }
743 else if (word->uch_set->get_islower (str, *lengths)) {
744 total_alpha_count++;
745 switch (state) {
746 case SUBSEQUENT_LOWER:
747 case FIRST_LOWER:
748 state = SUBSEQUENT_LOWER;
749 lower_string_count++;
750 if (longest_lower_run_len < lower_string_count)
751 longest_lower_run_len = lower_string_count;
752 if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
753 alpha_repetition_count++;
754 if (longest_alpha_repetition_count < alpha_repetition_count) {
755 longest_alpha_repetition_count = alpha_repetition_count;
756 }
757 }
758 else {
759 last_char = word->uch_set->unichar_to_id(str, *lengths);
760 alpha_repetition_count = 1;
761 }
762 break;
763 case FIRST_NUM:
764 isolated_digits++;
765 // Fall through.
766 default:
767 state = FIRST_LOWER;
768 last_char = word->uch_set->unichar_to_id(str, *lengths);
769 alpha_repetition_count = 1;
770 lower_string_count = 1;
771 break;
772 }
773 }
774 else if (word->uch_set->get_isdigit (str, *lengths)) {
775 total_digit_count++;
776 switch (state) {
777 case FIRST_NUM:
778 state = SUBSEQUENT_NUM;
779 case SUBSEQUENT_NUM:
780 break;
781 case FIRST_UPPER:
782 case FIRST_LOWER:
783 isolated_alphas++;
784 // Fall through.
785 default:
786 state = FIRST_NUM;
787 break;
788 }
789 }
790 else {
791 if (*lengths == 1 && *str == ' ')
792 tess_rejs++;
793 else
794 bad_char_count++;
795 switch (state) {
796 case FIRST_NUM:
797 isolated_digits++;
798 break;
799 case FIRST_UPPER:
800 case FIRST_LOWER:
801 isolated_alphas++;
802 default:
803 break;
804 }
805 state = JUNK;
806 }
807 }
808
809 switch (state) {
810 case FIRST_NUM:
811 isolated_digits++;
812 break;
813 case FIRST_UPPER:
814 case FIRST_LOWER:
815 isolated_alphas++;
816 default:
817 break;
818 }
819
821 total_alpha_count += total_digit_count - isolated_digits;
822 }
823
824 if (crunch_leave_ok_strings && len >= 4 &&
825 2 * (total_alpha_count - isolated_alphas) > len &&
826 longest_alpha_repetition_count < crunch_long_repetitions) {
827 if ((crunch_accept_ok &&
828 acceptable_word_string(*word->uch_set, str, lengths) !=
830 longest_lower_run_len > crunch_leave_lc_strings ||
831 longest_upper_run_len > crunch_leave_uc_strings)
832 return G_NEVER_CRUNCH;
833 }
834 if (word->reject_map.length() > 1 &&
835 strpbrk(str, " ") == nullptr &&
837 word->best_choice->permuter() == FREQ_DAWG_PERM ||
838 word->best_choice->permuter() == USER_DAWG_PERM ||
839 word->best_choice->permuter() == NUMBER_PERM ||
840 acceptable_word_string(*word->uch_set, str, lengths) !=
841 AC_UNACCEPTABLE || ok_dict_word))
842 return G_OK;
843
844 ok_chars = len - bad_char_count - isolated_digits -
845 isolated_alphas - tess_rejs;
846
847 if (crunch_debug > 3) {
848 tprintf("garbage_word: \"%s\"\n",
850 tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
851 len,
852 bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
853 }
854 if (bad_char_count == 0 &&
855 tess_rejs == 0 &&
856 (len > isolated_digits + isolated_alphas || len <= 2))
857 return G_OK;
858
859 if (tess_rejs > ok_chars ||
860 (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
861 return G_TERRIBLE;
862
863 if (len > 4) {
864 dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
865 isolated_alphas;
866 if (dodgy_chars > 5 || (dodgy_chars / static_cast<float>(len)) > 0.5)
867 return G_DODGY;
868 else
869 return G_OK;
870 } else {
871 dodgy_chars = 2 * tess_rejs + bad_char_count;
872 if ((len == 4 && dodgy_chars > 2) ||
873 (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
874 return G_DODGY;
875 else
876 return G_OK;
877 }
878}
@ G_DODGY
Definition: docqual.h:33
@ G_TERRIBLE
Definition: docqual.h:34
@ G_OK
Definition: docqual.h:32
@ G_NEVER_CRUNCH
Definition: docqual.h:31

◆ get_rep_char()

UNICHAR_ID tesseract::Tesseract::get_rep_char ( WERD_RES word)

Definition at line 251 of file output.cpp.

251 { // what char is repeated?
252 int i;
253 for (i = 0; ((i < word->reject_map.length()) &&
254 (word->reject_map[i].rejected())); ++i);
255
256 if (i < word->reject_map.length()) {
257 return word->best_choice->unichar_id(i);
258 } else {
259 return word->uch_set->unichar_to_id(unrecognised_char.string());
260 }
261}

◆ get_sub_lang()

Tesseract * tesseract::Tesseract::get_sub_lang ( int  index) const
inline

Definition at line 281 of file tesseractclass.h.

281 {
282 return sub_langs_[index];
283 }

◆ getDict()

Dict & tesseract::Tesseract::getDict ( )
overridevirtual

Reimplemented from tesseract::Classify.

Definition at line 551 of file tesseractclass.cpp.

552{
553 if (0 == Classify::getDict().NumDawgs() && AnyLSTMLang())
554 {
555 if (lstm_recognizer_ && lstm_recognizer_->GetDict())
556 {
557 return *const_cast<Dict*>(lstm_recognizer_->GetDict());
558 }
559 }
560 return Classify::getDict();
561 }
bool AnyLSTMLang() const
virtual Dict & getDict()
Definition: classify.h:107
const Dict * GetDict() const

◆ GetLineData()

ImageData * tesseract::Tesseract::GetLineData ( const TBOX line_box,
const GenericVector< TBOX > &  boxes,
const GenericVector< STRING > &  texts,
int  start_box,
int  end_box,
const BLOCK block 
)

Definition at line 136 of file linerec.cpp.

140 {
141 TBOX revised_box;
142 ImageData* image_data = GetRectImage(line_box, block, kImagePadding,
143 &revised_box);
144 if (image_data == nullptr) return nullptr;
145 image_data->set_page_number(applybox_page);
146 // Copy the boxes and shift them so they are relative to the image.
147 FCOORD block_rotation(block.re_rotation().x(), -block.re_rotation().y());
148 ICOORD shift = -revised_box.botleft();
149 GenericVector<TBOX> line_boxes;
150 GenericVector<STRING> line_texts;
151 for (int b = start_box; b < end_box; ++b) {
152 TBOX box = boxes[b];
153 box.rotate(block_rotation);
154 box.move(shift);
155 line_boxes.push_back(box);
156 line_texts.push_back(texts[b]);
157 }
158 GenericVector<int> page_numbers;
159 page_numbers.init_to_size(line_boxes.size(), applybox_page);
160 image_data->AddBoxes(line_boxes, line_texts, page_numbers);
161 return image_data;
162}
const int kImagePadding
Definition: imagedata.h:39
ImageData * GetRectImage(const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
Definition: linerec.cpp:170
FCOORD re_rotation() const
Definition: ocrblock.h:134
integer coordinate
Definition: points.h:32
float x() const
Definition: points.h:207
const ICOORD & botleft() const
Definition: rect.h:92
void rotate(const FCOORD &vec)
Definition: rect.h:197
void move(const ICOORD vec)
Definition: rect.h:157

◆ GetRectImage()

ImageData * tesseract::Tesseract::GetRectImage ( const TBOX box,
const BLOCK block,
int  padding,
TBOX revised_box 
) const

Definition at line 170 of file linerec.cpp.

171 {
172 TBOX wbox = box;
173 wbox.pad(padding, padding);
174 *revised_box = wbox;
175 // Number of clockwise 90 degree rotations needed to get back to tesseract
176 // coords from the clipped image.
177 int num_rotations = 0;
178 if (block.re_rotation().y() > 0.0f)
179 num_rotations = 1;
180 else if (block.re_rotation().x() < 0.0f)
181 num_rotations = 2;
182 else if (block.re_rotation().y() < 0.0f)
183 num_rotations = 3;
184 // Handle two cases automatically: 1 the box came from the block, 2 the box
185 // came from a box file, and refers to the image, which the block may not.
186 if (block.pdblk.bounding_box().major_overlap(*revised_box))
187 revised_box->rotate(block.re_rotation());
188 // Now revised_box always refers to the image.
189 // BestPix is never colormapped, but may be of any depth.
190 Pix* pix = BestPix();
191 int width = pixGetWidth(pix);
192 int height = pixGetHeight(pix);
193 TBOX image_box(0, 0, width, height);
194 // Clip to image bounds;
195 *revised_box &= image_box;
196 if (revised_box->null_box()) return nullptr;
197 Box* clip_box = boxCreate(revised_box->left(), height - revised_box->top(),
198 revised_box->width(), revised_box->height());
199 Pix* box_pix = pixClipRectangle(pix, clip_box, nullptr);
200 if (box_pix == nullptr) return nullptr;
201 boxDestroy(&clip_box);
202 if (num_rotations > 0) {
203 Pix* rot_pix = pixRotateOrth(box_pix, num_rotations);
204 pixDestroy(&box_pix);
205 box_pix = rot_pix;
206 }
207 // Convert sub-8-bit images to 8 bit.
208 int depth = pixGetDepth(box_pix);
209 if (depth < 8) {
210 Pix* grey;
211 grey = pixConvertTo8(box_pix, false);
212 pixDestroy(&box_pix);
213 box_pix = grey;
214 }
215 bool vertical_text = false;
216 if (num_rotations > 0) {
217 // Rotated the clipped revised box back to internal coordinates.
218 FCOORD rotation(block.re_rotation().x(), -block.re_rotation().y());
219 revised_box->rotate(rotation);
220 if (num_rotations != 2)
221 vertical_text = true;
222 }
223 return new ImageData(vertical_text, box_pix);
224}
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:59
void pad(int xpad, int ypad)
Definition: rect.h:131
bool null_box() const
Definition: rect.h:50

◆ GetSubAndSuperscriptCandidates()

void tesseract::Tesseract::GetSubAndSuperscriptCandidates ( const WERD_RES word,
int *  num_rebuilt_leading,
ScriptPos leading_pos,
float *  leading_certainty,
int *  num_rebuilt_trailing,
ScriptPos trailing_pos,
float *  trailing_certainty,
float *  avg_certainty,
float *  unlikely_threshold 
)

Determine how many characters (rebuilt blobs) on each end of a given word might plausibly be superscripts so SubAndSuperscriptFix can try to re-recognize them. Even if we find no whole blobs at either end, we will set *unlikely_threshold to a certainty that might be used to select "bad enough" outlier characters. If *unlikely_threshold is set to 0, though, there's really no hope.

Parameters
[in]wordThe word to examine.
[out]num_rebuilt_leadingthe number of rebuilt blobs at the start of the word which are all up or down and seem badly classified.
[out]leading_pos"super" or "sub" (for debugging)
[out]leading_certaintythe worst certainty in the leading blobs.
[out]num_rebuilt_trailingthe number of rebuilt blobs at the end of the word which are all up or down and seem badly classified.
[out]trailing_pos"super" or "sub" (for debugging)
[out]trailing_certaintythe worst certainty in the trailing blobs.
[out]avg_certaintythe average certainty of "normal" blobs in the word.
[out]unlikely_thresholdthe threshold (on certainty) we used to select "bad enough" outlier characters.

Definition at line 254 of file superscript.cpp.

262 {
263 *avg_certainty = *unlikely_threshold = 0.0f;
264 *num_rebuilt_leading = *num_rebuilt_trailing = 0;
265 *leading_certainty = *trailing_certainty = 0.0f;
266
267 int super_y_bottom =
269 int sub_y_top =
271
272 // Step one: Get an average certainty for "normally placed" characters.
273
274 // Counts here are of blobs in the rebuild_word / unichars in best_choice.
275 *leading_pos = *trailing_pos = SP_NORMAL;
276 int leading_outliers = 0;
277 int trailing_outliers = 0;
278 int num_normal = 0;
279 float normal_certainty_total = 0.0f;
280 float worst_normal_certainty = 0.0f;
281 ScriptPos last_pos = SP_NORMAL;
282 int num_blobs = word->rebuild_word->NumBlobs();
283 for (int b = 0; b < num_blobs; ++b) {
284 TBOX box = word->rebuild_word->blobs[b]->bounding_box();
285 ScriptPos pos = SP_NORMAL;
286 if (box.bottom() >= super_y_bottom) {
287 pos = SP_SUPERSCRIPT;
288 } else if (box.top() <= sub_y_top) {
289 pos = SP_SUBSCRIPT;
290 }
291 if (pos == SP_NORMAL) {
292 if (word->best_choice->unichar_id(b) != 0) {
293 float char_certainty = word->best_choice->certainty(b);
294 if (char_certainty < worst_normal_certainty) {
295 worst_normal_certainty = char_certainty;
296 }
297 num_normal++;
298 normal_certainty_total += char_certainty;
299 }
300 if (trailing_outliers == b) {
301 leading_outliers = trailing_outliers;
302 *leading_pos = last_pos;
303 }
304 trailing_outliers = 0;
305 } else {
306 if (last_pos == pos) {
307 trailing_outliers++;
308 } else {
309 trailing_outliers = 1;
310 }
311 }
312 last_pos = pos;
313 }
314 *trailing_pos = last_pos;
315 if (num_normal >= 3) { // throw out the worst as an outlier.
316 num_normal--;
317 normal_certainty_total -= worst_normal_certainty;
318 }
319 if (num_normal > 0) {
320 *avg_certainty = normal_certainty_total / num_normal;
321 *unlikely_threshold = superscript_worse_certainty * (*avg_certainty);
322 }
323 if (num_normal == 0 ||
324 (leading_outliers == 0 && trailing_outliers == 0)) {
325 return;
326 }
327
328 // Step two: Try to split off bits of the word that are both outliers
329 // and have much lower certainty than average
330 // Calculate num_leading and leading_certainty.
331 for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0;
332 *num_rebuilt_leading < leading_outliers;
333 (*num_rebuilt_leading)++) {
334 float char_certainty = word->best_choice->certainty(*num_rebuilt_leading);
335 if (char_certainty > *unlikely_threshold) {
336 break;
337 }
338 if (char_certainty < *leading_certainty) {
339 *leading_certainty = char_certainty;
340 }
341 }
342
343 // Calculate num_trailing and trailing_certainty.
344 for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0;
345 *num_rebuilt_trailing < trailing_outliers;
346 (*num_rebuilt_trailing)++) {
347 int blob_idx = num_blobs - 1 - *num_rebuilt_trailing;
348 float char_certainty = word->best_choice->certainty(blob_idx);
349 if (char_certainty > *unlikely_threshold) {
350 break;
351 }
352 if (char_certainty < *trailing_certainty) {
353 *trailing_certainty = char_certainty;
354 }
355 }
356}
@ SP_SUBSCRIPT
Definition: ratngs.h:254
@ SP_NORMAL
Definition: ratngs.h:253
@ SP_SUPERSCRIPT
Definition: ratngs.h:255

◆ ImageHeight()

int tesseract::Tesseract::ImageHeight ( ) const
inline

Definition at line 255 of file tesseractclass.h.

255 {
256 return pixGetHeight(pix_binary_);
257 }

◆ ImageWidth()

int tesseract::Tesseract::ImageWidth ( ) const
inline

Definition at line 252 of file tesseractclass.h.

252 {
253 return pixGetWidth(pix_binary_);
254 }

◆ init_recog_training()

FILE * tesseract::Tesseract::init_recog_training ( const STRING fname)

Definition at line 36 of file recogtraining.cpp.

36 {
38 tessedit_tess_adaption_mode.set_value(0); // turn off adaption
39 tessedit_enable_doc_dict.set_value(0); // turn off document dictionary
40 // Explore all segmentations.
42 }
43
44 STRING output_fname = fname;
45 const char* lastdot = strrchr(output_fname.string(), '.');
46 if (lastdot != nullptr)
47 output_fname[lastdot - output_fname.string()] = '\0';
48 output_fname += ".txt";
49 FILE* output_file = fopen(output_fname.string(), "a+");
50 if (output_file == nullptr) {
51 tprintf("Error: Could not open file %s\n", output_fname.string());
52 ASSERT_HOST(output_file);
53 }
54 return output_file;
55}
bool stopper_no_acceptable_choices
Definition: dict.h:641

◆ init_tesseract() [1/2]

int tesseract::Tesseract::init_tesseract ( const char *  arg0,
const char *  textbase,
const char *  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const GenericVector< STRING > *  vars_vec,
const GenericVector< STRING > *  vars_values,
bool  set_only_init_params,
TessdataManager mgr 
)

Definition at line 286 of file tessedit.cpp.

292 {
293 GenericVector<STRING> langs_to_load;
294 GenericVector<STRING> langs_not_to_load;
295 ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
296
297 sub_langs_.delete_data_pointers();
298 sub_langs_.clear();
299 // Find the first loadable lang and load into this.
300 // Add any languages that this language requires
301 bool loaded_primary = false;
302 // Load the rest into sub_langs_.
303 for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
304 if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
305 const char* lang_str = langs_to_load[lang_index].string();
306 Tesseract* tess_to_init;
307 if (!loaded_primary) {
308 tess_to_init = this;
309 } else {
310 tess_to_init = new Tesseract;
311 }
312
313 int result = tess_to_init->init_tesseract_internal(
314 arg0, textbase, lang_str, oem, configs, configs_size, vars_vec,
315 vars_values, set_only_non_debug_params, mgr);
316 // Forget that language, but keep any reader we were given.
317 mgr->Clear();
318
319 if (!loaded_primary) {
320 if (result < 0) {
321 tprintf("Failed loading language '%s'\n", lang_str);
322 } else {
323 ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
324 &langs_to_load, &langs_not_to_load);
325 loaded_primary = true;
326 }
327 } else {
328 if (result < 0) {
329 tprintf("Failed loading language '%s'\n", lang_str);
330 delete tess_to_init;
331 } else {
332 sub_langs_.push_back(tess_to_init);
333 // Add any languages that this language requires
334 ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
335 &langs_to_load, &langs_not_to_load);
336 }
337 }
338 }
339 }
340 if (!loaded_primary) {
341 tprintf("Tesseract couldn't load any languages!\n");
342 return -1; // Couldn't load any language!
343 }
344#ifndef DISABLED_LEGACY_ENGINE
345 if (!sub_langs_.empty()) {
346 // In multilingual mode word ratings have to be directly comparable,
347 // so use the same language model weights for all languages:
348 // use the primary language's params model if
349 // tessedit_use_primary_params_model is set,
350 // otherwise use default language model weights.
352 for (int s = 0; s < sub_langs_.size(); ++s) {
353 sub_langs_[s]->language_model_->getParamsModel().Copy(
354 this->language_model_->getParamsModel());
355 }
356 tprintf("Using params model of the primary language\n");
357 } else {
358 this->language_model_->getParamsModel().Clear();
359 for (int s = 0; s < sub_langs_.size(); ++s) {
360 sub_langs_[s]->language_model_->getParamsModel().Clear();
361 }
362 }
363 }
364
366#endif // ndef DISABLED_LEGACY_ENGINE
367 return 0;
368}
void SetupUniversalFontIds()
Definition: tessedit.cpp:431
void ParseLanguageString(const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
Definition: tessedit.cpp:254
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:471

◆ init_tesseract() [2/2]

int tesseract::Tesseract::init_tesseract ( const char *  datapath,
const char *  language,
OcrEngineMode  oem 
)
inline

Definition at line 512 of file tesseractclass.h.

513 {
514 TessdataManager mgr;
515 return init_tesseract(datapath, nullptr, language, oem, nullptr, 0, nullptr,
516 nullptr, false, &mgr);
517 }
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:286

◆ init_tesseract_internal()

int tesseract::Tesseract::init_tesseract_internal ( const char *  arg0,
const char *  textbase,
const char *  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const GenericVector< STRING > *  vars_vec,
const GenericVector< STRING > *  vars_values,
bool  set_only_init_params,
TessdataManager mgr 
)

Definition at line 386 of file tessedit.cpp.

392 {
393 if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
394 configs_size, vars_vec, vars_values,
395 set_only_non_debug_params, mgr)) {
396 return -1;
397 }
399 return 0;
400 }
401 // If only LSTM will be used, skip loading Tesseract classifier's
402 // pre-trained templates and dictionary.
404 program_editup(textbase, init_tesseract ? mgr : nullptr,
405 init_tesseract ? mgr : nullptr);
406 return 0; // Normal exit
407}
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:79
void program_editup(const char *textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
Definition: tface.cpp:40

◆ init_tesseract_lang_data()

bool tesseract::Tesseract::init_tesseract_lang_data ( const char *  arg0,
const char *  textbase,
const char *  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const GenericVector< STRING > *  vars_vec,
const GenericVector< STRING > *  vars_values,
bool  set_only_init_params,
TessdataManager mgr 
)

Definition at line 79 of file tessedit.cpp.

84 {
85 // Set the basename, compute the data directory.
86 main_setup(arg0, textbase);
87
88 // Set the language data path prefix
89 lang = language != nullptr ? language : "eng";
93
94 // Initialize TessdataManager.
95 STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
96 if (!mgr->is_loaded() && !mgr->Init(tessdata_path.string())) {
97 tprintf("Error opening data file %s\n", tessdata_path.string());
98 tprintf(
99 "Please make sure the TESSDATA_PREFIX environment variable is set"
100 " to your \"tessdata\" directory.\n");
101 return false;
102 }
103#ifndef DISABLED_LEGACY_ENGINE
104 if (oem == OEM_DEFAULT) {
105 // Set the engine mode from availability, which can then be overridden by
106 // the config file when we read it below.
107 if (!mgr->IsLSTMAvailable()) {
109 } else if (!mgr->IsBaseAvailable()) {
111 } else {
113 }
114 }
115#endif // ndef DISABLED_LEGACY_ENGINE
116
117 // If a language specific config file (lang.config) exists, load it in.
118 TFile fp;
119 if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
121 this->params());
122 }
123
124 SetParamConstraint set_params_constraint =
125 set_only_non_debug_params ? SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY
127 // Load tesseract variables from config files. This is done after loading
128 // language-specific variables from [lang].traineddata file, so that custom
129 // config files can override values in [lang].traineddata file.
130 for (int i = 0; i < configs_size; ++i) {
131 read_config_file(configs[i], set_params_constraint);
132 }
133
134 // Set params specified in vars_vec (done after setting params from config
135 // files, so that params in vars_vec can override those from files).
136 if (vars_vec != nullptr && vars_values != nullptr) {
137 for (int i = 0; i < vars_vec->size(); ++i) {
138 if (!ParamUtils::SetParam((*vars_vec)[i].string(),
139 (*vars_values)[i].string(),
140 set_params_constraint, this->params())) {
141 tprintf("Warning: The parameter '%s' was not found.\n", (*vars_vec)[i].string());
142 }
143 }
144 }
145
146 if (!tessedit_write_params_to_file.empty()) {
147 FILE* params_file = fopen(tessedit_write_params_to_file.string(), "wb");
148 if (params_file != nullptr) {
149 ParamUtils::PrintParams(params_file, this->params());
150 fclose(params_file);
151 } else {
152 tprintf("Failed to open %s for writing params.\n",
154 }
155 }
156
157 // Determine which ocr engine(s) should be loaded and used for recognition.
158 if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
159
160 // If we are only loading the config file (and so not planning on doing any
161 // recognition) then there's nothing else do here.
163 return true;
164 }
165
166// The various OcrEngineMode settings (see publictypes.h) determine which
167// engine-specific data files need to be loaded.
168// If LSTM_ONLY is requested, the base Tesseract files are *Not* required.
169#ifndef ANDROID_BUILD
170# ifdef DISABLED_LEGACY_ENGINE
172# else
175# endif // ndef DISABLED_LEGACY_ENGINE
176 if (mgr->IsComponentAvailable(TESSDATA_LSTM)) {
177 lstm_recognizer_ = new LSTMRecognizer(language_data_path_prefix);
178 ASSERT_HOST(lstm_recognizer_->Load(
179 this->params(), lstm_use_matrix ? language : nullptr, mgr));
180 } else {
181 tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
183 }
184 }
185#endif // ndef ANDROID_BUILD
186
187 // Load the unicharset
189 // Avoid requiring a unicharset when we aren't running base tesseract.
190#ifndef ANDROID_BUILD
191 unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
192#endif // ndef ANDROID_BUILD
193 }
194#ifndef DISABLED_LEGACY_ENGINE
195 else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) ||
196 !unicharset.load_from_file(&fp, false)) {
197 tprintf("Error: Tesseract (legacy) engine requested, but components are "
198 "not present in %s!!\n", tessdata_path.c_str());
199 return false;
200 }
201#endif // ndef DISABLED_LEGACY_ENGINE
203 tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
204 return false;
205 }
206 right_to_left_ = unicharset.major_right_to_left();
207
208#ifndef DISABLED_LEGACY_ENGINE
209
210 // Setup initial unichar ambigs table and read universal ambigs.
211 UNICHARSET encoder_unicharset;
212 encoder_unicharset.CopyFrom(unicharset);
214 unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
215
216 if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) {
217 unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp,
220 }
221
222 // Init ParamsModel.
223 // Load pass1 and pass2 weights (for now these two sets are the same, but in
224 // the future separate sets of weights can be generated).
226 ++p) {
227 language_model_->getParamsModel().SetPass(
228 static_cast<ParamsModel::PassEnum>(p));
229 if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
230 if (!language_model_->getParamsModel().LoadFromFp(lang.string(), &fp)) {
231 return false;
232 }
233 }
234 }
235#endif // ndef DISABLED_LEGACY_ENGINE
236
237 return true;
238}
#define MAX_NUM_CLASSES
Definition: matchdefs.h:30
SetParamConstraint
Definition: params.h:35
@ SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY
Definition: params.h:38
@ SET_PARAM_CONSTRAINT_NONE
Definition: params.h:36
@ TESSDATA_PARAMS_MODEL
@ TESSDATA_LANG_CONFIG
void read_config_file(const char *filename, SetParamConstraint constraint)
Definition: tessedit.cpp:48
void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset)
Definition: ambigs.cpp:68
void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption)
Definition: ambigs.cpp:54
void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
Definition: ambigs.cpp:75
STRING language_data_path_prefix
Definition: ccutil.h:72
STRING datadir
Definition: ccutil.h:69
void main_setup(const char *argv0, const char *basename)
CCUtil::main_setup - set location of tessdata and name of image.
Definition: mainblk.cpp:44
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:75
int ambigs_debug_level
Definition: ccutil.h:87
bool use_ambigs_for_adaption
Definition: ccutil.h:89
static bool ReadParamsFromFp(SetParamConstraint constraint, TFile *fp, ParamsVectors *member_params)
Definition: params.cpp:50
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:168
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:79
const char * c_str() const
Definition: strngs.cpp:205
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:448
bool major_right_to_left() const
Definition: unicharset.cpp:992
int size() const
Definition: unicharset.h:341
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:388
bool Load(const ParamsVectors *params, const char *lang, TessdataManager *mgr)

◆ init_tesseract_lm()

int tesseract::Tesseract::init_tesseract_lm ( const char *  arg0,
const char *  textbase,
const char *  language,
TessdataManager mgr 
)

Definition at line 452 of file tessedit.cpp.

453 {
454 if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
455 nullptr, 0, nullptr, nullptr, false, mgr))
456 return -1;
458 getDict().Load(lang, mgr);
460 return 0;
461}
static TESS_API DawgCache * GlobalDawgCache()
Definition: dict.cpp:184
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:192
bool FinishLoad()
Definition: dict.cpp:351
void Load(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:210

◆ join_words()

void tesseract::Tesseract::join_words ( WERD_RES word,
WERD_RES word2,
BlamerBundle orig_bb 
) const

Definition at line 234 of file tfacepp.cpp.

236 {
237 TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
238 TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
239 // Tack the word2 outputs onto the end of the word outputs.
240 word->chopped_word->blobs += word2->chopped_word->blobs;
241 word->rebuild_word->blobs += word2->rebuild_word->blobs;
242 word2->chopped_word->blobs.clear();
243 word2->rebuild_word->blobs.clear();
244 TPOINT split_pt;
245 split_pt.x = (prev_box.right() + blob_box.left()) / 2;
246 split_pt.y = (prev_box.top() + prev_box.bottom() +
247 blob_box.top() + blob_box.bottom()) / 4;
248 // Move the word2 seams onto the end of the word1 seam_array.
249 // Since the seam list is one element short, an empty seam marking the
250 // end of the last blob in the first word is needed first.
251 word->seam_array.push_back(new SEAM(0.0f, split_pt));
252 word->seam_array += word2->seam_array;
253 word2->seam_array.truncate(0);
254 // Fix widths and gaps.
255 word->blob_widths += word2->blob_widths;
256 word->blob_gaps += word2->blob_gaps;
257 // Fix the ratings matrix.
258 int rat1 = word->ratings->dimension();
259 int rat2 = word2->ratings->dimension();
260 word->ratings->AttachOnCorner(word2->ratings);
261 ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
262 word->best_state += word2->best_state;
263 // Append the word choices.
264 *word->raw_choice += *word2->raw_choice;
265
266 // How many alt choices from each should we try to get?
267 const int kAltsPerPiece = 2;
268 // When do we start throwing away extra alt choices?
269 const int kTooManyAltChoices = 100;
270
271 // Construct the cartesian product of the best_choices of word(1) and word2.
272 WERD_CHOICE_LIST joined_choices;
273 WERD_CHOICE_IT jc_it(&joined_choices);
274 WERD_CHOICE_IT bc1_it(&word->best_choices);
275 WERD_CHOICE_IT bc2_it(&word2->best_choices);
276 int num_word1_choices = word->best_choices.length();
277 int total_joined_choices = num_word1_choices;
278 // Nota Bene: For the main loop here, we operate only on the 2nd and greater
279 // word2 choices, and put them in the joined_choices list. The 1st word2
280 // choice gets added to the original word1 choices in-place after we have
281 // finished with them.
282 int bc2_index = 1;
283 for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
284 if (total_joined_choices >= kTooManyAltChoices &&
285 bc2_index > kAltsPerPiece)
286 break;
287 int bc1_index = 0;
288 for (bc1_it.move_to_first(); bc1_index < num_word1_choices;
289 ++bc1_index, bc1_it.forward()) {
290 if (total_joined_choices >= kTooManyAltChoices &&
291 bc1_index > kAltsPerPiece)
292 break;
293 auto *wc = new WERD_CHOICE(*bc1_it.data());
294 *wc += *bc2_it.data();
295 jc_it.add_after_then_move(wc);
296 ++total_joined_choices;
297 }
298 }
299 // Now that we've filled in as many alternates as we want, paste the best
300 // choice for word2 onto the original word alt_choices.
301 bc1_it.move_to_first();
302 bc2_it.move_to_first();
303 for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
304 *bc1_it.data() += *bc2_it.data();
305 }
306 bc1_it.move_to_last();
307 bc1_it.add_list_after(&joined_choices);
308
309 // Restore the pointer to original blamer bundle and combine blamer
310 // information recorded in the splits.
311 if (orig_bb != nullptr) {
312 orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle,
314 delete word->blamer_bundle;
315 word->blamer_bundle = orig_bb;
316 }
317 word->SetupBoxWord();
318 word->reject_map.initialise(word->box_word->length());
319 delete word2;
320}
T & back() const
void truncate(int size)
void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug)
Definition: blamer.cpp:233
Definition: blobs.h:51
int16_t x
Definition: blobs.h:93
int16_t y
Definition: blobs.h:94
void AttachOnCorner(BandTriMatrix< T > *array2)
Definition: matrix.h:553
GenericVector< int > blob_widths
Definition: pageres.h:216
void SetupBoxWord()
Definition: pageres.cpp:849
WERD_CHOICE * raw_choice
Definition: pageres.h:246
GenericVector< int > blob_gaps
Definition: pageres.h:219

◆ LSTMRecognizeWord()

void tesseract::Tesseract::LSTMRecognizeWord ( const BLOCK block,
ROW row,
WERD_RES word,
PointerVector< WERD_RES > *  words 
)

Definition at line 229 of file linerec.cpp.

230 {
231 TBOX word_box = word->word->bounding_box();
232 // Get the word image - no frills.
235 // In single word mode, use the whole image without any other row/word
236 // interpretation.
237 word_box = TBOX(0, 0, ImageWidth(), ImageHeight());
238 } else {
239 float baseline = row->base_line((word_box.left() + word_box.right()) / 2);
240 if (baseline + row->descenders() < word_box.bottom())
241 word_box.set_bottom(baseline + row->descenders());
242 if (baseline + row->x_height() + row->ascenders() > word_box.top())
243 word_box.set_top(baseline + row->x_height() + row->ascenders());
244 }
245 ImageData* im_data = GetRectImage(word_box, block, kImagePadding, &word_box);
246 if (im_data == nullptr) return;
247
248 bool do_invert = tessedit_do_invert;
249 lstm_recognizer_->RecognizeLine(*im_data, do_invert, classify_debug_level > 0,
251 word_box, words, lstm_choice_mode);
252 delete im_data;
253 SearchWords(words);
254}
const float kWorstDictCertainty
Definition: linerec.cpp:38
@ PSM_SINGLE_WORD
Treat the image as a single word.
Definition: publictypes.h:174
@ PSM_RAW_LINE
hacks that are Tesseract-specific.
Definition: publictypes.h:179
const float kCertaintyScale
Definition: linerec.cpp:36
void SearchWords(PointerVector< WERD_RES > *words)
Definition: linerec.cpp:259
float descenders() const
Definition: ocrrow.h:85
float base_line(float xpos) const
Definition: ocrrow.h:59
float ascenders() const
Definition: ocrrow.h:82
void set_bottom(int y)
Definition: rect.h:68
void set_top(int y)
Definition: rect.h:61
void RecognizeLine(const ImageData &image_data, bool invert, bool debug, double worst_dict_cert, const TBOX &line_box, PointerVector< WERD_RES > *words, int lstm_choice_mode=0)

◆ make_reject_map()

void tesseract::Tesseract::make_reject_map ( WERD_RES word,
ROW row,
int16_t  pass 
)

◆ match_current_words()

void tesseract::Tesseract::match_current_words ( WERD_RES_LIST &  words,
ROW row,
BLOCK block 
)

Definition at line 223 of file fixspace.cpp.

224 {
225 WERD_RES_IT word_it(&words);
226 WERD_RES *word;
227 // Since we are not using PAGE_RES to iterate over words, we need to update
228 // prev_word_best_choice_ before calling classify_word_pass2().
229 prev_word_best_choice_ = nullptr;
230 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
231 word = word_it.data();
232 if ((!word->part_of_combo) && (word->box_word == nullptr)) {
233 WordData word_data(block, row, word);
234 SetupWordPassN(2, &word_data);
235 classify_word_and_language(2, nullptr, &word_data);
236 }
238 }
239}

◆ match_word_pass_n()

void tesseract::Tesseract::match_word_pass_n ( int  pass_n,
WERD_RES word,
ROW row,
BLOCK block 
)

match_word_pass2

Baseline normalize the word and pass it to Tess.

Definition at line 1630 of file control.cpp.

1631 {
1632 if (word->tess_failed) return;
1633 tess_segment_pass_n(pass_n, word);
1634
1635 if (!word->tess_failed) {
1636 if (!word->word->flag (W_REP_CHAR)) {
1637 word->fix_quotes();
1639 word->fix_hyphens();
1640 /* Don't trust fix_quotes! - though I think I've fixed the bug */
1641 if (word->best_choice->length() != word->box_word->length()) {
1642 tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
1643 " #Blobs=%d\n",
1644 word->best_choice->debug_string().string(),
1645 word->best_choice->length(),
1646 word->box_word->length());
1647
1648 }
1649 word->tess_accepted = tess_acceptable_word(word);
1650
1651 // Also sets word->done flag
1652 make_reject_map(word, row, pass_n);
1653 }
1654 }
1655 set_word_fonts(word);
1656
1657 ASSERT_HOST(word->raw_choice != nullptr);
1658}
void tess_segment_pass_n(int pass_n, WERD_RES *word)
Definition: tessbox.cpp:32
void set_word_fonts(WERD_RES *word)
Definition: control.cpp:1962
bool tess_acceptable_word(WERD_RES *word)
Definition: tessbox.cpp:62
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
void fix_hyphens()
Definition: pageres.cpp:1047
void fix_quotes()
Definition: pageres.cpp:1018
const STRING debug_string() const
Definition: ratngs.h:495

◆ MaximallyChopWord()

void tesseract::Tesseract::MaximallyChopWord ( const GenericVector< TBOX > &  boxes,
BLOCK block,
ROW row,
WERD_RES word_res 
)

Tests the chopper by exhaustively running chop_one_blob. The word_res will contain filled chopped_word, seam_array, denorm, box_word and best_state for the maximally chopped word.

Definition at line 243 of file applybox.cpp.

245 {
246 if (!word_res->SetupForRecognition(unicharset, this, BestPix(),
251 row, block)) {
252 word_res->CloneChoppedToRebuild();
253 return;
254 }
255 if (chop_debug) {
256 tprintf("Maximally chopping word at:");
257 word_res->word->bounding_box().print();
258 }
259 GenericVector<BLOB_CHOICE*> blob_choices;
260 ASSERT_HOST(!word_res->chopped_word->blobs.empty());
261 auto rating = static_cast<float>(INT8_MAX);
262 for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
263 // The rating and certainty are not quite arbitrary. Since
264 // select_blob_to_chop uses the worst certainty to choose, they all have
265 // to be different, so starting with INT8_MAX, subtract 1/8 for each blob
266 // in here, and then divide by e each time they are chopped, which
267 // should guarantee a set of unequal values for the whole tree of blobs
268 // produced, however much chopping is required. The chops are thus only
269 // limited by the ability of the chopper to find suitable chop points,
270 // and not by the value of the certainties.
271 auto* choice =
272 new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
273 blob_choices.push_back(choice);
274 rating -= 0.125f;
275 }
276 const double e = exp(1.0); // The base of natural logs.
277 int blob_number;
278 int right_chop_index = 0;
280 // We only chop if the language is not fixed pitch like CJK.
281 SEAM* seam = nullptr;
282 while ((seam = chop_one_blob(boxes, blob_choices, word_res,
283 &blob_number)) != nullptr) {
284 word_res->InsertSeam(blob_number, seam);
285 BLOB_CHOICE* left_choice = blob_choices[blob_number];
286 rating = left_choice->rating() / e;
287 left_choice->set_rating(rating);
288 left_choice->set_certainty(-rating);
289 // combine confidence w/ serial #
290 auto* right_choice = new BLOB_CHOICE(++right_chop_index,
291 rating - 0.125f, -rating, -1,
292 0.0f, 0.0f, 0.0f, BCC_FAKE);
293 blob_choices.insert(right_choice, blob_number + 1);
294 }
295 }
296 word_res->CloneChoppedToRebuild();
297 word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
298}
@ BCC_FAKE
Definition: ratngs.h:48
void insert(const T &t, int index)
void CloneChoppedToRebuild()
Definition: pageres.cpp:835
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:877
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:418
void set_rating(float newrat)
Definition: ratngs.h:144
float rating() const
Definition: ratngs.h:80
void set_certainty(float newrat)
Definition: ratngs.h:147
SEAM * chop_one_blob(const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, int *blob_number)
Definition: chopper.cpp:371
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:225

◆ mutable_pix_binary()

Pix ** tesseract::Tesseract::mutable_pix_binary ( )
inline

Definition at line 198 of file tesseractclass.h.

198 {
199 pixDestroy(&pix_binary_);
200 return &pix_binary_;
201 }

◆ mutable_textord()

Textord * tesseract::Tesseract::mutable_textord ( )
inline

Definition at line 271 of file tesseractclass.h.

271 {
272 return &textord_;
273 }

◆ nn_match_word()

void tesseract::Tesseract::nn_match_word ( WERD_RES word,
ROW row 
)

◆ nn_recover_rejects()

void tesseract::Tesseract::nn_recover_rejects ( WERD_RES word,
ROW row 
)

◆ noise_outlines()

bool tesseract::Tesseract::noise_outlines ( TWERD word)

Definition at line 980 of file docqual.cpp.

980 {
981 TBOX box; // BB of outline
982 int16_t outline_count = 0;
983 int16_t small_outline_count = 0;
984 int16_t max_dimension;
985 float small_limit = kBlnXHeight * crunch_small_outlines_size;
986
987 for (int b = 0; b < word->NumBlobs(); ++b) {
988 TBLOB* blob = word->blobs[b];
989 for (TESSLINE* ol = blob->outlines; ol != nullptr; ol = ol->next) {
990 outline_count++;
991 box = ol->bounding_box();
992 if (box.height() > box.width())
993 max_dimension = box.height();
994 else
995 max_dimension = box.width();
996 if (max_dimension < small_limit)
997 small_outline_count++;
998 }
999 }
1000 return small_outline_count >= outline_count;
1001}

◆ non_0_digit()

bool tesseract::Tesseract::non_0_digit ( const UNICHARSET ch_set,
UNICHAR_ID  unichar_id 
)

Definition at line 790 of file reject.cpp.

790 {
791 return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
792}
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:687

◆ non_O_upper()

bool tesseract::Tesseract::non_O_upper ( const UNICHARSET ch_set,
UNICHAR_ID  unichar_id 
)

Definition at line 786 of file reject.cpp.

786 {
787 return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
788}

◆ num_sub_langs()

int tesseract::Tesseract::num_sub_langs ( ) const
inline

Definition at line 278 of file tesseractclass.h.

278 {
279 return sub_langs_.size();
280 }

◆ one_ell_conflict()

bool tesseract::Tesseract::one_ell_conflict ( WERD_RES word_res,
bool  update_map 
)

Definition at line 293 of file reject.cpp.

293 {
294 const char *word;
295 const char *lengths;
296 int16_t word_len; //its length
297 int16_t first_alphanum_index_;
298 int16_t first_alphanum_offset_;
299 int16_t i;
300 int16_t offset;
301 bool non_conflict_set_char; //non conf set a/n?
302 bool conflict = false;
303 bool allow_1s;
304 ACCEPTABLE_WERD_TYPE word_type;
305 bool dict_perm_type;
306 bool dict_word_ok;
307 int dict_word_type;
308
309 word = word_res->best_choice->unichar_string().string ();
310 lengths = word_res->best_choice->unichar_lengths().string();
311 word_len = strlen(lengths);
312 /*
313 If there are no occurrences of the conflict set characters then the word
314 is OK.
315 */
316 if (strpbrk(word, conflict_set_I_l_1.string ()) == nullptr)
317 return false;
318
319 /*
320 There is a conflict if there are NO other (confirmed) alphanumerics apart
321 from those in the conflict set.
322 */
323
324 for (i = 0, offset = 0, non_conflict_set_char = false;
325 (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
326 non_conflict_set_char =
327 (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
328 word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
329 !STRING (conflict_set_I_l_1).contains (word[offset]);
330 if (!non_conflict_set_char) {
331 if (update_map)
332 reject_I_1_L(word_res);
333 return true;
334 }
335
336 /*
337 If the word is accepted by a dawg permuter, and the first alpha character
338 is "I" or "l", check to see if the alternative is also a dawg word. If it
339 is, then there is a potential error otherwise the word is ok.
340 */
341
342 dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
343 (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
345 (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
346 (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
347 dict_word_type = dict_word(*(word_res->best_choice));
348 dict_word_ok = (dict_word_type > 0) &&
349 (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
350
351 if ((rej_1Il_use_dict_word && dict_word_ok) ||
352 (rej_1Il_trust_permuter_type && dict_perm_type) ||
353 (dict_perm_type && dict_word_ok)) {
354 first_alphanum_index_ = first_alphanum_index (word, lengths);
355 first_alphanum_offset_ = first_alphanum_offset (word, lengths);
356 if (lengths[first_alphanum_index_] == 1 &&
357 word[first_alphanum_offset_] == 'I') {
358 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
359 if (safe_dict_word(word_res) > 0) {
360 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
361 if (update_map)
362 word_res->reject_map[first_alphanum_index_].
363 setrej_1Il_conflict();
364 return true;
365 }
366 else {
367 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
368 return false;
369 }
370 }
371
372 if (lengths[first_alphanum_index_] == 1 &&
373 word[first_alphanum_offset_] == 'l') {
374 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
375 if (safe_dict_word(word_res) > 0) {
376 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
377 if (update_map)
378 word_res->reject_map[first_alphanum_index_].
379 setrej_1Il_conflict();
380 return true;
381 }
382 else {
383 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
384 return false;
385 }
386 }
387 return false;
388 }
389
390 /*
391 NEW 1Il code. The old code relied on permuter types too much. In fact,
392 tess will use TOP_CHOICE permute for good things like "palette".
393 In this code the string is examined independently to see if it looks like
394 a well formed word.
395 */
396
397 /*
398 REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
399 dictionary word.
400 */
401 first_alphanum_index_ = first_alphanum_index (word, lengths);
402 first_alphanum_offset_ = first_alphanum_offset (word, lengths);
403 if (lengths[first_alphanum_index_] == 1 &&
404 word[first_alphanum_offset_] == 'l') {
405 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
406 if (safe_dict_word(word_res) > 0)
407 return false;
408 else
409 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
410 }
411 else if (lengths[first_alphanum_index_] == 1 &&
412 word[first_alphanum_offset_] == 'I') {
413 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
414 if (safe_dict_word(word_res) > 0)
415 return false;
416 else
417 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
418 }
419 /*
420 For strings containing digits:
421 If there are no alphas OR the numeric permuter liked the word,
422 reject any non 1 conflict chs
423 Else reject all conflict chs
424 */
425 if (word_contains_non_1_digit (word, lengths)) {
426 allow_1s = (alpha_count (word, lengths) == 0) ||
427 (word_res->best_choice->permuter () == NUMBER_PERM);
428
429 int16_t offset;
430 conflict = false;
431 for (i = 0, offset = 0; word[offset] != '\0';
432 offset += word_res->best_choice->unichar_lengths()[i++]) {
433 if ((!allow_1s || (word[offset] != '1')) &&
434 STRING (conflict_set_I_l_1).contains (word[offset])) {
435 if (update_map)
436 word_res->reject_map[i].setrej_1Il_conflict ();
437 conflict = true;
438 }
439 }
440 return conflict;
441 }
442 /*
443 For anything else. See if it conforms to an acceptable word type. If so,
444 treat accordingly.
445 */
446 word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
447 if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
448 first_alphanum_index_ = first_alphanum_index (word, lengths);
449 first_alphanum_offset_ = first_alphanum_offset (word, lengths);
450 if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
451 if (update_map)
452 word_res->reject_map[first_alphanum_index_].
453 setrej_1Il_conflict ();
454 return true;
455 }
456 else
457 return false;
458 }
459 else if (word_type == AC_UPPER_CASE) {
460 return false;
461 }
462 else {
463 if (update_map)
464 reject_I_1_L(word_res);
465 return true;
466 }
467}
@ DOC_DAWG_PERM
Definition: ratngs.h:242
int16_t first_alphanum_index(const char *word, const char *word_lengths)
Definition: reject.cpp:470
int16_t first_alphanum_offset(const char *word, const char *word_lengths)
Definition: reject.cpp:483
int16_t alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:496
void reject_I_1_L(WERD_RES *word)
Definition: reject.cpp:194
bool word_contains_non_1_digit(const char *word, const char *word_lengths)
Definition: reject.cpp:510
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:89

◆ output_pass()

void tesseract::Tesseract::output_pass ( PAGE_RES_IT page_res_it,
const TBOX target_word_box 
)

Definition at line 36 of file output.cpp.

38 {
39 BLOCK_RES *block_of_last_word;
40 bool force_eol; //During output
41 BLOCK *nextblock; //block of next word
42 WERD *nextword; //next word
43
44 page_res_it.restart_page ();
45 block_of_last_word = nullptr;
46 while (page_res_it.word () != nullptr) {
47 check_debug_pt (page_res_it.word (), 120);
48
49 if (target_word_box) {
50 TBOX current_word_box = page_res_it.word()->word->bounding_box();
51 FCOORD center_pt(
52 (current_word_box.right() + current_word_box.left()) / 2,
53 (current_word_box.bottom() + current_word_box.top()) / 2);
54 if (!target_word_box->contains(center_pt)) {
55 page_res_it.forward();
56 continue;
57 }
58 }
60 block_of_last_word != page_res_it.block ()) {
61 block_of_last_word = page_res_it.block ();
62 }
63
65 (page_res_it.block () != page_res_it.next_block ())) ||
66 (page_res_it.next_word () == nullptr);
67
68 if (page_res_it.next_word () != nullptr)
69 nextword = page_res_it.next_word ()->word;
70 else
71 nextword = nullptr;
72 if (page_res_it.next_block () != nullptr)
73 nextblock = page_res_it.next_block ()->block;
74 else
75 nextblock = nullptr;
76 //regardless of tilde crunching
77 write_results(page_res_it,
78 determine_newline_type(page_res_it.word()->word,
79 page_res_it.block()->block,
80 nextword, nextblock), force_eol);
81 page_res_it.forward();
82 }
83}
char determine_newline_type(WERD *word, BLOCK *block, WERD *next_word, BLOCK *next_block)
Definition: output.cpp:213
void write_results(PAGE_RES_IT &page_res_it, char newline_type, bool force_eol)
Definition: output.cpp:98
WERD_RES * next_word() const
Definition: pageres.h:763
BLOCK_RES * next_block() const
Definition: pageres.h:769

◆ ParseLanguageString()

void tesseract::Tesseract::ParseLanguageString ( const char *  lang_str,
GenericVector< STRING > *  to_load,
GenericVector< STRING > *  not_to_load 
)

Definition at line 254 of file tessedit.cpp.

256 {
257 STRING remains(lang_str);
258 while (remains.length() > 0) {
259 // Find the start of the lang code and which vector to add to.
260 const char* start = remains.string();
261 while (*start == '+') ++start;
262 GenericVector<STRING>* target = to_load;
263 if (*start == '~') {
264 target = not_to_load;
265 ++start;
266 }
267 // Find the index of the end of the lang code in string start.
268 int end = strlen(start);
269 const char* plus = strchr(start, '+');
270 if (plus != nullptr && plus - start < end) end = plus - start;
271 STRING lang_code(start);
272 lang_code.truncate_at(end);
273 STRING next(start + end);
274 remains = next;
275 // Check whether lang_code is already in the target vector and add.
276 if (!IsStrInList(lang_code, *target)) {
277 target->push_back(lang_code);
278 }
279 }
280}

◆ pgeditor_main()

void tesseract::Tesseract::pgeditor_main ( int  width,
int  height,
PAGE_RES page_res 
)

pgeditor_main()

Top level editor operation: Setup a new window and an according event handler

Definition at line 378 of file pgedit.cpp.

378 {
379 current_page_res = page_res;
380 if (current_page_res->block_res_list.empty())
381 return;
382
383 recog_done = false;
384 stillRunning = true;
385
386 build_image_window(width, height);
387 word_display_mode.turn_on_bit(DF_EDGE_STEP);
389#ifndef GRAPHICS_DISABLED
390 pe = new ParamsEditor(this, image_win);
391#endif
392 PGEventHandler pgEventHandler(this);
393
394 image_win->AddEventHandler(&pgEventHandler);
395 image_win->AddMessageBox();
396
397 SVMenuNode* svMenuRoot = build_menu_new();
398
399 svMenuRoot->BuildMenu(image_win);
400 image_win->SetVisible(true);
401
402 image_win->AwaitEvent(SVET_DESTROY);
403 image_win->AddEventHandler(nullptr);
404}
@ DF_EDGE_STEP
Edge steps.
Definition: werd.h:49
@ SVET_DESTROY
Definition: scrollview.h:46
SVMenuNode * build_menu_new()
Definition: pgedit.cpp:298
bool word_set_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:928
void do_re_display(bool(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
Definition: pgedit.cpp:349
void turn_on_bit(uint8_t bit_num)
Definition: bits16.h:32
SVEvent * AwaitEvent(SVEventType type)
Definition: scrollview.cpp:443
void AddMessageBox()
Definition: scrollview.cpp:578
void AddEventHandler(SVEventHandler *listener)
Add an Event Listener to this ScrollView Window.
Definition: scrollview.cpp:414
void SetVisible(bool visible)
Definition: scrollview.cpp:549
void BuildMenu(ScrollView *sv, bool menu_bar=true)
Definition: svmnode.cpp:120

◆ pix_binary()

Pix * tesseract::Tesseract::pix_binary ( ) const
inline

Definition at line 202 of file tesseractclass.h.

202 {
203 return pix_binary_;
204 }

◆ pix_grey()

Pix * tesseract::Tesseract::pix_grey ( ) const
inline

Definition at line 205 of file tesseractclass.h.

205 {
206 return pix_grey_;
207 }

◆ pix_original()

Pix * tesseract::Tesseract::pix_original ( ) const
inline

Definition at line 212 of file tesseractclass.h.

212 {
213 return pix_original_;
214 }

◆ potential_word_crunch()

bool tesseract::Tesseract::potential_word_crunch ( WERD_RES word,
GARBAGE_LEVEL  garbage_level,
bool  ok_dict_word 
)

Definition at line 541 of file docqual.cpp.

543 {
544 float rating_per_ch;
545 int adjusted_len;
546 const char *str = word->best_choice->unichar_string().string();
547 const char *lengths = word->best_choice->unichar_lengths().string();
548 bool word_crunchable;
549 int poor_indicator_count = 0;
550
551 word_crunchable = !crunch_leave_accept_strings ||
552 word->reject_map.length() < 3 ||
554 str, lengths) == AC_UNACCEPTABLE &&
555 !ok_dict_word);
556
557 adjusted_len = word->reject_map.length();
558 if (adjusted_len > 10)
559 adjusted_len = 10;
560 rating_per_ch = word->best_choice->rating() / adjusted_len;
561
562 if (rating_per_ch > crunch_pot_poor_rate) {
563 if (crunch_debug > 2) {
564 tprintf("Potential poor rating on \"%s\"\n",
566 }
567 poor_indicator_count++;
568 }
569
570 if (word_crunchable &&
572 if (crunch_debug > 2) {
573 tprintf("Potential poor cert on \"%s\"\n",
575 }
576 poor_indicator_count++;
577 }
578
579 if (garbage_level != G_OK) {
580 if (crunch_debug > 2) {
581 tprintf("Potential garbage on \"%s\"\n",
583 }
584 poor_indicator_count++;
585 }
586 return poor_indicator_count >= crunch_pot_indicators;
587}

◆ PreenXHeights()

void tesseract::Tesseract::PreenXHeights ( BLOCK_LIST *  block_list)

Any row xheight that is significantly different from the median is set to the median.

Definition at line 181 of file applybox.cpp.

181 {
182 const double median_xheight = MedianXHeight(block_list);
183 const double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
184 // Strip all fuzzy space markers to simplify the PAGE_RES.
185 BLOCK_IT b_it(block_list);
186 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
187 BLOCK* block = b_it.data();
188 ROW_IT r_it(block->row_list());
189 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
190 ROW* row = r_it.data();
191 const double diff = fabs(row->x_height() - median_xheight);
192 if (diff > max_deviation) {
193 if (applybox_debug) {
194 tprintf("row xheight=%g, but median xheight = %g\n",
195 row->x_height(), median_xheight);
196 }
197 row->set_x_height(static_cast<float>(median_xheight));
198 }
199 }
200 }
201}
const double kMaxXHeightDeviationFraction
Definition: applybox.cpp:34
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:116
void set_x_height(float new_xheight)
Definition: ocrrow.h:67

◆ PrepareForPageseg()

void tesseract::Tesseract::PrepareForPageseg ( )

Definition at line 631 of file tesseractclass.cpp.

631 {
633 // Find the max splitter strategy over all langs.
634 auto max_pageseg_strategy =
636 static_cast<int32_t>(pageseg_devanagari_split_strategy));
637 for (int i = 0; i < sub_langs_.size(); ++i) {
638 auto pageseg_strategy =
640 static_cast<int32_t>(sub_langs_[i]->pageseg_devanagari_split_strategy));
641 if (pageseg_strategy > max_pageseg_strategy)
642 max_pageseg_strategy = pageseg_strategy;
643 pixDestroy(&sub_langs_[i]->pix_binary_);
644 sub_langs_[i]->pix_binary_ = pixClone(pix_binary());
645 }
646 // Perform shiro-rekha (top-line) splitting and replace the current image by
647 // the newly split image.
648 splitter_.set_orig_pix(pix_binary());
649 splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
650 if (splitter_.Split(true, &pixa_debug_)) {
651 ASSERT_HOST(splitter_.splitted_image());
652 pixDestroy(&pix_binary_);
653 pix_binary_ = pixClone(splitter_.splitted_image());
654 }
655}
Pix * pix_binary() const
void set_pageseg_split_strategy(SplitStrategy strategy)
bool Split(bool split_for_pageseg, DebugPixa *pixa_debug)
void set_use_cjk_fp_model(bool flag)
Definition: textord.h:95

◆ PrepareForTessOCR()

void tesseract::Tesseract::PrepareForTessOCR ( BLOCK_LIST *  block_list,
Tesseract osd_tess,
OSResults osr 
)

Definition at line 662 of file tesseractclass.cpp.

663 {
664 // Find the max splitter strategy over all langs.
665 auto max_ocr_strategy =
667 static_cast<int32_t>(ocr_devanagari_split_strategy));
668 for (int i = 0; i < sub_langs_.size(); ++i) {
669 auto ocr_strategy =
671 static_cast<int32_t>(sub_langs_[i]->ocr_devanagari_split_strategy));
672 if (ocr_strategy > max_ocr_strategy)
673 max_ocr_strategy = ocr_strategy;
674 }
675 // Utilize the segmentation information available.
676 splitter_.set_segmentation_block_list(block_list);
677 splitter_.set_ocr_split_strategy(max_ocr_strategy);
678 // Run the splitter for OCR
679 bool split_for_ocr = splitter_.Split(false, &pixa_debug_);
680 // Restore pix_binary to the binarized original pix for future reference.
681 ASSERT_HOST(splitter_.orig_pix());
682 pixDestroy(&pix_binary_);
683 pix_binary_ = pixClone(splitter_.orig_pix());
684 // If the pageseg and ocr strategies are different, refresh the block list
685 // (from the last SegmentImage call) with blobs from the real image to be used
686 // for OCR.
687 if (splitter_.HasDifferentSplitStrategies()) {
688 BLOCK block("", true, 0, 0, 0, 0, pixGetWidth(pix_binary_),
689 pixGetHeight(pix_binary_));
690 Pix* pix_for_ocr = split_for_ocr ? splitter_.splitted_image() :
691 splitter_.orig_pix();
692 extract_edges(pix_for_ocr, &block);
693 splitter_.RefreshSegmentationWithNewBlobs(block.blob_list());
694 }
695 // The splitter isn't needed any more after this, so save memory by clearing.
696 splitter_.Clear();
697}
void extract_edges(Pix *pix, BLOCK *block)
Definition: edgblob.cpp:329
void RefreshSegmentationWithNewBlobs(C_BLOB_LIST *new_blobs)
void set_segmentation_block_list(BLOCK_LIST *block_list)
void set_ocr_split_strategy(SplitStrategy strategy)

◆ PrerecAllWordsPar()

void tesseract::Tesseract::PrerecAllWordsPar ( const GenericVector< WordData > &  words)

Definition at line 38 of file par_control.cpp.

38 {
39 // Prepare all the blobs.
41 for (int w = 0; w < words.size(); ++w) {
42 if (words[w].word->ratings != nullptr &&
43 words[w].word->ratings->get(0, 0) == nullptr) {
44 for (int s = 0; s < words[w].lang_words.size(); ++s) {
45 Tesseract* sub = s < sub_langs_.size() ? sub_langs_[s] : this;
46 const WERD_RES& word = *words[w].lang_words[s];
47 for (int b = 0; b < word.chopped_word->NumBlobs(); ++b) {
48 blobs.push_back(BlobData(b, sub, word));
49 }
50 }
51 }
52 }
53 // Pre-classify all the blobs.
54 if (tessedit_parallelize > 1) {
55#ifdef _OPENMP
56#pragma omp parallel for num_threads(10)
57#endif // _OPENMP
58 for (int b = 0; b < blobs.size(); ++b) {
59 *blobs[b].choices =
60 blobs[b].tesseract->classify_blob(blobs[b].blob, "par", White, nullptr);
61 }
62 } else {
63 // TODO(AMD) parallelize this.
64 for (int b = 0; b < blobs.size(); ++b) {
65 *blobs[b].choices =
66 blobs[b].tesseract->classify_blob(blobs[b].blob, "par", White, nullptr);
67 }
68 }
69}
@ White
Definition: callcpp.h:29

◆ process_cmd_win_event()

bool tesseract::Tesseract::process_cmd_win_event ( int32_t  cmd_event,
char *  new_value 
)

Definition at line 415 of file pgedit.cpp.

418 {
419 char msg[160];
420 bool exit = false;
421
422 color_mode = CM_RAINBOW;
423
424 // Run recognition on the full page if needed.
425 switch (cmd_event) {
426 case BLAMER_CMD_EVENT:
436 if (!recog_done) {
437 recog_all_words(current_page_res, nullptr, nullptr, nullptr, 0);
438 recog_done = true;
439 }
440 break;
441 default:
442 break;
443 }
444
445 char* parameter;
446
447 switch (cmd_event) {
448 case NULL_CMD_EVENT:
449 break;
450
455 case RECOG_WERDS:
456 case RECOG_PSEUDO:
458 mode =static_cast<CMD_EVENTS>(cmd_event);
459 break;
462 parameter = image_win->ShowInputDialog("Config File Name");
463 word_config_ = parameter;
464 delete[] parameter;
465 break;
467 if (new_value[0] == 'T')
468 word_display_mode.turn_on_bit(DF_BOX);
469 else
470 word_display_mode.turn_off_bit(DF_BOX);
472 break;
473 case BLAMER_CMD_EVENT:
474 if (new_value[0] == 'T')
475 word_display_mode.turn_on_bit(DF_BLAMER);
476 else
477 word_display_mode.turn_off_bit(DF_BLAMER);
480 break;
482 if (new_value[0] == 'T')
483 word_display_mode.turn_on_bit(DF_TEXT);
484 else
485 word_display_mode.turn_off_bit(DF_TEXT);
487 break;
489 if (new_value[0] == 'T')
490 word_display_mode.turn_on_bit(DF_POLYGONAL);
491 else
492 word_display_mode.turn_off_bit(DF_POLYGONAL);
494 break;
496 if (new_value[0] == 'T')
497 word_display_mode.turn_on_bit(DF_BN_POLYGONAL);
498 else
499 word_display_mode.turn_off_bit(DF_BN_POLYGONAL);
501 break;
502 case BITMAP_CMD_EVENT:
503 if (new_value[0] == 'T')
504 word_display_mode.turn_on_bit(DF_EDGE_STEP);
505 else
506 word_display_mode.turn_off_bit(DF_EDGE_STEP);
508 break;
511 break;
512 case IMAGE_CMD_EVENT:
513 display_image =(new_value[0] == 'T');
515 break;
516 case BLOCKS_CMD_EVENT:
517 display_blocks =(new_value[0] == 'T');
519 break;
521 display_baselines =(new_value[0] == 'T');
523 break;
525 color_mode = CM_SUBSCRIPT;
527 break;
529 color_mode = CM_SUPERSCRIPT;
531 break;
533 color_mode = CM_ITALIC;
535 break;
537 color_mode = CM_BOLD;
539 break;
541 color_mode = CM_UNDERLINE;
543 break;
545 color_mode = CM_FIXEDPITCH;
547 break;
549 color_mode = CM_SERIF;
551 break;
553 color_mode = CM_SMALLCAPS;
555 break;
557 color_mode = CM_DROPCAPS;
559 break;
562 break;
563 case QUIT_CMD_EVENT:
564 exit = true;
566 break;
567
568 default:
569 snprintf(msg, sizeof(msg), "Unrecognised event %" PRId32 "(%s)",
570 cmd_event, new_value);
571 image_win->AddMessage(msg);
572 break;
573 }
574 return exit;
575}
CMD_EVENTS
Definition: pgedit.cpp:47
@ NULL_CMD_EVENT
Definition: pgedit.cpp:48
@ CM_RAINBOW
Definition: pgedit.cpp:81
@ CM_DROPCAPS
Definition: pgedit.cpp:90
@ CM_BOLD
Definition: pgedit.cpp:85
@ CM_UNDERLINE
Definition: pgedit.cpp:86
@ CM_SUBSCRIPT
Definition: pgedit.cpp:82
@ CM_SUPERSCRIPT
Definition: pgedit.cpp:83
@ CM_ITALIC
Definition: pgedit.cpp:84
@ CM_SERIF
Definition: pgedit.cpp:88
@ CM_SMALLCAPS
Definition: pgedit.cpp:89
@ CM_FIXEDPITCH
Definition: pgedit.cpp:87
@ DF_BN_POLYGONAL
BL normalisd polyapx.
Definition: werd.h:50
@ DF_TEXT
Correct ascii.
Definition: werd.h:47
@ DF_BOX
Bounding box.
Definition: werd.h:46
@ DF_BLAMER
Blamer information.
Definition: werd.h:51
@ DF_POLYGONAL
Polyg approx.
Definition: werd.h:48
bool word_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:740
void turn_off_bit(uint8_t bit_num)
Definition: bits16.h:37
static void Exit()
Definition: scrollview.cpp:583
char * ShowInputDialog(const char *msg)
Definition: scrollview.cpp:733
void AddMessage(const char *format,...)
Definition: scrollview.cpp:561

◆ process_image_event()

void tesseract::Tesseract::process_image_event ( const SVEvent event)

process_image_event()

User has done something in the image window - mouse down or up. Work out what it is and do something with it. If DOWN - just remember where it was. If UP - for each word in the selected area do the operation defined by the current mode.

Definition at line 587 of file pgedit.cpp.

588 {
589 // The following variable should remain static, since it is used by
590 // debug editor, which uses a single Tesseract instance.
591 static ICOORD down;
592 ICOORD up;
593 TBOX selection_box;
594 char msg[80];
595
596 switch(event.type) {
597
598 case SVET_SELECTION:
599 if (event.type == SVET_SELECTION) {
600 down.set_x(event.x + event.x_size);
601 down.set_y(event.y + event.y_size);
602 if (mode == SHOW_POINT_CMD_EVENT)
603 show_point(current_page_res, event.x, event.y);
604 }
605
606 up.set_x(event.x);
607 up.set_y(event.y);
608
609 selection_box = TBOX(down, up);
610
611 switch(mode) {
614 current_page_res,
615 selection_box,
617 break;
619 process_selected_words(current_page_res,
620 selection_box,
622 break;
624 process_selected_words(current_page_res,
625 selection_box,
627 break;
629 debug_word(current_page_res, selection_box);
630 break;
632 break; // ignore up event
633
634 case RECOG_WERDS:
635 #ifndef DISABLED_LEGACY_ENGINE
636 image_win->AddMessage("Recogging selected words");
637 this->process_selected_words(current_page_res,
638 selection_box,
640 #endif // ndef DISABLED_LEGACY_ENGINE
641 break;
642 case RECOG_PSEUDO:
643 image_win->AddMessage("Recogging selected blobs");
644 recog_pseudo_word(current_page_res, selection_box);
645 break;
647 blob_feature_display(current_page_res, selection_box);
648 break;
649
650 default:
651 sprintf(msg, "Mode %d not yet implemented", mode);
652 image_win->AddMessage(msg);
653 break;
654 }
655 default:
656 break;
657 }
658}
@ SVET_SELECTION
Definition: scrollview.h:49
bool recog_interactive(PAGE_RES_IT *pr_it)
Definition: control.cpp:77
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
Definition: control.cpp:62
bool word_bln_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:708
void process_selected_words(PAGE_RES *page_res, TBOX &selection_box, bool(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
Definition: pagewalk.cpp:30
bool word_dumper(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:904
bool word_blank_and_set_display(PAGE_RES_IT *pr_its)
Definition: pgedit.cpp:696
void debug_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: pgedit.cpp:665
void blob_feature_display(PAGE_RES *page_res, const TBOX &selection_box)
Definition: pgedit.cpp:943
void set_x(int16_t xin)
rewrite function
Definition: points.h:61
void set_y(int16_t yin)
rewrite function
Definition: points.h:65
int x
Definition: scrollview.h:67
SVEventType type
Definition: scrollview.h:64
int x_size
Definition: scrollview.h:69
int y
Definition: scrollview.h:68
int y_size
Definition: scrollview.h:70

◆ process_selected_words()

void tesseract::Tesseract::process_selected_words ( PAGE_RES page_res,
TBOX selection_box,
bool(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it)  word_processor 
)

Definition at line 30 of file pagewalk.cpp.

33 {
34 for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != nullptr;
35 page_res_it.forward()) {
36 WERD* word = page_res_it.word()->word;
37 if (word->bounding_box().overlap(selection_box)) {
38 if (!(this->*word_processor)(&page_res_it))
39 return;
40 }
41 }
42}
bool overlap(const TBOX &box) const
Definition: rect.h:355

◆ ProcessTargetWord()

bool tesseract::Tesseract::ProcessTargetWord ( const TBOX word_box,
const TBOX target_word_box,
const char *  word_config,
int  pass 
)

Definition at line 120 of file control.cpp.

123 {
124 if (word_config != nullptr) {
125 if (word_box.major_overlap(target_word_box)) {
126 if (backup_config_file_ == nullptr) {
127 backup_config_file_ = kBackUpConfigFile;
128 FILE* config_fp = fopen(backup_config_file_, "wb");
129 if (config_fp == nullptr) {
130 tprintf("Error, failed to open file \"%s\"\n", backup_config_file_);
131 } else {
132 ParamUtils::PrintParams(config_fp, params());
133 fclose(config_fp);
134 }
135 ParamUtils::ReadParamsFile(word_config,
137 params());
138 }
139 } else {
140 if (backup_config_file_ != nullptr) {
141 ParamUtils::ReadParamsFile(backup_config_file_,
143 params());
144 backup_config_file_ = nullptr;
145 }
146 }
147 } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
148 return false;
149 }
150 return true;
151}
const char *const kBackUpConfigFile
Definition: control.cpp:48
@ SET_PARAM_CONSTRAINT_DEBUG_ONLY
Definition: params.h:37
bool major_overlap(const TBOX &box) const
Definition: rect.h:368
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:39

◆ quality_based_rejection()

void tesseract::Tesseract::quality_based_rejection ( PAGE_RES_IT page_res_it,
bool  good_quality_doc 
)

Definition at line 138 of file docqual.cpp.

139 {
140 if ((tessedit_good_quality_unrej && good_quality_doc))
141 unrej_good_quality_words(page_res_it);
142 doc_and_block_rejection(page_res_it, good_quality_doc);
144 tilde_crunch(page_res_it);
145 tilde_delete(page_res_it);
146 }
147}
void tilde_delete(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:589
void tilde_crunch(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:417
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:232
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:160

◆ read_config_file()

void tesseract::Tesseract::read_config_file ( const char *  filename,
SetParamConstraint  constraint 
)

Definition at line 48 of file tessedit.cpp.

49 {
50 STRING path = datadir;
51 path += "configs/";
52 path += filename;
53 FILE* fp;
54 if ((fp = fopen(path.string(), "rb")) != nullptr) {
55 fclose(fp);
56 } else {
57 path = datadir;
58 path += "tessconfigs/";
59 path += filename;
60 if ((fp = fopen(path.string(), "rb")) != nullptr) {
61 fclose(fp);
62 } else {
63 path = filename;
64 }
65 }
66 ParamUtils::ReadParamsFile(path.string(), constraint, this->params());
67}

◆ ReassignDiacritics()

bool tesseract::Tesseract::ReassignDiacritics ( int  pass,
PAGE_RES_IT pr_it,
bool *  make_next_word_fuzzy 
)

Definition at line 945 of file control.cpp.

946 {
947 *make_next_word_fuzzy = false;
948 WERD* real_word = pr_it->word()->word;
949 if (real_word->rej_cblob_list()->empty() ||
950 real_word->cblob_list()->empty() ||
951 real_word->rej_cblob_list()->length() > noise_maxperword)
952 return false;
953 real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
954 // Get the noise outlines into a vector with matching bool map.
956 real_word->GetNoiseOutlines(&outlines);
957 GenericVector<bool> word_wanted;
958 GenericVector<bool> overlapped_any_blob;
959 GenericVector<C_BLOB*> target_blobs;
960 AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it,
961 &word_wanted, &overlapped_any_blob,
962 &target_blobs);
963 // Filter the outlines that overlapped any blob and put them into the word
964 // now. This simplifies the remaining task and also makes it more accurate
965 // as it has more completed blobs to work on.
966 GenericVector<bool> wanted;
967 GenericVector<C_BLOB*> wanted_blobs;
968 GenericVector<C_OUTLINE*> wanted_outlines;
969 int num_overlapped = 0;
970 int num_overlapped_used = 0;
971 for (int i = 0; i < overlapped_any_blob.size(); ++i) {
972 if (overlapped_any_blob[i]) {
973 ++num_overlapped;
974 if (word_wanted[i]) ++num_overlapped_used;
975 wanted.push_back(word_wanted[i]);
976 wanted_blobs.push_back(target_blobs[i]);
977 wanted_outlines.push_back(outlines[i]);
978 outlines[i] = nullptr;
979 }
980 }
981 real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, nullptr);
982 AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted,
983 &target_blobs);
984 int non_overlapped = 0;
985 int non_overlapped_used = 0;
986 for (int i = 0; i < word_wanted.size(); ++i) {
987 if (word_wanted[i]) ++non_overlapped_used;
988 if (outlines[i] != nullptr) ++non_overlapped_used;
989 }
991 tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:",
992 num_overlapped_used, num_overlapped, non_overlapped_used,
993 non_overlapped);
994 real_word->bounding_box().print();
995 }
996 // Now we have decided which outlines we want, put them into the real_word.
997 if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines,
998 make_next_word_fuzzy)) {
999 pr_it->MakeCurrentWordFuzzy();
1000 }
1001 // TODO(rays) Parts of combos have a deep copy of the real word, and need
1002 // to have their noise outlines moved/assigned in the same way!!
1003 return num_overlapped_used != 0 || non_overlapped_used != 0;
1004}
void AssignDiacriticsToOverlappingBlobs(const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB * > *target_blobs)
Definition: control.cpp:1011
void AssignDiacriticsToNewBlobs(const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB * > *target_blobs)
Definition: control.cpp:1064
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1473
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:125
bool AddSelectedOutlines(const GenericVector< bool > &wanted, const GenericVector< C_BLOB * > &target_blobs, const GenericVector< C_OUTLINE * > &outlines, bool *make_next_word_fuzzy)
Definition: werd.cpp:524
void GetNoiseOutlines(GenericVector< C_OUTLINE * > *outlines)
Definition: werd.cpp:506

◆ recog_all_words()

bool tesseract::Tesseract::recog_all_words ( PAGE_RES page_res,
ETEXT_DESC monitor,
const TBOX target_word_box,
const char *  word_config,
int  dopasses 
)

recog_all_words()

Walk the page_res, recognizing all the words. If monitor is not null, it is used as a progress monitor/timeout/cancel. If dopasses is 0, all recognition passes are run, 1 just pass 1, 2 passes2 and higher. If target_word_box is not null, special things are done to words that overlap the target_word_box: if word_config is not null, the word config file is read for just the target word(s), otherwise, on pass 2 and beyond ONLY the target words are processed (Jetsoft modification.) Returns false if we cancelled prematurely.

Parameters
page_respage structure
monitorprogress monitor
word_configword_config file
target_word_boxspecifies just to extract a rectangle
dopasses0 - all, 1 just pass 1, 2 passes 2 and higher

Definition at line 302 of file control.cpp.

306 {
307 PAGE_RES_IT page_res_it(page_res);
308
310 tessedit_test_adaption.set_value (true);
311 tessedit_minimal_rejection.set_value (true);
312 }
313
314 if (dopasses==0 || dopasses==1) {
315 page_res_it.restart_page();
316 // ****************** Pass 1 *******************
317
318 #ifndef DISABLED_LEGACY_ENGINE
319 // If the adaptive classifier is full switch to one we prepared earlier,
320 // ie on the previous page. If the current adaptive classifier is non-empty,
321 // prepare a backup starting at this page, in case it fills up. Do all this
322 // independently for each language.
325 } else if (!AdaptiveClassifierIsEmpty()) {
327 }
328 // Now check the sub-langs as well.
329 for (int i = 0; i < sub_langs_.size(); ++i) {
330 if (sub_langs_[i]->AdaptiveClassifierIsFull()) {
331 sub_langs_[i]->SwitchAdaptiveClassifier();
332 } else if (!sub_langs_[i]->AdaptiveClassifierIsEmpty()) {
333 sub_langs_[i]->StartBackupAdaptiveClassifier();
334 }
335 }
336
337 #endif // ndef DISABLED_LEGACY_ENGINE
338
339 // Set up all words ready for recognition, so that if parallelism is on
340 // all the input and output classes are ready to run the classifier.
342 SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);
343 #ifndef DISABLED_LEGACY_ENGINE
345 PrerecAllWordsPar(words);
346 }
347 #endif // ndef DISABLED_LEGACY_ENGINE
348
349 stats_.word_count = words.size();
350
351 stats_.dict_words = 0;
352 stats_.doc_blob_quality = 0;
353 stats_.doc_outline_errs = 0;
354 stats_.doc_char_quality = 0;
355 stats_.good_char_count = 0;
356 stats_.doc_good_char_quality = 0;
357
358 most_recently_used_ = this;
359 // Run pass 1 word recognition.
360 if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) return false;
361 // Pass 1 post-processing.
362 for (page_res_it.restart_page(); page_res_it.word() != nullptr;
363 page_res_it.forward()) {
364 if (page_res_it.word()->word->flag(W_REP_CHAR)) {
365 fix_rep_char(&page_res_it);
366 continue;
367 }
368
369 // Count dict words.
370 if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
371 ++(stats_.dict_words);
372
373 // Update misadaption log (we only need to do it on pass 1, since
374 // adaption only happens on this pass).
375 if (page_res_it.word()->blamer_bundle != nullptr &&
376 page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
377 page_res->misadaption_log.push_back(
378 page_res_it.word()->blamer_bundle->misadaption_debug());
379 }
380 }
381 }
382
383 if (dopasses == 1) return true;
384
385 #ifndef DISABLED_LEGACY_ENGINE
386
387 // ****************** Pass 2 *******************
389 AnyTessLang()) {
390 page_res_it.restart_page();
392 SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
394 PrerecAllWordsPar(words);
395 }
396 most_recently_used_ = this;
397 // Run pass 2 word recognition.
398 if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) return false;
399 }
400
401 // The next passes are only required for Tess-only.
402 if (AnyTessLang() && !AnyLSTMLang()) {
403 // ****************** Pass 3 *******************
404 // Fix fuzzy spaces.
406
409 fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
410
411 // ****************** Pass 4 *******************
414
415 // ****************** Pass 5,6 *******************
416 rejection_passes(page_res, monitor, target_word_box, word_config);
417
418 // ****************** Pass 8 *******************
419 font_recognition_pass(page_res);
420
421 // ****************** Pass 9 *******************
422 // Check the correctness of the final results.
423 blamer_pass(page_res);
424 script_pos_pass(page_res);
425 }
426
427 #endif // ndef DISABLED_LEGACY_ENGINE
428
429 // Write results pass.
431 // This is now redundant, but retained commented so show how to obtain
432 // bounding boxes and style information.
433
434 #ifndef DISABLED_LEGACY_ENGINE
435 // changed by jetsoft
436 // needed for dll to output memory structure
437 if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
438 output_pass(page_res_it, target_word_box);
439 // end jetsoft
440 #endif //ndef DISABLED_LEGACY_ENGINE
441
442 const auto pageseg_mode = static_cast<PageSegMode>(
443 static_cast<int>(tessedit_pageseg_mode));
444 textord_.CleanupSingleRowResult(pageseg_mode, page_res);
445
446 // Remove empty words, as these mess up the result iterators.
447 for (page_res_it.restart_page(); page_res_it.word() != nullptr;
448 page_res_it.forward()) {
449 const WERD_RES* word = page_res_it.word();
450 const POLY_BLOCK* pb = page_res_it.block()->block != nullptr
451 ? page_res_it.block()->block->pdblk.poly_block()
452 : nullptr;
453 if (word->best_choice == nullptr || word->best_choice->length() == 0 ||
454 (word->best_choice->IsAllSpaces() && (pb == nullptr || pb->IsText()))) {
455 page_res_it.DeleteCurrentWord();
456 }
457 }
458
459 if (monitor != nullptr) {
460 monitor->progress = 100;
461 }
462 return true;
463}
#define LOC_FUZZY_SPACE
Definition: errcode.h:49
#define LOC_WRITE_RESULTS
Definition: errcode.h:53
void set_global_loc_code(int loc_code)
Definition: globaloc.cpp:25
void bigram_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:467
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
Definition: control.cpp:154
void fix_rep_char(PAGE_RES_IT *page_res_it)
Definition: control.cpp:1706
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
Definition: control.cpp:612
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
Definition: output.cpp:36
void dictionary_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:2093
void font_recognition_pass(PAGE_RES *page_res)
Definition: control.cpp:2037
bool AnyTessLang() const
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:75
bool right_to_left() const
void script_pos_pass(PAGE_RES *page_res)
Definition: control.cpp:734
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
Definition: control.cpp:213
void PrerecAllWordsPar(const GenericVector< WordData > &words)
Definition: par_control.cpp:38
void blamer_pass(PAGE_RES *page_res)
Definition: control.cpp:710
bool IsText() const
Definition: polyblk.h:49
bool IsAllSpaces() const
Definition: ratngs.h:511
bool AdaptiveClassifierIsEmpty() const
Definition: classify.h:326
void StartBackupAdaptiveClassifier()
Definition: adaptmatch.cpp:629
void SwitchAdaptiveClassifier()
Definition: adaptmatch.cpp:613
bool AdaptiveClassifierIsFull() const
Definition: classify.h:325
void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res)
Definition: textord.cpp:318

◆ recog_interactive()

bool tesseract::Tesseract::recog_interactive ( PAGE_RES_IT pr_it)

Recognize a single word in interactive mode.

Parameters
pr_itthe page results iterator

Definition at line 77 of file control.cpp.

77 {
78 int16_t char_qual;
79 int16_t good_char_qual;
80
81 WordData word_data(*pr_it);
82 SetupWordPassN(2, &word_data);
83 // LSTM doesn't run on pass2, but we want to run pass2 for tesseract.
84 if (lstm_recognizer_ == nullptr) {
85#ifndef DISABLED_LEGACY_ENGINE
86 classify_word_and_language(2, pr_it, &word_data);
87#endif // ndef DISABLED_LEGACY_ENGINE
88 } else {
89 classify_word_and_language(1, pr_it, &word_data);
90 }
91#ifndef DISABLED_LEGACY_ENGINE
93 WERD_RES* word_res = pr_it->word();
94 word_char_quality(word_res, pr_it->row()->row, &char_qual, &good_char_qual);
95 tprintf("\n%d chars; word_blob_quality: %d; outline_errs: %d; "
96 "char_quality: %d; good_char_quality: %d\n",
97 word_res->reject_map.length(),
98 word_blob_quality(word_res, pr_it->row()->row),
99 word_outline_errs(word_res), char_qual, good_char_qual);
100 }
101#endif // ndef DISABLED_LEGACY_ENGINE
102 return true;
103}
int16_t word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:72
int16_t word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:60

◆ recog_pseudo_word()

void tesseract::Tesseract::recog_pseudo_word ( PAGE_RES page_res,
TBOX selection_box 
)

Definition at line 62 of file control.cpp.

63 {
64 PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
65 if (it != nullptr) {
68 delete it;
69 }
70}

◆ recog_training_segmented()

void tesseract::Tesseract::recog_training_segmented ( const STRING fname,
PAGE_RES page_res,
volatile ETEXT_DESC monitor,
FILE *  output_file 
)

Definition at line 84 of file recogtraining.cpp.

87 {
88 STRING box_fname = fname;
89 const char* lastdot = strrchr(box_fname.string(), '.');
90 if (lastdot != nullptr)
91 box_fname[lastdot - box_fname.string()] = '\0';
92 box_fname += ".box";
93 // ReadNextBox() will close box_file
94 FILE* box_file = fopen(box_fname.string(), "r");
95 if (box_file == nullptr) {
96 tprintf("Error: Could not open file %s\n", box_fname.string());
97 ASSERT_HOST(box_file);
98 }
99
100 PAGE_RES_IT page_res_it;
101 page_res_it.page_res = page_res;
102 page_res_it.restart_page();
103 STRING label;
104
105 // Process all the words on this page.
106 TBOX tbox; // tesseract-identified box
107 TBOX bbox; // box from the box file
108 bool keep_going;
109 int line_number = 0;
110 int examined_words = 0;
111 do {
112 keep_going = read_t(&page_res_it, &tbox);
113 keep_going &=
114 ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
115 // Align bottom left points of the TBOXes.
116 while (keep_going &&
117 !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
118 if (bbox.bottom() < tbox.bottom()) {
119 page_res_it.forward();
120 keep_going = read_t(&page_res_it, &tbox);
121 } else {
122 keep_going =
123 ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
124 }
125 }
126 while (keep_going &&
127 !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
128 if (bbox.left() > tbox.left()) {
129 page_res_it.forward();
130 keep_going = read_t(&page_res_it, &tbox);
131 } else {
132 keep_going =
133 ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
134 }
135 }
136 // OCR the word if top right points of the TBOXes are similar.
137 if (keep_going &&
138 NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
139 NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
140 ambigs_classify_and_output(label.string(), &page_res_it, output_file);
141 examined_words++;
142 }
143 page_res_it.forward();
144 } while (keep_going);
145
146 // Set up scripts on all of the words that did not get sent to
147 // ambigs_classify_and_output. They all should have, but if all the
148 // werd_res's don't get uch_sets, tesseract will crash when you try
149 // to iterate over them. :-(
150 int total_words = 0;
151 for (page_res_it.restart_page(); page_res_it.block() != nullptr;
152 page_res_it.forward()) {
153 if (page_res_it.word()) {
154 if (page_res_it.word()->uch_set == nullptr)
155 page_res_it.word()->SetupFake(unicharset);
156 total_words++;
157 }
158 }
159 if (examined_words < 0.85 * total_words) {
160 tprintf(
161 "TODO(antonova): clean up recog_training_segmented; "
162 " It examined only a small fraction of the ambigs image.\n");
163 }
164 tprintf("recog_training_segmented: examined %d / %d words.\n", examined_words,
165 total_words);
166}
bool ReadNextBox(int *line_number, FILE *box_file, STRING *utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:127
const int16_t kMaxBoxEdgeDiff
void ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:352

◆ recog_word()

void tesseract::Tesseract::recog_word ( WERD_RES word)

Definition at line 40 of file tfacepp.cpp.

40 {
41 if (wordrec_skip_no_truth_words && (word->blamer_bundle == nullptr ||
43 if (classify_debug_level) tprintf("No truth for word - skipping\n");
44 word->tess_failed = true;
45 return;
46 }
49 word->SetupBoxWord();
50 if (word->best_choice->length() != word->box_word->length()) {
51 tprintf("recog_word ASSERT FAIL String:\"%s\"; "
52 "Strlen=%d; #Blobs=%d\n",
54 word->best_choice->length(), word->box_word->length());
55 }
56 ASSERT_HOST(word->best_choice->length() == word->box_word->length());
57 // Check that the ratings matrix size matches the sum of all the
58 // segmentation states.
59 if (!word->StatesAllValid()) {
60 tprintf("Not all words have valid states relative to ratings matrix!!");
61 word->DebugWordChoices(true, nullptr);
63 }
65 /* Override the permuter type if a straight dictionary check disagrees. */
66 uint8_t perm_type = word->best_choice->permuter();
67 if ((perm_type != SYSTEM_DAWG_PERM) &&
68 (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
69 uint8_t real_dict_perm_type = dict_word(*word->best_choice);
70 if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
71 (real_dict_perm_type == FREQ_DAWG_PERM) ||
72 (real_dict_perm_type == USER_DAWG_PERM)) &&
74 word->best_choice->unichar_lengths().string()) > 0)) {
75 word->best_choice->set_permuter(real_dict_perm_type); // use dict perm
76 }
77 }
79 perm_type != word->best_choice->permuter()) {
80 tprintf("Permuter Type Flipped from %d to %d\n",
81 perm_type, word->best_choice->permuter());
82 }
83 }
84 // Factored out from control.cpp
85 ASSERT_HOST((word->best_choice == nullptr) == (word->raw_choice == nullptr));
86 if (word->best_choice == nullptr || word->best_choice->length() == 0 ||
87 static_cast<int>(strspn(word->best_choice->unichar_string().string(),
88 " ")) == word->best_choice->length()) {
89 word->tess_failed = true;
90 word->reject_map.initialise(word->box_word->length());
92 } else {
93 word->tess_failed = false;
94 }
95}
@ IRR_NO_TRUTH
Definition: blamer.h:93
void recog_word_recursive(WERD_RES *word)
Definition: tfacepp.cpp:104
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:480
bool StatesAllValid()
Definition: pageres.cpp:458
void set_permuter(uint8_t perm)
Definition: ratngs.h:365
void rej_word_tess_failure()
Definition: rejctmap.cpp:352
bool wordrec_skip_no_truth_words
Definition: wordrec.h:230

◆ recog_word_recursive()

void tesseract::Tesseract::recog_word_recursive ( WERD_RES word)

Definition at line 104 of file tfacepp.cpp.

104 {
105 int word_length = word->chopped_word->NumBlobs(); // no of blobs
106 if (word_length > MAX_UNDIVIDED_LENGTH) {
107 return split_and_recog_word(word);
108 }
109 cc_recog(word);
110 word_length = word->rebuild_word->NumBlobs(); // No of blobs in output.
111
112 // Do sanity checks and minor fixes on best_choice.
113 if (word->best_choice->length() > word_length) {
114 word->best_choice->make_bad(); // should never happen
115 tprintf("recog_word: Discarded long string \"%s\""
116 " (%d characters vs %d blobs)\n",
118 word->best_choice->length(), word_length);
119 tprintf("Word is at:");
120 word->word->bounding_box().print();
121 }
122 if (word->best_choice->length() < word_length) {
123 UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
124 while (word->best_choice->length() < word_length) {
125 word->best_choice->append_unichar_id(space_id, 1, 0.0,
126 word->best_choice->certainty());
127 }
128 }
129}
#define MAX_UNDIVIDED_LENGTH
Definition: tfacepp.cpp:29
void split_and_recog_word(WERD_RES *word)
Definition: tfacepp.cpp:138
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:433
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:472
void cc_recog(WERD_RES *word)
Definition: tface.cpp:125

◆ RecogAllWordsPassN()

bool tesseract::Tesseract::RecogAllWordsPassN ( int  pass_n,
ETEXT_DESC monitor,
PAGE_RES_IT pr_it,
GenericVector< WordData > *  words 
)

Definition at line 213 of file control.cpp.

215 {
216 // TODO(rays) Before this loop can be parallelized (it would yield a massive
217 // speed-up) all remaining member globals need to be converted to local/heap
218 // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
219 // added. The results will be significantly different with adaption on, and
220 // deterioration will need investigation.
221 pr_it->restart_page();
222 for (int w = 0; w < words->size(); ++w) {
223 WordData* word = &(*words)[w];
224 if (w > 0) word->prev_word = &(*words)[w - 1];
225 if (monitor != nullptr) {
226 monitor->ocr_alive = true;
227 if (pass_n == 1) {
228 monitor->progress = 70 * w / words->size();
229 } else {
230 monitor->progress = 70 + 30 * w / words->size();
231 }
232 if (monitor->progress_callback2 != nullptr) {
233 TBOX box = pr_it->word()->word->bounding_box();
234 (*monitor->progress_callback2)(monitor, box.left(),
235 box.right(), box.top(), box.bottom());
236 }
237 if (monitor->deadline_exceeded() ||
238 (monitor->cancel != nullptr && (*monitor->cancel)(monitor->cancel_this,
239 words->size()))) {
240 // Timeout. Fake out the rest of the words.
241 for (; w < words->size(); ++w) {
242 (*words)[w].word->SetupFake(unicharset);
243 }
244 return false;
245 }
246 }
247 if (word->word->tess_failed) {
248 int s;
249 for (s = 0; s < word->lang_words.size() &&
250 word->lang_words[s]->tess_failed; ++s) {}
251 // If all are failed, skip it. Image words are skipped by this test.
252 if (s > word->lang_words.size()) continue;
253 }
254 // Sync pr_it with the wth WordData.
255 while (pr_it->word() != nullptr && pr_it->word() != word->word)
256 pr_it->forward();
257 ASSERT_HOST(pr_it->word() != nullptr);
258 bool make_next_word_fuzzy = false;
259 #ifndef DISABLED_LEGACY_ENGINE
260 if (!AnyLSTMLang() &&
261 ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {
262 // Needs to be setup again to see the new outlines in the chopped_word.
263 SetupWordPassN(pass_n, word);
264 }
265 #endif // ndef DISABLED_LEGACY_ENGINE
266
267 classify_word_and_language(pass_n, pr_it, word);
269 tprintf("Pass%d: %s [%s]\n", pass_n,
270 word->word->best_choice->unichar_string().string(),
271 word->word->best_choice->debug_string().string());
272 }
273 pr_it->forward();
274 if (make_next_word_fuzzy && pr_it->word() != nullptr) {
275 pr_it->MakeCurrentWordFuzzy();
276 }
277 }
278 return true;
279}
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
Definition: control.cpp:945
PROGRESS_FUNC2 progress_callback2
called whenever progress increases
Definition: ocrclass.h:115

◆ recognize_page()

void tesseract::Tesseract::recognize_page ( STRING image_name)

◆ reject_edge_blobs()

void tesseract::Tesseract::reject_edge_blobs ( WERD_RES word)

Definition at line 264 of file reject.cpp.

264 {
265 TBOX word_box = word->word->bounding_box();
266 // Use the box_word as it is already denormed back to image coordinates.
267 int blobcount = word->box_word->length();
268
269 if (word_box.left() < tessedit_image_border ||
270 word_box.bottom() < tessedit_image_border ||
271 word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
272 word_box.top() + tessedit_image_border > ImageHeight() - 1) {
273 ASSERT_HOST(word->reject_map.length() == blobcount);
274 for (int blobindex = 0; blobindex < blobcount; blobindex++) {
275 TBOX blob_box = word->box_word->BlobBox(blobindex);
276 if (blob_box.left() < tessedit_image_border ||
277 blob_box.bottom() < tessedit_image_border ||
278 blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
279 blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
280 word->reject_map[blobindex].setrej_edge_char();
281 // Close to edge
282 }
283 }
284 }
285}
const TBOX & BlobBox(int index) const
Definition: boxword.h:84

◆ reject_I_1_L()

void tesseract::Tesseract::reject_I_1_L ( WERD_RES word)

Definition at line 194 of file reject.cpp.

194 {
195 int16_t i;
196 int16_t offset;
197
198 for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
199 offset += word->best_choice->unichar_lengths()[i], i += 1) {
201 contains (word->best_choice->unichar_string()[offset])) {
202 //rej 1Il conflict
203 word->reject_map[i].setrej_1Il_conflict ();
204 }
205 }
206}

◆ reject_mostly_rejects()

void tesseract::Tesseract::reject_mostly_rejects ( WERD_RES word)

Definition at line 574 of file reject.cpp.

574 {
575 /* Reject the whole of the word if the fraction of rejects exceeds a limit */
576
577 if (static_cast<float>(word->reject_map.reject_count()) / word->reject_map.length() >=
580}
void rej_word_mostly_rej()
Definition: rejctmap.cpp:406

◆ rejection_passes()

void tesseract::Tesseract::rejection_passes ( PAGE_RES page_res,
ETEXT_DESC monitor,
const TBOX target_word_box,
const char *  word_config 
)

Definition at line 612 of file control.cpp.

615 {
616 PAGE_RES_IT page_res_it(page_res);
617 // ****************** Pass 5 *******************
618 // Gather statistics on rejects.
619 int word_index = 0;
620 while (!tessedit_test_adaption && page_res_it.word() != nullptr) {
622 WERD_RES* word = page_res_it.word();
623 word_index++;
624 if (monitor != nullptr) {
625 monitor->ocr_alive = true;
626 monitor->progress = 95 + 5 * word_index / stats_.word_count;
627 }
628 if (word->rebuild_word == nullptr) {
629 // Word was not processed by tesseract.
630 page_res_it.forward();
631 continue;
632 }
633 check_debug_pt(word, 70);
634
635 // changed by jetsoft
636 // specific to its needs to extract one word when need
637 if (target_word_box &&
639 *target_word_box, word_config, 4)) {
640 page_res_it.forward();
641 continue;
642 }
643 // end jetsoft
644
645 page_res_it.rej_stat_word();
646 const int chars_in_word = word->reject_map.length();
647 const int rejects_in_word = word->reject_map.reject_count();
648
649 const int blob_quality = word_blob_quality(word, page_res_it.row()->row);
650 stats_.doc_blob_quality += blob_quality;
651 const int outline_errs = word_outline_errs(word);
652 stats_.doc_outline_errs += outline_errs;
653 int16_t all_char_quality;
654 int16_t accepted_all_char_quality;
655 word_char_quality(word, page_res_it.row()->row,
656 &all_char_quality, &accepted_all_char_quality);
657 stats_.doc_char_quality += all_char_quality;
658 const uint8_t permuter_type = word->best_choice->permuter();
659 if ((permuter_type == SYSTEM_DAWG_PERM) ||
660 (permuter_type == FREQ_DAWG_PERM) ||
661 (permuter_type == USER_DAWG_PERM)) {
662 stats_.good_char_count += chars_in_word - rejects_in_word;
663 stats_.doc_good_char_quality += accepted_all_char_quality;
664 }
665 check_debug_pt(word, 80);
667 (blob_quality == 0) && (outline_errs >= chars_in_word))
669 check_debug_pt(word, 90);
670 page_res_it.forward();
671 }
672
674 tprintf
675 ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f"
676 " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
677 page_res->char_count, page_res->rej_count,
678 page_res->rej_count / static_cast<float>(page_res->char_count),
679 stats_.doc_blob_quality,
680 stats_.doc_blob_quality / static_cast<float>(page_res->char_count),
681 stats_.doc_outline_errs,
682 stats_.doc_outline_errs / static_cast<float>(page_res->char_count),
683 stats_.doc_char_quality,
684 stats_.doc_char_quality / static_cast<float>(page_res->char_count),
686 (stats_.good_char_count > 0) ?
687 (stats_.doc_good_char_quality /
688 static_cast<float>(stats_.good_char_count)) : 0.0);
689 }
690 bool good_quality_doc =
691 ((page_res->rej_count / static_cast<float>(page_res->char_count)) <=
693 (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >=
695 (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <=
697 (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >=
699
700 // ****************** Pass 6 *******************
701 // Do whole document or whole block rejection pass
704 quality_based_rejection(page_res_it, good_quality_doc);
705 }
706}
#define LOC_DOC_BLK_REJ
Definition: errcode.h:52
#define LOC_MM_ADAPT
Definition: errcode.h:51
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:138
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
Definition: control.cpp:120
void rej_word_bad_quality()
Definition: rejctmap.cpp:415

◆ repeated_nonalphanum_wd()

bool tesseract::Tesseract::repeated_nonalphanum_wd ( WERD_RES word,
ROW row 
)

Definition at line 583 of file reject.cpp.

583 {
584 int16_t char_quality;
585 int16_t accepted_char_quality;
586
587 if (word->best_choice->unichar_lengths().length() <= 1)
588 return false;
589
591 contains(word->best_choice->unichar_string()[0]))
592 return false;
593
594 UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
595 for (int i = 1; i < word->best_choice->length(); ++i) {
596 if (word->best_choice->unichar_id(i) != uch_id) return false;
597 }
598
599 word_char_quality(word, row, &char_quality, &accepted_char_quality);
600
601 if ((word->best_choice->unichar_lengths().length () == char_quality) &&
602 (char_quality == accepted_char_quality))
603 return true;
604 else
605 return false;
606}
int32_t length() const
Definition: strngs.cpp:189

◆ ReportFailedBox()

void tesseract::Tesseract::ReportFailedBox ( int  boxfile_lineno,
TBOX  box,
const char *  box_ch,
const char *  err_msg 
)

Logs a bad box by line in the box file and box coords.

Definition at line 768 of file applybox.cpp.

769 {
770 tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
771 boxfile_lineno + 1, box_ch,
772 box.left(), box.bottom(), box.right(), box.top(), err_msg);
773}

◆ ReportXhtFixResult()

void tesseract::Tesseract::ReportXhtFixResult ( bool  accept_new_word,
float  new_x_ht,
WERD_RES word,
WERD_RES new_word 
)

Definition at line 1462 of file control.cpp.

1463 {
1464 tprintf("New XHT Match:%s = %s ",
1466 word->best_choice->debug_string().string());
1467 word->reject_map.print(debug_fp);
1468 tprintf(" -> %s = %s ",
1469 new_word->best_choice->unichar_string().string(),
1470 new_word->best_choice->debug_string().string());
1471 new_word->reject_map.print(debug_fp);
1472 tprintf(" %s->%s %s %s\n",
1473 word->guessed_x_ht ? "GUESS" : "CERT",
1474 new_word->guessed_x_ht ? "GUESS" : "CERT",
1475 new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
1476 accept_new_word ? "ACCEPTED" : "");
1477}
bool guessed_x_ht
Definition: pageres.h:313

◆ ReSegmentByClassification()

void tesseract::Tesseract::ReSegmentByClassification ( PAGE_RES page_res)

Resegments the words by running the classifier in an attempt to find the correct segmentation that produces the required string.

Definition at line 506 of file applybox.cpp.

506 {
507 PAGE_RES_IT pr_it(page_res);
508 WERD_RES* word_res;
509 for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
510 const WERD* word = word_res->word;
511 if (word->text() == nullptr || word->text()[0] == '\0')
512 continue; // Ignore words that have no text.
513 // Convert the correct text to a vector of UNICHAR_ID
514 GenericVector<UNICHAR_ID> target_text;
515 if (!ConvertStringToUnichars(word->text(), &target_text)) {
516 tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
517 word->text());
518 pr_it.DeleteCurrentWord();
519 continue;
520 }
521 if (!FindSegmentation(target_text, word_res)) {
522 tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
523 word->text());
524 pr_it.DeleteCurrentWord();
525 continue;
526 }
527 }
528}
bool ConvertStringToUnichars(const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
Definition: applybox.cpp:534
bool FindSegmentation(const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
Definition: applybox.cpp:561
const char * text() const
Definition: werd.h:114

◆ ResegmentCharBox()

bool tesseract::Tesseract::ResegmentCharBox ( PAGE_RES page_res,
const TBOX prev_box,
const TBOX box,
const TBOX next_box,
const char *  correct_text 
)

Gather consecutive blobs that match the given box into the best_state and corresponding correct_text.

Fights over which box owns which blobs are settled by pre-chopping and applying the blobs to box or next_box with the least non-overlap.

Returns
false if the box was in error, which can only be caused by failing to find an appropriate blob for a box.

This means that occasionally, blobs may be incorrectly segmented if the chopper fails to find a suitable chop point.

Definition at line 329 of file applybox.cpp.

331 {
332 if (applybox_debug > 1) {
333 tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
334 }
335 PAGE_RES_IT page_res_it(page_res);
336 WERD_RES* word_res;
337 for (word_res = page_res_it.word(); word_res != nullptr;
338 word_res = page_res_it.forward()) {
339 if (!word_res->box_word->bounding_box().major_overlap(box))
340 continue;
341 if (applybox_debug > 1) {
342 tprintf("Checking word box:");
343 word_res->box_word->bounding_box().print();
344 }
345 int word_len = word_res->box_word->length();
346 for (int i = 0; i < word_len; ++i) {
347 TBOX char_box = TBOX();
348 int blob_count = 0;
349 for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
350 TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
351 if (!blob_box.major_overlap(box))
352 break;
353 if (word_res->correct_text[i + blob_count].length() > 0)
354 break; // Blob is claimed already.
355 if (next_box != nullptr) {
356 const double current_box_miss_metric = BoxMissMetric(blob_box, box);
357 const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
358 if (applybox_debug > 2) {
359 tprintf("Checking blob:");
360 blob_box.print();
361 tprintf("Current miss metric = %g, next = %g\n",
362 current_box_miss_metric, next_box_miss_metric);
363 }
364 if (current_box_miss_metric > next_box_miss_metric)
365 break; // Blob is a better match for next box.
366 }
367 char_box += blob_box;
368 }
369 if (blob_count > 0) {
370 if (applybox_debug > 1) {
371 tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
372 }
373 if (!char_box.almost_equal(box, 3) &&
374 ((next_box != nullptr && box.x_gap(*next_box) < -3)||
375 (prev_box != nullptr && prev_box->x_gap(box) < -3))) {
376 return false;
377 }
378 // We refine just the box_word, best_state and correct_text here.
379 // The rebuild_word is made in TidyUp.
380 // blob_count blobs are put together to match the box. Merge the
381 // box_word boxes, save the blob_count in the state and the text.
382 word_res->box_word->MergeBoxes(i, i + blob_count);
383 word_res->best_state[i] = blob_count;
384 word_res->correct_text[i] = correct_text;
385 if (applybox_debug > 2) {
386 tprintf("%d Blobs match: blob box:", blob_count);
387 word_res->box_word->BlobBox(i).print();
388 tprintf("Matches box:");
389 box.print();
390 if (next_box != nullptr) {
391 tprintf("With next box:");
392 next_box->print();
393 }
394 }
395 // Eliminated best_state and correct_text entries for the consumed
396 // blobs.
397 for (int j = 1; j < blob_count; ++j) {
398 word_res->best_state.remove(i + 1);
399 word_res->correct_text.remove(i + 1);
400 }
401 // Assume that no box spans multiple source words, so we are done with
402 // this box.
403 if (applybox_debug > 1) {
404 tprintf("Best state = ");
405 for (int j = 0; j < word_res->best_state.size(); ++j) {
406 tprintf("%d ", word_res->best_state[j]);
407 }
408 tprintf("\n");
409 tprintf("Correct text = [[ ");
410 for (int j = 0; j < word_res->correct_text.size(); ++j) {
411 tprintf("%s ", word_res->correct_text[j].string());
412 }
413 tprintf("]]\n");
414 }
415 return true;
416 }
417 }
418 }
419 if (applybox_debug > 0) {
420 tprintf("FAIL!\n");
421 }
422 return false; // Failure.
423}
void remove(int index)
void MergeBoxes(int start, int end)
Definition: boxword.cpp:131
const TBOX & bounding_box() const
Definition: boxword.h:80
bool almost_equal(const TBOX &box, int tolerance) const
Definition: rect.cpp:258
int x_gap(const TBOX &box) const
Definition: rect.h:225

◆ ResegmentWordBox()

bool tesseract::Tesseract::ResegmentWordBox ( BLOCK_LIST *  block_list,
const TBOX box,
const TBOX next_box,
const char *  correct_text 
)

Consume all source blobs that strongly overlap the given box, putting them into a new word, with the correct_text label. Fights over which box owns which blobs are settled by applying the blobs to box or next_box with the least non-overlap.

Returns
false if the box was in error, which can only be caused by failing to find an overlapping blob for a box.

Definition at line 431 of file applybox.cpp.

433 {
434 if (applybox_debug > 1) {
435 tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
436 }
437 WERD* new_word = nullptr;
438 BLOCK_IT b_it(block_list);
439 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
440 BLOCK* block = b_it.data();
441 if (!box.major_overlap(block->pdblk.bounding_box()))
442 continue;
443 ROW_IT r_it(block->row_list());
444 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
445 ROW* row = r_it.data();
446 if (!box.major_overlap(row->bounding_box()))
447 continue;
448 WERD_IT w_it(row->word_list());
449 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
450 WERD* word = w_it.data();
451 if (applybox_debug > 2) {
452 tprintf("Checking word:");
453 word->bounding_box().print();
454 }
455 if (word->text() != nullptr && word->text()[0] != '\0')
456 continue; // Ignore words that are already done.
457 if (!box.major_overlap(word->bounding_box()))
458 continue;
459 C_BLOB_IT blob_it(word->cblob_list());
460 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
461 blob_it.forward()) {
462 C_BLOB* blob = blob_it.data();
463 TBOX blob_box = blob->bounding_box();
464 if (!blob_box.major_overlap(box))
465 continue;
466 if (next_box != nullptr) {
467 const double current_box_miss_metric = BoxMissMetric(blob_box, box);
468 const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
469 if (applybox_debug > 2) {
470 tprintf("Checking blob:");
471 blob_box.print();
472 tprintf("Current miss metric = %g, next = %g\n",
473 current_box_miss_metric, next_box_miss_metric);
474 }
475 if (current_box_miss_metric > next_box_miss_metric)
476 continue; // Blob is a better match for next box.
477 }
478 if (applybox_debug > 2) {
479 tprintf("Blob match: blob:");
480 blob_box.print();
481 tprintf("Matches box:");
482 box.print();
483 if (next_box != nullptr) {
484 tprintf("With next box:");
485 next_box->print();
486 }
487 }
488 if (new_word == nullptr) {
489 // Make a new word with a single blob.
490 new_word = word->shallow_copy();
491 new_word->set_text(correct_text);
492 w_it.add_to_end(new_word);
493 }
494 C_BLOB_IT new_blob_it(new_word->cblob_list());
495 new_blob_it.add_to_end(blob_it.extract());
496 }
497 }
498 }
499 }
500 if (new_word == nullptr && applybox_debug > 0) tprintf("FAIL!\n");
501 return new_word != nullptr;
502}
WERD_LIST * word_list()
Definition: ocrrow.h:55
TBOX bounding_box() const
Definition: ocrrow.h:88
WERD * shallow_copy()
Definition: werd.cpp:334
void set_text(const char *new_text)
Definition: werd.h:115

◆ ResetAdaptiveClassifier()

void tesseract::Tesseract::ResetAdaptiveClassifier ( )

Definition at line 587 of file tesseractclass.cpp.

587 {
589 for (int i = 0; i < sub_langs_.size(); ++i) {
590 sub_langs_[i]->ResetAdaptiveClassifierInternal();
591 }
592}
void ResetAdaptiveClassifierInternal()
Definition: adaptmatch.cpp:598

◆ ResetDocumentDictionary()

void tesseract::Tesseract::ResetDocumentDictionary ( )

Definition at line 597 of file tesseractclass.cpp.

597 {
599 for (int i = 0; i < sub_langs_.size(); ++i) {
600 sub_langs_[i]->getDict().ResetDocumentDictionary();
601 }
602}
void ResetDocumentDictionary()
Definition: dict.h:326

◆ reskew()

const FCOORD & tesseract::Tesseract::reskew ( ) const
inline

Definition at line 194 of file tesseractclass.h.

194 {
195 return reskew_;
196 }

◆ RetryWithLanguage()

int tesseract::Tesseract::RetryWithLanguage ( const WordData word_data,
WordRecognizer  recognizer,
bool  debug,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  best_words 
)

Definition at line 904 of file control.cpp.

907 {
908 if (debug) {
909 tprintf("Trying word using lang %s, oem %d\n",
910 lang.string(), static_cast<int>(tessedit_ocr_engine_mode));
911 }
912 // Run the recognizer on the word.
913 PointerVector<WERD_RES> new_words;
914 (this->*recognizer)(word_data, in_word, &new_words);
915 if (new_words.empty()) {
916 // Transfer input word to new_words, as the classifier must have put
917 // the result back in the input.
918 new_words.push_back(*in_word);
919 *in_word = nullptr;
920 }
921 if (debug) {
922 for (int i = 0; i < new_words.size(); ++i)
923 new_words[i]->DebugTopChoice("Lang result");
924 }
925 // Initial version is a bit of a hack based on better certainty and rating
926 // or a dictionary vs non-dictionary word.
927 return SelectBestWords(classify_max_rating_ratio,
929 debug, &new_words, best_words);
930}
double classify_max_rating_ratio
Definition: classify.h:438
double classify_max_certainty_margin
Definition: classify.h:440

◆ right_to_left()

bool tesseract::Tesseract::right_to_left ( ) const
inline

Definition at line 275 of file tesseractclass.h.

275 {
276 return right_to_left_;
277 }

◆ RunOldFixXht()

bool tesseract::Tesseract::RunOldFixXht ( WERD_RES word,
BLOCK block,
ROW row 
)

◆ safe_dict_word()

int16_t tesseract::Tesseract::safe_dict_word ( const WERD_RES werd_res)

Definition at line 608 of file reject.cpp.

608 {
609 const WERD_CHOICE &word = *werd_res->best_choice;
610 int dict_word_type = werd_res->tesseract->dict_word(word);
611 return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
612}

◆ scaled_color()

Pix * tesseract::Tesseract::scaled_color ( ) const
inline

Definition at line 258 of file tesseractclass.h.

258 {
259 return scaled_color_;
260 }

◆ scaled_factor()

int tesseract::Tesseract::scaled_factor ( ) const
inline

Definition at line 261 of file tesseractclass.h.

261 {
262 return scaled_factor_;
263 }

◆ script_pos_pass()

void tesseract::Tesseract::script_pos_pass ( PAGE_RES page_res)

Definition at line 734 of file control.cpp.

734 {
735 PAGE_RES_IT page_res_it(page_res);
736 for (page_res_it.restart_page(); page_res_it.word() != nullptr;
737 page_res_it.forward()) {
738 WERD_RES* word = page_res_it.word();
739 if (word->word->flag(W_REP_CHAR)) {
740 page_res_it.forward();
741 continue;
742 }
743 const float x_height = page_res_it.block()->block->x_height();
744 float word_x_height = word->x_height;
745 if (word_x_height < word->best_choice->min_x_height() ||
746 word_x_height > word->best_choice->max_x_height()) {
747 word_x_height = (word->best_choice->min_x_height() +
748 word->best_choice->max_x_height()) / 2.0f;
749 }
750 // Test for small caps. Word capheight must be close to block xheight,
751 // and word must contain no lower case letters, and at least one upper case.
752 const double small_cap_xheight = x_height * kXHeightCapRatio;
753 const double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
754 if (word->uch_set->script_has_xheight() &&
755 small_cap_xheight - small_cap_delta <= word_x_height &&
756 word_x_height <= small_cap_xheight + small_cap_delta) {
757 // Scan for upper/lower.
758 int num_upper = 0;
759 int num_lower = 0;
760 for (int i = 0; i < word->best_choice->length(); ++i) {
761 if (word->uch_set->get_isupper(word->best_choice->unichar_id(i)))
762 ++num_upper;
763 else if (word->uch_set->get_islower(word->best_choice->unichar_id(i)))
764 ++num_lower;
765 }
766 if (num_upper > 0 && num_lower == 0)
767 word->small_caps = true;
768 }
769 word->SetScriptPositions();
770 }
771}
static const double kXHeightCapRatio
Definition: ccstruct.h:37
void SetScriptPositions()
Definition: pageres.cpp:858
bool small_caps
Definition: pageres.h:306
float min_x_height() const
Definition: ratngs.h:326
float max_x_height() const
Definition: ratngs.h:329

◆ SearchForText()

void tesseract::Tesseract::SearchForText ( const GenericVector< BLOB_CHOICE_LIST * > *  choices,
int  choices_pos,
int  choices_length,
const GenericVector< UNICHAR_ID > &  target_text,
int  text_index,
float  rating,
GenericVector< int > *  segmentation,
float *  best_rating,
GenericVector< int > *  best_segmentation 
)

Recursive helper to find a match to the target_text (from text_index position) in the choices (from choices_pos position).

Parameters
choicesis an array of GenericVectors, of length choices_length, with each element representing a starting position in the word, and the GenericVector holding classification results for a sequence of consecutive blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
choices_pos
choices_length
target_text
text_index
rating
segmentation
best_rating
best_segmentation

Definition at line 631 of file applybox.cpp.

637 {
639 for (int length = 1; length <= choices[choices_pos].size(); ++length) {
640 // Rating of matching choice or worst choice if no match.
641 float choice_rating = 0.0f;
642 // Find the corresponding best BLOB_CHOICE.
643 BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
644 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
645 choice_it.forward()) {
646 const BLOB_CHOICE* choice = choice_it.data();
647 choice_rating = choice->rating();
648 UNICHAR_ID class_id = choice->unichar_id();
649 if (class_id == target_text[text_index]) {
650 break;
651 }
652 // Search ambigs table.
653 if (class_id < table.size() && table[class_id] != nullptr) {
654 AmbigSpec_IT spec_it(table[class_id]);
655 for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
656 spec_it.forward()) {
657 const AmbigSpec *ambig_spec = spec_it.data();
658 // We'll only do 1-1.
659 if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
660 ambig_spec->correct_ngram_id == target_text[text_index])
661 break;
662 }
663 if (!spec_it.cycled_list())
664 break; // Found an ambig.
665 }
666 }
667 if (choice_it.cycled_list())
668 continue; // No match.
669 segmentation->push_back(length);
670 if (choices_pos + length == choices_length &&
671 text_index + 1 == target_text.size()) {
672 // This is a complete match. If the rating is good record a new best.
673 if (applybox_debug > 2) {
674 tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
675 rating + choice_rating, *best_rating, segmentation->size(),
676 best_segmentation->size());
677 }
678 if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
679 *best_segmentation = *segmentation;
680 *best_rating = rating + choice_rating;
681 }
682 } else if (choices_pos + length < choices_length &&
683 text_index + 1 < target_text.size()) {
684 if (applybox_debug > 3) {
685 tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n",
686 target_text[text_index],
687 unicharset.id_to_unichar(target_text[text_index]),
688 choice_it.data()->unichar_id() == target_text[text_index]
689 ? "Match" : "Ambig",
690 choices_pos, length);
691 }
692 SearchForText(choices, choices_pos + length, choices_length, target_text,
693 text_index + 1, rating + choice_rating, segmentation,
694 best_rating, best_segmentation);
695 if (applybox_debug > 3) {
696 tprintf("End recursion for %d=%s\n", target_text[text_index],
697 unicharset.id_to_unichar(target_text[text_index]));
698 }
699 }
700 segmentation->truncate(segmentation->size() - 1);
701 }
702}
GenericVector< AmbigSpec_LIST * > UnicharAmbigsVector
Definition: ambigs.h:134
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:145
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:108

◆ SearchWords()

void tesseract::Tesseract::SearchWords ( PointerVector< WERD_RES > *  words)

Definition at line 259 of file linerec.cpp.

259 {
260 // Run the segmentation search on the network outputs and make a BoxWord
261 // for each of the output words.
262 // If we drop a word as junk, then there is always a space in front of the
263 // next.
264 const Dict* stopper_dict = lstm_recognizer_->GetDict();
265 if (stopper_dict == nullptr) stopper_dict = &getDict();
266 bool any_nonspace_delimited = false;
267 for (int w = 0; w < words->size(); ++w) {
268 WERD_RES* word = (*words)[w];
269 if (word->best_choice != nullptr &&
271 any_nonspace_delimited = true;
272 break;
273 }
274 }
275 for (int w = 0; w < words->size(); ++w) {
276 WERD_RES* word = (*words)[w];
277 if (word->best_choice == nullptr) {
278 // It is a dud.
279 word->SetupFake(lstm_recognizer_->GetUnicharset());
280 } else {
281 // Set the best state.
282 for (int i = 0; i < word->best_choice->length(); ++i) {
283 int length = word->best_choice->state(i);
284 word->best_state.push_back(length);
285 }
286 word->reject_map.initialise(word->best_choice->length());
287 word->tess_failed = false;
288 word->tess_accepted = true;
289 word->tess_would_adapt = false;
290 word->done = true;
291 word->tesseract = this;
292 float word_certainty = std::min(word->space_certainty,
293 word->best_choice->certainty());
294 word_certainty *= kCertaintyScale;
295 if (getDict().stopper_debug_level >= 1) {
296 tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n",
297 word->best_choice->certainty(), word->space_certainty,
298 std::min(word->space_certainty, word->best_choice->certainty()) *
300 word_certainty);
301 word->best_choice->print();
302 }
303 word->best_choice->set_certainty(word_certainty);
304
305 word->tess_accepted = stopper_dict->AcceptableResult(word);
306 }
307 }
308}
float space_certainty
Definition: pageres.h:321
void set_certainty(float new_val)
Definition: ratngs.h:362
int state(int index) const
Definition: ratngs.h:309
bool ContainsAnyNonSpaceDelimited() const
Definition: ratngs.h:504
void print() const
Definition: ratngs.h:570

◆ SegmentPage()

int tesseract::Tesseract::SegmentPage ( const STRING input_file,
BLOCK_LIST *  blocks,
Tesseract osd_tess,
OSResults osr 
)

Segment the page according to the current value of tessedit_pageseg_mode. pix_binary_ is used as the source image and should not be nullptr. On return the blocks list owns all the constructed page layout.

Definition at line 99 of file pagesegmain.cpp.

100 {
101 ASSERT_HOST(pix_binary_ != nullptr);
102 int width = pixGetWidth(pix_binary_);
103 int height = pixGetHeight(pix_binary_);
104 // Get page segmentation mode.
105 auto pageseg_mode = static_cast<PageSegMode>(
106 static_cast<int>(tessedit_pageseg_mode));
107 // If a UNLV zone file can be found, use that instead of segmentation.
108 if (!PSM_COL_FIND_ENABLED(pageseg_mode) &&
109 input_file != nullptr && input_file->length() > 0) {
110 STRING name = *input_file;
111 const char* lastdot = strrchr(name.string(), '.');
112 if (lastdot != nullptr)
113 name[lastdot - name.string()] = '\0';
114 read_unlv_file(name, width, height, blocks);
115 }
116 if (blocks->empty()) {
117 // No UNLV file present. Work according to the PageSegMode.
118 // First make a single block covering the whole image.
119 BLOCK_IT block_it(blocks);
120 auto* block = new BLOCK("", true, 0, 0, 0, 0, width, height);
121 block->set_right_to_left(right_to_left());
122 block_it.add_to_end(block);
123 } else {
124 // UNLV file present. Use PSM_SINGLE_BLOCK.
125 pageseg_mode = PSM_SINGLE_BLOCK;
126 }
127 // The diacritic_blobs holds noise blobs that may be diacritics. They
128 // are separated out on areas of the image that seem noisy and short-circuit
129 // the layout process, going straight from the initial partition creation
130 // right through to after word segmentation, where they are added to the
131 // rej_cblobs list of the most appropriate word. From there classification
132 // will determine whether they are used.
133 BLOBNBOX_LIST diacritic_blobs;
134 int auto_page_seg_ret_val = 0;
135 TO_BLOCK_LIST to_blocks;
136 if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
137 PSM_SPARSE(pageseg_mode)) {
138 auto_page_seg_ret_val = AutoPageSeg(
139 pageseg_mode, blocks, &to_blocks,
140 enable_noise_removal ? &diacritic_blobs : nullptr, osd_tess, osr);
141 if (pageseg_mode == PSM_OSD_ONLY)
142 return auto_page_seg_ret_val;
143 // To create blobs from the image region bounds uncomment this line:
144 // to_blocks.clear(); // Uncomment to go back to the old mode.
145 } else {
146 deskew_ = FCOORD(1.0f, 0.0f);
147 reskew_ = FCOORD(1.0f, 0.0f);
148 if (pageseg_mode == PSM_CIRCLE_WORD) {
149 Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_);
150 if (pixcleaned != nullptr) {
151 pixDestroy(&pix_binary_);
152 pix_binary_ = pixcleaned;
153 }
154 }
155 }
156
157 if (auto_page_seg_ret_val < 0) {
158 return -1;
159 }
160
161 if (blocks->empty()) {
163 tprintf("Empty page\n");
164 return 0; // AutoPageSeg found an empty page.
165 }
166 bool splitting =
168 bool cjk_mode = textord_use_cjk_fp_model;
169
170 textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_,
171 pix_thresholds_, pix_grey_, splitting || cjk_mode,
172 &diacritic_blobs, blocks, &to_blocks);
173 return auto_page_seg_ret_val;
174}
bool read_unlv_file(STRING name, int32_t xsize, int32_t ysize, BLOCK_LIST *blocks)
Definition: blread.cpp:32
int textord_debug_tabfind
Definition: alignedblob.cpp:27
bool PSM_OSD_ENABLED(int pageseg_mode)
Definition: publictypes.h:191
@ PSM_CIRCLE_WORD
Treat the image as a single word in a circle.
Definition: publictypes.h:175
@ PSM_OSD_ONLY
Orientation and script detection only.
Definition: publictypes.h:164
bool PSM_COL_FIND_ENABLED(int pageseg_mode)
Definition: publictypes.h:197
bool PSM_SPARSE(int pageseg_mode)
Definition: publictypes.h:200
bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode)
Definition: publictypes.h:203
int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
void TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width, int height, Pix *binary_pix, Pix *thresholds_pix, Pix *grey_pix, bool use_box_bottoms, BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: textord.cpp:226

◆ SelectGoodDiacriticOutlines()

bool tesseract::Tesseract::SelectGoodDiacriticOutlines ( int  pass,
float  certainty_threshold,
PAGE_RES_IT pr_it,
C_BLOB blob,
const GenericVector< C_OUTLINE * > &  outlines,
int  num_outlines,
GenericVector< bool > *  ok_outlines 
)

Definition at line 1140 of file control.cpp.

1143 {
1144 STRING best_str;
1145 float target_cert = certainty_threshold;
1146 if (blob != nullptr) {
1147 float target_c2;
1148 target_cert = ClassifyBlobAsWord(pass, pr_it, blob, &best_str, &target_c2);
1149 if (debug_noise_removal) {
1150 tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.string(),
1151 target_cert, target_c2);
1152 blob->bounding_box().print();
1153 }
1154 target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
1155 }
1156 GenericVector<bool> test_outlines = *ok_outlines;
1157 // Start with all the outlines in.
1158 STRING all_str;
1159 GenericVector<bool> best_outlines = *ok_outlines;
1160 float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
1161 pr_it, blob, &all_str);
1162 if (debug_noise_removal) {
1163 TBOX ol_box;
1164 for (int i = 0; i < test_outlines.size(); ++i) {
1165 if (test_outlines[i]) ol_box += outlines[i]->bounding_box();
1166 }
1167 tprintf("All Noise blob classified as %s=%g, delta=%g at:",
1168 all_str.string(), best_cert, best_cert - target_cert);
1169 ol_box.print();
1170 }
1171 // Iteratively zero out the bit that improves the certainty the most, until
1172 // we get past the threshold, have zero bits, or fail to improve.
1173 int best_index = 0; // To zero out.
1174 while (num_outlines > 1 && best_index >= 0 &&
1175 (blob == nullptr || best_cert < target_cert || blob != nullptr)) {
1176 // Find the best bit to zero out.
1177 best_index = -1;
1178 for (int i = 0; i < outlines.size(); ++i) {
1179 if (test_outlines[i]) {
1180 test_outlines[i] = false;
1181 STRING str;
1182 float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
1183 pr_it, blob, &str);
1184 if (debug_noise_removal) {
1185 TBOX ol_box;
1186 for (int j = 0; j < outlines.size(); ++j) {
1187 if (test_outlines[j]) ol_box += outlines[j]->bounding_box();
1188 tprintf("%d", test_outlines[j]);
1189 }
1190 tprintf(" blob classified as %s=%g, delta=%g) at:", str.string(),
1191 cert, cert - target_cert);
1192 ol_box.print();
1193 }
1194 if (cert > best_cert) {
1195 best_cert = cert;
1196 best_index = i;
1197 best_outlines = test_outlines;
1198 }
1199 test_outlines[i] = true;
1200 }
1201 }
1202 if (best_index >= 0) {
1203 test_outlines[best_index] = false;
1204 --num_outlines;
1205 }
1206 }
1207 if (best_cert >= target_cert) {
1208 // Save the best combination.
1209 *ok_outlines = best_outlines;
1210 if (debug_noise_removal) {
1211 tprintf("%s noise combination ", blob ? "Adding" : "New");
1212 for (int i = 0; i < best_outlines.size(); ++i) {
1213 tprintf("%d", best_outlines[i]);
1214 }
1215 tprintf(" yields certainty %g, beating target of %g\n", best_cert,
1216 target_cert);
1217 }
1218 return true;
1219 }
1220
1221 return false;
1222}
float ClassifyBlobPlusOutlines(const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
Definition: control.cpp:1226

◆ set_done()

void tesseract::Tesseract::set_done ( WERD_RES word,
int16_t  pass 
)

◆ set_pix_grey()

void tesseract::Tesseract::set_pix_grey ( Pix *  grey_pix)
inline

Definition at line 208 of file tesseractclass.h.

208 {
209 pixDestroy(&pix_grey_);
210 pix_grey_ = grey_pix;
211 }

◆ set_pix_original()

void tesseract::Tesseract::set_pix_original ( Pix *  original_pix)
inline

Definition at line 216 of file tesseractclass.h.

216 {
217 pixDestroy(&pix_original_);
218 pix_original_ = original_pix;
219 // Clone to sublangs as well.
220 for (int i = 0; i < sub_langs_.size(); ++i) {
221 sub_langs_[i]->set_pix_original(original_pix ? pixClone(original_pix)
222 : nullptr);
223 }
224 }

◆ set_pix_thresholds()

void tesseract::Tesseract::set_pix_thresholds ( Pix *  thresholds)
inline

Definition at line 242 of file tesseractclass.h.

242 {
243 pixDestroy(&pix_thresholds_);
244 pix_thresholds_ = thresholds;
245 }

◆ set_source_resolution()

void tesseract::Tesseract::set_source_resolution ( int  ppi)
inline

Definition at line 249 of file tesseractclass.h.

249 {
250 source_resolution_ = ppi;
251 }

◆ set_unlv_suspects()

void tesseract::Tesseract::set_unlv_suspects ( WERD_RES word)

Definition at line 273 of file output.cpp.

273 {
274 int len = word_res->reject_map.length();
275 const WERD_CHOICE &word = *(word_res->best_choice);
276 const UNICHARSET &uchset = *word.unicharset();
277 int i;
278 float rating_per_ch;
279
280 if (suspect_level == 0) {
281 for (i = 0; i < len; i++) {
282 if (word_res->reject_map[i].rejected())
283 word_res->reject_map[i].setrej_minimal_rej_accept();
284 }
285 return;
286 }
287
288 if (suspect_level >= 3)
289 return; //Use defaults
290
291 /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
292
293 if (safe_dict_word(word_res) &&
295 /* Unreject alphas in dictionary words */
296 for (i = 0; i < len; ++i) {
297 if (word_res->reject_map[i].rejected() &&
298 uchset.get_isalpha(word.unichar_id(i)))
299 word_res->reject_map[i].setrej_minimal_rej_accept();
300 }
301 }
302
303 rating_per_ch = word.rating() / word_res->reject_map.length();
304
305 if (rating_per_ch >= suspect_rating_per_ch)
306 return; // Don't touch bad ratings
307
308 if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
309 /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
310 for (i = 0; i < len; ++i) {
311 if (word_res->reject_map[i].rejected() &&
312 (!uchset.eq(word.unichar_id(i), " ")))
313 word_res->reject_map[i].setrej_minimal_rej_accept();
314 }
315 }
316
317 for (i = 0; i < len; i++) {
318 if (word_res->reject_map[i].rejected()) {
319 if (word_res->reject_map[i].flag(R_DOC_REJ))
320 word_res->reject_map[i].setrej_minimal_rej_accept();
321 if (word_res->reject_map[i].flag(R_BLOCK_REJ))
322 word_res->reject_map[i].setrej_minimal_rej_accept();
323 if (word_res->reject_map[i].flag(R_ROW_REJ))
324 word_res->reject_map[i].setrej_minimal_rej_accept();
325 }
326 }
327
328 if (suspect_level == 2)
329 return;
330
332 (word_res->reject_map.length() <= suspect_short_words)) {
333 for (i = 0; i < len; i++) {
334 if (word_res->reject_map[i].rejected()) {
335 if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
336 word_res->reject_map[i].flag(R_POSTNN_1IL)))
337 word_res->reject_map[i].setrej_minimal_rej_accept();
338
340 word_res->reject_map[i].flag(R_MM_REJECT))
341 word_res->reject_map[i].setrej_minimal_rej_accept();
342 }
343 }
344 }
345
346 if (acceptable_word_string(*word_res->uch_set,
347 word.unichar_string().string(),
348 word.unichar_lengths().string()) !=
351 word.unichar_lengths().string())) {
352 if (word_res->reject_map.length() > suspect_short_words) {
353 for (i = 0; i < len; i++) {
354 if (word_res->reject_map[i].rejected() &&
355 (!word_res->reject_map[i].perm_rejected() ||
356 word_res->reject_map[i].flag (R_1IL_CONFLICT) ||
357 word_res->reject_map[i].flag (R_POSTNN_1IL) ||
358 word_res->reject_map[i].flag (R_MM_REJECT))) {
359 word_res->reject_map[i].setrej_minimal_rej_accept();
360 }
361 }
362 }
363 }
364}
@ R_MM_REJECT
Definition: rejctmap.h:57
@ R_ROW_REJ
Definition: rejctmap.h:79
@ R_BLOCK_REJ
Definition: rejctmap.h:78
@ R_1IL_CONFLICT
Definition: rejctmap.h:54
@ R_POSTNN_1IL
Definition: rejctmap.h:55
@ R_DOC_REJ
Definition: rejctmap.h:77
bool acceptable_number_string(const char *s, const char *lengths)
Definition: output.cpp:387
int16_t count_alphas(const WERD_CHOICE &word)
Definition: output.cpp:366

◆ set_word_fonts()

void tesseract::Tesseract::set_word_fonts ( WERD_RES word)

set_word_fonts

Get the fonts for the word.

Definition at line 1962 of file control.cpp.

1962 {
1963 // Don't try to set the word fonts for an lstm word, as the configs
1964 // will be meaningless.
1965 if (word->chopped_word == nullptr) return;
1966 ASSERT_HOST(word->best_choice != nullptr);
1967
1968#ifndef DISABLED_LEGACY_ENGINE
1969 const int fontinfo_size = get_fontinfo_table().size();
1970 if (fontinfo_size == 0) return;
1971 GenericVector<int> font_total_score;
1972 font_total_score.init_to_size(fontinfo_size, 0);
1973
1974 // Compute the font scores for the word
1976 tprintf("Examining fonts in %s\n",
1977 word->best_choice->debug_string().string());
1978 }
1979 for (int b = 0; b < word->best_choice->length(); ++b) {
1980 const BLOB_CHOICE* choice = word->GetBlobChoice(b);
1981 if (choice == nullptr) continue;
1982 const GenericVector<ScoredFont>& fonts = choice->fonts();
1983 for (int f = 0; f < fonts.size(); ++f) {
1984 const int fontinfo_id = fonts[f].fontinfo_id;
1985 if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
1986 font_total_score[fontinfo_id] += fonts[f].score;
1987 }
1988 }
1989 }
1990 // Find the top and 2nd choice for the word.
1991 int score1 = 0, score2 = 0;
1992 int16_t font_id1 = -1, font_id2 = -1;
1993 for (int f = 0; f < fontinfo_size; ++f) {
1994 if (tessedit_debug_fonts && font_total_score[f] > 0) {
1995 tprintf("Font %s, total score = %d\n",
1996 fontinfo_table_.get(f).name, font_total_score[f]);
1997 }
1998 if (font_total_score[f] > score1) {
1999 score2 = score1;
2000 font_id2 = font_id1;
2001 score1 = font_total_score[f];
2002 font_id1 = f;
2003 } else if (font_total_score[f] > score2) {
2004 score2 = font_total_score[f];
2005 font_id2 = f;
2006 }
2007 }
2008 word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : nullptr;
2009 word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : nullptr;
2010 // Each score has a limit of UINT16_MAX, so divide by that to get the number
2011 // of "votes" for that font, ie number of perfect scores.
2012 word->fontinfo_id_count = ClipToRange<int>(score1 / UINT16_MAX, 1, INT8_MAX);
2013 word->fontinfo_id2_count = ClipToRange<int>(score2 / UINT16_MAX, 0, INT8_MAX);
2014 if (score1 > 0) {
2015 const FontInfo fi = fontinfo_table_.get(font_id1);
2017 if (word->fontinfo_id2_count > 0 && font_id2 >= 0) {
2018 tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n",
2019 fi.name, word->fontinfo_id_count,
2020 fontinfo_table_.get(font_id2).name,
2021 word->fontinfo_id2_count);
2022 } else {
2023 tprintf("Word modal font=%s, score=%d. No 2nd choice\n",
2024 fi.name, word->fontinfo_id_count);
2025 }
2026 }
2027 }
2028#endif // ndef DISABLED_LEGACY_ENGINE
2029}
const GenericVector< tesseract::ScoredFont > & fonts() const
Definition: ratngs.h:93
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:529

◆ SetBlackAndWhitelist()

void tesseract::Tesseract::SetBlackAndWhitelist ( )

Definition at line 604 of file tesseractclass.cpp.

604 {
605 // Set the white and blacklists (if any)
609 if (lstm_recognizer_) {
610 UNICHARSET& lstm_unicharset = const_cast<UNICHARSET&> (lstm_recognizer_->GetUnicharset());
611 lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
614 }
615 // Black and white lists should apply to all loaded classifiers.
616 for (int i = 0; i < sub_langs_.size(); ++i) {
617 sub_langs_[i]->unicharset.set_black_and_whitelist(
620 if (sub_langs_[i]->lstm_recognizer_) {
621 UNICHARSET& lstm_unicharset = const_cast<UNICHARSET&> (sub_langs_[i]->lstm_recognizer_->GetUnicharset());
622 lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
625 }
626 }
627}
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)

◆ SetEquationDetect()

void tesseract::Tesseract::SetEquationDetect ( EquationDetect detector)

Definition at line 581 of file tesseractclass.cpp.

581 {
582 equ_detect_ = detector;
583 equ_detect_->SetLangTesseract(this);
584}
void SetLangTesseract(Tesseract *lang_tesseract)

◆ SetScaledColor()

void tesseract::Tesseract::SetScaledColor ( int  factor,
Pix *  color 
)
inline

Definition at line 264 of file tesseractclass.h.

264 {
265 scaled_factor_ = factor;
266 scaled_color_ = color;
267 }

◆ SetupAllWordsPassN()

void tesseract::Tesseract::SetupAllWordsPassN ( int  pass_n,
const TBOX target_word_box,
const char *  word_config,
PAGE_RES page_res,
GenericVector< WordData > *  words 
)

If tesseract is to be run, sets the words up ready for it.

Definition at line 154 of file control.cpp.

158 {
159 // Prepare all the words.
160 PAGE_RES_IT page_res_it(page_res);
161 for (page_res_it.restart_page(); page_res_it.word() != nullptr;
162 page_res_it.forward()) {
163 if (target_word_box == nullptr ||
164 ProcessTargetWord(page_res_it.word()->word->bounding_box(),
165 *target_word_box, word_config, 1)) {
166 words->push_back(WordData(page_res_it));
167 }
168 }
169 // Setup all the words for recognition with polygonal approximation.
170 for (int w = 0; w < words->size(); ++w) {
171 SetupWordPassN(pass_n, &(*words)[w]);
172 if (w > 0) (*words)[w].prev_word = &(*words)[w - 1];
173 }
174}

◆ SetupApplyBoxes()

PAGE_RES * tesseract::Tesseract::SetupApplyBoxes ( const GenericVector< TBOX > &  boxes,
BLOCK_LIST *  block_list 
)

Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: All fuzzy spaces are removed, and all the words are maximally chopped.

Definition at line 207 of file applybox.cpp.

208 {
209 PreenXHeights(block_list);
210 // Strip all fuzzy space markers to simplify the PAGE_RES.
211 BLOCK_IT b_it(block_list);
212 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
213 BLOCK* block = b_it.data();
214 ROW_IT r_it(block->row_list());
215 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
216 ROW* row = r_it.data();
217 WERD_IT w_it(row->word_list());
218 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
219 WERD* word = w_it.data();
220 if (word->cblob_list()->empty()) {
221 delete w_it.extract();
222 } else {
223 word->set_flag(W_FUZZY_SP, false);
224 word->set_flag(W_FUZZY_NON, false);
225 }
226 }
227 }
228 }
229 auto* page_res = new PAGE_RES(false, block_list, nullptr);
230 PAGE_RES_IT pr_it(page_res);
231 WERD_RES* word_res;
232 while ((word_res = pr_it.word()) != nullptr) {
233 MaximallyChopWord(boxes, pr_it.block()->block,
234 pr_it.row()->row, word_res);
235 pr_it.forward();
236 }
237 return page_res;
238}
void PreenXHeights(BLOCK_LIST *block_list)
Definition: applybox.cpp:181
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: applybox.cpp:243

◆ SetupPageSegAndDetectOrientation()

ColumnFinder * tesseract::Tesseract::SetupPageSegAndDetectOrientation ( PageSegMode  pageseg_mode,
BLOCK_LIST *  blocks,
Tesseract osd_tess,
OSResults osr,
TO_BLOCK_LIST *  to_blocks,
Pix **  photo_mask_pix,
Pix **  music_mask_pix 
)

Sets up auto page segmentation, determines the orientation, and corrects it. Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to facilitate testing. photo_mask_pix is a pointer to a nullptr pointer that will be filled on return with the leptonica photo mask, which must be pixDestroyed by the caller. to_blocks is an empty list that will be filled with (usually a single) block that is used during layout analysis. This ugly API is required because of the possibility of a unlv zone file. TODO(rays) clean this up. See AutoPageSeg for other arguments. The returned ColumnFinder must be deleted after use.

Definition at line 270 of file pagesegmain.cpp.

273 {
274 int vertical_x = 0;
275 int vertical_y = 1;
276 TabVector_LIST v_lines;
277 TabVector_LIST h_lines;
278 ICOORD bleft(0, 0);
279
280 ASSERT_HOST(pix_binary_ != nullptr);
282 pixa_debug_.AddPix(pix_binary_, "PageSegInput");
283 }
284 // Leptonica is used to find the rule/separator lines in the input.
285 LineFinder::FindAndRemoveLines(source_resolution_,
286 textord_tabfind_show_vlines, pix_binary_,
287 &vertical_x, &vertical_y, music_mask_pix,
288 &v_lines, &h_lines);
290 pixa_debug_.AddPix(pix_binary_, "NoLines");
291 }
292 // Leptonica is used to find a mask of the photo regions in the input.
293 *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);
295 pixa_debug_.AddPix(pix_binary_, "NoImages");
296 }
297 if (!PSM_COL_FIND_ENABLED(pageseg_mode)) v_lines.clear();
298
299 // The rest of the algorithm uses the usual connected components.
300 textord_.find_components(pix_binary_, blocks, to_blocks);
301
302 TO_BLOCK_IT to_block_it(to_blocks);
303 // There must be exactly one input block.
304 // TODO(rays) handle new textline finding with a UNLV zone file.
305 ASSERT_HOST(to_blocks->singleton());
306 TO_BLOCK* to_block = to_block_it.data();
307 TBOX blkbox = to_block->block->pdblk.bounding_box();
308 ColumnFinder* finder = nullptr;
309 int estimated_resolution = source_resolution_;
310 if (source_resolution_ == kMinCredibleResolution) {
311 // Try to estimate resolution from typical body text size.
313 if (res > estimated_resolution && res < kMaxCredibleResolution) {
314 estimated_resolution = res;
315 tprintf("Estimating resolution as %d\n", estimated_resolution);
316 }
317 }
318
319 if (to_block->line_size >= 2) {
320 finder = new ColumnFinder(static_cast<int>(to_block->line_size),
321 blkbox.botleft(), blkbox.topright(),
322 estimated_resolution, textord_use_cjk_fp_model,
324 &h_lines, vertical_x, vertical_y);
325
326 finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block);
327
328#ifndef DISABLED_LEGACY_ENGINE
329
330 if (equ_detect_) {
331 equ_detect_->LabelSpecialText(to_block);
332 }
333
334 BLOBNBOX_CLIST osd_blobs;
335 // osd_orientation is the number of 90 degree rotations to make the
336 // characters upright. (See osdetect.h for precise definition.)
337 // We want the text lines horizontal, (vertical text indicates vertical
338 // textlines) which may conflict (eg vertically written CJK).
339 int osd_orientation = 0;
340 bool vertical_text = textord_tabfind_force_vertical_text ||
341 pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;
342 if (!vertical_text && textord_tabfind_vertical_text &&
343 PSM_ORIENTATION_ENABLED(pageseg_mode)) {
344 vertical_text =
345 finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio,
346 to_block, &osd_blobs);
347 }
348 if (PSM_OSD_ENABLED(pageseg_mode) && osd_tess != nullptr && osr != nullptr) {
349 GenericVector<int> osd_scripts;
350 if (osd_tess != this) {
351 // We are running osd as part of layout analysis, so constrain the
352 // scripts to those allowed by *this.
353 AddAllScriptsConverted(unicharset, osd_tess->unicharset, &osd_scripts);
354 for (int s = 0; s < sub_langs_.size(); ++s) {
355 AddAllScriptsConverted(sub_langs_[s]->unicharset,
356 osd_tess->unicharset, &osd_scripts);
357 }
358 }
359 os_detect_blobs(&osd_scripts, &osd_blobs, osr, osd_tess);
360 if (pageseg_mode == PSM_OSD_ONLY) {
361 delete finder;
362 return nullptr;
363 }
364 osd_orientation = osr->best_result.orientation_id;
365 double osd_score = osr->orientations[osd_orientation];
366 double osd_margin = min_orientation_margin * 2;
367 for (int i = 0; i < 4; ++i) {
368 if (i != osd_orientation &&
369 osd_score - osr->orientations[i] < osd_margin) {
370 osd_margin = osd_score - osr->orientations[i];
371 }
372 }
373 int best_script_id = osr->best_result.script_id;
374 const char* best_script_str =
375 osd_tess->unicharset.get_script_from_script_id(best_script_id);
376 bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||
377 best_script_id == osd_tess->unicharset.hiragana_sid() ||
378 best_script_id == osd_tess->unicharset.katakana_sid() ||
379 strcmp("Japanese", best_script_str) == 0 ||
380 strcmp("Korean", best_script_str) == 0 ||
381 strcmp("Hangul", best_script_str) == 0;
382 if (cjk) {
383 finder->set_cjk_script(true);
384 }
385 if (osd_margin < min_orientation_margin) {
386 // The margin is weak.
387 if (!cjk && !vertical_text && osd_orientation == 2) {
388 // upside down latin text is improbable with such a weak margin.
389 tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: "
390 "Don't rotate.\n", osd_margin);
391 osd_orientation = 0;
392 } else {
393 tprintf(
394 "OSD: Weak margin (%.2f) for %d blob text block, "
395 "but using orientation anyway: %d\n",
396 osd_margin, osd_blobs.length(), osd_orientation);
397 }
398 }
399 }
400 osd_blobs.shallow_clear();
401 finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
402
403#endif // ndef DISABLED_LEGACY_ENGINE
404 }
405
406 return finder;
407}
int os_detect_blobs(const GenericVector< int > *allowed_scripts, BLOBNBOX_CLIST *blob_list, OSResults *osr, tesseract::Tesseract *tess)
Definition: osdetect.cpp:278
constexpr int kResolutionEstimationFactor
Definition: publictypes.h:45
constexpr int kMinCredibleResolution
Definition: publictypes.h:38
constexpr int kMaxCredibleResolution
Definition: publictypes.h:40
@ PSM_SINGLE_BLOCK_VERT_TEXT
aligned text.
Definition: publictypes.h:170
bool PSM_ORIENTATION_ENABLED(int pageseg_mode)
Definition: publictypes.h:194
int LabelSpecialText(TO_BLOCK *to_block) override
int script_id
Definition: osdetect.h:44
int orientation_id
Definition: osdetect.h:43
OSBestResult best_result
Definition: osdetect.h:81
float orientations[4]
Definition: osdetect.h:76
BLOCK * block
Definition: blobbox.h:777
float line_size
Definition: blobbox.h:785
void AddPix(const Pix *pix, const char *caption)
Definition: debugpixa.h:26
const ICOORD & topright() const
Definition: rect.h:104
static Pix * FindImages(Pix *pix, DebugPixa *pixa_debug)
Definition: imagefind.cpp:62
static void FindAndRemoveLines(int resolution, bool debug, Pix *pix, int *vertical_x, int *vertical_y, Pix **pix_music_mask, TabVector_LIST *v_lines, TabVector_LIST *h_lines)
Definition: linefind.cpp:243
void find_components(Pix *pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: tordmain.cpp:219

◆ SetupUniversalFontIds()

void tesseract::Tesseract::SetupUniversalFontIds ( )

Definition at line 431 of file tessedit.cpp.

431 {
432 // Note that we can get away with bitwise copying FontInfo in
433 // all_fonts, as it is a temporary structure and we avoid setting the
434 // delete callback.
435 UnicityTable<FontInfo> all_fonts;
437
438 // Create the universal ID table.
439 CollectFonts(get_fontinfo_table(), &all_fonts);
440 for (int i = 0; i < sub_langs_.size(); ++i) {
441 CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts);
442 }
443 // Assign ids from the table to each font table.
444 AssignIds(all_fonts, &get_fontinfo_table());
445 for (int i = 0; i < sub_langs_.size(); ++i) {
446 AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table());
447 }
448 font_table_size_ = all_fonts.size();
449}
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
Definition: tesscallback.h:258
bool CompareFontInfo(const FontInfo &fi1, const FontInfo &fi2)
Definition: fontinfo.cpp:119
void set_compare_callback(TessResultCallback2< bool, T const &, T const & > *cb)

◆ SetupWordPassN()

void tesseract::Tesseract::SetupWordPassN ( int  pass_n,
WordData word 
)

Definition at line 177 of file control.cpp.

177 {
178 if (pass_n == 1 || !word->word->done) {
179 if (pass_n == 1) {
180 word->word->SetupForRecognition(unicharset, this, BestPix(),
185 word->row, word->block);
186 } else if (pass_n == 2) {
187 // TODO(rays) Should we do this on pass1 too?
188 word->word->caps_height = 0.0;
189 if (word->word->x_height == 0.0f)
190 word->word->x_height = word->row->x_height();
191 }
192 word->lang_words.truncate(0);
193 for (int s = 0; s <= sub_langs_.size(); ++s) {
194 // The sub_langs_.size() entry is for the master language.
195 Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
196 auto* word_res = new WERD_RES;
197 word_res->InitForRetryRecognition(*word->word);
198 word->lang_words.push_back(word_res);
199 // LSTM doesn't get setup for pass2.
200 if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
201 word_res->SetupForRecognition(
202 lang_t->unicharset, lang_t, BestPix(),
203 lang_t->tessedit_ocr_engine_mode, nullptr,
204 lang_t->classify_bln_numeric_mode,
205 lang_t->textord_use_cjk_fp_model,
206 lang_t->poly_allow_detailed_fx, word->row, word->block);
207 }
208 }
209 }
210}
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:277

◆ SetupWordScripts()

void tesseract::Tesseract::SetupWordScripts ( BLOCK_LIST *  blocks)

◆ source_resolution()

int tesseract::Tesseract::source_resolution ( ) const
inline

Definition at line 246 of file tesseractclass.h.

246 {
247 return source_resolution_;
248 }

◆ split_and_recog_word()

void tesseract::Tesseract::split_and_recog_word ( WERD_RES word)

Definition at line 138 of file tfacepp.cpp.

138 {
139 // Find the biggest blob gap in the chopped_word.
140 int bestgap = -INT32_MAX;
141 int split_index = 0;
142 for (int b = 1; b < word->chopped_word->NumBlobs(); ++b) {
143 TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
144 TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
145 int gap = blob_box.left() - prev_box.right();
146 if (gap > bestgap) {
147 bestgap = gap;
148 split_index = b;
149 }
150 }
151 ASSERT_HOST(split_index > 0);
152
153 WERD_RES *word2 = nullptr;
154 BlamerBundle *orig_bb = nullptr;
155 split_word(word, split_index, &word2, &orig_bb);
156
157 // Recognize the first part of the word.
159 // Recognize the second part of the word.
161
162 join_words(word, word2, orig_bb);
163}
void split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
Definition: tfacepp.cpp:176
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
Definition: tfacepp.cpp:234

◆ split_word()

void tesseract::Tesseract::split_word ( WERD_RES word,
int  split_pt,
WERD_RES **  right_piece,
BlamerBundle **  orig_blamer_bundle 
) const

Definition at line 176 of file tfacepp.cpp.

179 {
180 ASSERT_HOST(split_pt >0 && split_pt < word->chopped_word->NumBlobs());
181
182 // Save a copy of the blamer bundle so we can try to reconstruct it below.
183 BlamerBundle *orig_bb =
184 word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : nullptr;
185
186 auto *word2 = new WERD_RES(*word);
187
188 // blow away the copied chopped_word, as we want to work with
189 // the blobs from the input chopped_word so seam_arrays can be merged.
190 TWERD *chopped = word->chopped_word;
191 auto *chopped2 = new TWERD;
192 chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
193 for (int i = split_pt; i < chopped->NumBlobs(); ++i) {
194 chopped2->blobs.push_back(chopped->blobs[i]);
195 }
196 chopped->blobs.truncate(split_pt);
197 word->chopped_word = nullptr;
198 delete word2->chopped_word;
199 word2->chopped_word = nullptr;
200
201 const UNICHARSET &unicharset = *word->uch_set;
202 word->ClearResults();
203 word2->ClearResults();
204 word->chopped_word = chopped;
205 word2->chopped_word = chopped2;
207 word2->SetupBasicsFromChoppedWord(unicharset);
208
209 // Try to adjust the blamer bundle.
210 if (orig_bb != nullptr) {
211 // TODO(rays) Looks like a leak to me.
212 // orig_bb should take, rather than copy.
213 word->blamer_bundle = new BlamerBundle();
214 word2->blamer_bundle = new BlamerBundle();
215 orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),
216 word2->chopped_word->blobs[0]->bounding_box().left(),
218 word->blamer_bundle, word2->blamer_bundle);
219 }
220
221 *right_piece = word2;
222 *orig_blamer_bundle = orig_bb;
223}
void reserve(int size)
void SplitBundle(int word1_right, int word2_left, bool debug, BlamerBundle *bundle1, BlamerBundle *bundle2) const
Definition: blamer.cpp:177
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:343

◆ SubAndSuperscriptFix()

bool tesseract::Tesseract::SubAndSuperscriptFix ( WERD_RES word)

Attempt to split off any high (or low) bits at the ends of the word with poor certainty and recognize them separately. If the certainty gets much better and other sanity checks pass, accept.

This superscript fix is meant to be called in the second pass of recognition when we have tried once and already have a preliminary answer for word.

Returns
Whether we modified the given word.

Definition at line 102 of file superscript.cpp.

102 {
103 if (word->tess_failed || word->word->flag(W_REP_CHAR) ||
104 !word->best_choice) {
105 return false;
106 }
107 int num_leading, num_trailing;
108 ScriptPos sp_leading, sp_trailing;
109 float leading_certainty, trailing_certainty;
110 float avg_certainty, unlikely_threshold;
111
112 // Calculate the number of whole suspicious characters at the edges.
114 word, &num_leading, &sp_leading, &leading_certainty,
115 &num_trailing, &sp_trailing, &trailing_certainty,
116 &avg_certainty, &unlikely_threshold);
117
118 const char *leading_pos = sp_leading == SP_SUBSCRIPT ? "sub" : "super";
119 const char *trailing_pos = sp_trailing == SP_SUBSCRIPT ? "sub" : "super";
120
121 int num_blobs = word->best_choice->length();
122
123 // Calculate the remainder (partial characters) at the edges.
124 // This accounts for us having classified the best version of
125 // a word as [speaker?'] when it was instead [speaker.^{21}]
126 // (that is we accidentally thought the 2 was attached to the period).
127 int num_remainder_leading = 0, num_remainder_trailing = 0;
128 if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) {
129 int super_y_bottom =
131 int sub_y_top =
133 int last_word_char = num_blobs - 1 - num_trailing;
134 float last_char_certainty = word->best_choice->certainty(last_word_char);
135 if (word->best_choice->unichar_id(last_word_char) != 0 &&
136 last_char_certainty <= unlikely_threshold) {
137 ScriptPos rpos;
138 YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top,
139 nullptr, nullptr, &rpos, &num_remainder_trailing);
140 if (num_trailing > 0 && rpos != sp_trailing) num_remainder_trailing = 0;
141 if (num_remainder_trailing > 0 &&
142 last_char_certainty < trailing_certainty) {
143 trailing_certainty = last_char_certainty;
144 }
145 }
146 bool another_blob_available = (num_remainder_trailing == 0) ||
147 num_leading + num_trailing + 1 < num_blobs;
148 int first_char_certainty = word->best_choice->certainty(num_leading);
149 if (another_blob_available &&
150 word->best_choice->unichar_id(num_leading) != 0 &&
151 first_char_certainty <= unlikely_threshold) {
152 ScriptPos lpos;
153 YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top,
154 &lpos, &num_remainder_leading, nullptr, nullptr);
155 if (num_leading > 0 && lpos != sp_leading) num_remainder_leading = 0;
156 if (num_remainder_leading > 0 &&
157 first_char_certainty < leading_certainty) {
158 leading_certainty = first_char_certainty;
159 }
160 }
161 }
162
163 // If nothing to do, bail now.
164 if (num_leading + num_trailing +
165 num_remainder_leading + num_remainder_trailing == 0) {
166 return false;
167 }
168
169 if (superscript_debug >= 1) {
170 tprintf("Candidate for superscript detection: %s (",
172 if (num_leading || num_remainder_leading) {
173 tprintf("%d.%d %s-leading ", num_leading, num_remainder_leading,
174 leading_pos);
175 }
176 if (num_trailing || num_remainder_trailing) {
177 tprintf("%d.%d %s-trailing ", num_trailing, num_remainder_trailing,
178 trailing_pos);
179 }
180 tprintf(")\n");
181 }
182 if (superscript_debug >= 3) {
183 word->best_choice->print();
184 }
185 if (superscript_debug >= 2) {
186 tprintf(" Certainties -- Average: %.2f Unlikely thresh: %.2f ",
187 avg_certainty, unlikely_threshold);
188 if (num_leading)
189 tprintf("Orig. leading (min): %.2f ", leading_certainty);
190 if (num_trailing)
191 tprintf("Orig. trailing (min): %.2f ", trailing_certainty);
192 tprintf("\n");
193 }
194
195 // We've now calculated the number of rebuilt blobs we want to carve off.
196 // However, split_word() works from TBLOBs in chopped_word, so we need to
197 // convert to those.
198 int num_chopped_leading =
199 LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading;
200 int num_chopped_trailing =
201 TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing;
202
203 int retry_leading = 0;
204 int retry_trailing = 0;
205 bool is_good = false;
207 num_chopped_leading, leading_certainty, sp_leading,
208 num_chopped_trailing, trailing_certainty, sp_trailing,
209 word, &is_good, &retry_leading, &retry_trailing);
210 if (is_good) {
211 word->ConsumeWordResults(revised);
212 } else if (retry_leading || retry_trailing) {
213 int retry_chopped_leading =
214 LeadingUnicharsToChopped(revised, retry_leading);
215 int retry_chopped_trailing =
216 TrailingUnicharsToChopped(revised, retry_trailing);
217 WERD_RES *revised2 = TrySuperscriptSplits(
218 retry_chopped_leading, leading_certainty, sp_leading,
219 retry_chopped_trailing, trailing_certainty, sp_trailing,
220 revised, &is_good, &retry_leading, &retry_trailing);
221 if (is_good) {
222 word->ConsumeWordResults(revised2);
223 }
224 delete revised2;
225 }
226 delete revised;
227 return is_good;
228}
void GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
WERD_RES * TrySuperscriptSplits(int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:765

◆ terrible_word_crunch()

bool tesseract::Tesseract::terrible_word_crunch ( WERD_RES word,
GARBAGE_LEVEL  garbage_level 
)

Definition at line 503 of file docqual.cpp.

504 {
505 float rating_per_ch;
506 int adjusted_len;
507 int crunch_mode = 0;
508
509 if ((word->best_choice->unichar_string().length() == 0) ||
510 (strspn(word->best_choice->unichar_string().string(), " ") ==
512 crunch_mode = 1;
513 else {
514 adjusted_len = word->reject_map.length ();
515 if (adjusted_len > crunch_rating_max)
516 adjusted_len = crunch_rating_max;
517 rating_per_ch = word->best_choice->rating () / adjusted_len;
518
519 if (rating_per_ch > crunch_terrible_rating)
520 crunch_mode = 2;
521 else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
522 crunch_mode = 3;
523 else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
524 (garbage_level != G_OK))
525 crunch_mode = 4;
526 else if ((rating_per_ch > crunch_poor_garbage_rate) &&
527 (garbage_level != G_OK))
528 crunch_mode = 5;
529 }
530 if (crunch_mode > 0) {
531 if (crunch_debug > 2) {
532 tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
533 crunch_mode, word->best_choice->unichar_string().string());
534 }
535 return true;
536 }
537 else
538 return false;
539}
uint32_t unsigned_size() const
Definition: strngs.h:72

◆ tess_acceptable_word()

bool tesseract::Tesseract::tess_acceptable_word ( WERD_RES word)

Definition at line 62 of file tessbox.cpp.

62 {
63 return getDict().AcceptableResult(word);
64}
bool AcceptableResult(WERD_RES *word) const
Definition: stopper.cpp:102

◆ tess_add_doc_word()

void tesseract::Tesseract::tess_add_doc_word ( WERD_CHOICE word_choice)

Definition at line 72 of file tessbox.cpp.

72 {
73 getDict().add_document_word(*word_choice);
74}
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
Definition: dict.cpp:644

◆ tess_segment_pass_n()

void tesseract::Tesseract::tess_segment_pass_n ( int  pass_n,
WERD_RES word 
)

Definition at line 32 of file tessbox.cpp.

32 {
33 int saved_enable_assoc = 0;
34 int saved_chop_enable = 0;
35
36 if (word->word->flag(W_DONT_CHOP)) {
37 saved_enable_assoc = wordrec_enable_assoc;
38 saved_chop_enable = chop_enable;
39 wordrec_enable_assoc.set_value(0);
40 chop_enable.set_value(0);
41 }
42 if (pass_n == 1)
43 set_pass1();
44 else
45 set_pass2();
46 recog_word(word);
47 if (word->best_choice == nullptr)
48 word->SetupFake(*word->uch_set);
49 if (word->word->flag(W_DONT_CHOP)) {
50 wordrec_enable_assoc.set_value(saved_enable_assoc);
51 chop_enable.set_value(saved_chop_enable);
52 }
53}
void recog_word(WERD_RES *word)
Definition: tfacepp.cpp:40
void set_pass1()
Definition: tface.cpp:101
void set_pass2()
Definition: tface.cpp:113
bool wordrec_enable_assoc
Definition: wordrec.h:198

◆ TestNewNormalization()

bool tesseract::Tesseract::TestNewNormalization ( int  original_misfits,
float  baseline_shift,
float  new_x_ht,
WERD_RES word,
BLOCK block,
ROW row 
)

Definition at line 1519 of file control.cpp.

1521 {
1522 bool accept_new_x_ht = false;
1523 WERD_RES new_x_ht_word(word->word);
1524 if (word->blamer_bundle != nullptr) {
1525 new_x_ht_word.blamer_bundle = new BlamerBundle();
1526 new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
1527 }
1528 new_x_ht_word.x_height = new_x_ht;
1529 new_x_ht_word.baseline_shift = baseline_shift;
1530 new_x_ht_word.caps_height = 0.0;
1531 new_x_ht_word.SetupForRecognition(
1532 unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
1534 poly_allow_detailed_fx, row, block);
1535 match_word_pass_n(2, &new_x_ht_word, row, block);
1536 if (!new_x_ht_word.tess_failed) {
1537 int new_misfits = CountMisfitTops(&new_x_ht_word);
1538 if (debug_x_ht_level >= 1) {
1539 tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n",
1540 original_misfits, word->x_height,
1541 new_misfits, new_x_ht);
1542 tprintf("Old rating= %f, certainty=%f, new=%f, %f\n",
1543 word->best_choice->rating(), word->best_choice->certainty(),
1544 new_x_ht_word.best_choice->rating(),
1545 new_x_ht_word.best_choice->certainty());
1546 }
1547 // The misfits must improve and either the rating or certainty.
1548 accept_new_x_ht = new_misfits < original_misfits &&
1549 (new_x_ht_word.best_choice->certainty() >
1550 word->best_choice->certainty() ||
1551 new_x_ht_word.best_choice->rating() <
1552 word->best_choice->rating());
1553 if (debug_x_ht_level >= 1) {
1554 ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
1555 }
1556 }
1557 if (accept_new_x_ht) {
1558 word->ConsumeWordResults(&new_x_ht_word);
1559 return true;
1560 }
1561 return false;
1562}
int CountMisfitTops(WERD_RES *word_res)
Definition: fixxht.cpp:70
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
Definition: control.cpp:1462

◆ textord()

const Textord & tesseract::Tesseract::textord ( ) const
inline

Definition at line 268 of file tesseractclass.h.

268 {
269 return textord_;
270 }

◆ TidyUp()

void tesseract::Tesseract::TidyUp ( PAGE_RES page_res)
  • Counts up the labelled words and the blobs within.
  • Deletes all unused or emptied words, counting the unused ones.
  • Resets W_BOL and W_EOL flags correctly.
  • Builds the rebuild_word and rebuilds the box_word and the best_choice.

Definition at line 708 of file applybox.cpp.

708 {
709 int ok_blob_count = 0;
710 int bad_blob_count = 0;
711 int ok_word_count = 0;
712 int unlabelled_words = 0;
713 PAGE_RES_IT pr_it(page_res);
714 WERD_RES* word_res;
715 for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
716 int ok_in_word = 0;
717 int blob_count = word_res->correct_text.size();
718 auto* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
719 word_choice->set_permuter(TOP_CHOICE_PERM);
720 for (int c = 0; c < blob_count; ++c) {
721 if (word_res->correct_text[c].length() > 0) {
722 ++ok_in_word;
723 }
724 // Since we only need a fake word_res->best_choice, the actual
725 // unichar_ids do not matter. Which is fortunate, since TidyUp()
726 // can be called while training Tesseract, at the stage where
727 // unicharset is not meaningful yet.
728 word_choice->append_unichar_id_space_allocated(
729 INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f);
730 }
731 if (ok_in_word > 0) {
732 ok_blob_count += ok_in_word;
733 bad_blob_count += word_res->correct_text.size() - ok_in_word;
734 word_res->LogNewRawChoice(word_choice);
735 word_res->LogNewCookedChoice(1, false, word_choice);
736 } else {
737 ++unlabelled_words;
738 if (applybox_debug > 0) {
739 tprintf("APPLY_BOXES: Unlabelled word at :");
740 word_res->word->bounding_box().print();
741 }
742 pr_it.DeleteCurrentWord();
743 delete word_choice;
744 }
745 }
746 pr_it.restart_page();
747 for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
748 // Denormalize back to a BoxWord.
749 word_res->RebuildBestState();
750 word_res->SetupBoxWord();
751 word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
752 word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
753 }
754 if (applybox_debug > 0) {
755 tprintf(" Found %d good blobs.\n", ok_blob_count);
756 if (bad_blob_count > 0) {
757 tprintf(" Leaving %d unlabelled blobs in %d words.\n",
758 bad_blob_count, ok_word_count);
759 }
760 if (unlabelled_words > 0)
761 tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words);
762 }
763}
@ TOP_CHOICE_PERM
Definition: ratngs.h:235
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:604
void RebuildBestState()
Definition: pageres.cpp:808
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:620

◆ tilde_crunch()

void tesseract::Tesseract::tilde_crunch ( PAGE_RES_IT page_res_it)

Definition at line 417 of file docqual.cpp.

417 {
418 WERD_RES *word;
419 GARBAGE_LEVEL garbage_level;
420 PAGE_RES_IT copy_it;
421 bool prev_potential_marked = false;
422 bool found_terrible_word = false;
423 bool ok_dict_word;
424
425 page_res_it.restart_page();
426 while (page_res_it.word() != nullptr) {
427 POLY_BLOCK* pb = page_res_it.block()->block->pdblk.poly_block();
428 if (pb != nullptr && !pb->IsText()) {
429 page_res_it.forward();
430 continue;
431 }
432 word = page_res_it.word();
433
436
438 word->merge_tess_fails();
439
440 if (word->reject_map.accept_count () != 0) {
441 found_terrible_word = false;
442 //Forget earlier potential crunches
443 prev_potential_marked = false;
444 }
445 else {
446 ok_dict_word = safe_dict_word(word);
447 garbage_level = garbage_word(word, ok_dict_word);
448
449 if ((garbage_level != G_NEVER_CRUNCH) &&
450 (terrible_word_crunch (word, garbage_level))) {
451 if (crunch_debug > 0) {
452 tprintf ("T CRUNCHING: \"%s\"\n",
454 }
456 if (prev_potential_marked) {
457 while (copy_it.word () != word) {
458 if (crunch_debug > 0) {
459 tprintf ("P1 CRUNCHING: \"%s\"\n",
460 copy_it.word()->best_choice->unichar_string().string());
461 }
462 copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
463 copy_it.forward ();
464 }
465 prev_potential_marked = false;
466 }
467 found_terrible_word = true;
468 }
469 else if ((garbage_level != G_NEVER_CRUNCH) &&
471 garbage_level, ok_dict_word))) {
472 if (found_terrible_word) {
473 if (crunch_debug > 0) {
474 tprintf ("P2 CRUNCHING: \"%s\"\n",
476 }
478 }
479 else if (!prev_potential_marked) {
480 copy_it = page_res_it;
481 prev_potential_marked = true;
482 if (crunch_debug > 1) {
483 tprintf ("P3 CRUNCHING: \"%s\"\n",
485 }
486 }
487 }
488 else {
489 found_terrible_word = false;
490 //Forget earlier potential crunches
491 prev_potential_marked = false;
492 if (crunch_debug > 2) {
493 tprintf ("NO CRUNCH: \"%s\"\n",
495 }
496 }
497 }
498 page_res_it.forward ();
499 }
500}
GARBAGE_LEVEL
Definition: docqual.h:30
@ CR_KEEP_SPACE
Definition: pageres.h:159
GARBAGE_LEVEL garbage_word(WERD_RES *word, bool ok_dict_word)
Definition: docqual.cpp:679
void convert_bad_unlv_chs(WERD_RES *word_res)
Definition: docqual.cpp:659
bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
Definition: docqual.cpp:541
bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
Definition: docqual.cpp:503
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:315
void merge_tess_fails()
Definition: pageres.cpp:1067
POLY_BLOCK * poly_block() const
Definition: pdblock.h:55
int16_t accept_count()
Definition: rejctmap.cpp:279

◆ tilde_delete()

void tesseract::Tesseract::tilde_delete ( PAGE_RES_IT page_res_it)

Definition at line 589 of file docqual.cpp.

589 {
590 WERD_RES *word;
591 PAGE_RES_IT copy_it;
592 bool deleting_from_bol = false;
593 bool marked_delete_point = false;
594 int16_t debug_delete_mode;
595 CRUNCH_MODE delete_mode;
596 int16_t x_debug_delete_mode;
597 CRUNCH_MODE x_delete_mode;
598
599 page_res_it.restart_page();
600 while (page_res_it.word() != nullptr) {
601 word = page_res_it.word();
602
603 delete_mode = word_deletable (word, debug_delete_mode);
604 if (delete_mode != CR_NONE) {
605 if (word->word->flag (W_BOL) || deleting_from_bol) {
606 if (crunch_debug > 0) {
607 tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
608 debug_delete_mode,
610 }
611 word->unlv_crunch_mode = delete_mode;
612 deleting_from_bol = true;
613 } else if (word->word->flag(W_EOL)) {
614 if (marked_delete_point) {
615 while (copy_it.word() != word) {
616 x_delete_mode = word_deletable (copy_it.word (),
617 x_debug_delete_mode);
618 if (crunch_debug > 0) {
619 tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
620 x_debug_delete_mode,
621 copy_it.word()->best_choice->unichar_string().string());
622 }
623 copy_it.word ()->unlv_crunch_mode = x_delete_mode;
624 copy_it.forward ();
625 }
626 }
627 if (crunch_debug > 0) {
628 tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
629 debug_delete_mode,
631 }
632 word->unlv_crunch_mode = delete_mode;
633 deleting_from_bol = false;
634 marked_delete_point = false;
635 }
636 else {
637 if (!marked_delete_point) {
638 copy_it = page_res_it;
639 marked_delete_point = true;
640 }
641 }
642 }
643 else {
644 deleting_from_bol = false;
645 //Forget earlier potential crunches
646 marked_delete_point = false;
647 }
648 /*
649 The following step has been left till now as the tess fails are used to
650 determine if the word is deletable.
651 */
653 word->merge_tess_fails();
654 page_res_it.forward ();
655 }
656}
CRUNCH_MODE
Definition: pageres.h:157
@ CR_NONE
Definition: pageres.h:158
CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode)
Definition: docqual.cpp:897

◆ TrainedXheightFix()

bool tesseract::Tesseract::TrainedXheightFix ( WERD_RES word,
BLOCK block,
ROW row 
)

Definition at line 1485 of file control.cpp.

1485 {
1486 int original_misfits = CountMisfitTops(word);
1487 if (original_misfits == 0)
1488 return false;
1489 float baseline_shift = 0.0f;
1490 float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);
1491 if (baseline_shift != 0.0f) {
1492 // Try the shift on its own first.
1493 if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height,
1494 word, block, row))
1495 return false;
1496 original_misfits = CountMisfitTops(word);
1497 if (original_misfits > 0) {
1498 float new_baseline_shift;
1499 // Now recompute the new x_height.
1500 new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);
1501 if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1502 // No test of return value here, as we are definitely making a change
1503 // to the word by shifting the baseline.
1504 TestNewNormalization(original_misfits, baseline_shift, new_x_ht,
1505 word, block, row);
1506 }
1507 }
1508 return true;
1509 } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1510 return TestNewNormalization(original_misfits, 0.0f, new_x_ht,
1511 word, block, row);
1512 } else {
1513 return false;
1514 }
1515}
const double kMinRefitXHeightFraction
Definition: control.cpp:51
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
Definition: fixxht.cpp:102
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1519

◆ TrainFromBoxes()

void tesseract::Tesseract::TrainFromBoxes ( const GenericVector< TBOX > &  boxes,
const GenericVector< STRING > &  texts,
BLOCK_LIST *  block_list,
DocumentData training_data 
)

Definition at line 81 of file linerec.cpp.

84 {
85 int box_count = boxes.size();
86 // Process all the text lines in this page, as defined by the boxes.
87 int end_box = 0;
88 // Don't let \t, which marks newlines in the box file, get into the line
89 // content, as that makes the line unusable in training.
90 while (end_box < texts.size() && texts[end_box] == "\t") ++end_box;
91 for (int start_box = end_box; start_box < box_count; start_box = end_box) {
92 // Find the textline of boxes starting at start and their bounding box.
93 TBOX line_box = boxes[start_box];
94 STRING line_str = texts[start_box];
95 for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t";
96 ++end_box) {
97 line_box += boxes[end_box];
98 line_str += texts[end_box];
99 }
100 // Find the most overlapping block.
101 BLOCK* best_block = nullptr;
102 int best_overlap = 0;
103 BLOCK_IT b_it(block_list);
104 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
105 BLOCK* block = b_it.data();
106 if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText())
107 continue; // Not a text block.
108 TBOX block_box = block->pdblk.bounding_box();
109 block_box.rotate(block->re_rotation());
110 if (block_box.major_overlap(line_box)) {
111 TBOX overlap_box = line_box.intersection(block_box);
112 if (overlap_box.area() > best_overlap) {
113 best_overlap = overlap_box.area();
114 best_block = block;
115 }
116 }
117 }
118 ImageData* imagedata = nullptr;
119 if (best_block == nullptr) {
120 tprintf("No block overlapping textline: %s\n", line_str.string());
121 } else {
122 imagedata = GetLineData(line_box, boxes, texts, start_box, end_box,
123 *best_block);
124 }
125 if (imagedata != nullptr)
126 training_data->AddPageToDocument(imagedata);
127 // Don't let \t, which marks newlines in the box file, get into the line
128 // content, as that makes the line unusable in training.
129 while (end_box < texts.size() && texts[end_box] == "\t") ++end_box;
130 }
131}
ImageData * GetLineData(const TBOX &line_box, const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, int start_box, int end_box, const BLOCK &block)
Definition: linerec.cpp:136
int32_t area() const
Definition: rect.h:122
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87

◆ TrainLineRecognizer()

bool tesseract::Tesseract::TrainLineRecognizer ( const STRING input_imagename,
const STRING output_basename,
BLOCK_LIST *  block_list 
)

Definition at line 44 of file linerec.cpp.

46 {
47 STRING lstmf_name = output_basename + ".lstmf";
48 DocumentData images(lstmf_name);
49 if (applybox_page > 0) {
50 // Load existing document for the previous pages.
51 if (!images.LoadDocument(lstmf_name.c_str(), 0, 0, nullptr)) {
52 tprintf("Failed to read training data from %s!\n", lstmf_name.c_str());
53 return false;
54 }
55 }
58 // Get the boxes for this page, if there are any.
59 if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, nullptr,
60 nullptr) ||
61 boxes.empty()) {
62 tprintf("Failed to read boxes from %s\n", input_imagename.c_str());
63 return false;
64 }
65 TrainFromBoxes(boxes, texts, block_list, &images);
66 if (images.PagesSize() == 0) {
67 tprintf("Failed to read pages from %s\n", input_imagename.c_str());
68 return false;
69 }
70 images.Shuffle();
71 if (!images.SaveDocument(lstmf_name.c_str(), nullptr)) {
72 tprintf("Failed to write training data to %s!\n", lstmf_name.c_str());
73 return false;
74 }
75 return true;
76}
void TrainFromBoxes(const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, BLOCK_LIST *block_list, DocumentData *training_data)
Definition: linerec.cpp:81

◆ TrySuperscriptSplits()

WERD_RES * tesseract::Tesseract::TrySuperscriptSplits ( int  num_chopped_leading,
float  leading_certainty,
ScriptPos  leading_pos,
int  num_chopped_trailing,
float  trailing_certainty,
ScriptPos  trailing_pos,
WERD_RES word,
bool *  is_good,
int *  retry_rebuild_leading,
int *  retry_rebuild_trailing 
)

Try splitting off the given number of (chopped) blobs from the front and back of the given word and recognizing the pieces.

Parameters
[in]num_chopped_leadinghow many chopped blobs from the left end of the word to chop off and try recognizing as a superscript (or subscript)
[in]leading_certaintythe (minimum) certainty had by the characters in the original leading section.
[in]leading_pos"super" or "sub" (for debugging)
[in]num_chopped_trailinghow many chopped blobs from the right end of the word to chop off and try recognizing as a superscript (or subscript)
[in]trailing_certaintythe (minimum) certainty had by the characters in the original trailing section.
[in]trailing_pos"super" or "sub" (for debugging)
[in]wordthe word to try to chop up.
[out]is_gooddo we believe our result?
[out]retry_rebuild_leading,retry_rebuild_trailingIf non-zero, and !is_good, then the caller may have luck trying to split the returned word with this number of (rebuilt) leading and trailing blobs / unichars.
Returns
A word which is the result of re-recognizing as asked.

Definition at line 383 of file superscript.cpp.

389 {
390 int num_chopped = word->chopped_word->NumBlobs();
391
392 *retry_rebuild_leading = *retry_rebuild_trailing = 0;
393
394 // Chop apart the word into up to three pieces.
395
396 BlamerBundle *bb0 = nullptr;
397 BlamerBundle *bb1 = nullptr;
398 WERD_RES *prefix = nullptr;
399 WERD_RES *core = nullptr;
400 WERD_RES *suffix = nullptr;
401 if (num_chopped_leading > 0) {
402 prefix = new WERD_RES(*word);
403 split_word(prefix, num_chopped_leading, &core, &bb0);
404 } else {
405 core = new WERD_RES(*word);
406 }
407
408 if (num_chopped_trailing > 0) {
409 int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;
410 split_word(core, split_pt, &suffix, &bb1);
411 }
412
413 // Recognize the pieces in turn.
414 int saved_cp_multiplier = classify_class_pruner_multiplier;
415 int saved_im_multiplier = classify_integer_matcher_multiplier;
416 if (prefix) {
417 // Turn off Tesseract's y-position penalties for the leading superscript.
420
421 // Adjust our expectations about the baseline for this prefix.
422 if (superscript_debug >= 3) {
423 tprintf(" recognizing first %d chopped blobs\n", num_chopped_leading);
424 }
425 recog_word_recursive(prefix);
426 if (superscript_debug >= 2) {
427 tprintf(" The leading bits look like %s %s\n",
428 ScriptPosToString(leading_pos),
429 prefix->best_choice->unichar_string().string());
430 }
431
432 // Restore the normal y-position penalties.
433 classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
434 classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
435 }
436
437 if (superscript_debug >= 3) {
438 tprintf(" recognizing middle %d chopped blobs\n",
439 num_chopped - num_chopped_leading - num_chopped_trailing);
440 }
441
442 if (suffix) {
443 // Turn off Tesseract's y-position penalties for the trailing superscript.
446
447 if (superscript_debug >= 3) {
448 tprintf(" recognizing last %d chopped blobs\n", num_chopped_trailing);
449 }
450 recog_word_recursive(suffix);
451 if (superscript_debug >= 2) {
452 tprintf(" The trailing bits look like %s %s\n",
453 ScriptPosToString(trailing_pos),
454 suffix->best_choice->unichar_string().string());
455 }
456
457 // Restore the normal y-position penalties.
458 classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
459 classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
460 }
461
462 // Evaluate whether we think the results are believably better
463 // than what we already had.
464 bool good_prefix = !prefix || BelievableSuperscript(
465 superscript_debug >= 1, *prefix,
466 superscript_bettered_certainty * leading_certainty,
467 retry_rebuild_leading, nullptr);
468 bool good_suffix = !suffix || BelievableSuperscript(
469 superscript_debug >= 1, *suffix,
470 superscript_bettered_certainty * trailing_certainty,
471 nullptr, retry_rebuild_trailing);
472
473 *is_good = good_prefix && good_suffix;
474 if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {
475 // None of it is any good. Quit now.
476 delete core;
477 delete prefix;
478 delete suffix;
479 delete bb1;
480 return nullptr;
481 }
483
484 // Now paste the results together into core.
485 if (suffix) {
486 suffix->SetAllScriptPositions(trailing_pos);
487 join_words(core, suffix, bb1);
488 }
489 if (prefix) {
490 prefix->SetAllScriptPositions(leading_pos);
491 join_words(prefix, core, bb0);
492 core = prefix;
493 prefix = nullptr;
494 }
495
496 if (superscript_debug >= 1) {
497 tprintf("%s superscript fix: %s\n", *is_good ? "ACCEPT" : "REJECT",
499 }
500 return core;
501}
const char * ScriptPosToString(enum ScriptPos script_pos)
Definition: ratngs.cpp:204
bool BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: pageres.cpp:865
int classify_class_pruner_multiplier
Definition: classify.h:501
int classify_integer_matcher_multiplier
Definition: classify.h:505

◆ unrej_good_chs()

void tesseract::Tesseract::unrej_good_chs ( WERD_RES word,
ROW row 
)

Definition at line 115 of file docqual.cpp.

115 {
116 if (word->bln_boxes == nullptr ||
117 word->rebuild_word == nullptr || word->rebuild_word->blobs.empty())
118 return;
119
120 DocQualCallbacks cb(word);
122 *word->rebuild_word,
124}
void AcceptIfGoodQuality(int index)
Definition: docqual.cpp:44
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
Definition: boxword.cpp:190
tesseract::BoxWord * bln_boxes
Definition: pageres.h:195

◆ unrej_good_quality_words()

void tesseract::Tesseract::unrej_good_quality_words ( PAGE_RES_IT page_res_it)

Definition at line 160 of file docqual.cpp.

161 {
162 WERD_RES *word;
163 ROW_RES *current_row;
164 BLOCK_RES *current_block;
165 int i;
166
167 page_res_it.restart_page ();
168 while (page_res_it.word () != nullptr) {
169 check_debug_pt (page_res_it.word (), 100);
170 if (bland_unrej) {
171 word = page_res_it.word ();
172 for (i = 0; i < word->reject_map.length (); i++) {
173 if (word->reject_map[i].accept_if_good_quality ())
174 word->reject_map[i].setrej_quality_accept ();
175 }
176 page_res_it.forward ();
177 }
178 else if ((page_res_it.row ()->char_count > 0) &&
179 ((page_res_it.row ()->rej_count /
180 static_cast<float>(page_res_it.row ()->char_count)) <=
182 word = page_res_it.word ();
188 != AC_UNACCEPTABLE)) {
189 unrej_good_chs(word, page_res_it.row ()->row);
190 }
191 page_res_it.forward ();
192 }
193 else {
194 /* Skip to end of dodgy row */
195 current_row = page_res_it.row ();
196 while ((page_res_it.word () != nullptr) &&
197 (page_res_it.row () == current_row))
198 page_res_it.forward ();
199 }
200 check_debug_pt (page_res_it.word (), 110);
201 }
202 page_res_it.restart_page ();
203 page_res_it.page_res->char_count = 0;
204 page_res_it.page_res->rej_count = 0;
205 current_block = nullptr;
206 current_row = nullptr;
207 while (page_res_it.word () != nullptr) {
208 if (current_block != page_res_it.block ()) {
209 current_block = page_res_it.block ();
210 current_block->char_count = 0;
211 current_block->rej_count = 0;
212 }
213 if (current_row != page_res_it.row ()) {
214 current_row = page_res_it.row ();
215 current_row->char_count = 0;
216 current_row->rej_count = 0;
217 current_row->whole_word_rej_count = 0;
218 }
219 page_res_it.rej_stat_word ();
220 page_res_it.forward ();
221 }
222}
void unrej_good_chs(WERD_RES *word, ROW *row)
Definition: docqual.cpp:115
void rej_stat_word()
Definition: pageres.cpp:1667
bool quality_recoverable_rejects()
Definition: rejctmap.cpp:300

◆ word_adaptable()

bool tesseract::Tesseract::word_adaptable ( WERD_RES word,
uint16_t  mode 
)

Definition at line 34 of file adaptions.cpp.

36 {
38 tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n",
40 word->best_choice->rating(), word->best_choice->certainty());
41 }
42
43 bool status = false;
44 BITS16 flags(mode);
45
46 enum MODES
47 {
48 ADAPTABLE_WERD,
49 ACCEPTABLE_WERD,
50 CHECK_DAWGS,
51 CHECK_SPACES,
52 CHECK_ONE_ELL_CONFLICT,
53 CHECK_AMBIG_WERD
54 };
55
56 /*
57 0: NO adaption
58 */
59 if (mode == 0) {
60 if (tessedit_adaption_debug) tprintf("adaption disabled\n");
61 return false;
62 }
63
64 if (flags.bit (ADAPTABLE_WERD)) {
65 status |= word->tess_would_adapt; // result of Classify::AdaptableWord()
66 if (tessedit_adaption_debug && !status) {
67 tprintf("tess_would_adapt bit is false\n");
68 }
69 }
70
71 if (flags.bit (ACCEPTABLE_WERD)) {
72 status |= word->tess_accepted;
73 if (tessedit_adaption_debug && !status) {
74 tprintf("tess_accepted bit is false\n");
75 }
76 }
77
78 if (!status) { // If not set then
79 return false; // ignore other checks
80 }
81
82 if (flags.bit (CHECK_DAWGS) &&
83 (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
84 (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
85 (word->best_choice->permuter () != USER_DAWG_PERM) &&
86 (word->best_choice->permuter () != NUMBER_PERM)) {
87 if (tessedit_adaption_debug) tprintf("word not in dawgs\n");
88 return false;
89 }
90
91 if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, false)) {
92 if (tessedit_adaption_debug) tprintf("word has ell conflict\n");
93 return false;
94 }
95
96 if (flags.bit (CHECK_SPACES) &&
97 (strchr(word->best_choice->unichar_string().string(), ' ') != nullptr)) {
98 if (tessedit_adaption_debug) tprintf("word contains spaces\n");
99 return false;
100 }
101
102 if (flags.bit (CHECK_AMBIG_WERD) &&
104 if (tessedit_adaption_debug) tprintf("word is ambiguous\n");
105 return false;
106 }
107
109 tprintf("returning status %d\n", status);
110 }
111 return status;
112}
bool one_ell_conflict(WERD_RES *word_res, bool update_map)
Definition: reject.cpp:293
bool dangerous_ambig_found() const
Definition: ratngs.h:353
Definition: bits16.h:25

◆ word_blank_and_set_display()

bool tesseract::Tesseract::word_blank_and_set_display ( PAGE_RES_IT pr_its)

Definition at line 696 of file pgedit.cpp.

696 {
697 pr_it->word()->word->bounding_box().plot(image_win, ScrollView::BLACK,
699 return word_set_display(pr_it);
700}

◆ word_bln_display()

bool tesseract::Tesseract::word_bln_display ( PAGE_RES_IT pr_it)

word_bln_display()

Normalize word and display in word window

Definition at line 708 of file pgedit.cpp.

708 {
709 WERD_RES* word_res = pr_it->word();
710 if (word_res->chopped_word == nullptr) {
711 // Setup word normalization parameters.
712 word_res->SetupForRecognition(unicharset, this, BestPix(),
717 pr_it->row()->row, pr_it->block()->block);
718 }
719 bln_word_window_handle()->Clear();
720 display_bln_lines(bln_word_window_handle(), ScrollView::CYAN,
721 1.0, 0.0f, -1000.0f, 1000.0f);
722 C_BLOB_IT it(word_res->word->cblob_list());
724 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
725 it.data()->plot_normed(word_res->denorm, color, ScrollView::BROWN,
726 bln_word_window_handle());
727 color = WERD::NextColor(color);
728 }
729 bln_word_window_handle()->Update();
730 return true;
731}
static ScrollView::Color NextColor(ScrollView::Color colour)
Definition: werd.cpp:292

◆ word_blob_quality()

int16_t tesseract::Tesseract::word_blob_quality ( WERD_RES word,
ROW row 
)

Definition at line 60 of file docqual.cpp.

60 {
61 if (word->bln_boxes == nullptr ||
62 word->rebuild_word == nullptr || word->rebuild_word->blobs.empty())
63 return 0;
64
65 DocQualCallbacks cb(word);
67 *word->rebuild_word,
69 return cb.match_count;
70}
void CountMatchingBlobs(int index)
Definition: docqual.cpp:34

◆ word_char_quality()

void tesseract::Tesseract::word_char_quality ( WERD_RES word,
ROW row,
int16_t *  match_count,
int16_t *  accepted_match_count 
)

Definition at line 92 of file docqual.cpp.

95 {
96 if (word->bln_boxes == nullptr || word->rebuild_word == nullptr ||
97 word->rebuild_word->blobs.empty()) {
98 *match_count = 0;
99 *accepted_match_count = 0;
100 return;
101 }
102
103 DocQualCallbacks cb(word);
105 *word->rebuild_word,
107 *match_count = cb.match_count;
108 *accepted_match_count = cb.accepted_match_count;
109}
void CountAcceptedBlobs(int index)
Definition: docqual.cpp:38

◆ word_contains_non_1_digit()

bool tesseract::Tesseract::word_contains_non_1_digit ( const char *  word,
const char *  word_lengths 
)

Definition at line 510 of file reject.cpp.

511 {
512 int16_t i;
513 int16_t offset;
514
515 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
516 if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
517 (word_lengths[i] != 1 || word[offset] != '1'))
518 return true;
519 }
520 return false;
521}

◆ word_deletable()

CRUNCH_MODE tesseract::Tesseract::word_deletable ( WERD_RES word,
int16_t &  delete_mode 
)

Definition at line 897 of file docqual.cpp.

897 {
898 int word_len = word->reject_map.length ();
899 float rating_per_ch;
900 TBOX box; //BB of word
901
902 if (word->unlv_crunch_mode == CR_NONE) {
903 delete_mode = 0;
904 return CR_NONE;
905 }
906
907 if (word_len == 0) {
908 delete_mode = 1;
909 return CR_DELETE;
910 }
911
912 if (word->rebuild_word != nullptr) {
913 // Cube leaves rebuild_word nullptr.
914 box = word->rebuild_word->bounding_box();
915 if (box.height () < crunch_del_min_ht * kBlnXHeight) {
916 delete_mode = 4;
917 return CR_DELETE;
918 }
919
920 if (noise_outlines(word->rebuild_word)) {
921 delete_mode = 5;
922 return CR_DELETE;
923 }
924 }
925
926 if ((failure_count (word) * 1.5) > word_len) {
927 delete_mode = 2;
928 return CR_LOOSE_SPACE;
929 }
930
931 if (word->best_choice->certainty () < crunch_del_cert) {
932 delete_mode = 7;
933 return CR_LOOSE_SPACE;
934 }
935
936 rating_per_ch = word->best_choice->rating () / word_len;
937
938 if (rating_per_ch > crunch_del_rating) {
939 delete_mode = 8;
940 return CR_LOOSE_SPACE;
941 }
942
944 delete_mode = 9;
945 return CR_LOOSE_SPACE;
946 }
947
948 if (box.bottom () >
950 delete_mode = 10;
951 return CR_LOOSE_SPACE;
952 }
953
954 if (box.height () > crunch_del_max_ht * kBlnXHeight) {
955 delete_mode = 11;
956 return CR_LOOSE_SPACE;
957 }
958
959 if (box.width () < crunch_del_min_width * kBlnXHeight) {
960 delete_mode = 3;
961 return CR_LOOSE_SPACE;
962 }
963
964 delete_mode = 0;
965 return CR_NONE;
966}
@ CR_DELETE
Definition: pageres.h:161
@ CR_LOOSE_SPACE
Definition: pageres.h:160
bool noise_outlines(TWERD *word)
Definition: docqual.cpp:980
int16_t failure_count(WERD_RES *word)
Definition: docqual.cpp:968

◆ word_display()

bool tesseract::Tesseract::word_display ( PAGE_RES_IT pr_it)

word_display() Word Processor

Display a word according to its display modes

Definition at line 740 of file pgedit.cpp.

740 {
741 WERD_RES* word_res = pr_it->word();
742 WERD* word = word_res->word;
743 TBOX word_bb; // word bounding box
744 int word_height; // ht of word BB
745 bool displayed_something = false;
746 float shift; // from bot left
747
748 if (color_mode != CM_RAINBOW && word_res->box_word != nullptr) {
749 #ifndef DISABLED_LEGACY_ENGINE
750 BoxWord* box_word = word_res->box_word;
751 WERD_CHOICE* best_choice = word_res->best_choice;
752 int length = box_word->length();
753 if (word_res->fontinfo == nullptr) return false;
754 const FontInfo& font_info = *word_res->fontinfo;
755 for (int i = 0; i < length; ++i) {
757 switch (color_mode) {
758 case CM_SUBSCRIPT:
759 if (best_choice->BlobPosition(i) == SP_SUBSCRIPT)
760 color = ScrollView::RED;
761 break;
762 case CM_SUPERSCRIPT:
763 if (best_choice->BlobPosition(i) == SP_SUPERSCRIPT)
764 color = ScrollView::RED;
765 break;
766 case CM_ITALIC:
767 if (font_info.is_italic())
768 color = ScrollView::RED;
769 break;
770 case CM_BOLD:
771 if (font_info.is_bold())
772 color = ScrollView::RED;
773 break;
774 case CM_FIXEDPITCH:
775 if (font_info.is_fixed_pitch())
776 color = ScrollView::RED;
777 break;
778 case CM_SERIF:
779 if (font_info.is_serif())
780 color = ScrollView::RED;
781 break;
782 case CM_SMALLCAPS:
783 if (word_res->small_caps)
784 color = ScrollView::RED;
785 break;
786 case CM_DROPCAPS:
787 if (best_choice->BlobPosition(i) == SP_DROPCAP)
788 color = ScrollView::RED;
789 break;
790 // TODO(rays) underline is currently completely unsupported.
791 case CM_UNDERLINE:
792 default:
793 break;
794 }
795 image_win->Pen(color);
796 TBOX box = box_word->BlobBox(i);
797 image_win->Rectangle(box.left(), box.bottom(), box.right(), box.top());
798 }
799 return true;
800 #else
801 return false;
802 #endif // ndef DISABLED_LEGACY_ENGINE
803 }
804 /*
805 Note the double coercions of(COLOUR)((int32_t)editor_image_word_bb_color)
806 etc. are to keep the compiler happy.
807 */
808 // display bounding box
809 if (word->display_flag(DF_BOX)) {
810 word->bounding_box().plot(image_win,
811 static_cast<ScrollView::Color>((int32_t)
813 static_cast<ScrollView::Color>((int32_t)
815
816 auto c = static_cast<ScrollView::Color>((int32_t) editor_image_blob_bb_color);
817 image_win->Pen(c);
818 // cblob iterator
819 C_BLOB_IT c_it(word->cblob_list());
820 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward())
821 c_it.data()->bounding_box().plot(image_win);
822 displayed_something = true;
823 }
824
825 // display edge steps
826 if (word->display_flag(DF_EDGE_STEP)) { // edgesteps available
827 word->plot(image_win); // rainbow colors
828 displayed_something = true;
829 }
830
831 // display poly approx
832 if (word->display_flag(DF_POLYGONAL)) {
833 // need to convert
835 tword->plot(image_win);
836 delete tword;
837 displayed_something = true;
838 }
839
840 // Display correct text and blamer information.
841 STRING text;
842 STRING blame;
843 if (word->display_flag(DF_TEXT) && word->text() != nullptr) {
844 text = word->text();
845 }
846 if (word->display_flag(DF_BLAMER) &&
847 !(word_res->blamer_bundle != nullptr &&
849 text = "";
850 const BlamerBundle *blamer_bundle = word_res->blamer_bundle;
851 if (blamer_bundle == nullptr) {
852 text += "NULL";
853 } else {
854 text = blamer_bundle->TruthString();
855 }
856 text += " -> ";
857 STRING best_choice_str;
858 if (word_res->best_choice == nullptr) {
859 best_choice_str = "NULL";
860 } else {
861 word_res->best_choice->string_and_lengths(&best_choice_str, nullptr);
862 }
863 text += best_choice_str;
864 IncorrectResultReason reason = (blamer_bundle == nullptr) ?
865 IRR_PAGE_LAYOUT : blamer_bundle->incorrect_result_reason();
867 blame += " [";
868 blame += BlamerBundle::IncorrectReasonName(reason);
869 blame += "]";
870 }
871 if (text.length() > 0) {
872 word_bb = word->bounding_box();
873 image_win->Pen(ScrollView::RED);
874 word_height = word_bb.height();
875 int text_height = 0.50 * word_height;
876 if (text_height > 20) text_height = 20;
877 image_win->TextAttributes("Arial", text_height, false, false, false);
878 shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f;
879 image_win->Text(word_bb.left() + shift,
880 word_bb.bottom() + 0.25 * word_height, text.string());
881 if (blame.length() > 0) {
882 image_win->Text(word_bb.left() + shift,
883 word_bb.bottom() + 0.25 * word_height - text_height,
884 blame.string());
885 }
886
887 displayed_something = true;
888 }
889
890 if (!displayed_something) // display BBox anyway
891 word->bounding_box().plot(image_win,
892 static_cast<ScrollView::Color>((int32_t) editor_image_word_bb_color),
893 static_cast<ScrollView::Color>((int32_t)
895 return true;
896}
int editor_image_blob_bb_color
Definition: pgedit.cpp:127
int editor_image_word_bb_color
Definition: pgedit.cpp:125
@ IRR_PAGE_LAYOUT
Definition: blamer.h:72
@ IRR_CORRECT
Definition: blamer.h:53
@ SP_DROPCAP
Definition: ratngs.h:256
STRING TruthString() const
Definition: blamer.h:114
static TWERD * PolygonalCopy(bool allow_detailed_fx, WERD *src)
Definition: blobs.cpp:776
bool is_fixed_pitch() const
Definition: fontinfo.h:113
bool is_bold() const
Definition: fontinfo.h:112
bool is_serif() const
Definition: fontinfo.h:114
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:453
tesseract::ScriptPos BlobPosition(int index) const
Definition: ratngs.h:312
void plot(ScrollView *fd) const
Definition: rect.h:286
bool display_flag(uint8_t flag) const
Definition: werd.h:120
void plot(ScrollView *window, ScrollView::Color colour)
Definition: werd.cpp:283
void TextAttributes(const char *font, int pixel_size, bool bold, bool italic, bool underlined)
Definition: scrollview.cpp:635
void Text(int x, int y, const char *mystring)
Definition: scrollview.cpp:652
void Pen(Color color)
Definition: scrollview.cpp:719
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:600

◆ word_dumper()

bool tesseract::Tesseract::word_dumper ( PAGE_RES_IT pr_it)

word_dumper()

Dump members to the debug window

Definition at line 904 of file pgedit.cpp.

904 {
905 if (pr_it->block()->block != nullptr) {
906 tprintf("\nBlock data...\n");
907 pr_it->block()->block->print(nullptr, false);
908 }
909 tprintf("\nRow data...\n");
910 pr_it->row()->row->print(nullptr);
911 tprintf("\nWord data...\n");
912 WERD_RES* word_res = pr_it->word();
913 word_res->word->print();
914 if (word_res->blamer_bundle != nullptr && wordrec_debug_blamer &&
916 tprintf("Current blamer debug: %s\n",
917 word_res->blamer_bundle->debug().string());
918 }
919 return true;
920}
const STRING & debug() const
Definition: blamer.h:130
void print(FILE *fp, bool dump)
dump whole table
Definition: ocrblock.cpp:190
void print(FILE *fp)
Definition: ocrrow.cpp:166

◆ word_outline_errs()

int16_t tesseract::Tesseract::word_outline_errs ( WERD_RES word)

Definition at line 72 of file docqual.cpp.

72 {
73 int16_t i = 0;
74 int16_t err_count = 0;
75
76 if (word->rebuild_word != nullptr) {
77 for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
78 TBLOB* blob = word->rebuild_word->blobs[b];
79 err_count += count_outline_errs(word->best_choice->unichar_string()[i],
80 blob->NumOutlines());
81 i++;
82 }
83 }
84 return err_count;
85}
int16_t count_outline_errs(char c, int16_t outline_count)
Definition: docqual.cpp:126
int NumOutlines() const
Definition: blobs.cpp:454

◆ word_set_display()

bool tesseract::Tesseract::word_set_display ( PAGE_RES_IT pr_it)

word_set_display() Word processor

Display word according to current display mode settings

Definition at line 928 of file pgedit.cpp.

928 {
929 WERD* word = pr_it->word()->word;
930 word->set_display_flag(DF_BOX, word_display_mode.bit(DF_BOX));
931 word->set_display_flag(DF_TEXT, word_display_mode.bit(DF_TEXT));
932 word->set_display_flag(DF_POLYGONAL, word_display_mode.bit(DF_POLYGONAL));
933 word->set_display_flag(DF_EDGE_STEP, word_display_mode.bit(DF_EDGE_STEP));
935 word_display_mode.bit(DF_BN_POLYGONAL));
936 word->set_display_flag(DF_BLAMER, word_display_mode.bit(DF_BLAMER));
937 return word_display(pr_it);
938}
void set_display_flag(uint8_t flag, bool value)
Definition: werd.h:121
bool bit(uint8_t bit_num) const
Definition: bits16.h:51

◆ worst_noise_blob()

int16_t tesseract::Tesseract::worst_noise_blob ( WERD_RES word_res,
float *  worst_noise_score 
)

Definition at line 707 of file fixspace.cpp.

708 {
709 float noise_score[512];
710 int i;
711 int min_noise_blob; // 1st contender
712 int max_noise_blob; // last contender
713 int non_noise_count;
714 int worst_noise_blob; // Worst blob
715 float small_limit = kBlnXHeight * fixsp_small_outlines_size;
716 float non_noise_limit = kBlnXHeight * 0.8;
717
718 if (word_res->rebuild_word == nullptr)
719 return -1; // Can't handle cube words.
720
721 // Normalised.
722 int blob_count = word_res->box_word->length();
723 ASSERT_HOST(blob_count <= 512);
724 if (blob_count < 5)
725 return -1; // too short to split
726
727 /* Get the noise scores for all blobs */
728
729 #ifndef SECURE_NAMES
730 if (debug_fix_space_level > 5)
731 tprintf("FP fixspace Noise metrics for \"%s\": ",
732 word_res->best_choice->unichar_string().string());
733 #endif
734
735 for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
736 TBLOB* blob = word_res->rebuild_word->blobs[i];
737 if (word_res->reject_map[i].accepted())
738 noise_score[i] = non_noise_limit;
739 else
740 noise_score[i] = blob_noise_score(blob);
741
742 if (debug_fix_space_level > 5)
743 tprintf("%1.1f ", noise_score[i]);
744 }
745 if (debug_fix_space_level > 5)
746 tprintf("\n");
747
748 /* Now find the worst one which is far enough away from the end of the word */
749
750 non_noise_count = 0;
751 for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
752 if (noise_score[i] >= non_noise_limit) {
753 non_noise_count++;
754 }
755 }
756 if (non_noise_count < fixsp_non_noise_limit)
757 return -1;
758
759 min_noise_blob = i;
760
761 non_noise_count = 0;
762 for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit;
763 i--) {
764 if (noise_score[i] >= non_noise_limit) {
765 non_noise_count++;
766 }
767 }
768 if (non_noise_count < fixsp_non_noise_limit)
769 return -1;
770
771 max_noise_blob = i;
772
773 if (min_noise_blob > max_noise_blob)
774 return -1;
775
776 *worst_noise_score = small_limit;
777 worst_noise_blob = -1;
778 for (i = min_noise_blob; i <= max_noise_blob; i++) {
779 if (noise_score[i] < *worst_noise_score) {
781 *worst_noise_score = noise_score[i];
782 }
783 }
784 return worst_noise_blob;
785}

◆ write_results()

void tesseract::Tesseract::write_results ( PAGE_RES_IT page_res_it,
char  newline_type,
bool  force_eol 
)

Definition at line 98 of file output.cpp.

100 { // override tilde crunch?
101 WERD_RES *word = page_res_it.word();
102 const UNICHARSET &uchset = *word->uch_set;
103 int i;
104 bool need_reject = false;
105 UNICHAR_ID space = uchset.unichar_to_id(" ");
106
107 if ((word->unlv_crunch_mode != CR_NONE ||
108 word->best_choice->length() == 0) &&
110 if ((word->unlv_crunch_mode != CR_DELETE) &&
111 (!stats_.tilde_crunch_written ||
112 ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
113 (word->word->space () > 0) &&
114 !word->word->flag (W_FUZZY_NON) &&
115 !word->word->flag (W_FUZZY_SP)))) {
116 if (!word->word->flag (W_BOL) &&
117 (word->word->space () > 0) &&
118 !word->word->flag (W_FUZZY_NON) &&
119 !word->word->flag (W_FUZZY_SP)) {
120 stats_.last_char_was_tilde = false;
121 }
122 need_reject = true;
123 }
124 if ((need_reject && !stats_.last_char_was_tilde) ||
125 (force_eol && stats_.write_results_empty_block)) {
126 /* Write a reject char - mark as rejected unless zero_rejection mode */
127 stats_.last_char_was_tilde = true;
128 stats_.tilde_crunch_written = true;
129 stats_.last_char_was_newline = false;
130 stats_.write_results_empty_block = false;
131 }
132
133 if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
134 stats_.tilde_crunch_written = false;
135 stats_.last_char_was_newline = true;
136 stats_.last_char_was_tilde = false;
137 }
138
139 if (force_eol)
140 stats_.write_results_empty_block = true;
141 return;
142 }
143
144 /* NORMAL PROCESSING of non tilde crunched words */
145
146 stats_.tilde_crunch_written = false;
147 if (newline_type)
148 stats_.last_char_was_newline = true;
149 else
150 stats_.last_char_was_newline = false;
151 stats_.write_results_empty_block = force_eol; // about to write a real word
152
154 stats_.last_char_was_tilde &&
155 (word->word->space() == 0) &&
157 (word->best_choice->unichar_id(0) == space)) {
158 /* Prevent adjacent tilde across words - we know that adjacent tildes within
159 words have been removed */
160 word->MergeAdjacentBlobs(0);
161 }
162 if (newline_type ||
164 stats_.last_char_was_tilde = false;
165 else {
166 if (word->reject_map.length () > 0) {
167 if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
168 stats_.last_char_was_tilde = true;
169 else
170 stats_.last_char_was_tilde = false;
171 }
172 else if (word->word->space () > 0)
173 stats_.last_char_was_tilde = false;
174 /* else it is unchanged as there are no output chars */
175 }
176
177 ASSERT_HOST (word->best_choice->length() == word->reject_map.length());
178
179 set_unlv_suspects(word);
180 check_debug_pt (word, 120);
182 tprintf ("Dict word: \"%s\": %d\n",
184 dict_word(*(word->best_choice)));
185 }
186 if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
188 /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
189 for (i = 0; i < word->best_choice->length(); ++i) {
190 if (word->reject_map[i].rejected())
191 word->reject_map[i].setrej_minimal_rej_accept();
192 }
193 }
195 /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
196 for (i = 0; i < word->best_choice->length(); ++i) {
197 if ((word->best_choice->unichar_id(i) != space) &&
198 word->reject_map[i].rejected())
199 word->reject_map[i].setrej_minimal_rej_accept();
200 }
201 }
202 }
203}
void set_unlv_suspects(WERD_RES *word)
Definition: output.cpp:273
void MergeAdjacentBlobs(int index)
Definition: pageres.cpp:974

Member Data Documentation

◆ applybox_debug

int tesseract::Tesseract::applybox_debug = 1

"Debug level"

Definition at line 825 of file tesseractclass.h.

◆ applybox_exposure_pattern

char* tesseract::Tesseract::applybox_exposure_pattern = ".exp"

"Exposure value follows this pattern in the image" " filename. The name of the image files are expected" " to be in the form [lang].[fontname].exp[num].tif"

Definition at line 830 of file tesseractclass.h.

◆ applybox_learn_chars_and_char_frags_mode

bool tesseract::Tesseract::applybox_learn_chars_and_char_frags_mode = false

"Learn both character fragments (as is done in the" " special low exposure mode) as well as unfragmented" " characters."

Definition at line 834 of file tesseractclass.h.

◆ applybox_learn_ngrams_mode

bool tesseract::Tesseract::applybox_learn_ngrams_mode = false

"Each bounding box is assumed to contain ngrams. Only" " learn the ngrams whose outlines overlap horizontally."

Definition at line 837 of file tesseractclass.h.

◆ applybox_page

int tesseract::Tesseract::applybox_page = 0

"Page number to apply boxes from"

Definition at line 826 of file tesseractclass.h.

◆ bidi_debug

int tesseract::Tesseract::bidi_debug = 0

"Debug level for BiDi"

Definition at line 824 of file tesseractclass.h.

◆ bland_unrej

bool tesseract::Tesseract::bland_unrej = false

"unrej potential with no checks"

Definition at line 930 of file tesseractclass.h.

◆ chs_leading_punct

char* tesseract::Tesseract::chs_leading_punct = "('`\""

"Leading punctuation"

Definition at line 875 of file tesseractclass.h.

◆ chs_trailing_punct1

char* tesseract::Tesseract::chs_trailing_punct1 = ").,;:?!"

"1st Trailing punctuation"

Definition at line 876 of file tesseractclass.h.

◆ chs_trailing_punct2

char* tesseract::Tesseract::chs_trailing_punct2 = ")'`\""

"2nd Trailing punctuation"

Definition at line 877 of file tesseractclass.h.

◆ conflict_set_I_l_1

char* tesseract::Tesseract::conflict_set_I_l_1 = "Il1[]"

"Il1 conflict set"

Definition at line 1043 of file tesseractclass.h.

◆ crunch_accept_ok

bool tesseract::Tesseract::crunch_accept_ok = true

"Use acceptability in okstring"

Definition at line 955 of file tesseractclass.h.

◆ crunch_debug

int tesseract::Tesseract::crunch_debug = 0

"As it says"

Definition at line 964 of file tesseractclass.h.

◆ crunch_del_cert

double tesseract::Tesseract::crunch_del_cert = -10.0

"POTENTIAL crunch cert lt this"

Definition at line 945 of file tesseractclass.h.

◆ crunch_del_high_word

double tesseract::Tesseract::crunch_del_high_word = 1.5

"Del if word gt xht x this above bl"

Definition at line 949 of file tesseractclass.h.

◆ crunch_del_low_word

double tesseract::Tesseract::crunch_del_low_word = 0.5

"Del if word gt xht x this below bl"

Definition at line 950 of file tesseractclass.h.

◆ crunch_del_max_ht

double tesseract::Tesseract::crunch_del_max_ht = 3.0

"Del if word ht gt xht x this"

Definition at line 947 of file tesseractclass.h.

◆ crunch_del_min_ht

double tesseract::Tesseract::crunch_del_min_ht = 0.7

"Del if word ht lt xht x this"

Definition at line 946 of file tesseractclass.h.

◆ crunch_del_min_width

double tesseract::Tesseract::crunch_del_min_width = 3.0

"Del if word width lt xht x this"

Definition at line 948 of file tesseractclass.h.

◆ crunch_del_rating

double tesseract::Tesseract::crunch_del_rating = 60

"POTENTIAL crunch rating lt this"

Definition at line 944 of file tesseractclass.h.

◆ crunch_early_convert_bad_unlv_chs

bool tesseract::Tesseract::crunch_early_convert_bad_unlv_chs = false

"Take out ~^ early?"

Definition at line 937 of file tesseractclass.h.

◆ crunch_early_merge_tess_fails

bool tesseract::Tesseract::crunch_early_merge_tess_fails = true

"Before word crunch?"

Definition at line 936 of file tesseractclass.h.

◆ crunch_include_numerals

bool tesseract::Tesseract::crunch_include_numerals = false

"Fiddle alpha figures"

Definition at line 958 of file tesseractclass.h.

◆ crunch_leave_accept_strings

bool tesseract::Tesseract::crunch_leave_accept_strings = false

"Don't pot crunch sensible strings"

Definition at line 957 of file tesseractclass.h.

◆ crunch_leave_lc_strings

int tesseract::Tesseract::crunch_leave_lc_strings = 4

"Don't crunch words with long lower case strings"

Definition at line 960 of file tesseractclass.h.

◆ crunch_leave_ok_strings

bool tesseract::Tesseract::crunch_leave_ok_strings = true

"Don't touch sensible strings"

Definition at line 954 of file tesseractclass.h.

◆ crunch_leave_uc_strings

int tesseract::Tesseract::crunch_leave_uc_strings = 4

"Don't crunch words with long lower case strings"

Definition at line 962 of file tesseractclass.h.

◆ crunch_long_repetitions

int tesseract::Tesseract::crunch_long_repetitions = 3

"Crunch words with long repetitions"

Definition at line 963 of file tesseractclass.h.

◆ crunch_poor_garbage_cert

double tesseract::Tesseract::crunch_poor_garbage_cert = -9.0

"crunch garbage cert lt this"

Definition at line 940 of file tesseractclass.h.

◆ crunch_poor_garbage_rate

double tesseract::Tesseract::crunch_poor_garbage_rate = 60

"crunch garbage rating lt this"

Definition at line 941 of file tesseractclass.h.

◆ crunch_pot_indicators

int tesseract::Tesseract::crunch_pot_indicators = 1

"How many potential indicators needed"

Definition at line 953 of file tesseractclass.h.

◆ crunch_pot_poor_cert

double tesseract::Tesseract::crunch_pot_poor_cert = -8.0

"POTENTIAL crunch cert lt this"

Definition at line 943 of file tesseractclass.h.

◆ crunch_pot_poor_rate

double tesseract::Tesseract::crunch_pot_poor_rate = 40

"POTENTIAL crunch rating lt this"

Definition at line 942 of file tesseractclass.h.

◆ crunch_rating_max

int tesseract::Tesseract::crunch_rating_max = 10

"For adj length in rating per ch"

Definition at line 952 of file tesseractclass.h.

◆ crunch_small_outlines_size

double tesseract::Tesseract::crunch_small_outlines_size = 0.6

"Small if lt xht x this"

Definition at line 951 of file tesseractclass.h.

◆ crunch_terrible_garbage

bool tesseract::Tesseract::crunch_terrible_garbage = true

"As it says"

Definition at line 939 of file tesseractclass.h.

◆ crunch_terrible_rating

double tesseract::Tesseract::crunch_terrible_rating = 80.0

"crunch rating lt this"

Definition at line 938 of file tesseractclass.h.

◆ debug_fix_space_level

int tesseract::Tesseract::debug_fix_space_level = 0

"Contextual fixspace debug"

Definition at line 969 of file tesseractclass.h.

◆ debug_noise_removal

int tesseract::Tesseract::debug_noise_removal = 0

"Debug reassignment of small outlines"

Definition at line 859 of file tesseractclass.h.

◆ debug_x_ht_level

int tesseract::Tesseract::debug_x_ht_level = 0

"Reestimate debug"

Definition at line 874 of file tesseractclass.h.

◆ enable_noise_removal

bool tesseract::Tesseract::enable_noise_removal = true

"Remove and conditionally reassign small outlines when they" " confuse layout analysis, determining diacritics vs noise"

Definition at line 858 of file tesseractclass.h.

◆ file_type

char* tesseract::Tesseract::file_type = ".tif"

"Filename extension"

Definition at line 1050 of file tesseractclass.h.

◆ fixsp_done_mode

int tesseract::Tesseract::fixsp_done_mode = 1

"What constitues done for spacing"

Definition at line 968 of file tesseractclass.h.

◆ fixsp_non_noise_limit

int tesseract::Tesseract::fixsp_non_noise_limit = 1

"How many non-noise blbs either side?"

Definition at line 965 of file tesseractclass.h.

◆ fixsp_small_outlines_size

double tesseract::Tesseract::fixsp_small_outlines_size = 0.28

"Small if lt xht x this"

Definition at line 966 of file tesseractclass.h.

◆ hocr_char_boxes

bool tesseract::Tesseract::hocr_char_boxes = false

"Add coordinates for each character to hocr output"

Definition at line 935 of file tesseractclass.h.

◆ hocr_font_info

bool tesseract::Tesseract::hocr_font_info = false

"Add font info to hocr output"

Definition at line 933 of file tesseractclass.h.

◆ interactive_display_mode

bool tesseract::Tesseract::interactive_display_mode = false

"Run interactively?"

Definition at line 1049 of file tesseractclass.h.

◆ jpg_quality

int tesseract::Tesseract::jpg_quality = 85

"Set JPEG quality level"

Definition at line 1011 of file tesseractclass.h.

◆ lstm_choice_mode

int tesseract::Tesseract::lstm_choice_mode = 0

"Allows to include alternative symbols choices in the hOCR " "output. " "Valid input values are 0, 1, 2 and 3. 0 is the default value. " "With 1 the alternative symbol choices per timestep are included. " "With 2 the alternative symbol choices are accumulated per " "character. "

Definition at line 1087 of file tesseractclass.h.

◆ lstm_use_matrix

bool tesseract::Tesseract::lstm_use_matrix = 1

"Use ratings matrix/beam searct with lstm"

Definition at line 897 of file tesseractclass.h.

◆ min_characters_to_try

int tesseract::Tesseract::min_characters_to_try = 50

"Specify minimum characters to try during OSD"

Definition at line 1014 of file tesseractclass.h.

◆ min_orientation_margin

double tesseract::Tesseract::min_orientation_margin = 7.0

"Min acceptable orientation margin"

Definition at line 1059 of file tesseractclass.h.

◆ min_sane_x_ht_pixels

int tesseract::Tesseract::min_sane_x_ht_pixels = 8

"Reject any x-ht lt or eq than this"

Definition at line 1044 of file tesseractclass.h.

◆ multilang_debug_level

int tesseract::Tesseract::multilang_debug_level = 0

"Print multilang debug info."

Definition at line 892 of file tesseractclass.h.

◆ noise_cert_basechar

double tesseract::Tesseract::noise_cert_basechar = -8.0

"Hingepoint for base char certainty"

Definition at line 862 of file tesseractclass.h.

◆ noise_cert_disjoint

double tesseract::Tesseract::noise_cert_disjoint = -2.5

"Hingepoint for disjoint certainty"

Definition at line 865 of file tesseractclass.h.

◆ noise_cert_factor

double tesseract::Tesseract::noise_cert_factor = 0.375

"Scaling on certainty diff from Hingepoint"

Definition at line 871 of file tesseractclass.h.

◆ noise_cert_punc

double tesseract::Tesseract::noise_cert_punc = -2.5

"Threshold for new punc char certainty"

Definition at line 868 of file tesseractclass.h.

◆ noise_maxperblob

int tesseract::Tesseract::noise_maxperblob = 8

"Max diacritics to apply to a blob"

Definition at line 872 of file tesseractclass.h.

◆ noise_maxperword

int tesseract::Tesseract::noise_maxperword = 16

"Max diacritics to apply to a word"

Definition at line 873 of file tesseractclass.h.

◆ numeric_punctuation

char* tesseract::Tesseract::numeric_punctuation = ".,"

"Punct. chs expected WITHIN numbers"

Definition at line 970 of file tesseractclass.h.

◆ ocr_devanagari_split_strategy

int tesseract::Tesseract::ocr_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT

"Whether to use the top-line splitting process for Devanagari " "documents while performing ocr."

Definition at line 819 of file tesseractclass.h.

◆ ok_repeated_ch_non_alphanum_wds

char* tesseract::Tesseract::ok_repeated_ch_non_alphanum_wds = "-?*\075"

"Allow NN to unrej"

Definition at line 1042 of file tesseractclass.h.

◆ outlines_2

char* tesseract::Tesseract::outlines_2 = "ij!?%\":;"

"Non standard number of outlines"

Definition at line 899 of file tesseractclass.h.

◆ outlines_odd

char* tesseract::Tesseract::outlines_odd = "%| "

"Non standard number of outlines"

Definition at line 898 of file tesseractclass.h.

◆ page_separator

char* tesseract::Tesseract::page_separator = "\f"

"Page separator (default is form feed control character)"

Definition at line 1080 of file tesseractclass.h.

◆ pageseg_apply_music_mask

bool tesseract::Tesseract::pageseg_apply_music_mask = true

"Detect music staff and remove intersecting components"

Definition at line 1089 of file tesseractclass.h.

◆ pageseg_devanagari_split_strategy

int tesseract::Tesseract::pageseg_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT

"Whether to use the top-line splitting process for Devanagari " "documents while performing page-segmentation."

Definition at line 815 of file tesseractclass.h.

◆ paragraph_debug_level

int tesseract::Tesseract::paragraph_debug_level = 0

"Print paragraph debug info."

Definition at line 893 of file tesseractclass.h.

◆ paragraph_text_based

bool tesseract::Tesseract::paragraph_text_based = true

"Run paragraph detection on the post-text-recognition " "(more accurate)"

Definition at line 896 of file tesseractclass.h.

◆ poly_allow_detailed_fx

bool tesseract::Tesseract::poly_allow_detailed_fx = false

"Allow feature extractors to see the original outline"

Definition at line 1063 of file tesseractclass.h.

◆ preserve_interword_spaces

bool tesseract::Tesseract::preserve_interword_spaces = false

"Preserve multiple interword spaces"

Definition at line 1078 of file tesseractclass.h.

◆ quality_blob_pc

double tesseract::Tesseract::quality_blob_pc = 0.0

"good_quality_doc gte good blobs limit"

Definition at line 879 of file tesseractclass.h.

◆ quality_char_pc

double tesseract::Tesseract::quality_char_pc = 0.95

"good_quality_doc gte good char limit"

Definition at line 882 of file tesseractclass.h.

◆ quality_min_initial_alphas_reqd

int tesseract::Tesseract::quality_min_initial_alphas_reqd = 2

"alphas in a good word"

Definition at line 883 of file tesseractclass.h.

◆ quality_outline_pc

double tesseract::Tesseract::quality_outline_pc = 1.0

"good_quality_doc lte outline error limit"

Definition at line 881 of file tesseractclass.h.

◆ quality_rej_pc

double tesseract::Tesseract::quality_rej_pc = 0.08

"good_quality_doc lte rejection limit"

Definition at line 878 of file tesseractclass.h.

◆ quality_rowrej_pc

double tesseract::Tesseract::quality_rowrej_pc = 1.1

"good_quality_doc gte good char limit"

Definition at line 931 of file tesseractclass.h.

◆ rej_1Il_trust_permuter_type

bool tesseract::Tesseract::rej_1Il_trust_permuter_type = true

"Don't double check"

Definition at line 1034 of file tesseractclass.h.

◆ rej_1Il_use_dict_word

bool tesseract::Tesseract::rej_1Il_use_dict_word = false

"Use dictword test"

Definition at line 1033 of file tesseractclass.h.

◆ rej_alphas_in_number_perm

bool tesseract::Tesseract::rej_alphas_in_number_perm = false

"Extend permuter check"

Definition at line 1039 of file tesseractclass.h.

◆ rej_trust_doc_dawg

bool tesseract::Tesseract::rej_trust_doc_dawg = false

"Use DOC dawg in 11l conf. detector"

Definition at line 1032 of file tesseractclass.h.

◆ rej_use_good_perm

bool tesseract::Tesseract::rej_use_good_perm = true

"Individual rejection control"

Definition at line 1037 of file tesseractclass.h.

◆ rej_use_sensible_wd

bool tesseract::Tesseract::rej_use_sensible_wd = false

"Extend permuter check"

Definition at line 1038 of file tesseractclass.h.

◆ rej_use_tess_accepted

bool tesseract::Tesseract::rej_use_tess_accepted = true

"Individual rejection control"

Definition at line 1035 of file tesseractclass.h.

◆ rej_use_tess_blanks

bool tesseract::Tesseract::rej_use_tess_blanks = true

"Individual rejection control"

Definition at line 1036 of file tesseractclass.h.

◆ rej_whole_of_mostly_reject_word_fract

double tesseract::Tesseract::rej_whole_of_mostly_reject_word_fract = 0.85

"if >this fract"

Definition at line 1040 of file tesseractclass.h.

◆ subscript_max_y_top

double tesseract::Tesseract::subscript_max_y_top = 0.5

"Maximum top of a character measured as a multiple of x-height " "above the baseline for us to reconsider whether it's a " "subscript."

Definition at line 991 of file tesseractclass.h.

◆ superscript_bettered_certainty

double tesseract::Tesseract::superscript_bettered_certainty = 0.97

"What reduction in " "badness do we think sufficient to choose a superscript over " "what we'd thought. For example, a value of 0.6 means we want " "to reduce badness of certainty by 40%"

Definition at line 983 of file tesseractclass.h.

◆ superscript_debug

int tesseract::Tesseract::superscript_debug = 0

"Debug level for sub & superscript fixer"

Definition at line 974 of file tesseractclass.h.

◆ superscript_min_y_bottom

double tesseract::Tesseract::superscript_min_y_bottom = 0.3

"Minimum bottom of a character measured as a multiple of " "x-height above the baseline for us to reconsider whether it's " "a superscript."

Definition at line 995 of file tesseractclass.h.

◆ superscript_scaledown_ratio

double tesseract::Tesseract::superscript_scaledown_ratio = 0.4

"A superscript scaled down more than this is unbelievably " "small. For example, 0.3 means we expect the font size to " "be no smaller than 30% of the text line font size."

Definition at line 987 of file tesseractclass.h.

◆ superscript_worse_certainty

double tesseract::Tesseract::superscript_worse_certainty = 2.0

"How many times worse " "certainty does a superscript position glyph need to be for us " "to try classifying it as a char with a different baseline?"

Definition at line 978 of file tesseractclass.h.

◆ suspect_accept_rating

double tesseract::Tesseract::suspect_accept_rating = -999.9

"Accept good rating limit"

Definition at line 1020 of file tesseractclass.h.

◆ suspect_constrain_1Il

bool tesseract::Tesseract::suspect_constrain_1Il = false

"UNLV keep 1Il chars rejected"

Definition at line 1018 of file tesseractclass.h.

◆ suspect_level

int tesseract::Tesseract::suspect_level = 99

"Suspect marker level"

Definition at line 1016 of file tesseractclass.h.

◆ suspect_rating_per_ch

double tesseract::Tesseract::suspect_rating_per_ch = 999.9

"Don't touch bad rating limit"

Definition at line 1019 of file tesseractclass.h.

◆ suspect_short_words

int tesseract::Tesseract::suspect_short_words = 2

"Don't Suspect dict wds longer than this"

Definition at line 1017 of file tesseractclass.h.

◆ tessedit_adaption_debug

bool tesseract::Tesseract::tessedit_adaption_debug = false

"Generate and print debug information for adaption"

Definition at line 823 of file tesseractclass.h.

◆ tessedit_ambigs_training

bool tesseract::Tesseract::tessedit_ambigs_training = false

"Perform training for ambiguities"

Definition at line 811 of file tesseractclass.h.

◆ tessedit_bigram_debug

int tesseract::Tesseract::tessedit_bigram_debug = 0

"Amount of debug output for bigram " "correction."

Definition at line 855 of file tesseractclass.h.

◆ tessedit_char_blacklist

char* tesseract::Tesseract::tessedit_char_blacklist = ""

"Blacklist of chars not to recognize"

Definition at line 806 of file tesseractclass.h.

◆ tessedit_char_unblacklist

char* tesseract::Tesseract::tessedit_char_unblacklist = ""

"List of chars to override tessedit_char_blacklist"

Definition at line 809 of file tesseractclass.h.

◆ tessedit_char_whitelist

char* tesseract::Tesseract::tessedit_char_whitelist = ""

"Whitelist of chars to recognize"

Definition at line 807 of file tesseractclass.h.

◆ tessedit_create_alto

bool tesseract::Tesseract::tessedit_create_alto = false

"Write .xml ALTO output file"

Definition at line 1002 of file tesseractclass.h.

◆ tessedit_create_boxfile

bool tesseract::Tesseract::tessedit_create_boxfile = false

"Output text with boxes"

Definition at line 1045 of file tesseractclass.h.

◆ tessedit_create_hocr

bool tesseract::Tesseract::tessedit_create_hocr = false

"Write .html hOCR output file"

Definition at line 1001 of file tesseractclass.h.

◆ tessedit_create_lstmbox

bool tesseract::Tesseract::tessedit_create_lstmbox = false

"Write .box file for LSTM training"

Definition at line 1004 of file tesseractclass.h.

◆ tessedit_create_pdf

bool tesseract::Tesseract::tessedit_create_pdf = false

"Write .pdf output file"

Definition at line 1008 of file tesseractclass.h.

◆ tessedit_create_tsv

bool tesseract::Tesseract::tessedit_create_tsv = false

"Write .tsv output file"

Definition at line 1005 of file tesseractclass.h.

◆ tessedit_create_txt

bool tesseract::Tesseract::tessedit_create_txt = false

"Write .txt output file"

Definition at line 1000 of file tesseractclass.h.

◆ tessedit_create_wordstrbox

bool tesseract::Tesseract::tessedit_create_wordstrbox = false

"Write WordStr format .box output file"

Definition at line 1007 of file tesseractclass.h.

◆ tessedit_debug_block_rejection

bool tesseract::Tesseract::tessedit_debug_block_rejection = false

"Block and Row stats"

Definition at line 848 of file tesseractclass.h.

◆ tessedit_debug_doc_rejection

bool tesseract::Tesseract::tessedit_debug_doc_rejection = false

"Page stats"

Definition at line 927 of file tesseractclass.h.

◆ tessedit_debug_fonts

bool tesseract::Tesseract::tessedit_debug_fonts = false

"Output font info per char"

Definition at line 847 of file tesseractclass.h.

◆ tessedit_debug_quality_metrics

bool tesseract::Tesseract::tessedit_debug_quality_metrics = false

"Output data to debug file"

Definition at line 929 of file tesseractclass.h.

◆ tessedit_display_outwords

bool tesseract::Tesseract::tessedit_display_outwords = false

"Draw output words"

Definition at line 838 of file tesseractclass.h.

◆ tessedit_do_invert

bool tesseract::Tesseract::tessedit_do_invert = true

"Try inverting the image in `LSTMRecognizeWord`"

Definition at line 797 of file tesseractclass.h.

◆ tessedit_dont_blkrej_good_wds

bool tesseract::Tesseract::tessedit_dont_blkrej_good_wds = false

"Use word segmentation quality metric"

Definition at line 917 of file tesseractclass.h.

◆ tessedit_dont_rowrej_good_wds

bool tesseract::Tesseract::tessedit_dont_rowrej_good_wds = false

"Use word segmentation quality metric"

Definition at line 919 of file tesseractclass.h.

◆ tessedit_dump_choices

bool tesseract::Tesseract::tessedit_dump_choices = false

"Dump char choices"

Definition at line 839 of file tesseractclass.h.

◆ tessedit_dump_pageseg_images

bool tesseract::Tesseract::tessedit_dump_pageseg_images = false

"Dump intermediate images made during page segmentation"

Definition at line 795 of file tesseractclass.h.

◆ tessedit_enable_bigram_correction

bool tesseract::Tesseract::tessedit_enable_bigram_correction = true

"Enable correction based on the word bigram dictionary."

Definition at line 850 of file tesseractclass.h.

◆ tessedit_enable_dict_correction

bool tesseract::Tesseract::tessedit_enable_dict_correction = false

"Enable single word correction based on the dictionary."

Definition at line 852 of file tesseractclass.h.

◆ tessedit_enable_doc_dict

bool tesseract::Tesseract::tessedit_enable_doc_dict = true

"Add words to the document dictionary"

Definition at line 846 of file tesseractclass.h.

◆ tessedit_fix_fuzzy_spaces

bool tesseract::Tesseract::tessedit_fix_fuzzy_spaces = true

"Try to improve fuzzy spaces"

Definition at line 841 of file tesseractclass.h.

◆ tessedit_fix_hyphens

bool tesseract::Tesseract::tessedit_fix_hyphens = true

"Crunch double hyphens?"

Definition at line 844 of file tesseractclass.h.

◆ tessedit_flip_0O

bool tesseract::Tesseract::tessedit_flip_0O = true

"Contextual 0O O0 flips"

Definition at line 1029 of file tesseractclass.h.

◆ tessedit_good_doc_still_rowrej_wd

double tesseract::Tesseract::tessedit_good_doc_still_rowrej_wd = 1.1

"rej good doc wd if more than this fraction rejected"

Definition at line 925 of file tesseractclass.h.

◆ tessedit_good_quality_unrej

bool tesseract::Tesseract::tessedit_good_quality_unrej = true

"Reduce rejection on good docs"

Definition at line 901 of file tesseractclass.h.

◆ tessedit_image_border

int tesseract::Tesseract::tessedit_image_border = 2

"Rej blbs near image edge limit"

Definition at line 1041 of file tesseractclass.h.

◆ tessedit_init_config_only

bool tesseract::Tesseract::tessedit_init_config_only = false

"Only initialize with the config file. Useful if the instance is " "not going to be used for OCR but say only for layout analysis."

Definition at line 1066 of file tesseractclass.h.

◆ tessedit_load_sublangs

char* tesseract::Tesseract::tessedit_load_sublangs = ""

"List of languages to load with this one"

Definition at line 1053 of file tesseractclass.h.

◆ tessedit_lower_flip_hyphen

double tesseract::Tesseract::tessedit_lower_flip_hyphen = 1.5

"Aspect ratio dot/hyphen test"

Definition at line 1030 of file tesseractclass.h.

◆ tessedit_make_boxes_from_boxes

bool tesseract::Tesseract::tessedit_make_boxes_from_boxes = false

"Generate more boxes from boxed chars"

Definition at line 791 of file tesseractclass.h.

◆ tessedit_minimal_rej_pass1

bool tesseract::Tesseract::tessedit_minimal_rej_pass1 = false

"Do minimal rejection on pass 1 output"

Definition at line 887 of file tesseractclass.h.

◆ tessedit_minimal_rejection

bool tesseract::Tesseract::tessedit_minimal_rejection = false

"Only reject tess failures"

Definition at line 1021 of file tesseractclass.h.

◆ tessedit_ocr_engine_mode

int tesseract::Tesseract::tessedit_ocr_engine_mode = tesseract::OEM_DEFAULT

"Which OCR engine(s) to run (Tesseract, LSTM, both). Defaults" " to loading and running the most accurate available."

Definition at line 804 of file tesseractclass.h.

◆ tessedit_override_permuter

bool tesseract::Tesseract::tessedit_override_permuter = true

"According to dict_word"

Definition at line 1051 of file tesseractclass.h.

◆ tessedit_page_number

int tesseract::Tesseract::tessedit_page_number = -1

"-1 -> All pages, else specific page to process"

Definition at line 1047 of file tesseractclass.h.

◆ tessedit_pageseg_mode

int tesseract::Tesseract::tessedit_pageseg_mode = PSM_SINGLE_BLOCK

"Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," " 5=line, 6=word, 7=char" " (Values from PageSegMode enum in publictypes.h)"

Definition at line 801 of file tesseractclass.h.

◆ tessedit_parallelize

int tesseract::Tesseract::tessedit_parallelize = 0

"Run in parallel where possible"

Definition at line 1076 of file tesseractclass.h.

◆ tessedit_prefer_joined_punct

bool tesseract::Tesseract::tessedit_prefer_joined_punct = false

"Reward punctuation joins"

Definition at line 967 of file tesseractclass.h.

◆ tessedit_preserve_blk_rej_perfect_wds

bool tesseract::Tesseract::tessedit_preserve_blk_rej_perfect_wds = true

"Only rej partially rejected words in block rejection"

Definition at line 913 of file tesseractclass.h.

◆ tessedit_preserve_min_wd_len

int tesseract::Tesseract::tessedit_preserve_min_wd_len = 2

"Only preserve wds longer than this"

Definition at line 921 of file tesseractclass.h.

◆ tessedit_preserve_row_rej_perfect_wds

bool tesseract::Tesseract::tessedit_preserve_row_rej_perfect_wds = true

"Only rej partially rejected words in row rejection"

Definition at line 915 of file tesseractclass.h.

◆ tessedit_reject_bad_qual_wds

bool tesseract::Tesseract::tessedit_reject_bad_qual_wds = true

"Reject all bad quality wds"

Definition at line 926 of file tesseractclass.h.

◆ tessedit_reject_block_percent

double tesseract::Tesseract::tessedit_reject_block_percent = 45.00

"%rej allowed before rej whole block"

Definition at line 906 of file tesseractclass.h.

◆ tessedit_reject_doc_percent

double tesseract::Tesseract::tessedit_reject_doc_percent = 65.00

"%rej allowed before rej whole doc"

Definition at line 904 of file tesseractclass.h.

◆ tessedit_reject_mode

int tesseract::Tesseract::tessedit_reject_mode = 0

"Rejection algorithm"

Definition at line 1027 of file tesseractclass.h.

◆ tessedit_reject_row_percent

double tesseract::Tesseract::tessedit_reject_row_percent = 40.00

"%rej allowed before rej whole row"

Definition at line 908 of file tesseractclass.h.

◆ tessedit_rejection_debug

bool tesseract::Tesseract::tessedit_rejection_debug = false

"Adaption debug"

Definition at line 1028 of file tesseractclass.h.

◆ tessedit_resegment_from_boxes

bool tesseract::Tesseract::tessedit_resegment_from_boxes = false

"Take segmentation and labeling from box file"

Definition at line 785 of file tesseractclass.h.

◆ tessedit_resegment_from_line_boxes

bool tesseract::Tesseract::tessedit_resegment_from_line_boxes = false

"Conversion of word/line box file to char box file"

Definition at line 787 of file tesseractclass.h.

◆ tessedit_row_rej_good_docs

bool tesseract::Tesseract::tessedit_row_rej_good_docs = true

"Apply row rejection to good docs"

Definition at line 923 of file tesseractclass.h.

◆ tessedit_tess_adaption_mode

int tesseract::Tesseract::tessedit_tess_adaption_mode = 0x27

"Adaptation decision algorithm for tess"

Definition at line 885 of file tesseractclass.h.

◆ tessedit_test_adaption

bool tesseract::Tesseract::tessedit_test_adaption = false

"Test adaption criteria"

Definition at line 888 of file tesseractclass.h.

◆ tessedit_timing_debug

bool tesseract::Tesseract::tessedit_timing_debug = false

"Print timing stats"

Definition at line 840 of file tesseractclass.h.

◆ tessedit_train_from_boxes

bool tesseract::Tesseract::tessedit_train_from_boxes = false

"Generate training data from boxed chars"

Definition at line 789 of file tesseractclass.h.

◆ tessedit_train_line_recognizer

bool tesseract::Tesseract::tessedit_train_line_recognizer = false

"Break input into lines and remap boxes if present"

Definition at line 793 of file tesseractclass.h.

◆ tessedit_unrej_any_wd

bool tesseract::Tesseract::tessedit_unrej_any_wd = false

"Don't bother with word plausibility"

Definition at line 843 of file tesseractclass.h.

◆ tessedit_upper_flip_hyphen

double tesseract::Tesseract::tessedit_upper_flip_hyphen = 1.8

"Aspect ratio dot/hyphen test"

Definition at line 1031 of file tesseractclass.h.

◆ tessedit_use_primary_params_model

bool tesseract::Tesseract::tessedit_use_primary_params_model = false

"In multilingual mode use params model of the primary language"

Definition at line 1055 of file tesseractclass.h.

◆ tessedit_use_reject_spaces

bool tesseract::Tesseract::tessedit_use_reject_spaces = true

"Reject spaces?"

Definition at line 902 of file tesseractclass.h.

◆ tessedit_whole_wd_rej_row_percent

double tesseract::Tesseract::tessedit_whole_wd_rej_row_percent = 70.00

"Number of row rejects in whole word rejects" "which prevents whole row rejection"

Definition at line 911 of file tesseractclass.h.

◆ tessedit_word_for_word

bool tesseract::Tesseract::tessedit_word_for_word = false

"Make output have exactly one word per WERD"

Definition at line 1024 of file tesseractclass.h.

◆ tessedit_write_block_separators

bool tesseract::Tesseract::tessedit_write_block_separators = false

"Write block separators in output"

Definition at line 997 of file tesseractclass.h.

◆ tessedit_write_images

bool tesseract::Tesseract::tessedit_write_images = false

"Capture the image from the IPE"

Definition at line 1048 of file tesseractclass.h.

◆ tessedit_write_params_to_file

char* tesseract::Tesseract::tessedit_write_params_to_file = ""

"Write all parameters to the given file."

Definition at line 821 of file tesseractclass.h.

◆ tessedit_write_rep_codes

bool tesseract::Tesseract::tessedit_write_rep_codes = false

"Write repetition char code"

Definition at line 998 of file tesseractclass.h.

◆ tessedit_write_unlv

bool tesseract::Tesseract::tessedit_write_unlv = false

"Write .unlv output file"

Definition at line 999 of file tesseractclass.h.

◆ tessedit_zero_kelvin_rejection

bool tesseract::Tesseract::tessedit_zero_kelvin_rejection = false

"Don't reject ANYTHING AT ALL"

Definition at line 1026 of file tesseractclass.h.

◆ tessedit_zero_rejection

bool tesseract::Tesseract::tessedit_zero_rejection = false

"Don't reject ANYTHING"

Definition at line 1022 of file tesseractclass.h.

◆ test_pt

bool tesseract::Tesseract::test_pt = false

"Test for point"

Definition at line 889 of file tesseractclass.h.

◆ test_pt_x

double tesseract::Tesseract::test_pt_x = 99999.99

"xcoord"

Definition at line 890 of file tesseractclass.h.

◆ test_pt_y

double tesseract::Tesseract::test_pt_y = 99999.99

"ycoord"

Definition at line 891 of file tesseractclass.h.

◆ textonly_pdf

bool tesseract::Tesseract::textonly_pdf = false

"Create PDF with only one invisible text layer"

Definition at line 1010 of file tesseractclass.h.

◆ textord_equation_detect

bool tesseract::Tesseract::textord_equation_detect = false

"Turn on equation detector"

Definition at line 1067 of file tesseractclass.h.

◆ textord_tabfind_aligned_gap_fraction

double tesseract::Tesseract::textord_tabfind_aligned_gap_fraction = 0.75

"Fraction of height used as a minimum gap for aligned blobs."

Definition at line 1075 of file tesseractclass.h.

◆ textord_tabfind_force_vertical_text

bool tesseract::Tesseract::textord_tabfind_force_vertical_text = false

"Force using vertical text page mode"

Definition at line 1070 of file tesseractclass.h.

◆ textord_tabfind_show_vlines

bool tesseract::Tesseract::textord_tabfind_show_vlines = false

"Debug line finding"

Definition at line 1060 of file tesseractclass.h.

◆ textord_tabfind_vertical_text

bool tesseract::Tesseract::textord_tabfind_vertical_text = true

"Enable vertical detection"

Definition at line 1068 of file tesseractclass.h.

◆ textord_tabfind_vertical_text_ratio

double tesseract::Tesseract::textord_tabfind_vertical_text_ratio = 0.5

"Fraction of textlines deemed vertical to use vertical page " "mode"

Definition at line 1073 of file tesseractclass.h.

◆ textord_use_cjk_fp_model

bool tesseract::Tesseract::textord_use_cjk_fp_model = false

"Use CJK fixed pitch model"

Definition at line 1061 of file tesseractclass.h.

◆ unlv_tilde_crunching

bool tesseract::Tesseract::unlv_tilde_crunching = false

"Mark v.bad words for tilde crunch"

Definition at line 932 of file tesseractclass.h.

◆ unrecognised_char

char* tesseract::Tesseract::unrecognised_char = "|"

"Output char for unidentified blobs"

Definition at line 1015 of file tesseractclass.h.

◆ user_defined_dpi

int tesseract::Tesseract::user_defined_dpi = 0

"Specify DPI for input image"

Definition at line 1012 of file tesseractclass.h.

◆ x_ht_acceptance_tolerance

int tesseract::Tesseract::x_ht_acceptance_tolerance = 8

"Max allowed deviation of blob top outside of font data"

Definition at line 972 of file tesseractclass.h.

◆ x_ht_min_change

int tesseract::Tesseract::x_ht_min_change = 8

"Min change in xht before actually trying it"

Definition at line 973 of file tesseractclass.h.


The documentation for this class was generated from the following files: