tesseract 4.1.1
Loading...
Searching...
No Matches
ScriptDetector Class Reference

#include <osdetect.h>

Public Member Functions

 ScriptDetector (const GenericVector< int > *allowed_scripts, OSResults *osr, tesseract::Tesseract *tess)
 
void detect_blob (BLOB_CHOICE_LIST *scores)
 
bool must_stop (int orientation)
 

Detailed Description

Definition at line 95 of file osdetect.h.

Constructor & Destructor Documentation

◆ ScriptDetector()

ScriptDetector::ScriptDetector ( const GenericVector< int > *  allowed_scripts,
OSResults osr,
tesseract::Tesseract tess 
)

Definition at line 453 of file osdetect.cpp.

454 {
455 osr_ = osr;
456 tess_ = tess;
457 allowed_scripts_ = allowed_scripts;
458 katakana_id_ = tess_->unicharset.add_script(katakana_script);
459 hiragana_id_ = tess_->unicharset.add_script(hiragana_script);
460 han_id_ = tess_->unicharset.add_script(han_script);
461 hangul_id_ = tess_->unicharset.add_script(hangul_script);
462 japanese_id_ = tess_->unicharset.add_script(japanese_script_);
463 korean_id_ = tess_->unicharset.add_script(korean_script_);
464 latin_id_ = tess_->unicharset.add_script(latin_script);
465 fraktur_id_ = tess_->unicharset.add_script(fraktur_script_);
466}
UNICHARSET unicharset
Definition: ccutil.h:73
int add_script(const char *script)

Member Function Documentation

◆ detect_blob()

void ScriptDetector::detect_blob ( BLOB_CHOICE_LIST *  scores)

Definition at line 471 of file osdetect.cpp.

471 {
472 for (int i = 0; i < 4; ++i) {
473 bool done[kMaxNumberOfScripts] = { false };
474
475 BLOB_CHOICE_IT choice_it;
476 choice_it.set_to_list(scores + i);
477
478 float prev_score = -1;
479 int script_count = 0;
480 int prev_id = -1;
481 int prev_fontinfo_id = -1;
482 const char* prev_unichar = "";
483 const char* unichar = "";
484
485 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
486 choice_it.forward()) {
487 BLOB_CHOICE* choice = choice_it.data();
488 int id = choice->script_id();
489 if (allowed_scripts_ != nullptr && !allowed_scripts_->empty()) {
490 // Check that the choice is in an allowed script.
491 int s = 0;
492 for (s = 0; s < allowed_scripts_->size(); ++s) {
493 if ((*allowed_scripts_)[s] == id) break;
494 }
495 if (s == allowed_scripts_->size()) continue; // Not found in list.
496 }
497 // Script already processed before.
498 if (done[id]) continue;
499 done[id] = true;
500
501 unichar = tess_->unicharset.id_to_unichar(choice->unichar_id());
502 // Save data from the first match
503 if (prev_score < 0) {
504 prev_score = -choice->certainty();
505 script_count = 1;
506 prev_id = id;
507 prev_unichar = unichar;
508 prev_fontinfo_id = choice->fontinfo_id();
509 } else if (-choice->certainty() < prev_score + kNonAmbiguousMargin) {
510 ++script_count;
511 }
512
513 if (strlen(prev_unichar) == 1)
514 if (unichar[0] >= '0' && unichar[0] <= '9')
515 break;
516
517 // if script_count is >= 2, character is ambiguous, skip other matches
518 // since they are useless.
519 if (script_count >= 2)
520 break;
521 }
522 // Character is non ambiguous
523 if (script_count == 1) {
524 // Update the score of the winning script
525 osr_->scripts_na[i][prev_id] += 1.0;
526
527 // Workaround for Fraktur
528 if (prev_id == latin_id_) {
529 if (prev_fontinfo_id >= 0) {
530 const tesseract::FontInfo &fi =
531 tess_->get_fontinfo_table().get(prev_fontinfo_id);
532 //printf("Font: %s i:%i b:%i f:%i s:%i k:%i (%s)\n", fi.name,
533 // fi.is_italic(), fi.is_bold(), fi.is_fixed_pitch(),
534 // fi.is_serif(), fi.is_fraktur(),
535 // prev_unichar);
536 if (fi.is_fraktur()) {
537 osr_->scripts_na[i][prev_id] -= 1.0;
538 osr_->scripts_na[i][fraktur_id_] += 1.0;
539 }
540 }
541 }
542
543 // Update Japanese / Korean pseudo-scripts
544 if (prev_id == katakana_id_)
545 osr_->scripts_na[i][japanese_id_] += 1.0;
546 if (prev_id == hiragana_id_)
547 osr_->scripts_na[i][japanese_id_] += 1.0;
548 if (prev_id == hangul_id_)
549 osr_->scripts_na[i][korean_id_] += 1.0;
550 if (prev_id == han_id_) {
551 osr_->scripts_na[i][korean_id_] += kHanRatioInKorean;
552 osr_->scripts_na[i][japanese_id_] += kHanRatioInJapanese;
553 }
554 }
555 } // iterate over each orientation
556}
const float kNonAmbiguousMargin
Definition: osdetect.cpp:48
const float kHanRatioInJapanese
Definition: osdetect.cpp:46
const float kHanRatioInKorean
Definition: osdetect.cpp:45
const int kMaxNumberOfScripts
Definition: osdetect.h:38
bool empty() const
Definition: genericvector.h:91
int size() const
Definition: genericvector.h:72
float scripts_na[4][kMaxNumberOfScripts]
Definition: osdetect.h:78
bool is_fraktur() const
Definition: fontinfo.h:115
float certainty() const
Definition: ratngs.h:83
int script_id() const
Definition: ratngs.h:114
int16_t fontinfo_id() const
Definition: ratngs.h:86
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:386

◆ must_stop()

bool ScriptDetector::must_stop ( int  orientation)

Definition at line 558 of file osdetect.cpp.

558 {
559 osr_->update_best_script(orientation);
560 return osr_->best_result.sconfidence > 1;
561}
float sconfidence
Definition: osdetect.h:45
OSBestResult best_result
Definition: osdetect.h:81
void update_best_script(int orientation_id)
Definition: osdetect.cpp:89

The documentation for this class was generated from the following files: