tesseract 4.1.1
Loading...
Searching...
No Matches
ratngs.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: ratngs.cpp (Formerly ratings.c)
3 * Description: Code to manipulate the BLOB_CHOICE and WERD_CHOICE classes.
4 * Author: Ray Smith
5 * Created: Thu Apr 23 13:23:29 BST 1992
6 *
7 * (C) Copyright 1992, Hewlett-Packard Ltd.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 **********************************************************************/
19
20
21#ifdef HAVE_CONFIG_H
22#include "config_auto.h"
23#endif
24
25#include "ratngs.h"
26
27#include <algorithm>
28#include <string>
29#include "blobs.h"
30#include "callcpp.h"
31#include "genericvector.h"
32#include "matrix.h"
33#include "normalis.h" // kBlnBaselineOffset.
34#include "unicharset.h"
35
37
40
41const float WERD_CHOICE::kBadRating = 100000.0;
42// Min offset in baseline-normalized coords to make a character a subscript.
43const int kMinSubscriptOffset = 20;
44// Min offset in baseline-normalized coords to make a character a superscript.
45const int kMinSuperscriptOffset = 20;
46// Max y of bottom of a drop-cap blob.
47const int kMaxDropCapBottom = -128;
48// Max fraction of x-height to use as denominator in measuring x-height overlap.
49const double kMaxOverlapDenominator = 0.125;
50// Min fraction of x-height range that should be in agreement for matching
51// x-heights.
52const double kMinXHeightMatch = 0.5;
53// Max tolerance on baseline position as a fraction of x-height for matching
54// baselines.
55const double kMaxBaselineDrift = 0.0625;
56
57static const char kPermuterTypeNoPerm[] = "None";
58static const char kPermuterTypePuncPerm[] = "Punctuation";
59static const char kPermuterTypeTopPerm[] = "Top Choice";
60static const char kPermuterTypeLowerPerm[] = "Top Lower Case";
61static const char kPermuterTypeUpperPerm[] = "Top Upper Case";
62static const char kPermuterTypeNgramPerm[] = "Ngram";
63static const char kPermuterTypeNumberPerm[] = "Number";
64static const char kPermuterTypeUserPatPerm[] = "User Pattern";
65static const char kPermuterTypeSysDawgPerm[] = "System Dictionary";
66static const char kPermuterTypeDocDawgPerm[] = "Document Dictionary";
67static const char kPermuterTypeUserDawgPerm[] = "User Dictionary";
68static const char kPermuterTypeFreqDawgPerm[] = "Frequent Words Dictionary";
69static const char kPermuterTypeCompoundPerm[] = "Compound";
70
71static const char * const kPermuterTypeNames[] = {
72 kPermuterTypeNoPerm, // 0
73 kPermuterTypePuncPerm, // 1
74 kPermuterTypeTopPerm, // 2
75 kPermuterTypeLowerPerm, // 3
76 kPermuterTypeUpperPerm, // 4
77 kPermuterTypeNgramPerm, // 5
78 kPermuterTypeNumberPerm, // 6
79 kPermuterTypeUserPatPerm, // 7
80 kPermuterTypeSysDawgPerm, // 8
81 kPermuterTypeDocDawgPerm, // 9
82 kPermuterTypeUserDawgPerm, // 10
83 kPermuterTypeFreqDawgPerm, // 11
84 kPermuterTypeCompoundPerm // 12
85};
86
92BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
93 float src_rating, // rating
94 float src_cert, // certainty
95 int src_script_id, // script
96 float min_xheight, // min xheight allowed
97 float max_xheight, // max xheight by this char
98 float yshift, // yshift out of position
99 BlobChoiceClassifier c) { // adapted match or other
100 unichar_id_ = src_unichar_id;
101 rating_ = src_rating;
102 certainty_ = src_cert;
103 fontinfo_id_ = -1;
104 fontinfo_id2_ = -1;
105 script_id_ = src_script_id;
106 min_xheight_ = min_xheight;
107 max_xheight_ = max_xheight;
108 yshift_ = yshift;
109 classifier_ = c;
110}
111
118 unichar_id_ = other.unichar_id();
119 rating_ = other.rating();
120 certainty_ = other.certainty();
121 fontinfo_id_ = other.fontinfo_id();
122 fontinfo_id2_ = other.fontinfo_id2();
123 script_id_ = other.script_id();
124 matrix_cell_ = other.matrix_cell_;
125 min_xheight_ = other.min_xheight_;
126 max_xheight_ = other.max_xheight_;
127 yshift_ = other.yshift();
128 classifier_ = other.classifier_;
129#ifndef DISABLED_LEGACY_ENGINE
130 fonts_ = other.fonts_;
131#endif // ndef DISABLED_LEGACY_ENGINE
132}
133
134// Copy assignment operator.
135BLOB_CHOICE& BLOB_CHOICE::operator=(const BLOB_CHOICE& other) {
137 unichar_id_ = other.unichar_id();
138 rating_ = other.rating();
139 certainty_ = other.certainty();
140 fontinfo_id_ = other.fontinfo_id();
141 fontinfo_id2_ = other.fontinfo_id2();
142 script_id_ = other.script_id();
143 matrix_cell_ = other.matrix_cell_;
144 min_xheight_ = other.min_xheight_;
145 max_xheight_ = other.max_xheight_;
146 yshift_ = other.yshift();
147 classifier_ = other.classifier_;
148#ifndef DISABLED_LEGACY_ENGINE
149 fonts_ = other.fonts_;
150#endif // ndef DISABLED_LEGACY_ENGINE
151 return *this;
152}
153
154// Returns true if *this and other agree on the baseline and x-height
155// to within some tolerance based on a given estimate of the x-height.
156bool BLOB_CHOICE::PosAndSizeAgree(const BLOB_CHOICE& other, float x_height,
157 bool debug) const {
158 double baseline_diff = fabs(yshift() - other.yshift());
159 if (baseline_diff > kMaxBaselineDrift * x_height) {
160 if (debug) {
161 tprintf("Baseline diff %g for %d v %d\n",
162 baseline_diff, unichar_id_, other.unichar_id_);
163 }
164 return false;
165 }
166 double this_range = max_xheight() - min_xheight();
167 double other_range = other.max_xheight() - other.min_xheight();
168 double denominator = ClipToRange(std::min(this_range, other_range),
169 1.0, kMaxOverlapDenominator * x_height);
170 double overlap = std::min(max_xheight(), other.max_xheight()) -
171 std::max(min_xheight(), other.min_xheight());
172 overlap /= denominator;
173 if (debug) {
174 tprintf("PosAndSize for %d v %d: bl diff = %g, ranges %g, %g / %g ->%g\n",
175 unichar_id_, other.unichar_id_, baseline_diff,
176 this_range, other_range, denominator, overlap);
177 }
178
179 return overlap >= kMinXHeightMatch;
180}
181
182// Helper to find the BLOB_CHOICE in the bc_list that matches the given
183// unichar_id, or nullptr if there is no match.
185 BLOB_CHOICE_LIST* bc_list) {
186 // Find the corresponding best BLOB_CHOICE.
187 BLOB_CHOICE_IT choice_it(bc_list);
188 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
189 choice_it.forward()) {
190 BLOB_CHOICE* choice = choice_it.data();
191 if (choice->unichar_id() == char_id) {
192 return choice;
193 }
194 }
195 return nullptr;
196}
197
198const char *WERD_CHOICE::permuter_name(uint8_t permuter) {
199 return kPermuterTypeNames[permuter];
200}
201
202namespace tesseract {
203
204const char *ScriptPosToString(enum ScriptPos script_pos) {
205 switch (script_pos) {
206 case SP_NORMAL: return "NORM";
207 case SP_SUBSCRIPT: return "SUB";
208 case SP_SUPERSCRIPT: return "SUPER";
209 case SP_DROPCAP: return "DROPC";
210 }
211 return "SP_UNKNOWN";
212}
213
214} // namespace tesseract.
215
222WERD_CHOICE::WERD_CHOICE(const char *src_string,
223 const UNICHARSET &unicharset)
224 : unicharset_(&unicharset){
226 GenericVector<char> lengths;
227 std::string cleaned = unicharset.CleanupString(src_string);
228 if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths,
229 nullptr)) {
230 lengths.push_back('\0');
231 STRING src_lengths = &lengths[0];
232 this->init(cleaned.c_str(), src_lengths.string(), 0.0, 0.0, NO_PERM);
233 } else { // There must have been an invalid unichar in the string.
234 this->init(8);
235 this->make_bad();
236 }
237}
238
249void WERD_CHOICE::init(const char *src_string,
250 const char *src_lengths,
251 float src_rating,
252 float src_certainty,
253 uint8_t src_permuter) {
254 int src_string_len = strlen(src_string);
255 if (src_string_len == 0) {
256 this->init(8);
257 } else {
258 this->init(src_lengths ? strlen(src_lengths): src_string_len);
259 length_ = reserved_;
260 int offset = 0;
261 for (int i = 0; i < length_; ++i) {
262 int unichar_length = src_lengths ? src_lengths[i] : 1;
263 unichar_ids_[i] =
264 unicharset_->unichar_to_id(src_string+offset, unichar_length);
265 state_[i] = 1;
266 certainties_[i] = src_certainty;
267 offset += unichar_length;
268 }
269 }
270 adjust_factor_ = 1.0f;
271 rating_ = src_rating;
272 certainty_ = src_certainty;
273 permuter_ = src_permuter;
274 dangerous_ambig_found_ = false;
275}
276
281 delete[] unichar_ids_;
282 delete[] script_pos_;
283 delete[] state_;
284 delete[] certainties_;
285}
286
287const char *WERD_CHOICE::permuter_name() const {
288 return kPermuterTypeNames[permuter_];
289}
290
291// Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,
292// taken from the appropriate cell in the ratings MATRIX.
293// Borrowed pointer, so do not delete.
294BLOB_CHOICE_LIST* WERD_CHOICE::blob_choices(int index, MATRIX* ratings) const {
295 MATRIX_COORD coord = MatrixCoord(index);
296 BLOB_CHOICE_LIST* result = ratings->get(coord.col, coord.row);
297 if (result == nullptr) {
298 result = new BLOB_CHOICE_LIST;
299 ratings->put(coord.col, coord.row, result);
300 }
301 return result;
302}
303
304// Returns the MATRIX_COORD corresponding to the location in the ratings
305// MATRIX for the given index into the word.
307 int col = 0;
308 for (int i = 0; i < index; ++i)
309 col += state_[i];
310 int row = col + state_[index] - 1;
311 return MATRIX_COORD(col, row);
312}
313
314// Sets the entries for the given index from the BLOB_CHOICE, assuming
315// unit fragment lengths, but setting the state for this index to blob_count.
316void WERD_CHOICE::set_blob_choice(int index, int blob_count,
317 const BLOB_CHOICE* blob_choice) {
318 unichar_ids_[index] = blob_choice->unichar_id();
319 script_pos_[index] = tesseract::SP_NORMAL;
320 state_[index] = blob_count;
321 certainties_[index] = blob_choice->certainty();
322}
323
324
331 for (int i = 0; i < length_; ++i) {
332 if (unichar_ids_[i] == unichar_id) {
333 return true;
334 }
335 }
336 return false;
337}
338
346void WERD_CHOICE::remove_unichar_ids(int start, int num) {
347 ASSERT_HOST(start >= 0 && start + num <= length_);
348 // Accumulate the states to account for the merged blobs.
349 for (int i = 0; i < num; ++i) {
350 if (start > 0)
351 state_[start - 1] += state_[start + i];
352 else if (start + num < length_)
353 state_[start + num] += state_[start + i];
354 }
355 for (int i = start; i + num < length_; ++i) {
356 unichar_ids_[i] = unichar_ids_[i + num];
357 script_pos_[i] = script_pos_[i + num];
358 state_[i] = state_[i + num];
359 certainties_[i] = certainties_[i + num];
360 }
361 length_ -= num;
362}
363
370 for (int i = 0; i < length_ / 2; ++i) {
371 UNICHAR_ID tmp_id = unichar_ids_[i];
372 unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_-1-i]);
373 unichar_ids_[length_-1-i] = unicharset_->get_mirror(tmp_id);
374 }
375 if (length_ % 2 != 0) {
376 unichar_ids_[length_/2] = unicharset_->get_mirror(unichar_ids_[length_/2]);
377 }
378}
379
387void WERD_CHOICE::punct_stripped(int *start, int *end) const {
388 *start = 0;
389 *end = length() - 1;
390 while (*start < length() &&
391 unicharset()->get_ispunctuation(unichar_id(*start))) {
392 (*start)++;
393 }
394 while (*end > -1 &&
395 unicharset()->get_ispunctuation(unichar_id(*end))) {
396 (*end)--;
397 }
398 (*end)++;
399}
400
401void WERD_CHOICE::GetNonSuperscriptSpan(int *pstart, int *pend) const {
402 int end = length();
403 while (end > 0 &&
404 unicharset_->get_isdigit(unichar_ids_[end - 1]) &&
406 end--;
407 }
408 int start = 0;
409 while (start < end &&
410 unicharset_->get_isdigit(unichar_ids_[start]) &&
412 start++;
413 }
414 *pstart = start;
415 *pend = end;
416}
417
418WERD_CHOICE WERD_CHOICE::shallow_copy(int start, int end) const {
419 ASSERT_HOST(start >= 0 && start <= length_);
420 ASSERT_HOST(end >= 0 && end <= length_);
421 if (end < start) { end = start; }
422 WERD_CHOICE retval(unicharset_, end - start);
423 for (int i = start; i < end; i++) {
425 unichar_ids_[i], state_[i], 0.0f, certainties_[i]);
426 }
427 return retval;
428}
429
436 int i;
437 for (i = 0; i < length_; ++i) {
438 UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]);
439 if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
441 return true;
442 }
443 }
444 return false;
445}
446
454 STRING *word_lengths_str) const {
455 *word_str = "";
456 if (word_lengths_str != nullptr) *word_lengths_str = "";
457 for (int i = 0; i < length_; ++i) {
458 const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]);
459 *word_str += ch;
460 if (word_lengths_str != nullptr) {
461 *word_lengths_str += strlen(ch);
462 }
463 }
464}
465
473 UNICHAR_ID unichar_id, int blob_count,
474 float rating, float certainty) {
475 if (length_ == reserved_) {
476 this->double_the_size();
477 }
478 this->append_unichar_id_space_allocated(unichar_id, blob_count,
480}
481
490 ASSERT_HOST(unicharset_ == second.unicharset_);
491 while (reserved_ < length_ + second.length()) {
492 this->double_the_size();
493 }
494 const UNICHAR_ID *other_unichar_ids = second.unichar_ids();
495 for (int i = 0; i < second.length(); ++i) {
496 unichar_ids_[length_ + i] = other_unichar_ids[i];
497 state_[length_ + i] = second.state_[i];
498 certainties_[length_ + i] = second.certainties_[i];
499 script_pos_[length_ + i] = second.BlobPosition(i);
500 }
501 length_ += second.length();
502 if (second.adjust_factor_ > adjust_factor_)
503 adjust_factor_ = second.adjust_factor_;
504 rating_ += second.rating(); // add ratings
505 if (second.certainty() < certainty_) // take min
506 certainty_ = second.certainty();
507 if (second.dangerous_ambig_found_)
508 dangerous_ambig_found_ = true;
509 if (permuter_ == NO_PERM) {
510 permuter_ = second.permuter();
511 } else if (second.permuter() != NO_PERM &&
512 second.permuter() != permuter_) {
513 permuter_ = COMPOUND_PERM;
514 }
515 return *this;
516}
517
518
526 while (reserved_ < source.length()) {
527 this->double_the_size();
528 }
529
530 unicharset_ = source.unicharset_;
531 const UNICHAR_ID *other_unichar_ids = source.unichar_ids();
532 for (int i = 0; i < source.length(); ++i) {
533 unichar_ids_[i] = other_unichar_ids[i];
534 state_[i] = source.state_[i];
535 certainties_[i] = source.certainties_[i];
536 script_pos_[i] = source.BlobPosition(i);
537 }
538 length_ = source.length();
539 adjust_factor_ = source.adjust_factor_;
540 rating_ = source.rating();
541 certainty_ = source.certainty();
542 min_x_height_ = source.min_x_height();
543 max_x_height_ = source.max_x_height();
544 permuter_ = source.permuter();
545 dangerous_ambig_found_ = source.dangerous_ambig_found_;
546 return *this;
547}
548
549// Sets up the script_pos_ member using the blobs_list to get the bln
550// bounding boxes, *this to get the unichars, and this->unicharset
551// to get the target positions. If small_caps is true, sub/super are not
552// considered, but dropcaps are.
553// NOTE: blobs_list should be the chopped_word blobs. (Fully segemented.)
554void WERD_CHOICE::SetScriptPositions(bool small_caps, TWERD* word, int debug) {
555 // Initialize to normal.
556 for (int i = 0; i < length_; ++i)
557 script_pos_[i] = tesseract::SP_NORMAL;
558 if (word->blobs.empty() || word->NumBlobs() != TotalOfStates()) {
559 return;
560 }
561
562 int position_counts[4] = { 0, 0, 0, 0 };
563
564 int chunk_index = 0;
565 for (int blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) {
566 TBLOB* tblob = word->blobs[chunk_index];
567 int uni_id = unichar_id(blob_index);
568 TBOX blob_box = tblob->bounding_box();
569 if (state_ != nullptr) {
570 for (int i = 1; i < state_[blob_index]; ++i) {
571 ++chunk_index;
572 tblob = word->blobs[chunk_index];
573 blob_box += tblob->bounding_box();
574 }
575 }
576 script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box,
577 uni_id);
578 if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) {
579 script_pos_[blob_index] = tesseract::SP_NORMAL;
580 }
581 position_counts[script_pos_[blob_index]]++;
582 }
583 // If almost everything looks like a superscript or subscript,
584 // we most likely just got the baseline wrong.
585 if (position_counts[tesseract::SP_SUBSCRIPT] > 0.75 * length_ ||
586 position_counts[tesseract::SP_SUPERSCRIPT] > 0.75 * length_) {
587 if (debug >= 2) {
588 tprintf("Most characters of %s are subscript or superscript.\n"
589 "That seems wrong, so I'll assume we got the baseline wrong\n",
590 unichar_string().string());
591 }
592 for (int i = 0; i < length_; i++) {
593 ScriptPos sp = script_pos_[i];
595 position_counts[sp]--;
596 position_counts[tesseract::SP_NORMAL]++;
597 script_pos_[i] = tesseract::SP_NORMAL;
598 }
599 }
600 }
601
602 if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) ||
603 debug >= 2) {
604 tprintf("SetScriptPosition on %s\n", unichar_string().string());
605 int chunk_index = 0;
606 for (int blob_index = 0; blob_index < length_; ++blob_index) {
607 if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) {
608 TBLOB* tblob = word->blobs[chunk_index];
609 ScriptPositionOf(true, *unicharset_, tblob->bounding_box(),
610 unichar_id(blob_index));
611 }
612 chunk_index += state_ != nullptr ? state_[blob_index] : 1;
613 }
614 }
615}
616// Sets the script_pos_ member from some source positions with a given length.
618 int length) {
619 ASSERT_HOST(length == length_);
620 if (positions != script_pos_) {
621 delete [] script_pos_;
622 script_pos_ = new ScriptPos[length];
623 memcpy(script_pos_, positions, sizeof(positions[0]) * length);
624 }
625}
626// Sets all the script_pos_ positions to the given position.
628 for (int i = 0; i < length_; ++i)
629 script_pos_[i] = position;
630}
631
632/* static */
633ScriptPos WERD_CHOICE::ScriptPositionOf(bool print_debug,
634 const UNICHARSET& unicharset,
635 const TBOX& blob_box,
636 UNICHAR_ID unichar_id) {
637 ScriptPos retval = tesseract::SP_NORMAL;
638 int top = blob_box.top();
639 int bottom = blob_box.bottom();
640 int min_bottom, max_bottom, min_top, max_top;
642 &min_bottom, &max_bottom,
643 &min_top, &max_top);
644
645 int sub_thresh_top = min_top - kMinSubscriptOffset;
646 int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset;
647 int sup_thresh_bot = max_bottom + kMinSuperscriptOffset;
648 if (bottom <= kMaxDropCapBottom) {
649 retval = tesseract::SP_DROPCAP;
650 } else if (top < sub_thresh_top && bottom < sub_thresh_bot) {
652 } else if (bottom > sup_thresh_bot) {
654 }
655
656 if (print_debug) {
657 const char *pos = ScriptPosToString(retval);
658 tprintf("%s Character %s[bot:%d top: %d] "
659 "bot_range[%d,%d] top_range[%d, %d] "
660 "sub_thresh[bot:%d top:%d] sup_thresh_bot %d\n",
662 bottom, top,
663 min_bottom, max_bottom, min_top, max_top,
664 sub_thresh_bot, sub_thresh_top,
665 sup_thresh_bot);
666 }
667 return retval;
668}
669
670// Returns the script-id (eg Han) of the dominant script in the word.
672 int max_script = unicharset_->get_script_table_size();
673 int *sid = new int[max_script];
674 int x;
675 for (x = 0; x < max_script; x++) sid[x] = 0;
676 for (x = 0; x < length_; ++x) {
677 int script_id = unicharset_->get_script(unichar_id(x));
678 sid[script_id]++;
679 }
680 if (unicharset_->han_sid() != unicharset_->null_sid()) {
681 // Add the Hiragana & Katakana counts to Han and zero them out.
682 if (unicharset_->hiragana_sid() != unicharset_->null_sid()) {
683 sid[unicharset_->han_sid()] += sid[unicharset_->hiragana_sid()];
684 sid[unicharset_->hiragana_sid()] = 0;
685 }
686 if (unicharset_->katakana_sid() != unicharset_->null_sid()) {
687 sid[unicharset_->han_sid()] += sid[unicharset_->katakana_sid()];
688 sid[unicharset_->katakana_sid()] = 0;
689 }
690 }
691 // Note that high script ID overrides lower one on a tie, thus biasing
692 // towards non-Common script (if sorted that way in unicharset file).
693 int max_sid = 0;
694 for (x = 1; x < max_script; x++)
695 if (sid[x] >= sid[max_sid]) max_sid = x;
696 if (sid[max_sid] < length_ / 2)
697 max_sid = unicharset_->null_sid();
698 delete[] sid;
699 return max_sid;
700}
701
702// Fixes the state_ for a chop at the given blob_posiiton.
703void WERD_CHOICE::UpdateStateForSplit(int blob_position) {
704 int total_chunks = 0;
705 for (int i = 0; i < length_; ++i) {
706 total_chunks += state_[i];
707 if (total_chunks > blob_position) {
708 ++state_[i];
709 return;
710 }
711 }
712}
713
714// Returns the sum of all the state elements, being the total number of blobs.
716 int total_chunks = 0;
717 for (int i = 0; i < length_; ++i) {
718 total_chunks += state_[i];
719 }
720 return total_chunks;
721}
722
728void WERD_CHOICE::print(const char *msg) const {
729 tprintf("%s : ", msg);
730 for (int i = 0; i < length_; ++i) {
731 tprintf("%s", unicharset_->id_to_unichar(unichar_ids_[i]));
732 }
733 tprintf(" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n",
734 rating_, certainty_, adjust_factor_, permuter_,
735 min_x_height_, max_x_height_, dangerous_ambig_found_);
736 tprintf("pos");
737 for (int i = 0; i < length_; ++i) {
738 tprintf("\t%s", ScriptPosToString(script_pos_[i]));
739 }
740 tprintf("\nstr");
741 for (int i = 0; i < length_; ++i) {
742 tprintf("\t%s", unicharset_->id_to_unichar(unichar_ids_[i]));
743 }
744 tprintf("\nstate:");
745 for (int i = 0; i < length_; ++i) {
746 tprintf("\t%d ", state_[i]);
747 }
748 tprintf("\nC");
749 for (int i = 0; i < length_; ++i) {
750 tprintf("\t%.3f", certainties_[i]);
751 }
752 tprintf("\n");
753}
754
755// Prints the segmentation state with an introductory message.
756void WERD_CHOICE::print_state(const char *msg) const {
757 tprintf("%s", msg);
758 for (int i = 0; i < length_; ++i)
759 tprintf(" %d", state_[i]);
760 tprintf("\n");
761}
762
763// Displays the segmentation state of *this (if not the same as the last
764// one displayed) and waits for a click in the window.
766#ifndef GRAPHICS_DISABLED
767 // Number of different colors to draw with.
768 const int kNumColors = 6;
769 static ScrollView *segm_window = nullptr;
770 // Check the state against the static prev_drawn_state.
771 static GenericVector<int> prev_drawn_state;
772 bool already_done = prev_drawn_state.size() == length_;
773 if (!already_done) prev_drawn_state.init_to_size(length_, 0);
774 for (int i = 0; i < length_; ++i) {
775 if (prev_drawn_state[i] != state_[i]) {
776 already_done = false;
777 }
778 prev_drawn_state[i] = state_[i];
779 }
780 if (already_done || word->blobs.empty()) return;
781
782 // Create the window if needed.
783 if (segm_window == nullptr) {
784 segm_window = new ScrollView("Segmentation", 5, 10, 500, 256,
785 2000.0, 256.0, true);
786 } else {
787 segm_window->Clear();
788 }
789
790 TBOX bbox;
791 int blob_index = 0;
792 for (int c = 0; c < length_; ++c) {
793 auto color =
794 static_cast<ScrollView::Color>(c % kNumColors + 3);
795 for (int i = 0; i < state_[c]; ++i, ++blob_index) {
796 TBLOB* blob = word->blobs[blob_index];
797 bbox += blob->bounding_box();
798 blob->plot(segm_window, color, color);
799 }
800 }
801 segm_window->ZoomToRectangle(bbox.left(), bbox.top(),
802 bbox.right(), bbox.bottom());
803 segm_window->Update();
804 window_wait(segm_window);
805#endif
806}
807
808
810 const WERD_CHOICE &word2) {
811 const UNICHARSET *uchset = word1.unicharset();
812 if (word2.unicharset() != uchset) return false;
813 int w1start, w1end;
814 word1.punct_stripped(&w1start, &w1end);
815 int w2start, w2end;
816 word2.punct_stripped(&w2start, &w2end);
817 if (w1end - w1start != w2end - w2start) return false;
818 for (int i = 0; i < w1end - w1start; i++) {
819 if (uchset->to_lower(word1.unichar_id(w1start + i)) !=
820 uchset->to_lower(word2.unichar_id(w2start + i))) {
821 return false;
822 }
823 }
824 return true;
825}
826
837void print_ratings_list(const char *msg,
838 BLOB_CHOICE_LIST *ratings,
839 const UNICHARSET &current_unicharset) {
840 if (ratings->length() == 0) {
841 tprintf("%s:<none>\n", msg);
842 return;
843 }
844 if (*msg != '\0') {
845 tprintf("%s\n", msg);
846 }
847 BLOB_CHOICE_IT c_it;
848 c_it.set_to_list(ratings);
849 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
850 c_it.data()->print(&current_unicharset);
851 if (!c_it.at_last()) tprintf("\n");
852 }
853 tprintf("\n");
854 fflush(stdout);
855}
const int kBlnBaselineOffset
Definition: normalis.h:25
const int kMinSubscriptOffset
Definition: ratngs.cpp:43
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:837
const int kMinSuperscriptOffset
Definition: ratngs.cpp:45
const double kMaxBaselineDrift
Definition: ratngs.cpp:55
const double kMaxOverlapDenominator
Definition: ratngs.cpp:49
const double kMinXHeightMatch
Definition: ratngs.cpp:52
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
Definition: ratngs.cpp:809
const int kMaxDropCapBottom
Definition: ratngs.cpp:47
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:184
@ COMPOUND_PERM
Definition: ratngs.h:245
@ NO_PERM
Definition: ratngs.h:233
BlobChoiceClassifier
Definition: ratngs.h:43
#define ELISTIZE(CLASSNAME)
Definition: elst.h:931
#define ASSERT_HOST(x)
Definition: errcode.h:88
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:108
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int UNICHAR_ID
Definition: unichar.h:34
char window_wait(ScrollView *win)
Definition: callcpp.cpp:103
@ SP_SUBSCRIPT
Definition: ratngs.h:254
@ SP_DROPCAP
Definition: ratngs.h:256
@ SP_NORMAL
Definition: ratngs.h:253
@ SP_SUPERSCRIPT
Definition: ratngs.h:255
const char * ScriptPosToString(enum ScriptPos script_pos)
Definition: ratngs.cpp:204
void init_to_size(int size, const T &t)
int push_back(T object)
bool empty() const
Definition: genericvector.h:91
int size() const
Definition: genericvector.h:72
T get(ICOORD pos) const
Definition: matrix.h:231
void put(ICOORD pos, const T &thing)
Definition: matrix.h:223
Definition: blobs.h:284
TBOX bounding_box() const
Definition: blobs.cpp:468
void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color)
Definition: blobs.cpp:510
Definition: blobs.h:418
int NumBlobs() const
Definition: blobs.h:448
GenericVector< TBLOB * > blobs
Definition: blobs.h:459
Definition: matrix.h:578
float max_xheight() const
Definition: ratngs.h:123
float certainty() const
Definition: ratngs.h:83
float yshift() const
Definition: ratngs.h:126
int16_t fontinfo_id2() const
Definition: ratngs.h:89
bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const
Definition: ratngs.cpp:156
int script_id() const
Definition: ratngs.h:114
int16_t fontinfo_id() const
Definition: ratngs.h:86
float rating() const
Definition: ratngs.h:80
BLOB_CHOICE()
Definition: ratngs.h:54
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
float min_xheight() const
Definition: ratngs.h:120
MATRIX_COORD MatrixCoord(int index) const
Definition: ratngs.cpp:306
WERD_CHOICE shallow_copy(int start, int end) const
Definition: ratngs.cpp:418
void init(int reserved)
Definition: ratngs.h:399
void SetScriptPositions(bool small_caps, TWERD *word, int debug=0)
Definition: ratngs.cpp:554
void UpdateStateForSplit(int blob_position)
Definition: ratngs.cpp:703
const STRING & unichar_string() const
Definition: ratngs.h:531
int TotalOfStates() const
Definition: ratngs.cpp:715
void double_the_size()
Make more space in unichar_id_ and fragment_lengths_ arrays.
Definition: ratngs.h:377
static tesseract::ScriptPos ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset, const TBOX &blob_box, UNICHAR_ID unichar_id)
Definition: ratngs.cpp:633
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:453
int GetTopScriptID() const
Definition: ratngs.cpp:671
WERD_CHOICE & operator=(const WERD_CHOICE &source)
Definition: ratngs.cpp:525
void punct_stripped(int *start_core, int *end_core) const
Definition: ratngs.cpp:387
void print_state(const char *msg) const
Definition: ratngs.cpp:756
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
uint8_t permuter() const
Definition: ratngs.h:336
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: ratngs.cpp:330
const UNICHARSET * unicharset() const
Definition: ratngs.h:290
const char * permuter_name() const
Definition: ratngs.cpp:287
WERD_CHOICE & operator+=(const WERD_CHOICE &second)
Definition: ratngs.cpp:489
void DisplaySegmentation(TWERD *word)
Definition: ratngs.cpp:765
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: ratngs.cpp:627
BLOB_CHOICE_LIST * blob_choices(int index, MATRIX *ratings) const
Definition: ratngs.cpp:294
static const float kBadRating
Definition: ratngs.h:265
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:433
bool has_rtl_unichar_id() const
Definition: ratngs.cpp:435
void remove_unichar_ids(int index, int num)
Definition: ratngs.cpp:346
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice)
Definition: ratngs.cpp:316
void reverse_and_mirror_unichar_ids()
Definition: ratngs.cpp:369
float min_x_height() const
Definition: ratngs.h:326
float certainty() const
Definition: ratngs.h:320
WERD_CHOICE(const UNICHARSET *unicharset)
Definition: ratngs.h:268
int length() const
Definition: ratngs.h:293
tesseract::ScriptPos BlobPosition(int index) const
Definition: ratngs.h:312
float max_x_height() const
Definition: ratngs.h:329
const UNICHAR_ID * unichar_ids() const
Definition: ratngs.h:302
~WERD_CHOICE()
Definition: ratngs.cpp:280
void GetNonSuperscriptSpan(int *start, int *end) const
Definition: ratngs.cpp:401
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:472
void print() const
Definition: ratngs.h:570
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:442
float rating() const
Definition: ratngs.h:317
Definition: rect.h:34
int16_t top() const
Definition: rect.h:58
int16_t left() const
Definition: rect.h:72
int16_t bottom() const
Definition: rect.h:65
int16_t right() const
Definition: rect.h:79
void operator=(const ELIST_LINK &)
Definition: elst.h:94
Definition: strngs.h:45
const char * string() const
Definition: strngs.cpp:194
int hiragana_sid() const
Definition: unicharset.h:890
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:299
int katakana_sid() const
Definition: unicharset.h:891
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:690
UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:704
@ U_RIGHT_TO_LEFT_ARABIC
Definition: unicharset.h:170
@ U_RIGHT_TO_LEFT
Definition: unicharset.h:158
int han_sid() const
Definition: unicharset.h:889
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:663
int null_sid() const
Definition: unicharset.h:884
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:259
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:697
int get_script_table_size() const
Definition: unicharset.h:849
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:246
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:568
static void Update()
Definition: scrollview.cpp:709
void Clear()
Definition: scrollview.cpp:589
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:757