tesseract 4.1.1
Loading...
Searching...
No Matches
applybox.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: applybox.cpp (Formerly applybox.c)
3 * Description: Re segment rows according to box file data
4 * Author: Phil Cheatle
5 *
6 * (C) Copyright 1993, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19#include <cctype>
20#include <cerrno>
21#include <cstring>
22#include "allheaders.h"
23#include "boxread.h"
24#include "pageres.h"
25#include "unichar.h"
26#include "unicharset.h"
27#include "tesseractclass.h"
28#include "genericvector.h"
29
31const int kMaxGroupSize = 4;
34const double kMaxXHeightDeviationFraction = 0.125;
35
71namespace tesseract {
72
73#ifndef DISABLED_LEGACY_ENGINE
74static void clear_any_old_text(BLOCK_LIST *block_list) {
75 BLOCK_IT block_it(block_list);
76 for (block_it.mark_cycle_pt();
77 !block_it.cycled_list(); block_it.forward()) {
78 ROW_IT row_it(block_it.data()->row_list());
79 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
80 WERD_IT word_it(row_it.data()->word_list());
81 for (word_it.mark_cycle_pt();
82 !word_it.cycled_list(); word_it.forward()) {
83 word_it.data()->set_text("");
84 }
85 }
86 }
87}
88
89// Applies the box file based on the image name fname, and resegments
90// the words in the block_list (page), with:
91// blob-mode: one blob per line in the box file, words as input.
92// word/line-mode: one blob per space-delimited unit after the #, and one word
93// per line in the box file. (See comment above for box file format.)
94// If find_segmentation is true, (word/line mode) then the classifier is used
95// to re-segment words/lines to match the space-delimited truth string for
96// each box. In this case, the input box may be for a word or even a whole
97// text line, and the output words will contain multiple blobs corresponding
98// to the space-delimited input string.
99// With find_segmentation false, no classifier is needed, but the chopper
100// can still be used to correctly segment touching characters with the help
101// of the input boxes.
102// In the returned PAGE_RES, the WERD_RES are setup as they would be returned
103// from normal classification, ie. with a word, chopped_word, rebuild_word,
104// seam_array, denorm, box_word, and best_state, but NO best_choice or
105// raw_choice, as they would require a UNICHARSET, which we aim to avoid.
106// Instead, the correct_text member of WERD_RES is set, and this may be later
107// converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
108// is not required before calling ApplyBoxTraining.
110 bool find_segmentation,
111 BLOCK_LIST *block_list) {
113 GenericVector<STRING> texts, full_texts;
114 if (!ReadAllBoxes(applybox_page, true, fname, &boxes, &texts, &full_texts,
115 nullptr)) {
116 return nullptr; // Can't do it.
117 }
118
119 const int box_count = boxes.size();
120 int box_failures = 0;
121
122 // In word mode, we use the boxes to make a word for each box, but
123 // in blob mode we use the existing words and maximally chop them first.
124 PAGE_RES* page_res = find_segmentation ?
125 nullptr : SetupApplyBoxes(boxes, block_list);
126 clear_any_old_text(block_list);
127
128 for (int i = 0; i < box_count; i++) {
129 bool foundit = false;
130 if (page_res != nullptr) {
131 foundit = ResegmentCharBox(page_res,
132 (i == 0) ? nullptr : &boxes[i - 1],
133 boxes[i],
134 (i == box_count - 1) ? nullptr : &boxes[i + 1],
135 full_texts[i].string());
136 } else {
137 foundit = ResegmentWordBox(block_list, boxes[i],
138 (i == box_count - 1) ? nullptr : &boxes[i + 1],
139 texts[i].string());
140 }
141 if (!foundit) {
142 box_failures++;
143 ReportFailedBox(i, boxes[i], texts[i].string(),
144 "FAILURE! Couldn't find a matching blob");
145 }
146 }
147
148 if (page_res == nullptr) {
149 // In word/line mode, we now maximally chop all the words and resegment
150 // them with the classifier.
151 page_res = SetupApplyBoxes(boxes, block_list);
153 }
154 if (applybox_debug > 0) {
155 tprintf("APPLY_BOXES:\n");
156 tprintf(" Boxes read from boxfile: %6d\n", box_count);
157 if (box_failures > 0)
158 tprintf(" Boxes failed resegmentation: %6d\n", box_failures);
159 }
160 TidyUp(page_res);
161 return page_res;
162}
163#endif // ndef DISABLED_LEGACY_ENGINE
164
165// Helper computes median xheight in the image.
166static double MedianXHeight(BLOCK_LIST *block_list) {
167 BLOCK_IT block_it(block_list);
168 STATS xheights(0, block_it.data()->pdblk.bounding_box().height());
169 for (block_it.mark_cycle_pt();
170 !block_it.cycled_list(); block_it.forward()) {
171 ROW_IT row_it(block_it.data()->row_list());
172 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
173 xheights.add(IntCastRounded(row_it.data()->x_height()), 1);
174 }
175 }
176 return xheights.median();
177}
178
181void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
182 const double median_xheight = MedianXHeight(block_list);
183 const double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
184 // Strip all fuzzy space markers to simplify the PAGE_RES.
185 BLOCK_IT b_it(block_list);
186 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
187 BLOCK* block = b_it.data();
188 ROW_IT r_it(block->row_list());
189 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
190 ROW* row = r_it.data();
191 const double diff = fabs(row->x_height() - median_xheight);
192 if (diff > max_deviation) {
193 if (applybox_debug) {
194 tprintf("row xheight=%g, but median xheight = %g\n",
195 row->x_height(), median_xheight);
196 }
197 row->set_x_height(static_cast<float>(median_xheight));
198 }
199 }
200 }
201}
202
203#ifndef DISABLED_LEGACY_ENGINE
204
208 BLOCK_LIST *block_list) {
209 PreenXHeights(block_list);
210 // Strip all fuzzy space markers to simplify the PAGE_RES.
211 BLOCK_IT b_it(block_list);
212 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
213 BLOCK* block = b_it.data();
214 ROW_IT r_it(block->row_list());
215 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
216 ROW* row = r_it.data();
217 WERD_IT w_it(row->word_list());
218 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
219 WERD* word = w_it.data();
220 if (word->cblob_list()->empty()) {
221 delete w_it.extract();
222 } else {
223 word->set_flag(W_FUZZY_SP, false);
224 word->set_flag(W_FUZZY_NON, false);
225 }
226 }
227 }
228 }
229 auto* page_res = new PAGE_RES(false, block_list, nullptr);
230 PAGE_RES_IT pr_it(page_res);
231 WERD_RES* word_res;
232 while ((word_res = pr_it.word()) != nullptr) {
233 MaximallyChopWord(boxes, pr_it.block()->block,
234 pr_it.row()->row, word_res);
235 pr_it.forward();
236 }
237 return page_res;
238}
239
244 BLOCK* block, ROW* row,
245 WERD_RES* word_res) {
246 if (!word_res->SetupForRecognition(unicharset, this, BestPix(),
251 row, block)) {
252 word_res->CloneChoppedToRebuild();
253 return;
254 }
255 if (chop_debug) {
256 tprintf("Maximally chopping word at:");
257 word_res->word->bounding_box().print();
258 }
259 GenericVector<BLOB_CHOICE*> blob_choices;
260 ASSERT_HOST(!word_res->chopped_word->blobs.empty());
261 auto rating = static_cast<float>(INT8_MAX);
262 for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
263 // The rating and certainty are not quite arbitrary. Since
264 // select_blob_to_chop uses the worst certainty to choose, they all have
265 // to be different, so starting with INT8_MAX, subtract 1/8 for each blob
266 // in here, and then divide by e each time they are chopped, which
267 // should guarantee a set of unequal values for the whole tree of blobs
268 // produced, however much chopping is required. The chops are thus only
269 // limited by the ability of the chopper to find suitable chop points,
270 // and not by the value of the certainties.
271 auto* choice =
272 new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
273 blob_choices.push_back(choice);
274 rating -= 0.125f;
275 }
276 const double e = exp(1.0); // The base of natural logs.
277 int blob_number;
278 int right_chop_index = 0;
280 // We only chop if the language is not fixed pitch like CJK.
281 SEAM* seam = nullptr;
282 while ((seam = chop_one_blob(boxes, blob_choices, word_res,
283 &blob_number)) != nullptr) {
284 word_res->InsertSeam(blob_number, seam);
285 BLOB_CHOICE* left_choice = blob_choices[blob_number];
286 rating = left_choice->rating() / e;
287 left_choice->set_rating(rating);
288 left_choice->set_certainty(-rating);
289 // combine confidence w/ serial #
290 auto* right_choice = new BLOB_CHOICE(++right_chop_index,
291 rating - 0.125f, -rating, -1,
292 0.0f, 0.0f, 0.0f, BCC_FAKE);
293 blob_choices.insert(right_choice, blob_number + 1);
294 }
295 }
296 word_res->CloneChoppedToRebuild();
297 word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
298}
299
311static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
312 const int overlap_area = box1.intersection(box2).area();
313 const int a = box1.area();
314 const int b = box2.area();
315 ASSERT_HOST(a != 0 && b != 0);
316 return 1.0 * (a - overlap_area) * (b - overlap_area) / a / b;
317}
318
329bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX* prev_box,
330 const TBOX& box, const TBOX* next_box,
331 const char* correct_text) {
332 if (applybox_debug > 1) {
333 tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
334 }
335 PAGE_RES_IT page_res_it(page_res);
336 WERD_RES* word_res;
337 for (word_res = page_res_it.word(); word_res != nullptr;
338 word_res = page_res_it.forward()) {
339 if (!word_res->box_word->bounding_box().major_overlap(box))
340 continue;
341 if (applybox_debug > 1) {
342 tprintf("Checking word box:");
343 word_res->box_word->bounding_box().print();
344 }
345 int word_len = word_res->box_word->length();
346 for (int i = 0; i < word_len; ++i) {
347 TBOX char_box = TBOX();
348 int blob_count = 0;
349 for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
350 TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
351 if (!blob_box.major_overlap(box))
352 break;
353 if (word_res->correct_text[i + blob_count].length() > 0)
354 break; // Blob is claimed already.
355 if (next_box != nullptr) {
356 const double current_box_miss_metric = BoxMissMetric(blob_box, box);
357 const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
358 if (applybox_debug > 2) {
359 tprintf("Checking blob:");
360 blob_box.print();
361 tprintf("Current miss metric = %g, next = %g\n",
362 current_box_miss_metric, next_box_miss_metric);
363 }
364 if (current_box_miss_metric > next_box_miss_metric)
365 break; // Blob is a better match for next box.
366 }
367 char_box += blob_box;
368 }
369 if (blob_count > 0) {
370 if (applybox_debug > 1) {
371 tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
372 }
373 if (!char_box.almost_equal(box, 3) &&
374 ((next_box != nullptr && box.x_gap(*next_box) < -3)||
375 (prev_box != nullptr && prev_box->x_gap(box) < -3))) {
376 return false;
377 }
378 // We refine just the box_word, best_state and correct_text here.
379 // The rebuild_word is made in TidyUp.
380 // blob_count blobs are put together to match the box. Merge the
381 // box_word boxes, save the blob_count in the state and the text.
382 word_res->box_word->MergeBoxes(i, i + blob_count);
383 word_res->best_state[i] = blob_count;
384 word_res->correct_text[i] = correct_text;
385 if (applybox_debug > 2) {
386 tprintf("%d Blobs match: blob box:", blob_count);
387 word_res->box_word->BlobBox(i).print();
388 tprintf("Matches box:");
389 box.print();
390 if (next_box != nullptr) {
391 tprintf("With next box:");
392 next_box->print();
393 }
394 }
395 // Eliminated best_state and correct_text entries for the consumed
396 // blobs.
397 for (int j = 1; j < blob_count; ++j) {
398 word_res->best_state.remove(i + 1);
399 word_res->correct_text.remove(i + 1);
400 }
401 // Assume that no box spans multiple source words, so we are done with
402 // this box.
403 if (applybox_debug > 1) {
404 tprintf("Best state = ");
405 for (int j = 0; j < word_res->best_state.size(); ++j) {
406 tprintf("%d ", word_res->best_state[j]);
407 }
408 tprintf("\n");
409 tprintf("Correct text = [[ ");
410 for (int j = 0; j < word_res->correct_text.size(); ++j) {
411 tprintf("%s ", word_res->correct_text[j].string());
412 }
413 tprintf("]]\n");
414 }
415 return true;
416 }
417 }
418 }
419 if (applybox_debug > 0) {
420 tprintf("FAIL!\n");
421 }
422 return false; // Failure.
423}
424
431bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
432 const TBOX& box, const TBOX* next_box,
433 const char* correct_text) {
434 if (applybox_debug > 1) {
435 tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
436 }
437 WERD* new_word = nullptr;
438 BLOCK_IT b_it(block_list);
439 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
440 BLOCK* block = b_it.data();
441 if (!box.major_overlap(block->pdblk.bounding_box()))
442 continue;
443 ROW_IT r_it(block->row_list());
444 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
445 ROW* row = r_it.data();
446 if (!box.major_overlap(row->bounding_box()))
447 continue;
448 WERD_IT w_it(row->word_list());
449 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
450 WERD* word = w_it.data();
451 if (applybox_debug > 2) {
452 tprintf("Checking word:");
453 word->bounding_box().print();
454 }
455 if (word->text() != nullptr && word->text()[0] != '\0')
456 continue; // Ignore words that are already done.
457 if (!box.major_overlap(word->bounding_box()))
458 continue;
459 C_BLOB_IT blob_it(word->cblob_list());
460 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
461 blob_it.forward()) {
462 C_BLOB* blob = blob_it.data();
463 TBOX blob_box = blob->bounding_box();
464 if (!blob_box.major_overlap(box))
465 continue;
466 if (next_box != nullptr) {
467 const double current_box_miss_metric = BoxMissMetric(blob_box, box);
468 const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
469 if (applybox_debug > 2) {
470 tprintf("Checking blob:");
471 blob_box.print();
472 tprintf("Current miss metric = %g, next = %g\n",
473 current_box_miss_metric, next_box_miss_metric);
474 }
475 if (current_box_miss_metric > next_box_miss_metric)
476 continue; // Blob is a better match for next box.
477 }
478 if (applybox_debug > 2) {
479 tprintf("Blob match: blob:");
480 blob_box.print();
481 tprintf("Matches box:");
482 box.print();
483 if (next_box != nullptr) {
484 tprintf("With next box:");
485 next_box->print();
486 }
487 }
488 if (new_word == nullptr) {
489 // Make a new word with a single blob.
490 new_word = word->shallow_copy();
491 new_word->set_text(correct_text);
492 w_it.add_to_end(new_word);
493 }
494 C_BLOB_IT new_blob_it(new_word->cblob_list());
495 new_blob_it.add_to_end(blob_it.extract());
496 }
497 }
498 }
499 }
500 if (new_word == nullptr && applybox_debug > 0) tprintf("FAIL!\n");
501 return new_word != nullptr;
502}
503
507 PAGE_RES_IT pr_it(page_res);
508 WERD_RES* word_res;
509 for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
510 const WERD* word = word_res->word;
511 if (word->text() == nullptr || word->text()[0] == '\0')
512 continue; // Ignore words that have no text.
513 // Convert the correct text to a vector of UNICHAR_ID
514 GenericVector<UNICHAR_ID> target_text;
515 if (!ConvertStringToUnichars(word->text(), &target_text)) {
516 tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
517 word->text());
518 pr_it.DeleteCurrentWord();
519 continue;
520 }
521 if (!FindSegmentation(target_text, word_res)) {
522 tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
523 word->text());
524 pr_it.DeleteCurrentWord();
525 continue;
526 }
527 }
528}
529
530#endif // ndef DISABLED_LEGACY_ENGINE
531
535 GenericVector<UNICHAR_ID>* class_ids) {
536 for (int step = 0; *utf8 != '\0'; utf8 += step) {
537 const char* next_space = strchr(utf8, ' ');
538 if (next_space == nullptr)
539 next_space = utf8 + strlen(utf8);
540 step = next_space - utf8;
541 UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
542 if (class_id == INVALID_UNICHAR_ID) {
543 return false;
544 }
545 while (utf8[step] == ' ')
546 ++step;
547 class_ids->push_back(class_id);
548 }
549 return true;
550}
551
552#ifndef DISABLED_LEGACY_ENGINE
553
554
562 WERD_RES* word_res) {
563 // Classify all required combinations of blobs and save results in choices.
564 const int word_length = word_res->box_word->length();
565 auto* choices =
566 new GenericVector<BLOB_CHOICE_LIST*>[word_length];
567 for (int i = 0; i < word_length; ++i) {
568 for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
569 BLOB_CHOICE_LIST* match_result = classify_piece(
570 word_res->seam_array, i, i + j - 1, "Applybox",
571 word_res->chopped_word, word_res->blamer_bundle);
572 if (applybox_debug > 2) {
573 tprintf("%d+%d:", i, j);
574 print_ratings_list("Segment:", match_result, unicharset);
575 }
576 choices[i].push_back(match_result);
577 }
578 }
579 // Search the segmentation graph for the target text. Must be an exact
580 // match. Using wildcards makes it difficult to find the correct
581 // segmentation even when it is there.
582 word_res->best_state.clear();
583 GenericVector<int> search_segmentation;
584 float best_rating = 0.0f;
585 SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
586 &search_segmentation, &best_rating, &word_res->best_state);
587 for (int i = 0; i < word_length; ++i)
588 choices[i].delete_data_pointers();
589 delete [] choices;
590 if (word_res->best_state.empty()) {
591 // Build the original segmentation and if it is the same length as the
592 // truth, assume it will do.
593 int blob_count = 1;
594 for (int s = 0; s < word_res->seam_array.size(); ++s) {
595 SEAM* seam = word_res->seam_array[s];
596 if (!seam->HasAnySplits()) {
597 word_res->best_state.push_back(blob_count);
598 blob_count = 1;
599 } else {
600 ++blob_count;
601 }
602 }
603 word_res->best_state.push_back(blob_count);
604 if (word_res->best_state.size() != target_text.size()) {
605 word_res->best_state.clear(); // No good. Original segmentation bad size.
606 return false;
607 }
608 }
609 word_res->correct_text.clear();
610 for (int i = 0; i < target_text.size(); ++i) {
611 word_res->correct_text.push_back(
612 STRING(unicharset.id_to_unichar(target_text[i])));
613 }
614 return true;
615}
616
632 int choices_pos, int choices_length,
633 const GenericVector<UNICHAR_ID>& target_text,
634 int text_index,
635 float rating, GenericVector<int>* segmentation,
636 float* best_rating,
637 GenericVector<int>* best_segmentation) {
639 for (int length = 1; length <= choices[choices_pos].size(); ++length) {
640 // Rating of matching choice or worst choice if no match.
641 float choice_rating = 0.0f;
642 // Find the corresponding best BLOB_CHOICE.
643 BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
644 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
645 choice_it.forward()) {
646 const BLOB_CHOICE* choice = choice_it.data();
647 choice_rating = choice->rating();
648 UNICHAR_ID class_id = choice->unichar_id();
649 if (class_id == target_text[text_index]) {
650 break;
651 }
652 // Search ambigs table.
653 if (class_id < table.size() && table[class_id] != nullptr) {
654 AmbigSpec_IT spec_it(table[class_id]);
655 for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
656 spec_it.forward()) {
657 const AmbigSpec *ambig_spec = spec_it.data();
658 // We'll only do 1-1.
659 if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
660 ambig_spec->correct_ngram_id == target_text[text_index])
661 break;
662 }
663 if (!spec_it.cycled_list())
664 break; // Found an ambig.
665 }
666 }
667 if (choice_it.cycled_list())
668 continue; // No match.
669 segmentation->push_back(length);
670 if (choices_pos + length == choices_length &&
671 text_index + 1 == target_text.size()) {
672 // This is a complete match. If the rating is good record a new best.
673 if (applybox_debug > 2) {
674 tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
675 rating + choice_rating, *best_rating, segmentation->size(),
676 best_segmentation->size());
677 }
678 if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
679 *best_segmentation = *segmentation;
680 *best_rating = rating + choice_rating;
681 }
682 } else if (choices_pos + length < choices_length &&
683 text_index + 1 < target_text.size()) {
684 if (applybox_debug > 3) {
685 tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n",
686 target_text[text_index],
687 unicharset.id_to_unichar(target_text[text_index]),
688 choice_it.data()->unichar_id() == target_text[text_index]
689 ? "Match" : "Ambig",
690 choices_pos, length);
691 }
692 SearchForText(choices, choices_pos + length, choices_length, target_text,
693 text_index + 1, rating + choice_rating, segmentation,
694 best_rating, best_segmentation);
695 if (applybox_debug > 3) {
696 tprintf("End recursion for %d=%s\n", target_text[text_index],
697 unicharset.id_to_unichar(target_text[text_index]));
698 }
699 }
700 segmentation->truncate(segmentation->size() - 1);
701 }
702}
703
709 int ok_blob_count = 0;
710 int bad_blob_count = 0;
711 int ok_word_count = 0;
712 int unlabelled_words = 0;
713 PAGE_RES_IT pr_it(page_res);
714 WERD_RES* word_res;
715 for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
716 int ok_in_word = 0;
717 int blob_count = word_res->correct_text.size();
718 auto* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
719 word_choice->set_permuter(TOP_CHOICE_PERM);
720 for (int c = 0; c < blob_count; ++c) {
721 if (word_res->correct_text[c].length() > 0) {
722 ++ok_in_word;
723 }
724 // Since we only need a fake word_res->best_choice, the actual
725 // unichar_ids do not matter. Which is fortunate, since TidyUp()
726 // can be called while training Tesseract, at the stage where
727 // unicharset is not meaningful yet.
728 word_choice->append_unichar_id_space_allocated(
729 INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f);
730 }
731 if (ok_in_word > 0) {
732 ok_blob_count += ok_in_word;
733 bad_blob_count += word_res->correct_text.size() - ok_in_word;
734 word_res->LogNewRawChoice(word_choice);
735 word_res->LogNewCookedChoice(1, false, word_choice);
736 } else {
737 ++unlabelled_words;
738 if (applybox_debug > 0) {
739 tprintf("APPLY_BOXES: Unlabelled word at :");
740 word_res->word->bounding_box().print();
741 }
742 pr_it.DeleteCurrentWord();
743 delete word_choice;
744 }
745 }
746 pr_it.restart_page();
747 for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
748 // Denormalize back to a BoxWord.
749 word_res->RebuildBestState();
750 word_res->SetupBoxWord();
751 word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
752 word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
753 }
754 if (applybox_debug > 0) {
755 tprintf(" Found %d good blobs.\n", ok_blob_count);
756 if (bad_blob_count > 0) {
757 tprintf(" Leaving %d unlabelled blobs in %d words.\n",
758 bad_blob_count, ok_word_count);
759 }
760 if (unlabelled_words > 0)
761 tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words);
762 }
763}
764
765#endif // ndef DISABLED_LEGACY_ENGINE
766
768void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
769 const char *box_ch, const char *err_msg) {
770 tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
771 boxfile_lineno + 1, box_ch,
772 box.left(), box.bottom(), box.right(), box.top(), err_msg);
773}
774
777 PAGE_RES_IT pr_it(page_res);
778 for (WERD_RES *word_res = pr_it.word(); word_res != nullptr;
779 word_res = pr_it.forward()) {
780 auto* choice = new WERD_CHOICE(word_res->uch_set,
781 word_res->correct_text.size());
782 for (int i = 0; i < word_res->correct_text.size(); ++i) {
783 // The part before the first space is the real ground truth, and the
784 // rest is the bounding box location and page number.
786 word_res->correct_text[i].split(' ', &tokens);
787 UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
788 choice->append_unichar_id_space_allocated(char_id,
789 word_res->best_state[i],
790 0.0f, 0.0f);
791 }
792 word_res->ClearWordChoices();
793 word_res->LogNewRawChoice(choice);
794 word_res->LogNewCookedChoice(1, false, choice);
795 }
796}
797
798#ifndef DISABLED_LEGACY_ENGINE
799
800
803void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) {
804 PAGE_RES_IT pr_it(page_res);
805 int word_count = 0;
806 for (WERD_RES *word_res = pr_it.word(); word_res != nullptr;
807 word_res = pr_it.forward()) {
808 LearnWord(fontname.string(), word_res);
809 ++word_count;
810 }
811 tprintf("Generated training data for %d words\n", word_count);
812}
813
814#endif // ndef DISABLED_LEGACY_ENGINE
815
816} // namespace tesseract
const int kMaxGroupSize
Definition: applybox.cpp:31
const double kMaxXHeightDeviationFraction
Definition: applybox.cpp:34
bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING &filename, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)
Definition: boxread.cpp:53
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:837
@ TOP_CHOICE_PERM
Definition: ratngs.h:235
@ BCC_FAKE
Definition: ratngs.h:48
@ W_FUZZY_SP
fuzzy space
Definition: werd.h:39
@ W_EOL
end of line
Definition: werd.h:33
@ W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:40
@ W_BOL
start of line
Definition: werd.h:32
#define ASSERT_HOST(x)
Definition: errcode.h:88
int IntCastRounded(double x)
Definition: helpers.h:175
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int UNICHAR_ID
Definition: unichar.h:34
int push_back(T object)
bool empty() const
Definition: genericvector.h:91
int size() const
Definition: genericvector.h:72
void remove(int index)
void insert(const T &t, int index)
int length() const
Definition: genericvector.h:86
void truncate(int size)
bool ConvertStringToUnichars(const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
Definition: applybox.cpp:534
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text)
Definition: applybox.cpp:431
void TidyUp(PAGE_RES *page_res)
Definition: applybox.cpp:708
Pix * BestPix() const
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text)
Definition: applybox.cpp:329
void SearchForText(const GenericVector< BLOB_CHOICE_LIST * > *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
Definition: applybox.cpp:631
void ReSegmentByClassification(PAGE_RES *page_res)
Definition: applybox.cpp:506
Dict & getDict() override
void ApplyBoxTraining(const STRING &fontname, PAGE_RES *page_res)
Definition: applybox.cpp:803
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
Definition: applybox.cpp:207
void PreenXHeights(BLOCK_LIST *block_list)
Definition: applybox.cpp:181
bool FindSegmentation(const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
Definition: applybox.cpp:561
void CorrectClassifyWords(PAGE_RES *page_res)
Definition: applybox.cpp:776
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: applybox.cpp:243
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
Definition: applybox.cpp:768
PAGE_RES * ApplyBoxes(const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
Definition: applybox.cpp:109
int NumBlobs() const
Definition: blobs.h:448
GenericVector< TBLOB * > blobs
Definition: blobs.h:459
int length() const
Definition: boxword.h:83
void MergeBoxes(int start, int end)
Definition: boxword.cpp:131
const TBOX & BlobBox(int index) const
Definition: boxword.h:84
const TBOX & bounding_box() const
Definition: boxword.h:80
Definition: ocrblock.h:31
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:116
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:190
Definition: ocrrow.h:37
WERD_LIST * word_list()
Definition: ocrrow.h:55
void set_x_height(float new_xheight)
Definition: ocrrow.h:67
TBOX bounding_box() const
Definition: ocrrow.h:88
float x_height() const
Definition: ocrrow.h:64
BLOCK * block
Definition: pageres.h:116
ROW * row
Definition: pageres.h:140
const UNICHARSET * uch_set
Definition: pageres.h:203
void CloneChoppedToRebuild()
Definition: pageres.cpp:835
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:877
BlamerBundle * blamer_bundle
Definition: pageres.h:252
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:604
tesseract::BoxWord * box_word
Definition: pageres.h:272
GenericVector< SEAM * > seam_array
Definition: pageres.h:214
void SetupBoxWord()
Definition: pageres.cpp:849
GenericVector< int > best_state
Definition: pageres.h:285
TWERD * chopped_word
Definition: pageres.h:212
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:302
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:418
GenericVector< STRING > correct_text
Definition: pageres.h:289
void RebuildBestState()
Definition: pageres.cpp:808
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:620
WERD * word
Definition: pageres.h:186
WERD_RES * word() const
Definition: pageres.h:754
ROW_RES * row() const
Definition: pageres.h:757
ROW_RES * prev_row() const
Definition: pageres.h:748
BLOCK_RES * block() const
Definition: pageres.h:760
WERD_RES * restart_page()
Definition: pageres.h:701
WERD_RES * forward()
Definition: pageres.h:734
ROW_RES * next_row() const
Definition: pageres.h:766
void DeleteCurrentWord()
Definition: pageres.cpp:1440
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:59
void set_rating(float newrat)
Definition: ratngs.h:144
float rating() const
Definition: ratngs.h:80
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
void set_certainty(float newrat)
Definition: ratngs.h:147
Definition: rect.h:34
bool almost_equal(const TBOX &box, int tolerance) const
Definition: rect.cpp:258
int16_t top() const
Definition: rect.h:58
void print() const
Definition: rect.h:278
int32_t area() const
Definition: rect.h:122
bool major_overlap(const TBOX &box) const
Definition: rect.h:368
int16_t left() const
Definition: rect.h:72
int16_t bottom() const
Definition: rect.h:65
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87
int x_gap(const TBOX &box) const
Definition: rect.h:225
int16_t right() const
Definition: rect.h:79
Definition: seam.h:38
bool HasAnySplits() const
Definition: seam.h:61
Definition: statistc.h:31
TBOX bounding_box() const
Definition: stepblob.cpp:253
Definition: werd.h:56
C_BLOB_LIST * cblob_list()
Definition: werd.h:95
WERD * shallow_copy()
Definition: werd.cpp:334
void set_text(const char *new_text)
Definition: werd.h:115
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:118
const char * text() const
Definition: werd.h:114
TBOX bounding_box() const
Definition: werd.cpp:148
UNICHAR_ID correct_ngram_id
Definition: ambigs.h:126
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE+1]
Definition: ambigs.h:124
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:145
UNICHARSET unicharset
Definition: ccutil.h:73
Definition: strngs.h:45
const char * string() const
Definition: strngs.cpp:194
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:250
bool classify_bln_numeric_mode
Definition: classify.h:508
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:108
SEAM * chop_one_blob(const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, int *blob_number)
Definition: chopper.cpp:371
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:225
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM * > &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:50