tesseract 4.1.1
Loading...
Searching...
No Matches
recogtraining.cpp
Go to the documentation of this file.
1
2// File: recogtraining.cpp
3// Description: Functions for ambiguity and parameter training.
4// Author: Daria Antonova
5//
6// (C) Copyright 2009, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16//
18
19#include "tesseractclass.h"
20
21#include "boxread.h"
22#include "control.h"
23#include "host.h" // for NearlyEqual
24#include "ratngs.h"
25#ifndef DISABLED_LEGACY_ENGINE
26#include "reject.h"
27#endif
28#include "stopper.h"
29
30namespace tesseract {
31
32const int16_t kMaxBoxEdgeDiff = 2;
33
34// Sets flags necessary for recognition in the training mode.
35// Opens and returns the pointer to the output file.
38 tessedit_tess_adaption_mode.set_value(0); // turn off adaption
39 tessedit_enable_doc_dict.set_value(0); // turn off document dictionary
40 // Explore all segmentations.
42 }
43
44 STRING output_fname = fname;
45 const char* lastdot = strrchr(output_fname.string(), '.');
46 if (lastdot != nullptr)
47 output_fname[lastdot - output_fname.string()] = '\0';
48 output_fname += ".txt";
49 FILE* output_file = fopen(output_fname.string(), "a+");
50 if (output_file == nullptr) {
51 tprintf("Error: Could not open file %s\n", output_fname.string());
52 ASSERT_HOST(output_file);
53 }
54 return output_file;
55}
56
57// Copies the bounding box from page_res_it->word() to the given TBOX.
58static bool read_t(PAGE_RES_IT* page_res_it, TBOX* tbox) {
59 while (page_res_it->block() != nullptr && page_res_it->word() == nullptr)
60 page_res_it->forward();
61
62 if (page_res_it->word() != nullptr) {
63 *tbox = page_res_it->word()->word->bounding_box();
64
65 // If tbox->left() is negative, the training image has vertical text and
66 // all the coordinates of bounding boxes of page_res are rotated by 90
67 // degrees in a counterclockwise direction. We need to rotate the TBOX back
68 // in order to compare with the TBOXes of box files.
69 if (tbox->left() < 0) {
70 tbox->rotate(FCOORD(0.0, -1.0));
71 }
72
73 return true;
74 } else {
75 return false;
76 }
77}
78
79// This function takes tif/box pair of files and runs recognition on the image,
80// while making sure that the word bounds that tesseract identified roughly
81// match to those specified by the input box file. For each word (ngram in a
82// single bounding box from the input box file) it outputs the ocred result,
83// the correct label, rating and certainty.
85 PAGE_RES* page_res,
86 volatile ETEXT_DESC* monitor,
87 FILE* output_file) {
88 STRING box_fname = fname;
89 const char* lastdot = strrchr(box_fname.string(), '.');
90 if (lastdot != nullptr)
91 box_fname[lastdot - box_fname.string()] = '\0';
92 box_fname += ".box";
93 // ReadNextBox() will close box_file
94 FILE* box_file = fopen(box_fname.string(), "r");
95 if (box_file == nullptr) {
96 tprintf("Error: Could not open file %s\n", box_fname.string());
97 ASSERT_HOST(box_file);
98 }
99
100 PAGE_RES_IT page_res_it;
101 page_res_it.page_res = page_res;
102 page_res_it.restart_page();
103 STRING label;
104
105 // Process all the words on this page.
106 TBOX tbox; // tesseract-identified box
107 TBOX bbox; // box from the box file
108 bool keep_going;
109 int line_number = 0;
110 int examined_words = 0;
111 do {
112 keep_going = read_t(&page_res_it, &tbox);
113 keep_going &=
114 ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
115 // Align bottom left points of the TBOXes.
116 while (keep_going &&
117 !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
118 if (bbox.bottom() < tbox.bottom()) {
119 page_res_it.forward();
120 keep_going = read_t(&page_res_it, &tbox);
121 } else {
122 keep_going =
123 ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
124 }
125 }
126 while (keep_going &&
127 !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
128 if (bbox.left() > tbox.left()) {
129 page_res_it.forward();
130 keep_going = read_t(&page_res_it, &tbox);
131 } else {
132 keep_going =
133 ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
134 }
135 }
136 // OCR the word if top right points of the TBOXes are similar.
137 if (keep_going &&
138 NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
139 NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
140 ambigs_classify_and_output(label.string(), &page_res_it, output_file);
141 examined_words++;
142 }
143 page_res_it.forward();
144 } while (keep_going);
145
146 // Set up scripts on all of the words that did not get sent to
147 // ambigs_classify_and_output. They all should have, but if all the
148 // werd_res's don't get uch_sets, tesseract will crash when you try
149 // to iterate over them. :-(
150 int total_words = 0;
151 for (page_res_it.restart_page(); page_res_it.block() != nullptr;
152 page_res_it.forward()) {
153 if (page_res_it.word()) {
154 if (page_res_it.word()->uch_set == nullptr)
155 page_res_it.word()->SetupFake(unicharset);
156 total_words++;
157 }
158 }
159 if (examined_words < 0.85 * total_words) {
160 tprintf(
161 "TODO(antonova): clean up recog_training_segmented; "
162 " It examined only a small fraction of the ambigs image.\n");
163 }
164 tprintf("recog_training_segmented: examined %d / %d words.\n", examined_words,
165 total_words);
166}
167
168// Helper prints the given set of blob choices.
169static void PrintPath(int length, const BLOB_CHOICE** blob_choices,
170 const UNICHARSET& unicharset, const char* label,
171 FILE* output_file) {
172 float rating = 0.0f;
173 float certainty = 0.0f;
174 for (int i = 0; i < length; ++i) {
175 const BLOB_CHOICE* blob_choice = blob_choices[i];
176 fprintf(output_file, "%s",
177 unicharset.id_to_unichar(blob_choice->unichar_id()));
178 rating += blob_choice->rating();
179 if (certainty > blob_choice->certainty())
180 certainty = blob_choice->certainty();
181 }
182 fprintf(output_file, "\t%s\t%.4f\t%.4f\n", label, rating, certainty);
183}
184
185// Helper recursively prints all paths through the ratings matrix, starting
186// at column col.
187static void PrintMatrixPaths(int col, int dim, const MATRIX& ratings,
188 int length, const BLOB_CHOICE** blob_choices,
189 const UNICHARSET& unicharset, const char* label,
190 FILE* output_file) {
191 for (int row = col; row < dim && row - col < ratings.bandwidth(); ++row) {
192 if (ratings.get(col, row) != NOT_CLASSIFIED) {
193 BLOB_CHOICE_IT bc_it(ratings.get(col, row));
194 for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
195 blob_choices[length] = bc_it.data();
196 if (row + 1 < dim) {
197 PrintMatrixPaths(row + 1, dim, ratings, length + 1, blob_choices,
198 unicharset, label, output_file);
199 } else {
200 PrintPath(length + 1, blob_choices, unicharset, label, output_file);
201 }
202 }
203 }
204 }
205}
206
207// Runs classify_word_pass1() on the current word. Outputs Tesseract's
208// raw choice as a result of the classification. For words labeled with a
209// single unichar also outputs all alternatives from blob_choices of the
210// best choice.
212 PAGE_RES_IT* pr_it,
213 FILE* output_file) {
214 // Classify word.
215 fflush(stdout);
216 WordData word_data(*pr_it);
217 SetupWordPassN(1, &word_data);
218 classify_word_and_language(1, pr_it, &word_data);
219 WERD_RES* werd_res = word_data.word;
220 WERD_CHOICE* best_choice = werd_res->best_choice;
221 ASSERT_HOST(best_choice != nullptr);
222
223 // Compute the number of unichars in the label.
225 if (!unicharset.encode_string(label, true, &encoding, nullptr, nullptr)) {
226 tprintf("Not outputting illegal unichar %s\n", label);
227 return;
228 }
229
230 // Dump all paths through the ratings matrix (which is normally small).
231 int dim = werd_res->ratings->dimension();
232 const auto** blob_choices = new const BLOB_CHOICE*[dim];
233 PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices, unicharset,
234 label, output_file);
235 delete[] blob_choices;
236}
237
238} // namespace tesseract
bool ReadNextBox(int *line_number, FILE *box_file, STRING *utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:127
#define NOT_CLASSIFIED
Definition: matrix.h:44
#define ASSERT_HOST(x)
Definition: errcode.h:88
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
const int16_t kMaxBoxEdgeDiff
T get(ICOORD pos) const
Definition: matrix.h:231
void recog_training_segmented(const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1319
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:177
FILE * init_recog_training(const STRING &fname)
Dict & getDict() override
void ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
int bandwidth() const
Definition: matrix.h:538
int dimension() const
Definition: matrix.h:536
Definition: matrix.h:578
const UNICHARSET * uch_set
Definition: pageres.h:203
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:352
WERD_CHOICE * best_choice
Definition: pageres.h:241
MATRIX * ratings
Definition: pageres.h:237
WERD * word
Definition: pageres.h:186
WERD_RES * word() const
Definition: pageres.h:754
BLOCK_RES * block() const
Definition: pageres.h:760
WERD_RES * restart_page()
Definition: pageres.h:701
PAGE_RES * page_res
Definition: pageres.h:677
WERD_RES * forward()
Definition: pageres.h:734
Definition: points.h:189
float certainty() const
Definition: ratngs.h:83
float rating() const
Definition: ratngs.h:80
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
Definition: rect.h:34
void rotate(const FCOORD &vec)
Definition: rect.h:197
int16_t top() const
Definition: rect.h:58
int16_t left() const
Definition: rect.h:72
int16_t bottom() const
Definition: rect.h:65
int16_t right() const
Definition: rect.h:79
TBOX bounding_box() const
Definition: werd.cpp:148
UNICHARSET unicharset
Definition: ccutil.h:73
Definition: strngs.h:45
const char * string() const
Definition: strngs.cpp:194
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:259
bool stopper_no_acceptable_choices
Definition: dict.h:641