tesseract 4.1.1
Loading...
Searching...
No Matches
linerec.cpp
Go to the documentation of this file.
1
2// File: linerec.cpp
3// Description: Top-level line-based recognition module for Tesseract.
4// Author: Ray Smith
5// Created: Thu May 02 09:47:06 PST 2013
6//
7// (C) Copyright 2013, Google Inc.
8// Licensed under the Apache License, Version 2.0 (the "License");
9// you may not use this file except in compliance with the License.
10// You may obtain a copy of the License at
11// http://www.apache.org/licenses/LICENSE-2.0
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
18
19#include "tesseractclass.h"
20
21#include "allheaders.h"
22#include "boxread.h"
23#include "imagedata.h"
24#ifndef ANDROID_BUILD
25#include "lstmrecognizer.h"
26#include "recodebeam.h"
27#endif
28#include "pageres.h"
29#include "tprintf.h"
30
31#include <algorithm>
32
33namespace tesseract {
34
35// Scale factor to make certainty more comparable to Tesseract.
36const float kCertaintyScale = 7.0f;
37// Worst acceptable certainty for a dictionary word.
38const float kWorstDictCertainty = -25.0f;
39
40// Generates training data for training a line recognizer, eg LSTM.
41// Breaks the page into lines, according to the boxes, and writes them to a
42// serialized DocumentData based on output_basename.
43// Return true if successful, false if an error occurred.
44bool Tesseract::TrainLineRecognizer(const STRING& input_imagename,
45 const STRING& output_basename,
46 BLOCK_LIST *block_list) {
47 STRING lstmf_name = output_basename + ".lstmf";
48 DocumentData images(lstmf_name);
49 if (applybox_page > 0) {
50 // Load existing document for the previous pages.
51 if (!images.LoadDocument(lstmf_name.c_str(), 0, 0, nullptr)) {
52 tprintf("Failed to read training data from %s!\n", lstmf_name.c_str());
53 return false;
54 }
55 }
58 // Get the boxes for this page, if there are any.
59 if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, nullptr,
60 nullptr) ||
61 boxes.empty()) {
62 tprintf("Failed to read boxes from %s\n", input_imagename.c_str());
63 return false;
64 }
65 TrainFromBoxes(boxes, texts, block_list, &images);
66 if (images.PagesSize() == 0) {
67 tprintf("Failed to read pages from %s\n", input_imagename.c_str());
68 return false;
69 }
70 images.Shuffle();
71 if (!images.SaveDocument(lstmf_name.c_str(), nullptr)) {
72 tprintf("Failed to write training data to %s!\n", lstmf_name.c_str());
73 return false;
74 }
75 return true;
76}
77
78// Generates training data for training a line recognizer, eg LSTM.
79// Breaks the boxes into lines, normalizes them, converts to ImageData and
80// appends them to the given training_data.
82 const GenericVector<STRING>& texts,
83 BLOCK_LIST *block_list,
84 DocumentData* training_data) {
85 int box_count = boxes.size();
86 // Process all the text lines in this page, as defined by the boxes.
87 int end_box = 0;
88 // Don't let \t, which marks newlines in the box file, get into the line
89 // content, as that makes the line unusable in training.
90 while (end_box < texts.size() && texts[end_box] == "\t") ++end_box;
91 for (int start_box = end_box; start_box < box_count; start_box = end_box) {
92 // Find the textline of boxes starting at start and their bounding box.
93 TBOX line_box = boxes[start_box];
94 STRING line_str = texts[start_box];
95 for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t";
96 ++end_box) {
97 line_box += boxes[end_box];
98 line_str += texts[end_box];
99 }
100 // Find the most overlapping block.
101 BLOCK* best_block = nullptr;
102 int best_overlap = 0;
103 BLOCK_IT b_it(block_list);
104 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
105 BLOCK* block = b_it.data();
106 if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText())
107 continue; // Not a text block.
108 TBOX block_box = block->pdblk.bounding_box();
109 block_box.rotate(block->re_rotation());
110 if (block_box.major_overlap(line_box)) {
111 TBOX overlap_box = line_box.intersection(block_box);
112 if (overlap_box.area() > best_overlap) {
113 best_overlap = overlap_box.area();
114 best_block = block;
115 }
116 }
117 }
118 ImageData* imagedata = nullptr;
119 if (best_block == nullptr) {
120 tprintf("No block overlapping textline: %s\n", line_str.string());
121 } else {
122 imagedata = GetLineData(line_box, boxes, texts, start_box, end_box,
123 *best_block);
124 }
125 if (imagedata != nullptr)
126 training_data->AddPageToDocument(imagedata);
127 // Don't let \t, which marks newlines in the box file, get into the line
128 // content, as that makes the line unusable in training.
129 while (end_box < texts.size() && texts[end_box] == "\t") ++end_box;
130 }
131}
132
133// Returns an Imagedata containing the image of the given box,
134// and ground truth boxes/truth text if available in the input.
135// The image is not normalized in any way.
137 const GenericVector<TBOX>& boxes,
138 const GenericVector<STRING>& texts,
139 int start_box, int end_box,
140 const BLOCK& block) {
141 TBOX revised_box;
142 ImageData* image_data = GetRectImage(line_box, block, kImagePadding,
143 &revised_box);
144 if (image_data == nullptr) return nullptr;
145 image_data->set_page_number(applybox_page);
146 // Copy the boxes and shift them so they are relative to the image.
147 FCOORD block_rotation(block.re_rotation().x(), -block.re_rotation().y());
148 ICOORD shift = -revised_box.botleft();
149 GenericVector<TBOX> line_boxes;
150 GenericVector<STRING> line_texts;
151 for (int b = start_box; b < end_box; ++b) {
152 TBOX box = boxes[b];
153 box.rotate(block_rotation);
154 box.move(shift);
155 line_boxes.push_back(box);
156 line_texts.push_back(texts[b]);
157 }
158 GenericVector<int> page_numbers;
159 page_numbers.init_to_size(line_boxes.size(), applybox_page);
160 image_data->AddBoxes(line_boxes, line_texts, page_numbers);
161 return image_data;
162}
163
164// Helper gets the image of a rectangle, using the block.re_rotation() if
165// needed to get to the image, and rotating the result back to horizontal
166// layout. (CJK characters will be on their left sides) The vertical text flag
167// is set in the returned ImageData if the text was originally vertical, which
168// can be used to invoke a different CJK recognition engine. The revised_box
169// is also returned to enable calculation of output bounding boxes.
170ImageData* Tesseract::GetRectImage(const TBOX& box, const BLOCK& block,
171 int padding, TBOX* revised_box) const {
172 TBOX wbox = box;
173 wbox.pad(padding, padding);
174 *revised_box = wbox;
175 // Number of clockwise 90 degree rotations needed to get back to tesseract
176 // coords from the clipped image.
177 int num_rotations = 0;
178 if (block.re_rotation().y() > 0.0f)
179 num_rotations = 1;
180 else if (block.re_rotation().x() < 0.0f)
181 num_rotations = 2;
182 else if (block.re_rotation().y() < 0.0f)
183 num_rotations = 3;
184 // Handle two cases automatically: 1 the box came from the block, 2 the box
185 // came from a box file, and refers to the image, which the block may not.
186 if (block.pdblk.bounding_box().major_overlap(*revised_box))
187 revised_box->rotate(block.re_rotation());
188 // Now revised_box always refers to the image.
189 // BestPix is never colormapped, but may be of any depth.
190 Pix* pix = BestPix();
191 int width = pixGetWidth(pix);
192 int height = pixGetHeight(pix);
193 TBOX image_box(0, 0, width, height);
194 // Clip to image bounds;
195 *revised_box &= image_box;
196 if (revised_box->null_box()) return nullptr;
197 Box* clip_box = boxCreate(revised_box->left(), height - revised_box->top(),
198 revised_box->width(), revised_box->height());
199 Pix* box_pix = pixClipRectangle(pix, clip_box, nullptr);
200 if (box_pix == nullptr) return nullptr;
201 boxDestroy(&clip_box);
202 if (num_rotations > 0) {
203 Pix* rot_pix = pixRotateOrth(box_pix, num_rotations);
204 pixDestroy(&box_pix);
205 box_pix = rot_pix;
206 }
207 // Convert sub-8-bit images to 8 bit.
208 int depth = pixGetDepth(box_pix);
209 if (depth < 8) {
210 Pix* grey;
211 grey = pixConvertTo8(box_pix, false);
212 pixDestroy(&box_pix);
213 box_pix = grey;
214 }
215 bool vertical_text = false;
216 if (num_rotations > 0) {
217 // Rotated the clipped revised box back to internal coordinates.
218 FCOORD rotation(block.re_rotation().x(), -block.re_rotation().y());
219 revised_box->rotate(rotation);
220 if (num_rotations != 2)
221 vertical_text = true;
222 }
223 return new ImageData(vertical_text, box_pix);
224}
225
226#ifndef ANDROID_BUILD
227// Recognizes a word or group of words, converting to WERD_RES in *words.
228// Analogous to classify_word_pass1, but can handle a group of words as well.
229void Tesseract::LSTMRecognizeWord(const BLOCK& block, ROW *row, WERD_RES *word,
231 TBOX word_box = word->word->bounding_box();
232 // Get the word image - no frills.
235 // In single word mode, use the whole image without any other row/word
236 // interpretation.
237 word_box = TBOX(0, 0, ImageWidth(), ImageHeight());
238 } else {
239 float baseline = row->base_line((word_box.left() + word_box.right()) / 2);
240 if (baseline + row->descenders() < word_box.bottom())
241 word_box.set_bottom(baseline + row->descenders());
242 if (baseline + row->x_height() + row->ascenders() > word_box.top())
243 word_box.set_top(baseline + row->x_height() + row->ascenders());
244 }
245 ImageData* im_data = GetRectImage(word_box, block, kImagePadding, &word_box);
246 if (im_data == nullptr) return;
247
248 bool do_invert = tessedit_do_invert;
249 lstm_recognizer_->RecognizeLine(*im_data, do_invert, classify_debug_level > 0,
251 word_box, words, lstm_choice_mode);
252 delete im_data;
253 SearchWords(words);
254}
255
256// Apply segmentation search to the given set of words, within the constraints
257// of the existing ratings matrix. If there is already a best_choice on a word
258// leaves it untouched and just sets the done/accepted etc flags.
260 // Run the segmentation search on the network outputs and make a BoxWord
261 // for each of the output words.
262 // If we drop a word as junk, then there is always a space in front of the
263 // next.
264 const Dict* stopper_dict = lstm_recognizer_->GetDict();
265 if (stopper_dict == nullptr) stopper_dict = &getDict();
266 bool any_nonspace_delimited = false;
267 for (int w = 0; w < words->size(); ++w) {
268 WERD_RES* word = (*words)[w];
269 if (word->best_choice != nullptr &&
271 any_nonspace_delimited = true;
272 break;
273 }
274 }
275 for (int w = 0; w < words->size(); ++w) {
276 WERD_RES* word = (*words)[w];
277 if (word->best_choice == nullptr) {
278 // It is a dud.
279 word->SetupFake(lstm_recognizer_->GetUnicharset());
280 } else {
281 // Set the best state.
282 for (int i = 0; i < word->best_choice->length(); ++i) {
283 int length = word->best_choice->state(i);
284 word->best_state.push_back(length);
285 }
286 word->reject_map.initialise(word->best_choice->length());
287 word->tess_failed = false;
288 word->tess_accepted = true;
289 word->tess_would_adapt = false;
290 word->done = true;
291 word->tesseract = this;
292 float word_certainty = std::min(word->space_certainty,
293 word->best_choice->certainty());
294 word_certainty *= kCertaintyScale;
295 if (getDict().stopper_debug_level >= 1) {
296 tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n",
297 word->best_choice->certainty(), word->space_certainty,
298 std::min(word->space_certainty, word->best_choice->certainty()) *
300 word_certainty);
301 word->best_choice->print();
302 }
303 word->best_choice->set_certainty(word_certainty);
304
305 word->tess_accepted = stopper_dict->AcceptableResult(word);
306 }
307 }
308}
309#endif // ANDROID_BUILD
310
311} // namespace tesseract.
bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING &filename, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)
Definition: boxread.cpp:53
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
@ baseline
Definition: mfoutline.h:63
const float kWorstDictCertainty
Definition: linerec.cpp:38
@ PSM_SINGLE_WORD
Treat the image as a single word.
Definition: publictypes.h:174
@ PSM_RAW_LINE
hacks that are Tesseract-specific.
Definition: publictypes.h:179
const float kCertaintyScale
Definition: linerec.cpp:36
const int kImagePadding
Definition: imagedata.h:39
void init_to_size(int size, const T &t)
int push_back(T object)
bool empty() const
Definition: genericvector.h:91
int size() const
Definition: genericvector.h:72
void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
Definition: linerec.cpp:229
ImageData * GetLineData(const TBOX &line_box, const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, int start_box, int end_box, const BLOCK &block)
Definition: linerec.cpp:136
Pix * BestPix() const
Dict & getDict() override
void TrainFromBoxes(const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, BLOCK_LIST *block_list, DocumentData *training_data)
Definition: linerec.cpp:81
ImageData * GetRectImage(const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
Definition: linerec.cpp:170
bool TrainLineRecognizer(const STRING &input_imagename, const STRING &output_basename, BLOCK_LIST *block_list)
Definition: linerec.cpp:44
void SearchWords(PointerVector< WERD_RES > *words)
Definition: linerec.cpp:259
void set_page_number(int num)
Definition: imagedata.h:135
void AddBoxes(const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, const GenericVector< int > &box_pages)
Definition: imagedata.cpp:315
bool SaveDocument(const char *filename, FileWriter writer)
Definition: imagedata.cpp:417
size_t PagesSize() const
Definition: imagedata.h:237
bool LoadDocument(const char *filename, int start_page, int64_t max_memory, FileReader reader)
Definition: imagedata.cpp:398
void AddPageToDocument(ImageData *page)
Definition: imagedata.cpp:435
Definition: ocrblock.h:31
FCOORD re_rotation() const
Definition: ocrblock.h:134
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:190
Definition: ocrrow.h:37
float descenders() const
Definition: ocrrow.h:85
float base_line(float xpos) const
Definition: ocrrow.h:59
float ascenders() const
Definition: ocrrow.h:82
float x_height() const
Definition: ocrrow.h:64
bool tess_would_adapt
Definition: pageres.h:304
bool done
Definition: pageres.h:305
tesseract::Tesseract * tesseract
Definition: pageres.h:280
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:352
float space_certainty
Definition: pageres.h:321
WERD_CHOICE * best_choice
Definition: pageres.h:241
bool tess_failed
Definition: pageres.h:295
bool tess_accepted
Definition: pageres.h:303
GenericVector< int > best_state
Definition: pageres.h:285
REJMAP reject_map
Definition: pageres.h:294
WERD * word
Definition: pageres.h:186
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:59
POLY_BLOCK * poly_block() const
Definition: pdblock.h:55
integer coordinate
Definition: points.h:32
Definition: points.h:189
float y() const
Definition: points.h:210
float x() const
Definition: points.h:207
bool IsText() const
Definition: polyblk.h:49
void set_certainty(float new_val)
Definition: ratngs.h:362
int state(int index) const
Definition: ratngs.h:309
bool ContainsAnyNonSpaceDelimited() const
Definition: ratngs.h:504
float certainty() const
Definition: ratngs.h:320
int length() const
Definition: ratngs.h:293
void print() const
Definition: ratngs.h:570
Definition: rect.h:34
const ICOORD & botleft() const
Definition: rect.h:92
void rotate(const FCOORD &vec)
Definition: rect.h:197
int16_t top() const
Definition: rect.h:58
void move(const ICOORD vec)
Definition: rect.h:157
void set_bottom(int y)
Definition: rect.h:68
int16_t width() const
Definition: rect.h:115
int32_t area() const
Definition: rect.h:122
int16_t height() const
Definition: rect.h:108
bool major_overlap(const TBOX &box) const
Definition: rect.h:368
void set_top(int y)
Definition: rect.h:61
int16_t left() const
Definition: rect.h:72
int16_t bottom() const
Definition: rect.h:65
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87
void pad(int xpad, int ypad)
Definition: rect.h:131
bool null_box() const
Definition: rect.h:50
int16_t right() const
Definition: rect.h:79
void initialise(int16_t length)
Definition: rejctmap.cpp:273
TBOX bounding_box() const
Definition: werd.cpp:148
Definition: strngs.h:45
const char * c_str() const
Definition: strngs.cpp:205
const char * string() const
Definition: strngs.cpp:194
bool AcceptableResult(WERD_RES *word) const
Definition: stopper.cpp:102
void RecognizeLine(const ImageData &image_data, bool invert, bool debug, double worst_dict_cert, const TBOX &line_box, PointerVector< WERD_RES > *words, int lstm_choice_mode=0)
const Dict * GetDict() const
const UNICHARSET & GetUnicharset() const