tesseract 4.1.1
Loading...
Searching...
No Matches
output.cpp
Go to the documentation of this file.
1/******************************************************************
2 * File: output.cpp (Formerly output.c)
3 * Description: Output pass
4 * Author: Phil Cheatle
5 *
6 * (C) Copyright 1994, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19#include <cctype>
20#include <cerrno>
21#include <cstring>
22#include "control.h"
23#include "helpers.h"
24#include "output.h"
25#include "tesseractclass.h"
26#include "tessvars.h"
27#ifndef DISABLED_LEGACY_ENGINE
28#include "docqual.h"
29#include "reject.h"
30#endif
31
32#define CTRL_NEWLINE '\012' //newline
33#define CTRL_HARDLINE '\015' //cr
34
35namespace tesseract {
36void Tesseract::output_pass( //Tess output pass //send to api
37 PAGE_RES_IT &page_res_it,
38 const TBOX *target_word_box) {
39 BLOCK_RES *block_of_last_word;
40 bool force_eol; //During output
41 BLOCK *nextblock; //block of next word
42 WERD *nextword; //next word
43
44 page_res_it.restart_page ();
45 block_of_last_word = nullptr;
46 while (page_res_it.word () != nullptr) {
47 check_debug_pt (page_res_it.word (), 120);
48
49 if (target_word_box) {
50 TBOX current_word_box = page_res_it.word()->word->bounding_box();
51 FCOORD center_pt(
52 (current_word_box.right() + current_word_box.left()) / 2,
53 (current_word_box.bottom() + current_word_box.top()) / 2);
54 if (!target_word_box->contains(center_pt)) {
55 page_res_it.forward();
56 continue;
57 }
58 }
60 block_of_last_word != page_res_it.block ()) {
61 block_of_last_word = page_res_it.block ();
62 }
63
65 (page_res_it.block () != page_res_it.next_block ())) ||
66 (page_res_it.next_word () == nullptr);
67
68 if (page_res_it.next_word () != nullptr)
69 nextword = page_res_it.next_word ()->word;
70 else
71 nextword = nullptr;
72 if (page_res_it.next_block () != nullptr)
73 nextblock = page_res_it.next_block ()->block;
74 else
75 nextblock = nullptr;
76 //regardless of tilde crunching
77 write_results(page_res_it,
78 determine_newline_type(page_res_it.word()->word,
79 page_res_it.block()->block,
80 nextword, nextblock), force_eol);
81 page_res_it.forward();
82 }
83}
84
85
86/*************************************************************************
87 * write_results()
88 *
89 * All recognition and rejection has now been done. Generate the following:
90 * .txt file - giving the final best choices with NO highlighting
91 * .raw file - giving the tesseract top choice output for each word
92 * .map file - showing how the .txt file has been rejected in the .ep file
93 * epchoice list - a list of one element per word, containing the text for the
94 * epaper. Reject strings are inserted.
95 * inset list - a list of bounding boxes of reject insets - indexed by the
96 * reject strings in the epchoice text.
97 *************************************************************************/
99 char newline_type, // type of newline
100 bool force_eol) { // override tilde crunch?
101 WERD_RES *word = page_res_it.word();
102 const UNICHARSET &uchset = *word->uch_set;
103 int i;
104 bool need_reject = false;
105 UNICHAR_ID space = uchset.unichar_to_id(" ");
106
107 if ((word->unlv_crunch_mode != CR_NONE ||
108 word->best_choice->length() == 0) &&
110 if ((word->unlv_crunch_mode != CR_DELETE) &&
111 (!stats_.tilde_crunch_written ||
112 ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
113 (word->word->space () > 0) &&
114 !word->word->flag (W_FUZZY_NON) &&
115 !word->word->flag (W_FUZZY_SP)))) {
116 if (!word->word->flag (W_BOL) &&
117 (word->word->space () > 0) &&
118 !word->word->flag (W_FUZZY_NON) &&
119 !word->word->flag (W_FUZZY_SP)) {
120 stats_.last_char_was_tilde = false;
121 }
122 need_reject = true;
123 }
124 if ((need_reject && !stats_.last_char_was_tilde) ||
125 (force_eol && stats_.write_results_empty_block)) {
126 /* Write a reject char - mark as rejected unless zero_rejection mode */
127 stats_.last_char_was_tilde = true;
128 stats_.tilde_crunch_written = true;
129 stats_.last_char_was_newline = false;
130 stats_.write_results_empty_block = false;
131 }
132
133 if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
134 stats_.tilde_crunch_written = false;
135 stats_.last_char_was_newline = true;
136 stats_.last_char_was_tilde = false;
137 }
138
139 if (force_eol)
140 stats_.write_results_empty_block = true;
141 return;
142 }
143
144 /* NORMAL PROCESSING of non tilde crunched words */
145
146 stats_.tilde_crunch_written = false;
147 if (newline_type)
148 stats_.last_char_was_newline = true;
149 else
150 stats_.last_char_was_newline = false;
151 stats_.write_results_empty_block = force_eol; // about to write a real word
152
154 stats_.last_char_was_tilde &&
155 (word->word->space() == 0) &&
157 (word->best_choice->unichar_id(0) == space)) {
158 /* Prevent adjacent tilde across words - we know that adjacent tildes within
159 words have been removed */
160 word->MergeAdjacentBlobs(0);
161 }
162 if (newline_type ||
164 stats_.last_char_was_tilde = false;
165 else {
166 if (word->reject_map.length () > 0) {
167 if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
168 stats_.last_char_was_tilde = true;
169 else
170 stats_.last_char_was_tilde = false;
171 }
172 else if (word->word->space () > 0)
173 stats_.last_char_was_tilde = false;
174 /* else it is unchanged as there are no output chars */
175 }
176
177 ASSERT_HOST (word->best_choice->length() == word->reject_map.length());
178
179 set_unlv_suspects(word);
180 check_debug_pt (word, 120);
182 tprintf ("Dict word: \"%s\": %d\n",
184 dict_word(*(word->best_choice)));
185 }
186 if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
188 /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
189 for (i = 0; i < word->best_choice->length(); ++i) {
190 if (word->reject_map[i].rejected())
191 word->reject_map[i].setrej_minimal_rej_accept();
192 }
193 }
195 /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
196 for (i = 0; i < word->best_choice->length(); ++i) {
197 if ((word->best_choice->unichar_id(i) != space) &&
198 word->reject_map[i].rejected())
199 word->reject_map[i].setrej_minimal_rej_accept();
200 }
201 }
202 }
203}
204} // namespace tesseract
205
206/**********************************************************************
207 * determine_newline_type
208 *
209 * Find whether we have a wrapping or hard newline.
210 * Return false if not at end of line.
211 **********************************************************************/
212
213char determine_newline_type( //test line ends
214 WERD *word, //word to do
215 BLOCK *block, //current block
216 WERD *next_word, //next word
217 BLOCK *next_block //block of next word
218 ) {
219 int16_t end_gap; //to right edge
220 int16_t width; //of next word
221 TBOX word_box; //bounding
222 TBOX next_box; //next word
223 TBOX block_box; //block bounding
224
225 if (!word->flag (W_EOL))
226 return false; //not end of line
227 if (next_word == nullptr || next_block == nullptr || block != next_block)
228 return CTRL_NEWLINE;
229 if (next_word->space () > 0)
230 return CTRL_HARDLINE; //it is tabbed
231 word_box = word->bounding_box ();
232 next_box = next_word->bounding_box ();
233 block_box = block->pdblk.bounding_box ();
234 //gap to eol
235 end_gap = block_box.right () - word_box.right ();
236 end_gap -= static_cast<int32_t>(block->space ());
237 width = next_box.right () - next_box.left ();
238 // tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
239 // block_box.right(),word_box.right(),end_gap,
240 // next_box.right(),next_box.left(),width,
241 // end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
242 return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
243}
244
245/*************************************************************************
246 * get_rep_char()
247 * Return the first accepted character from the repetition string. This is the
248 * character which is repeated - as determined earlier by fix_rep_char()
249 *************************************************************************/
250namespace tesseract {
251UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?
252 int i;
253 for (i = 0; ((i < word->reject_map.length()) &&
254 (word->reject_map[i].rejected())); ++i);
255
256 if (i < word->reject_map.length()) {
257 return word->best_choice->unichar_id(i);
258 } else {
259 return word->uch_set->unichar_to_id(unrecognised_char.string());
260 }
261}
262
263/*************************************************************************
264 * SUSPECT LEVELS
265 *
266 * 0 - don't reject ANYTHING
267 * 1,2 - partial rejection
268 * 3 - BEST
269 *
270 * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
271 * tessedit_minimal_rejection.
272 *************************************************************************/
274 int len = word_res->reject_map.length();
275 const WERD_CHOICE &word = *(word_res->best_choice);
276 const UNICHARSET &uchset = *word.unicharset();
277 int i;
278 float rating_per_ch;
279
280 if (suspect_level == 0) {
281 for (i = 0; i < len; i++) {
282 if (word_res->reject_map[i].rejected())
283 word_res->reject_map[i].setrej_minimal_rej_accept();
284 }
285 return;
286 }
287
288 if (suspect_level >= 3)
289 return; //Use defaults
290
291 /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
292
293 if (safe_dict_word(word_res) &&
295 /* Unreject alphas in dictionary words */
296 for (i = 0; i < len; ++i) {
297 if (word_res->reject_map[i].rejected() &&
298 uchset.get_isalpha(word.unichar_id(i)))
299 word_res->reject_map[i].setrej_minimal_rej_accept();
300 }
301 }
302
303 rating_per_ch = word.rating() / word_res->reject_map.length();
304
305 if (rating_per_ch >= suspect_rating_per_ch)
306 return; // Don't touch bad ratings
307
308 if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
309 /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
310 for (i = 0; i < len; ++i) {
311 if (word_res->reject_map[i].rejected() &&
312 (!uchset.eq(word.unichar_id(i), " ")))
313 word_res->reject_map[i].setrej_minimal_rej_accept();
314 }
315 }
316
317 for (i = 0; i < len; i++) {
318 if (word_res->reject_map[i].rejected()) {
319 if (word_res->reject_map[i].flag(R_DOC_REJ))
320 word_res->reject_map[i].setrej_minimal_rej_accept();
321 if (word_res->reject_map[i].flag(R_BLOCK_REJ))
322 word_res->reject_map[i].setrej_minimal_rej_accept();
323 if (word_res->reject_map[i].flag(R_ROW_REJ))
324 word_res->reject_map[i].setrej_minimal_rej_accept();
325 }
326 }
327
328 if (suspect_level == 2)
329 return;
330
332 (word_res->reject_map.length() <= suspect_short_words)) {
333 for (i = 0; i < len; i++) {
334 if (word_res->reject_map[i].rejected()) {
335 if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
336 word_res->reject_map[i].flag(R_POSTNN_1IL)))
337 word_res->reject_map[i].setrej_minimal_rej_accept();
338
340 word_res->reject_map[i].flag(R_MM_REJECT))
341 word_res->reject_map[i].setrej_minimal_rej_accept();
342 }
343 }
344 }
345
346 if (acceptable_word_string(*word_res->uch_set,
347 word.unichar_string().string(),
348 word.unichar_lengths().string()) !=
351 word.unichar_lengths().string())) {
352 if (word_res->reject_map.length() > suspect_short_words) {
353 for (i = 0; i < len; i++) {
354 if (word_res->reject_map[i].rejected() &&
355 (!word_res->reject_map[i].perm_rejected() ||
356 word_res->reject_map[i].flag (R_1IL_CONFLICT) ||
357 word_res->reject_map[i].flag (R_POSTNN_1IL) ||
358 word_res->reject_map[i].flag (R_MM_REJECT))) {
359 word_res->reject_map[i].setrej_minimal_rej_accept();
360 }
361 }
362 }
363 }
364}
365
367 int count = 0;
368 for (int i = 0; i < word.length(); ++i) {
369 if (word.unicharset()->get_isalpha(word.unichar_id(i)))
370 count++;
371 }
372 return count;
373}
374
375
377 int count = 0;
378 for (int i = 0; i < word.length(); ++i) {
379 if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
380 word.unicharset()->get_isdigit(word.unichar_id(i)))
381 count++;
382 }
383 return count;
384}
385
386
388 const char* lengths) {
389 bool prev_digit = false;
390
391 if (*lengths == 1 && *s == '(')
392 s++;
393
394 if (*lengths == 1 &&
395 ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
396 s++;
397
398 for (; *s != '\0'; s += *(lengths++)) {
399 if (unicharset.get_isdigit(s, *lengths))
400 prev_digit = true;
401 else if (prev_digit &&
402 (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
403 prev_digit = false;
404 else if (prev_digit && *lengths == 1 &&
405 (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
406 return true;
407 else if (prev_digit &&
408 *lengths == 1 && (*s == '%') &&
409 (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
410 (*(s + *lengths + *(lengths + 1)) == '\0'))
411 return true;
412 else
413 return false;
414 }
415 return true;
416}
417} // namespace tesseract
@ AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:30
#define CTRL_NEWLINE
Definition: output.cpp:32
char determine_newline_type(WERD *word, BLOCK *block, WERD *next_word, BLOCK *next_block)
Definition: output.cpp:213
#define CTRL_HARDLINE
Definition: output.cpp:33
@ CR_DELETE
Definition: pageres.h:161
@ CR_NONE
Definition: pageres.h:158
@ CR_KEEP_SPACE
Definition: pageres.h:159
@ R_MM_REJECT
Definition: rejctmap.h:57
@ R_ROW_REJ
Definition: rejctmap.h:79
@ R_BLOCK_REJ
Definition: rejctmap.h:78
@ R_1IL_CONFLICT
Definition: rejctmap.h:54
@ R_POSTNN_1IL
Definition: rejctmap.h:55
@ R_DOC_REJ
Definition: rejctmap.h:77
@ W_FUZZY_SP
fuzzy space
Definition: werd.h:39
@ W_EOL
end of line
Definition: werd.h:33
@ W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:40
@ W_REP_CHAR
repeated character
Definition: werd.h:38
@ W_BOL
start of line
Definition: werd.h:32
#define ASSERT_HOST(x)
Definition: errcode.h:88
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int UNICHAR_ID
Definition: unichar.h:34
int count(LIST var_list)
Definition: oldlist.cpp:95
bool acceptable_number_string(const char *s, const char *lengths)
Definition: output.cpp:387
int16_t count_alphanums(const WERD_CHOICE &word)
Definition: output.cpp:376
void set_unlv_suspects(WERD_RES *word)
Definition: output.cpp:273
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1745
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
Definition: output.cpp:36
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:608
int16_t count_alphas(const WERD_CHOICE &word)
Definition: output.cpp:366
void write_results(PAGE_RES_IT &page_res_it, char newline_type, bool force_eol)
Definition: output.cpp:98
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1849
UNICHAR_ID get_rep_char(WERD_RES *word)
Definition: output.cpp:251
Definition: ocrblock.h:31
int16_t space() const
return spacing
Definition: ocrblock.h:98
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:190
BLOCK * block
Definition: pageres.h:116
const UNICHARSET * uch_set
Definition: pageres.h:203
WERD_CHOICE * best_choice
Definition: pageres.h:241
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:315
bool tess_accepted
Definition: pageres.h:303
void MergeAdjacentBlobs(int index)
Definition: pageres.cpp:974
REJMAP reject_map
Definition: pageres.h:294
WERD * word
Definition: pageres.h:186
WERD_RES * word() const
Definition: pageres.h:754
WERD_RES * next_word() const
Definition: pageres.h:763
BLOCK_RES * block() const
Definition: pageres.h:760
WERD_RES * restart_page()
Definition: pageres.h:701
WERD_RES * forward()
Definition: pageres.h:734
BLOCK_RES * next_block() const
Definition: pageres.h:769
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:59
Definition: points.h:189
const STRING debug_string() const
Definition: ratngs.h:495
const STRING & unichar_string() const
Definition: ratngs.h:531
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
const UNICHARSET * unicharset() const
Definition: ratngs.h:290
int length() const
Definition: ratngs.h:293
const STRING & unichar_lengths() const
Definition: ratngs.h:538
float rating() const
Definition: ratngs.h:317
Definition: rect.h:34
int16_t top() const
Definition: rect.h:58
int16_t left() const
Definition: rect.h:72
int16_t bottom() const
Definition: rect.h:65
bool contains(const FCOORD pt) const
Definition: rect.h:333
int16_t right() const
Definition: rect.h:79
int32_t length() const
Definition: rejctmap.h:223
Definition: werd.h:56
uint8_t space()
Definition: werd.h:99
TBOX bounding_box() const
Definition: werd.cpp:148
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
UNICHARSET unicharset
Definition: ccutil.h:73
const char * string() const
Definition: strngs.cpp:194
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:687
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:89