tesseract 4.1.1
Loading...
Searching...
No Matches
tfacepp.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: tfacepp.cpp (Formerly tface++.c)
3 * Description: C++ side of the C/C++ Tess/Editor interface.
4 * Author: Ray Smith
5 * Created: Thu Apr 23 15:39:23 BST 1992
6 *
7 * (C) Copyright 1992, Hewlett-Packard Ltd.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 **********************************************************************/
19
20#include <cmath>
21
22#include "blamer.h"
23#include "errcode.h"
24#include "ratngs.h"
25#include "reject.h"
26#include "tesseractclass.h"
27#include "werd.h"
28
29#define MAX_UNDIVIDED_LENGTH 24
30
31
32
33/**********************************************************************
34 * recog_word
35 *
36 * Convert the word to tess form and pass it to the tess segmenter.
37 * Convert the output back to editor form.
38 **********************************************************************/
39namespace tesseract {
41 if (wordrec_skip_no_truth_words && (word->blamer_bundle == nullptr ||
43 if (classify_debug_level) tprintf("No truth for word - skipping\n");
44 word->tess_failed = true;
45 return;
46 }
49 word->SetupBoxWord();
50 if (word->best_choice->length() != word->box_word->length()) {
51 tprintf("recog_word ASSERT FAIL String:\"%s\"; "
52 "Strlen=%d; #Blobs=%d\n",
54 word->best_choice->length(), word->box_word->length());
55 }
56 ASSERT_HOST(word->best_choice->length() == word->box_word->length());
57 // Check that the ratings matrix size matches the sum of all the
58 // segmentation states.
59 if (!word->StatesAllValid()) {
60 tprintf("Not all words have valid states relative to ratings matrix!!");
61 word->DebugWordChoices(true, nullptr);
63 }
65 /* Override the permuter type if a straight dictionary check disagrees. */
66 uint8_t perm_type = word->best_choice->permuter();
67 if ((perm_type != SYSTEM_DAWG_PERM) &&
68 (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
69 uint8_t real_dict_perm_type = dict_word(*word->best_choice);
70 if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
71 (real_dict_perm_type == FREQ_DAWG_PERM) ||
72 (real_dict_perm_type == USER_DAWG_PERM)) &&
74 word->best_choice->unichar_lengths().string()) > 0)) {
75 word->best_choice->set_permuter(real_dict_perm_type); // use dict perm
76 }
77 }
79 perm_type != word->best_choice->permuter()) {
80 tprintf("Permuter Type Flipped from %d to %d\n",
81 perm_type, word->best_choice->permuter());
82 }
83 }
84 // Factored out from control.cpp
85 ASSERT_HOST((word->best_choice == nullptr) == (word->raw_choice == nullptr));
86 if (word->best_choice == nullptr || word->best_choice->length() == 0 ||
87 static_cast<int>(strspn(word->best_choice->unichar_string().string(),
88 " ")) == word->best_choice->length()) {
89 word->tess_failed = true;
90 word->reject_map.initialise(word->box_word->length());
92 } else {
93 word->tess_failed = false;
94 }
95}
96
97
98/**********************************************************************
99 * recog_word_recursive
100 *
101 * Convert the word to tess form and pass it to the tess segmenter.
102 * Convert the output back to editor form.
103 **********************************************************************/
105 int word_length = word->chopped_word->NumBlobs(); // no of blobs
106 if (word_length > MAX_UNDIVIDED_LENGTH) {
107 return split_and_recog_word(word);
108 }
109 cc_recog(word);
110 word_length = word->rebuild_word->NumBlobs(); // No of blobs in output.
111
112 // Do sanity checks and minor fixes on best_choice.
113 if (word->best_choice->length() > word_length) {
114 word->best_choice->make_bad(); // should never happen
115 tprintf("recog_word: Discarded long string \"%s\""
116 " (%d characters vs %d blobs)\n",
118 word->best_choice->length(), word_length);
119 tprintf("Word is at:");
120 word->word->bounding_box().print();
121 }
122 if (word->best_choice->length() < word_length) {
123 UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
124 while (word->best_choice->length() < word_length) {
125 word->best_choice->append_unichar_id(space_id, 1, 0.0,
126 word->best_choice->certainty());
127 }
128 }
129}
130
131
132/**********************************************************************
133 * split_and_recog_word
134 *
135 * Split the word into 2 smaller pieces at the largest gap.
136 * Recognize the pieces and stick the results back together.
137 **********************************************************************/
139 // Find the biggest blob gap in the chopped_word.
140 int bestgap = -INT32_MAX;
141 int split_index = 0;
142 for (int b = 1; b < word->chopped_word->NumBlobs(); ++b) {
143 TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
144 TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
145 int gap = blob_box.left() - prev_box.right();
146 if (gap > bestgap) {
147 bestgap = gap;
148 split_index = b;
149 }
150 }
151 ASSERT_HOST(split_index > 0);
152
153 WERD_RES *word2 = nullptr;
154 BlamerBundle *orig_bb = nullptr;
155 split_word(word, split_index, &word2, &orig_bb);
156
157 // Recognize the first part of the word.
159 // Recognize the second part of the word.
161
162 join_words(word, word2, orig_bb);
163}
164
165
166/**********************************************************************
167 * split_word
168 *
169 * Split a given WERD_RES in place into two smaller words for recognition.
170 * split_pt is the index of the first blob to go in the second word.
171 * The underlying word is left alone, only the TWERD (and subsequent data)
172 * are split up. orig_blamer_bundle is set to the original blamer bundle,
173 * and will now be owned by the caller. New blamer bundles are forged for the
174 * two pieces.
175 **********************************************************************/
177 int split_pt,
178 WERD_RES **right_piece,
179 BlamerBundle **orig_blamer_bundle) const {
180 ASSERT_HOST(split_pt >0 && split_pt < word->chopped_word->NumBlobs());
181
182 // Save a copy of the blamer bundle so we can try to reconstruct it below.
183 BlamerBundle *orig_bb =
184 word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : nullptr;
185
186 auto *word2 = new WERD_RES(*word);
187
188 // blow away the copied chopped_word, as we want to work with
189 // the blobs from the input chopped_word so seam_arrays can be merged.
190 TWERD *chopped = word->chopped_word;
191 auto *chopped2 = new TWERD;
192 chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
193 for (int i = split_pt; i < chopped->NumBlobs(); ++i) {
194 chopped2->blobs.push_back(chopped->blobs[i]);
195 }
196 chopped->blobs.truncate(split_pt);
197 word->chopped_word = nullptr;
198 delete word2->chopped_word;
199 word2->chopped_word = nullptr;
200
201 const UNICHARSET &unicharset = *word->uch_set;
202 word->ClearResults();
203 word2->ClearResults();
204 word->chopped_word = chopped;
205 word2->chopped_word = chopped2;
207 word2->SetupBasicsFromChoppedWord(unicharset);
208
209 // Try to adjust the blamer bundle.
210 if (orig_bb != nullptr) {
211 // TODO(rays) Looks like a leak to me.
212 // orig_bb should take, rather than copy.
213 word->blamer_bundle = new BlamerBundle();
214 word2->blamer_bundle = new BlamerBundle();
215 orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),
216 word2->chopped_word->blobs[0]->bounding_box().left(),
218 word->blamer_bundle, word2->blamer_bundle);
219 }
220
221 *right_piece = word2;
222 *orig_blamer_bundle = orig_bb;
223}
224
225
226/**********************************************************************
227 * join_words
228 *
229 * The opposite of split_word():
230 * join word2 (including any recognized data / seam array / etc)
231 * onto the right of word and then delete word2.
232 * Also, if orig_bb is provided, stitch it back into word.
233 **********************************************************************/
235 WERD_RES *word2,
236 BlamerBundle *orig_bb) const {
237 TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
238 TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
239 // Tack the word2 outputs onto the end of the word outputs.
240 word->chopped_word->blobs += word2->chopped_word->blobs;
241 word->rebuild_word->blobs += word2->rebuild_word->blobs;
242 word2->chopped_word->blobs.clear();
243 word2->rebuild_word->blobs.clear();
244 TPOINT split_pt;
245 split_pt.x = (prev_box.right() + blob_box.left()) / 2;
246 split_pt.y = (prev_box.top() + prev_box.bottom() +
247 blob_box.top() + blob_box.bottom()) / 4;
248 // Move the word2 seams onto the end of the word1 seam_array.
249 // Since the seam list is one element short, an empty seam marking the
250 // end of the last blob in the first word is needed first.
251 word->seam_array.push_back(new SEAM(0.0f, split_pt));
252 word->seam_array += word2->seam_array;
253 word2->seam_array.truncate(0);
254 // Fix widths and gaps.
255 word->blob_widths += word2->blob_widths;
256 word->blob_gaps += word2->blob_gaps;
257 // Fix the ratings matrix.
258 int rat1 = word->ratings->dimension();
259 int rat2 = word2->ratings->dimension();
260 word->ratings->AttachOnCorner(word2->ratings);
261 ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
262 word->best_state += word2->best_state;
263 // Append the word choices.
264 *word->raw_choice += *word2->raw_choice;
265
266 // How many alt choices from each should we try to get?
267 const int kAltsPerPiece = 2;
268 // When do we start throwing away extra alt choices?
269 const int kTooManyAltChoices = 100;
270
271 // Construct the cartesian product of the best_choices of word(1) and word2.
272 WERD_CHOICE_LIST joined_choices;
273 WERD_CHOICE_IT jc_it(&joined_choices);
274 WERD_CHOICE_IT bc1_it(&word->best_choices);
275 WERD_CHOICE_IT bc2_it(&word2->best_choices);
276 int num_word1_choices = word->best_choices.length();
277 int total_joined_choices = num_word1_choices;
278 // Nota Bene: For the main loop here, we operate only on the 2nd and greater
279 // word2 choices, and put them in the joined_choices list. The 1st word2
280 // choice gets added to the original word1 choices in-place after we have
281 // finished with them.
282 int bc2_index = 1;
283 for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
284 if (total_joined_choices >= kTooManyAltChoices &&
285 bc2_index > kAltsPerPiece)
286 break;
287 int bc1_index = 0;
288 for (bc1_it.move_to_first(); bc1_index < num_word1_choices;
289 ++bc1_index, bc1_it.forward()) {
290 if (total_joined_choices >= kTooManyAltChoices &&
291 bc1_index > kAltsPerPiece)
292 break;
293 auto *wc = new WERD_CHOICE(*bc1_it.data());
294 *wc += *bc2_it.data();
295 jc_it.add_after_then_move(wc);
296 ++total_joined_choices;
297 }
298 }
299 // Now that we've filled in as many alternates as we want, paste the best
300 // choice for word2 onto the original word alt_choices.
301 bc1_it.move_to_first();
302 bc2_it.move_to_first();
303 for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
304 *bc1_it.data() += *bc2_it.data();
305 }
306 bc1_it.move_to_last();
307 bc1_it.add_list_after(&joined_choices);
308
309 // Restore the pointer to original blamer bundle and combine blamer
310 // information recorded in the splits.
311 if (orig_bb != nullptr) {
312 orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle,
314 delete word->blamer_bundle;
315 word->blamer_bundle = orig_bb;
316 }
317 word->SetupBoxWord();
318 word->reject_map.initialise(word->box_word->length());
319 delete word2;
320}
321
322
323} // namespace tesseract
#define MAX_UNDIVIDED_LENGTH
Definition: tfacepp.cpp:29
@ IRR_NO_TRUTH
Definition: blamer.h:93
@ FREQ_DAWG_PERM
Definition: ratngs.h:244
@ USER_DAWG_PERM
Definition: ratngs.h:243
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:241
#define ASSERT_HOST(x)
Definition: errcode.h:88
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int UNICHAR_ID
Definition: unichar.h:34
int push_back(T object)
bool empty() const
Definition: genericvector.h:91
T & back() const
void truncate(int size)
void reserve(int size)
int16_t alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:496
void split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
Definition: tfacepp.cpp:176
void split_and_recog_word(WERD_RES *word)
Definition: tfacepp.cpp:138
void recog_word_recursive(WERD_RES *word)
Definition: tfacepp.cpp:104
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
Definition: tfacepp.cpp:234
void recog_word(WERD_RES *word)
Definition: tfacepp.cpp:40
void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug)
Definition: blamer.cpp:233
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:120
void SplitBundle(int word1_right, int word2_left, bool debug, BlamerBundle *bundle1, BlamerBundle *bundle2) const
Definition: blamer.cpp:177
Definition: blobs.h:51
int16_t x
Definition: blobs.h:93
int16_t y
Definition: blobs.h:94
TBOX bounding_box() const
Definition: blobs.cpp:468
Definition: blobs.h:418
int NumBlobs() const
Definition: blobs.h:448
GenericVector< TBLOB * > blobs
Definition: blobs.h:459
int length() const
Definition: boxword.h:83
void AttachOnCorner(BandTriMatrix< T > *array2)
Definition: matrix.h:553
int dimension() const
Definition: matrix.h:536
const UNICHARSET * uch_set
Definition: pageres.h:203
TWERD * rebuild_word
Definition: pageres.h:266
WERD_CHOICE_LIST best_choices
Definition: pageres.h:249
BlamerBundle * blamer_bundle
Definition: pageres.h:252
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:480
tesseract::BoxWord * box_word
Definition: pageres.h:272
GenericVector< int > blob_widths
Definition: pageres.h:216
GenericVector< SEAM * > seam_array
Definition: pageres.h:214
WERD_CHOICE * best_choice
Definition: pageres.h:241
void ClearResults()
Definition: pageres.cpp:1104
void SetupBoxWord()
Definition: pageres.cpp:849
bool tess_failed
Definition: pageres.h:295
GenericVector< int > best_state
Definition: pageres.h:285
WERD_CHOICE * raw_choice
Definition: pageres.h:246
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:343
TWERD * chopped_word
Definition: pageres.h:212
REJMAP reject_map
Definition: pageres.h:294
GenericVector< int > blob_gaps
Definition: pageres.h:219
bool StatesAllValid()
Definition: pageres.cpp:458
MATRIX * ratings
Definition: pageres.h:237
WERD * word
Definition: pageres.h:186
const STRING debug_string() const
Definition: ratngs.h:495
const STRING & unichar_string() const
Definition: ratngs.h:531
uint8_t permuter() const
Definition: ratngs.h:336
void set_permuter(uint8_t perm)
Definition: ratngs.h:365
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:433
float certainty() const
Definition: ratngs.h:320
int length() const
Definition: ratngs.h:293
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:472
const STRING & unichar_lengths() const
Definition: ratngs.h:538
Definition: rect.h:34
int16_t top() const
Definition: rect.h:58
void print() const
Definition: rect.h:278
int16_t left() const
Definition: rect.h:72
int16_t bottom() const
Definition: rect.h:65
int16_t right() const
Definition: rect.h:79
void initialise(int16_t length)
Definition: rejctmap.cpp:273
void rej_word_tess_failure()
Definition: rejctmap.cpp:352
Definition: seam.h:38
TBOX bounding_box() const
Definition: werd.cpp:148
UNICHARSET unicharset
Definition: ccutil.h:73
const char * string() const
Definition: strngs.cpp:194
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:89
bool wordrec_skip_no_truth_words
Definition: wordrec.h:230
bool wordrec_debug_blamer
Definition: wordrec.h:231
void cc_recog(WERD_RES *word)
Definition: tface.cpp:125