tesseract 4.1.1
Loading...
Searching...
No Matches
pieces.cpp
Go to the documentation of this file.
1/* -*-C-*-
2 ********************************************************************************
3 *
4 * File: pieces.cpp (Formerly pieces.c)
5 * Description:
6 * Author: Mark Seaman, OCR Technology
7 *
8 * (c) Copyright 1987, Hewlett-Packard Company.
9 ** Licensed under the Apache License, Version 2.0 (the "License");
10 ** you may not use this file except in compliance with the License.
11 ** You may obtain a copy of the License at
12 ** http://www.apache.org/licenses/LICENSE-2.0
13 ** Unless required by applicable law or agreed to in writing, software
14 ** distributed under the License is distributed on an "AS IS" BASIS,
15 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 ** See the License for the specific language governing permissions and
17 ** limitations under the License.
18 *
19 *********************************************************************************/
20/*----------------------------------------------------------------------
21 I n c l u d e s
22----------------------------------------------------------------------*/
23
24#include "blobs.h"
25#include "helpers.h"
26#include "matrix.h"
27#include "ratngs.h"
28#include "seam.h"
29#include "wordrec.h"
30
31// Include automatically generated configuration file if running autoconf.
32#ifdef HAVE_CONFIG_H
33#include "config_auto.h"
34#endif
35
37
38/*----------------------------------------------------------------------
39 F u n c t i o n s
40----------------------------------------------------------------------*/
41
42/**********************************************************************
43 * classify_piece
44 *
45 * Create a larger piece from a collection of smaller ones. Classify
46 * it and return the results. Take the large piece apart to leave
47 * the collection of small pieces un modified.
48 **********************************************************************/
49namespace tesseract {
50BLOB_CHOICE_LIST *Wordrec::classify_piece(const GenericVector<SEAM*>& seams,
51 int16_t start,
52 int16_t end,
53 const char* description,
54 TWERD *word,
55 BlamerBundle *blamer_bundle) {
56 if (end > start) SEAM::JoinPieces(seams, word->blobs, start, end);
57 BLOB_CHOICE_LIST *choices = classify_blob(word->blobs[start], description,
58 White, blamer_bundle);
59 // Set the matrix_cell_ entries in all the BLOB_CHOICES.
60 BLOB_CHOICE_IT bc_it(choices);
61 for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
62 bc_it.data()->set_matrix_cell(start, end);
63 }
64
65 if (end > start) SEAM::BreakPieces(seams, word->blobs, start, end);
66
67 return (choices);
68}
69
70template<class BLOB_CHOICE>
71int SortByUnicharID(const void *void1, const void *void2) {
72 const BLOB_CHOICE *p1 = *static_cast<const BLOB_CHOICE *const *>(void1);
73 const BLOB_CHOICE *p2 = *static_cast<const BLOB_CHOICE *const *>(void2);
74
75 return p1->unichar_id() - p2->unichar_id();
76}
77
78template<class BLOB_CHOICE>
79int SortByRating(const void *void1, const void *void2) {
80 const BLOB_CHOICE *p1 = *static_cast<const BLOB_CHOICE *const *>(void1);
81 const BLOB_CHOICE *p2 = *static_cast<const BLOB_CHOICE *const *>(void2);
82
83 if (p1->rating() < p2->rating())
84 return 1;
85 return -1;
86}
87
88
89/**********************************************************************
90 * fill_filtered_fragment_list
91 *
92 * Filter the fragment list so that the filtered_choices only contain
93 * fragments that are in the correct position. choices is the list
94 * that we are going to filter. fragment_pos is the position in the
95 * fragment that we are looking for and num_frag_parts is the the
96 * total number of pieces. The result will be appended to
97 * filtered_choices.
98 **********************************************************************/
99void Wordrec::fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices,
100 int fragment_pos,
101 int num_frag_parts,
102 BLOB_CHOICE_LIST *filtered_choices) {
103 BLOB_CHOICE_IT filtered_choices_it(filtered_choices);
104 BLOB_CHOICE_IT choices_it(choices);
105
106 for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
107 choices_it.forward()) {
108 UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
109 const CHAR_FRAGMENT *frag = unicharset.get_fragment(choice_unichar_id);
110
111 if (frag != nullptr && frag->get_pos() == fragment_pos &&
112 frag->get_total() == num_frag_parts) {
113 // Recover the unichar_id of the unichar that this fragment is
114 // a part of
115 auto *b = new BLOB_CHOICE(*choices_it.data());
116 int original_unichar = unicharset.unichar_to_id(frag->get_unichar());
117 b->set_unichar_id(original_unichar);
118 filtered_choices_it.add_to_end(b);
119 }
120 }
121
122 filtered_choices->sort(SortByUnicharID<BLOB_CHOICE>);
123}
124
125
126/**********************************************************************
127 * merge_and_put_fragment_lists
128 *
129 * Merge the fragment lists in choice_lists and append it to the
130 * ratings matrix.
131 **********************************************************************/
132void Wordrec::merge_and_put_fragment_lists(int16_t row, int16_t column,
133 int16_t num_frag_parts,
134 BLOB_CHOICE_LIST *choice_lists,
135 MATRIX *ratings) {
136 auto *choice_lists_it = new BLOB_CHOICE_IT[num_frag_parts];
137
138 for (int i = 0; i < num_frag_parts; i++) {
139 choice_lists_it[i].set_to_list(&choice_lists[i]);
140 choice_lists_it[i].mark_cycle_pt();
141 }
142
143 BLOB_CHOICE_LIST *merged_choice = ratings->get(row, column);
144 if (merged_choice == nullptr)
145 merged_choice = new BLOB_CHOICE_LIST;
146
147 bool end_of_list = false;
148 BLOB_CHOICE_IT merged_choice_it(merged_choice);
149 while (!end_of_list) {
150 // Find the maximum unichar_id of the current entry the iterators
151 // are pointing at
152 UNICHAR_ID max_unichar_id = choice_lists_it[0].data()->unichar_id();
153 for (int i = 0; i < num_frag_parts; i++) {
154 UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
155 if (max_unichar_id < unichar_id) {
156 max_unichar_id = unichar_id;
157 }
158 }
159
160 // Move the each iterators until it gets to an entry that has a
161 // value greater than or equal to max_unichar_id
162 for (int i = 0; i < num_frag_parts; i++) {
163 UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
164 while (!choice_lists_it[i].cycled_list() &&
165 unichar_id < max_unichar_id) {
166 choice_lists_it[i].forward();
167 unichar_id = choice_lists_it[i].data()->unichar_id();
168 }
169 if (choice_lists_it[i].cycled_list()) {
170 end_of_list = true;
171 break;
172 }
173 }
174
175 if (end_of_list)
176 break;
177
178 // Checks if the fragments are parts of the same character
179 UNICHAR_ID first_unichar_id = choice_lists_it[0].data()->unichar_id();
180 bool same_unichar = true;
181 for (int i = 1; i < num_frag_parts; i++) {
182 UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
183 if (unichar_id != first_unichar_id) {
184 same_unichar = false;
185 break;
186 }
187 }
188
189 if (same_unichar) {
190 // Add the merged character to the result
191 UNICHAR_ID merged_unichar_id = first_unichar_id;
192 GenericVector<ScoredFont> merged_fonts =
193 choice_lists_it[0].data()->fonts();
194 float merged_min_xheight = choice_lists_it[0].data()->min_xheight();
195 float merged_max_xheight = choice_lists_it[0].data()->max_xheight();
196 float positive_yshift = 0, negative_yshift = 0;
197 int merged_script_id = choice_lists_it[0].data()->script_id();
198 BlobChoiceClassifier classifier = choice_lists_it[0].data()->classifier();
199
200 float merged_rating = 0, merged_certainty = 0;
201 for (int i = 0; i < num_frag_parts; i++) {
202 float rating = choice_lists_it[i].data()->rating();
203 float certainty = choice_lists_it[i].data()->certainty();
204
205 if (i == 0 || certainty < merged_certainty)
206 merged_certainty = certainty;
207 merged_rating += rating;
208
209 choice_lists_it[i].forward();
210 if (choice_lists_it[i].cycled_list())
211 end_of_list = true;
212 IntersectRange(choice_lists_it[i].data()->min_xheight(),
213 choice_lists_it[i].data()->max_xheight(),
214 &merged_min_xheight, &merged_max_xheight);
215 float yshift = choice_lists_it[i].data()->yshift();
216 if (yshift > positive_yshift) positive_yshift = yshift;
217 if (yshift < negative_yshift) negative_yshift = yshift;
218 // Use the min font rating over the parts.
219 // TODO(rays) font lists are unsorted. Need to be faster?
220 const GenericVector<ScoredFont>& frag_fonts =
221 choice_lists_it[i].data()->fonts();
222 for (int f = 0; f < frag_fonts.size(); ++f) {
223 int merged_f = 0;
224 for (merged_f = 0; merged_f < merged_fonts.size() &&
225 merged_fonts[merged_f].fontinfo_id != frag_fonts[f].fontinfo_id;
226 ++merged_f) {}
227 if (merged_f == merged_fonts.size()) {
228 merged_fonts.push_back(frag_fonts[f]);
229 } else if (merged_fonts[merged_f].score > frag_fonts[f].score) {
230 merged_fonts[merged_f].score = frag_fonts[f].score;
231 }
232 }
233 }
234
235 float merged_yshift = positive_yshift != 0
236 ? (negative_yshift != 0 ? 0 : positive_yshift)
237 : negative_yshift;
238 auto* choice = new BLOB_CHOICE(merged_unichar_id,
239 merged_rating,
240 merged_certainty,
241 merged_script_id,
242 merged_min_xheight,
243 merged_max_xheight,
244 merged_yshift,
245 classifier);
246 choice->set_fonts(merged_fonts);
247 merged_choice_it.add_to_end(choice);
248 }
249 }
250
252 print_ratings_list("Merged Fragments", merged_choice,
253 unicharset);
254
255 if (merged_choice->empty())
256 delete merged_choice;
257 else
258 ratings->put(row, column, merged_choice);
259
260 delete [] choice_lists_it;
261}
262
263/**********************************************************************
264 * get_fragment_lists
265 *
266 * Recursively go through the ratings matrix to find lists of fragments
267 * to be merged in the function merge_and_put_fragment_lists.
268 * current_frag is the position of the piece we are looking for.
269 * current_row is the row in the rating matrix we are currently at.
270 * start is the row we started initially, so that we can know where
271 * to append the results to the matrix. num_frag_parts is the total
272 * number of pieces we are looking for and num_blobs is the size of the
273 * ratings matrix.
274 **********************************************************************/
275void Wordrec::get_fragment_lists(int16_t current_frag, int16_t current_row,
276 int16_t start, int16_t num_frag_parts,
277 int16_t num_blobs, MATRIX *ratings,
278 BLOB_CHOICE_LIST *choice_lists) {
279 if (current_frag == num_frag_parts) {
280 merge_and_put_fragment_lists(start, current_row - 1, num_frag_parts,
281 choice_lists, ratings);
282 return;
283 }
284
285 for (int16_t x = current_row; x < num_blobs; x++) {
286 BLOB_CHOICE_LIST *choices = ratings->get(current_row, x);
287 if (choices == nullptr)
288 continue;
289
290 fill_filtered_fragment_list(choices, current_frag, num_frag_parts,
291 &choice_lists[current_frag]);
292 if (!choice_lists[current_frag].empty()) {
293 get_fragment_lists(current_frag + 1, x + 1, start, num_frag_parts,
294 num_blobs, ratings, choice_lists);
295 choice_lists[current_frag].clear();
296 }
297 }
298}
299
300
301/**********************************************************************
302 * merge_fragments
303 *
304 * Try to merge fragments in the ratings matrix and put the result in
305 * the corresponding row and column
306 **********************************************************************/
307void Wordrec::merge_fragments(MATRIX *ratings, int16_t num_blobs) {
308 BLOB_CHOICE_LIST choice_lists[CHAR_FRAGMENT::kMaxChunks];
309 for (int16_t start = 0; start < num_blobs; start++) {
310 for (int frag_parts = 2; frag_parts <= CHAR_FRAGMENT::kMaxChunks;
311 frag_parts++) {
312 get_fragment_lists(0, start, start, frag_parts, num_blobs,
313 ratings, choice_lists);
314 }
315 }
316
317 // Delete fragments from the rating matrix
318 for (int16_t x = 0; x < num_blobs; x++) {
319 for (int16_t y = x; y < num_blobs; y++) {
320 BLOB_CHOICE_LIST *choices = ratings->get(x, y);
321 if (choices != nullptr) {
322 BLOB_CHOICE_IT choices_it(choices);
323 for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
324 choices_it.forward()) {
325 UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
326 const CHAR_FRAGMENT *frag =
327 unicharset.get_fragment(choice_unichar_id);
328 if (frag != nullptr)
329 delete choices_it.extract();
330 }
331 }
332 }
333 }
334}
335
336
337} // namespace tesseract
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:837
BlobChoiceClassifier
Definition: ratngs.h:43
void IntersectRange(const T &lower1, const T &upper1, T *lower2, T *upper2)
Definition: helpers.h:145
int UNICHAR_ID
Definition: unichar.h:34
@ White
Definition: callcpp.h:29
int SortByUnicharID(const void *void1, const void *void2)
Definition: pieces.cpp:71
int SortByRating(const void *void1, const void *void2)
Definition: pieces.cpp:79
int push_back(T object)
int size() const
Definition: genericvector.h:72
T get(ICOORD pos) const
Definition: matrix.h:231
void put(ICOORD pos, const T &thing)
Definition: matrix.h:223
Definition: blobs.h:418
GenericVector< TBLOB * > blobs
Definition: blobs.h:459
Definition: matrix.h:578
float rating() const
Definition: ratngs.h:80
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
static void BreakPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:188
static void JoinPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:210
UNICHARSET unicharset
Definition: ccutil.h:73
static const int kMaxChunks
Definition: unicharset.h:55
int get_total() const
Definition: unicharset.h:72
const char * get_unichar() const
Definition: unicharset.h:70
int get_pos() const
Definition: unicharset.h:71
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:734
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
void get_fragment_lists(int16_t current_frag, int16_t current_row, int16_t start, int16_t num_frag_parts, int16_t num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
Definition: pieces.cpp:275
BLOB_CHOICE_LIST * classify_blob(TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle)
Definition: wordclass.cpp:54
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM * > &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:50
void fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)
Definition: pieces.cpp:99
void merge_fragments(MATRIX *ratings, int16_t num_blobs)
Definition: pieces.cpp:307
void merge_and_put_fragment_lists(int16_t row, int16_t column, int16_t num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)
Definition: pieces.cpp:132