tesseract 4.1.1
Loading...
Searching...
No Matches
shapetable.h
Go to the documentation of this file.
1// Copyright 2010 Google Inc. All Rights Reserved.
2// Author: rays@google.com (Ray Smith)
4// File: shapetable.h
5// Description: Class to map a classifier shape index to unicharset
6// indices and font indices.
7// Author: Ray Smith
8//
9// (C) Copyright 2010, Google Inc.
10// Licensed under the Apache License, Version 2.0 (the "License");
11// you may not use this file except in compliance with the License.
12// You may obtain a copy of the License at
13// http://www.apache.org/licenses/LICENSE-2.0
14// Unless required by applicable law or agreed to in writing, software
15// distributed under the License is distributed on an "AS IS" BASIS,
16// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17// See the License for the specific language governing permissions and
18// limitations under the License.
19//
21
22#ifndef TESSERACT_CLASSIFY_SHAPETABLE_H_
23#define TESSERACT_CLASSIFY_SHAPETABLE_H_
24
25#include "bitvector.h"
26#include "fontinfo.h"
27#include "genericheap.h"
28#include "genericvector.h"
29#include "intmatcher.h"
30
31class STRING;
32class UNICHARSET;
33
34namespace tesseract {
35
36class ShapeTable;
37
38// Simple struct to hold a single classifier unichar selection, a corresponding
39// rating, and a list of appropriate fonts.
42 : unichar_id(0), rating(0.0f), adapted(false), config(0),
43 feature_misses(0) {}
44 UnicharRating(int u, float r)
45 : unichar_id(u), rating(r), adapted(false), config(0), feature_misses(0) {}
46
47 // Print debug info.
48 void Print() const {
49 tprintf("Unichar-id=%d, rating=%g, adapted=%d, config=%d, misses=%d,"
51 fonts.size());
52 }
53
54 // Sort function to sort ratings appropriately by descending rating.
55 static int SortDescendingRating(const void* t1, const void* t2) {
56 const auto* a = static_cast<const UnicharRating*>(t1);
57 const auto* b = static_cast<const UnicharRating*>(t2);
58 if (a->rating > b->rating) {
59 return -1;
60 } else if (a->rating < b->rating) {
61 return 1;
62 } else {
63 return a->unichar_id - b->unichar_id;
64 }
65 }
66 // Helper function to get the index of the first result with the required
67 // unichar_id. If the results are sorted by rating, this will also be the
68 // best result with the required unichar_id.
69 // Returns -1 if the unichar_id is not found
72
73 // Index into some UNICHARSET table indicates the class of the answer.
75 // Rating from classifier with 1.0 perfect and 0.0 impossible.
76 // Call it a probability if you must.
77 float rating;
78 // True if this result is from the adaptive classifier.
79 bool adapted;
80 // Index of best matching font configuration of result.
81 uint8_t config;
82 // Number of features that were total misses - were liked by no classes.
84 // Unsorted collection of fontinfo ids and scores. Note that a raw result
85 // from the IntegerMatch will contain config ids, that require transforming
86 // to fontinfo ids via fontsets and (possibly) shapetable.
88};
89
90// Classifier result from a low-level classification is an index into some
91// ShapeTable and a rating.
94 : shape_id(0), rating(0.0f), raw(0.0f), font(0.0f),
95 joined(false), broken(false) {}
96 ShapeRating(int s, float r)
97 : shape_id(s), rating(r), raw(1.0f), font(0.0f),
98 joined(false), broken(false) {}
99
100 // Sort function to sort ratings appropriately by descending rating.
101 static int SortDescendingRating(const void* t1, const void* t2) {
102 const auto* a = static_cast<const ShapeRating*>(t1);
103 const auto* b = static_cast<const ShapeRating*>(t2);
104 if (a->rating > b->rating) {
105 return -1;
106 } else if (a->rating < b->rating) {
107 return 1;
108 } else {
109 return a->shape_id - b->shape_id;
110 }
111 }
112 // Helper function to get the index of the first result with the required
113 // unichar_id. If the results are sorted by rating, this will also be the
114 // best result with the required unichar_id.
115 // Returns -1 if the unichar_id is not found
116 static int FirstResultWithUnichar(const GenericVector<ShapeRating>& results,
117 const ShapeTable& shape_table,
118 UNICHAR_ID unichar_id);
119
120 // Index into some shape table indicates the class of the answer.
122 // Rating from classifier with 1.0 perfect and 0.0 impossible.
123 // Call it a probability if you must.
124 float rating;
125 // Subsidiary rating that a classifier may use internally.
126 float raw;
127 // Subsidiary rating that a classifier may use internally.
128 float font;
129 // Flag indicating that the input may be joined.
130 bool joined;
131 // Flag indicating that the input may be broken (a fragment).
132 bool broken;
133};
134
135// Simple struct to hold an entry for a heap-based priority queue of
136// ShapeRating.
139 ShapeQueueEntry(const ShapeRating& rating, int level0)
140 : result(rating), level(level0) {}
141
142 // Sort by decreasing rating and decreasing level for equal rating.
143 bool operator<(const ShapeQueueEntry& other) const {
144 if (result.rating > other.result.rating) return true;
145 if (result.rating == other.result.rating)
146 return level > other.level;
147 return false;
148 }
149
150 // Output from classifier.
152 // Which level in the tree did this come from?
153 int level;
154};
156
157// Simple struct to hold a set of fonts associated with a single unichar-id.
158// A vector of UnicharAndFonts makes a shape.
161 }
162 UnicharAndFonts(int uni_id, int font_id) : unichar_id(uni_id) {
163 font_ids.push_back(font_id);
164 }
165
166 // Writes to the given file. Returns false in case of error.
167 bool Serialize(FILE* fp) const;
168 // Reads from the given file. Returns false in case of error.
169 bool DeSerialize(TFile* fp);
170
171 // Sort function to sort a pair of UnicharAndFonts by unichar_id.
172 static int SortByUnicharId(const void* v1, const void* v2);
173
175 int32_t unichar_id;
176};
177
178// A Shape is a collection of unichar-ids and a list of fonts associated with
179// each, organized as a vector of UnicharAndFonts. Conceptually a Shape is
180// a classifiable unit, and represents a group of characters or parts of
181// characters that have a similar or identical shape. Shapes/ShapeTables may
182// be organized hierarchically from identical shapes at the leaves to vaguely
183// similar shapes near the root.
184class Shape {
185 public:
186 Shape() : destination_index_(-1) {}
187
188 // Writes to the given file. Returns false in case of error.
189 bool Serialize(FILE* fp) const;
190 // Reads from the given file. Returns false in case of error.
191 bool DeSerialize(TFile* fp);
192
193 int destination_index() const {
194 return destination_index_;
195 }
196 void set_destination_index(int index) {
197 destination_index_ = index;
198 }
199 int size() const {
200 return unichars_.size();
201 }
202 // Returns a UnicharAndFonts entry for the given index, which must be
203 // in the range [0, size()).
204 const UnicharAndFonts& operator[](int index) const {
205 return unichars_[index];
206 }
207 // Sets the unichar_id of the given index to the new unichar_id.
208 void SetUnicharId(int index, int unichar_id) {
209 unichars_[index].unichar_id = unichar_id;
210 }
211 // Adds a font_id for the given unichar_id. If the unichar_id is not
212 // in the shape, it is added.
213 void AddToShape(int unichar_id, int font_id);
214 // Adds everything in other to this.
215 void AddShape(const Shape& other);
216 // Returns true if the shape contains the given unichar_id, font_id pair.
217 bool ContainsUnicharAndFont(int unichar_id, int font_id) const;
218 // Returns true if the shape contains the given unichar_id, ignoring font.
219 bool ContainsUnichar(int unichar_id) const;
220 // Returns true if the shape contains the given font, ignoring unichar_id.
221 bool ContainsFont(int font_id) const;
222 // Returns true if the shape contains the given font properties, ignoring
223 // unichar_id.
224 bool ContainsFontProperties(const FontInfoTable& font_table,
225 uint32_t properties) const;
226 // Returns true if the shape contains multiple different font properties,
227 // ignoring unichar_id.
228 bool ContainsMultipleFontProperties(const FontInfoTable& font_table) const;
229 // Returns true if this shape is equal to other (ignoring order of unichars
230 // and fonts).
231 bool operator==(const Shape& other) const;
232 // Returns true if this is a subset (including equal) of other.
233 bool IsSubsetOf(const Shape& other) const;
234 // Returns true if the lists of unichar ids are the same in this and other,
235 // ignoring fonts.
236 // NOT const, as it will sort the unichars on demand.
237 bool IsEqualUnichars(Shape* other);
238
239 private:
240 // Sorts the unichars_ vector by unichar.
241 void SortUnichars();
242
243 // Flag indicates that the unichars are sorted, allowing faster set
244 // operations with another shape.
245 bool unichars_sorted_ = false;
246 // If this Shape is part of a ShapeTable the destiation_index_ is the index
247 // of some other shape in the ShapeTable with which this shape is merged.
248 int destination_index_ = 0;
249 // Array of unichars, each with a set of fonts. Each unichar has at most
250 // one entry in the vector.
252};
253
254// ShapeTable is a class to encapsulate the triple indirection that is
255// used here.
256// ShapeTable is a vector of shapes.
257// Each shape is a vector of UnicharAndFonts representing the set of unichars
258// that the shape represents.
259// Each UnicharAndFonts also lists the fonts of the unichar_id that were
260// mapped to the shape during training.
262 public:
263 ShapeTable();
264 // The UNICHARSET reference supplied here, or in set_unicharset below must
265 // exist for the entire life of the ShapeTable. It is used only by DebugStr.
266 explicit ShapeTable(const UNICHARSET& unicharset);
267
268 // Writes to the given file. Returns false in case of error.
269 bool Serialize(FILE* fp) const;
270 // Reads from the given file. Returns false in case of error.
271 bool DeSerialize(TFile* fp);
272
273 // Accessors.
274 int NumShapes() const {
275 return shape_table_.size();
276 }
277 const UNICHARSET& unicharset() const {
278 return *unicharset_;
279 }
280 // Returns the number of fonts used in this ShapeTable, computing it if
281 // necessary.
282 int NumFonts() const;
283 // Shapetable takes a pointer to the UNICHARSET, so it must persist for the
284 // entire life of the ShapeTable.
286 unicharset_ = &unicharset;
287 }
288 // Re-indexes the class_ids in the shapetable according to the given map.
289 // Useful in conjunction with set_unicharset.
290 void ReMapClassIds(const GenericVector<int>& unicharset_map);
291 // Returns a string listing the classes/fonts in a shape.
292 STRING DebugStr(int shape_id) const;
293 // Returns a debug string summarizing the table.
294 STRING SummaryStr() const;
295
296 // Adds a new shape starting with the given unichar_id and font_id.
297 // Returns the assigned index.
298 int AddShape(int unichar_id, int font_id);
299 // Adds a copy of the given shape unless it is already present.
300 // Returns the assigned index or index of existing shape if already present.
301 int AddShape(const Shape& other);
302 // Removes the shape given by the shape index. All indices above are changed!
303 void DeleteShape(int shape_id);
304 // Adds a font_id to the given existing shape index for the given
305 // unichar_id. If the unichar_id is not in the shape, it is added.
306 void AddToShape(int shape_id, int unichar_id, int font_id);
307 // Adds the given shape to the existing shape with the given index.
308 void AddShapeToShape(int shape_id, const Shape& other);
309 // Returns the id of the shape that contains the given unichar and font.
310 // If not found, returns -1.
311 // If font_id < 0, the font_id is ignored and the first shape that matches
312 // the unichar_id is returned.
313 int FindShape(int unichar_id, int font_id) const;
314 // Returns the first unichar_id and font_id in the given shape.
315 void GetFirstUnicharAndFont(int shape_id,
316 int* unichar_id, int* font_id) const;
317
318 // Accessors for the Shape with the given shape_id.
319 const Shape& GetShape(int shape_id) const {
320 return *shape_table_[shape_id];
321 }
322 Shape* MutableShape(int shape_id) {
323 return shape_table_[shape_id];
324 }
325
326 // Expands all the classes/fonts in the shape individually to build
327 // a ShapeTable.
328 int BuildFromShape(const Shape& shape, const ShapeTable& master_shapes);
329
330 // Returns true if the shapes are already merged.
331 bool AlreadyMerged(int shape_id1, int shape_id2) const;
332 // Returns true if any shape contains multiple unichars.
333 bool AnyMultipleUnichars() const;
334 // Returns the maximum number of unichars over all shapes.
335 int MaxNumUnichars() const;
336 // Merges shapes with a common unichar over the [start, end) interval.
337 // Assumes single unichar per shape.
338 void ForceFontMerges(int start, int end);
339 // Returns the number of unichars in the master shape.
340 int MasterUnicharCount(int shape_id) const;
341 // Returns the sum of the font counts in the master shape.
342 int MasterFontCount(int shape_id) const;
343 // Returns the number of unichars that would result from merging the shapes.
344 int MergedUnicharCount(int shape_id1, int shape_id2) const;
345 // Merges two shape_ids, leaving shape_id2 marked as merged.
346 void MergeShapes(int shape_id1, int shape_id2);
347 // Swaps two shape_ids.
348 void SwapShapes(int shape_id1, int shape_id2);
349 // Appends the master shapes from other to this.
350 // Used to create a clean ShapeTable from a merged one, or to create a
351 // copy of a ShapeTable.
352 // If not nullptr, shape_map is set to map other shape_ids to this's shape_ids.
353 void AppendMasterShapes(const ShapeTable& other,
354 GenericVector<int>* shape_map);
355 // Returns the number of master shapes remaining after merging.
356 int NumMasterShapes() const;
357 // Returns the destination of this shape, (if merged), taking into account
358 // the fact that the destination may itself have been merged.
359 // For a non-merged shape, returns the input shape_id.
360 int MasterDestinationIndex(int shape_id) const;
361
362 // Returns false if the unichars in neither shape is a subset of the other..
363 bool SubsetUnichar(int shape_id1, int shape_id2) const;
364 // Returns false if the unichars in neither shape is a subset of the other..
365 bool MergeSubsetUnichar(int merge_id1, int merge_id2, int shape_id) const;
366 // Returns true if the unichar sets are equal between the shapes.
367 bool EqualUnichars(int shape_id1, int shape_id2) const;
368 bool MergeEqualUnichars(int merge_id1, int merge_id2, int shape_id) const;
369 // Returns true if there is a common unichar between the shapes.
370 bool CommonUnichars(int shape_id1, int shape_id2) const;
371 // Returns true if there is a common font id between the shapes.
372 bool CommonFont(int shape_id1, int shape_id2) const;
373
374 // Adds the unichars of the given shape_id to the vector of results. Any
375 // unichar_id that is already present just has the fonts added to the
376 // font set for that result without adding a new entry in the vector.
377 // NOTE: it is assumed that the results are given to this function in order
378 // of decreasing rating.
379 // The unichar_map vector indicates the index of the results entry containing
380 // each unichar, or -1 if the unichar is not yet included in results.
381 void AddShapeToResults(const ShapeRating& shape_rating,
382 GenericVector<int>* unichar_map,
383 GenericVector<UnicharRating>* results) const;
384
385 private:
386 // Adds the given unichar_id to the results if needed, updating unichar_map
387 // and returning the index of unichar in results.
388 int AddUnicharToResults(int unichar_id, float rating,
389 GenericVector<int>* unichar_map,
390 GenericVector<UnicharRating>* results) const;
391
392 // Pointer to a provided unicharset used only by the Debugstr member.
393 const UNICHARSET* unicharset_;
394 // Vector of pointers to the Shapes in this ShapeTable.
395 PointerVector<Shape> shape_table_;
396
397 // Cached data calculated on demand.
398 mutable int num_fonts_;
399};
400
401} // namespace tesseract.
402
403#endif // TESSERACT_CLASSIFY_SHAPETABLE_H_
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int UNICHAR_ID
Definition: unichar.h:34
int push_back(T object)
Definition: strngs.h:45
GenericVector< ScoredFont > fonts
Definition: shapetable.h:87
static int FirstResultWithUnichar(const GenericVector< UnicharRating > &results, UNICHAR_ID unichar_id)
Definition: shapetable.cpp:58
UnicharRating(int u, float r)
Definition: shapetable.h:44
static int SortDescendingRating(const void *t1, const void *t2)
Definition: shapetable.h:55
ShapeRating(int s, float r)
Definition: shapetable.h:96
static int SortDescendingRating(const void *t1, const void *t2)
Definition: shapetable.h:101
static int FirstResultWithUnichar(const GenericVector< ShapeRating > &results, const ShapeTable &shape_table, UNICHAR_ID unichar_id)
Definition: shapetable.cpp:40
ShapeQueueEntry(const ShapeRating &rating, int level0)
Definition: shapetable.h:139
bool operator<(const ShapeQueueEntry &other) const
Definition: shapetable.h:143
static int SortByUnicharId(const void *v1, const void *v2)
Definition: shapetable.cpp:79
UnicharAndFonts(int uni_id, int font_id)
Definition: shapetable.h:162
GenericVector< int32_t > font_ids
Definition: shapetable.h:174
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:74
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:69
const UnicharAndFonts & operator[](int index) const
Definition: shapetable.h:204
int destination_index() const
Definition: shapetable.h:193
bool IsSubsetOf(const Shape &other) const
Definition: shapetable.cpp:202
void set_destination_index(int index)
Definition: shapetable.h:196
bool ContainsMultipleFontProperties(const FontInfoTable &font_table) const
Definition: shapetable.cpp:182
bool ContainsUnicharAndFont(int unichar_id, int font_id) const
Definition: shapetable.cpp:131
void SetUnicharId(int index, int unichar_id)
Definition: shapetable.h:208
bool ContainsFont(int font_id) const
Definition: shapetable.cpp:157
void AddToShape(int unichar_id, int font_id)
Definition: shapetable.cpp:101
int size() const
Definition: shapetable.h:199
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:86
void AddShape(const Shape &other)
Definition: shapetable.cpp:120
bool IsEqualUnichars(Shape *other)
Definition: shapetable.cpp:217
bool ContainsFontProperties(const FontInfoTable &font_table, uint32_t properties) const
Definition: shapetable.cpp:169
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:92
bool ContainsUnichar(int unichar_id) const
Definition: shapetable.cpp:147
bool operator==(const Shape &other) const
Definition: shapetable.cpp:197
void SwapShapes(int shape_id1, int shape_id2)
Definition: shapetable.cpp:523
bool AnyMultipleUnichars() const
Definition: shapetable.cpp:444
int NumShapes() const
Definition: shapetable.h:274
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:246
void GetFirstUnicharAndFont(int shape_id, int *unichar_id, int *font_id) const
Definition: shapetable.cpp:404
bool CommonUnichars(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:629
STRING SummaryStr() const
Definition: shapetable.cpp:313
int AddShape(int unichar_id, int font_id)
Definition: shapetable.cpp:336
int MergedUnicharCount(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:503
bool SubsetUnichar(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:543
int MasterFontCount(int shape_id) const
Definition: shapetable.cpp:492
bool AlreadyMerged(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:439
int NumMasterShapes() const
Definition: shapetable.cpp:670
STRING DebugStr(int shape_id) const
Definition: shapetable.cpp:281
void AddShapeToResults(const ShapeRating &shape_rating, GenericVector< int > *unichar_map, GenericVector< UnicharRating > *results) const
Definition: shapetable.cpp:687
void MergeShapes(int shape_id1, int shape_id2)
Definition: shapetable.cpp:513
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:241
int MasterUnicharCount(int shape_id) const
Definition: shapetable.cpp:486
void set_unicharset(const UNICHARSET &unicharset)
Definition: shapetable.h:285
void DeleteShape(int shape_id)
Definition: shapetable.cpp:361
int MaxNumUnichars() const
Definition: shapetable.cpp:455
Shape * MutableShape(int shape_id)
Definition: shapetable.h:322
void AddShapeToShape(int shape_id, const Shape &other)
Definition: shapetable.cpp:376
bool CommonFont(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:641
bool MergeEqualUnichars(int merge_id1, int merge_id2, int shape_id) const
Definition: shapetable.cpp:604
int BuildFromShape(const Shape &shape, const ShapeTable &master_shapes)
Definition: shapetable.cpp:413
int MasterDestinationIndex(int shape_id) const
Definition: shapetable.cpp:531
void AddToShape(int shape_id, int unichar_id, int font_id)
Definition: shapetable.cpp:369
bool EqualUnichars(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:587
bool MergeSubsetUnichar(int merge_id1, int merge_id2, int shape_id) const
Definition: shapetable.cpp:561
void ReMapClassIds(const GenericVector< int > &unicharset_map)
Definition: shapetable.cpp:271
void ForceFontMerges(int start, int end)
Definition: shapetable.cpp:468
int FindShape(int unichar_id, int font_id) const
Definition: shapetable.cpp:386
const UNICHARSET & unicharset() const
Definition: shapetable.h:277
const Shape & GetShape(int shape_id) const
Definition: shapetable.h:319
void AppendMasterShapes(const ShapeTable &other, GenericVector< int > *shape_map)
Definition: shapetable.cpp:656