tesseract 4.1.1
Loading...
Searching...
No Matches
recodebeam.h
Go to the documentation of this file.
1
2// File: recodebeam.h
3// Description: Beam search to decode from the re-encoded CJK as a sequence of
4// smaller numbers in place of a single large code.
5// Author: Ray Smith
6//
7// (C) Copyright 2015, Google Inc.
8// Licensed under the Apache License, Version 2.0 (the "License");
9// you may not use this file except in compliance with the License.
10// You may obtain a copy of the License at
11// http://www.apache.org/licenses/LICENSE-2.0
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17//
19
20#ifndef THIRD_PARTY_TESSERACT_LSTM_RECODEBEAM_H_
21#define THIRD_PARTY_TESSERACT_LSTM_RECODEBEAM_H_
22
23#include "dawg.h"
24#include "dict.h"
25#include "genericheap.h"
26#include "kdpair.h"
27#include "networkio.h"
28#include "ratngs.h"
29#include "unicharcompress.h"
30#include <deque>
31#include <set>
32#include <tuple>
33#include <vector>
34
35namespace tesseract {
36
37// Enum describing what can follow the current node.
38// Consider the following softmax outputs:
39// Timestep 0 1 2 3 4 5 6 7 8
40// X-score 0.01 0.55 0.98 0.42 0.01 0.01 0.40 0.95 0.01
41// Y-score 0.00 0.01 0.01 0.01 0.01 0.97 0.59 0.04 0.01
42// Null-score 0.99 0.44 0.01 0.57 0.98 0.02 0.01 0.01 0.98
43// Then the correct CTC decoding (in which adjacent equal classes are folded,
44// and then all nulls are dropped) is clearly XYX, but simple decoding (taking
45// the max at each timestep) leads to:
46// Null@0.99 X@0.55 X@0.98 Null@0.57 Null@0.98 Y@0.97 Y@0.59 X@0.95 Null@0.98,
47// which folds to the correct XYX. The conversion to Tesseract rating and
48// certainty uses the sum of the log probs (log of the product of probabilities)
49// for the Rating and the minimum log prob for the certainty, but that yields a
50// minimum certainty of log(0.55), which is poor for such an obvious case.
51// CTC says that the probability of the result is the SUM of the products of the
52// probabilities over ALL PATHS that decode to the same result, which includes:
53// NXXNNYYXN, NNXNNYYN, NXXXNYYXN, NNXXNYXXN, and others including XXXXXYYXX.
54// That is intractable, so some compromise between simple and ideal is needed.
55// Observing that evenly split timesteps rarely happen next to each other, we
56// allow scores at a transition between classes to be added for decoding thus:
57// N@0.99 (N+X)@0.99 X@0.98 (N+X)@0.99 N@0.98 Y@0.97 (X+Y+N)@1.00 X@0.95 N@0.98.
58// This works because NNX and NXX both decode to X, so in the middle we can use
59// N+X. Note that the classes either side of a sum must stand alone, i.e. use a
60// single score, to force all paths to pass through them and decode to the same
61// result. Also in the special case of a transition from X to Y, with only one
62// timestep between, it is possible to add X+Y+N, since XXY, XYY, and XNY all
63// decode to XY.
64// An important condition is that we cannot combine X and Null between two
65// stand-alone Xs, since that can decode as XNX->XX or XXX->X, so the scores for
66// X and Null have to go in separate paths. Combining scores in this way
67// provides a much better minimum certainty of log(0.95).
68// In the implementation of the beam search, we have to place the possibilities
69// X, X+N and X+Y+N in the beam under appropriate conditions of the previous
70// node, and constrain what can follow, to enforce the rules explained above.
71// We therefore have 3 different types of node determined by what can follow:
73 NC_ANYTHING, // This node used just its own score, so anything can follow.
74 NC_ONLY_DUP, // The current node combined another score with the score for
75 // itself, without a stand-alone duplicate before, so must be
76 // followed by a stand-alone duplicate.
77 NC_NO_DUP, // The current node combined another score with the score for
78 // itself, after a stand-alone, so can only be followed by
79 // something other than a duplicate of the current node.
81};
82
83// Enum describing the top-n status of a code.
85 TN_TOP2, // Winner or 2nd.
86 TN_TOPN, // Runner up in top-n, but not 1st or 2nd.
87 TN_ALSO_RAN, // Not in the top-n.
89};
90
91// Lattice element for Re-encode beam search.
92struct RecodeNode {
94 : code(-1),
95 unichar_id(INVALID_UNICHAR_ID),
97 start_of_dawg(false),
98 start_of_word(false),
99 end_of_word(false),
100 duplicate(false),
101 certainty(0.0f),
102 score(0.0f),
103 prev(nullptr),
104 dawgs(nullptr),
105 code_hash(0) {}
106 RecodeNode(int c, int uni_id, PermuterType perm, bool dawg_start,
107 bool word_start, bool end, bool dup, float cert, float s,
108 const RecodeNode* p, DawgPositionVector* d, uint64_t hash)
109 : code(c),
110 unichar_id(uni_id),
111 permuter(perm),
112 start_of_dawg(dawg_start),
113 start_of_word(word_start),
114 end_of_word(end),
115 duplicate(dup),
116 certainty(cert),
117 score(s),
118 prev(p),
119 dawgs(d),
120 code_hash(hash) {}
121 // NOTE: If we could use C++11, then this would be a move constructor.
122 // Instead we have copy constructor that does a move!! This is because we
123 // don't want to copy the whole DawgPositionVector each time, and true
124 // copying isn't necessary for this struct. It does get moved around a lot
125 // though inside the heap and during heap push, hence the move semantics.
126 RecodeNode(RecodeNode& src) : dawgs(nullptr) {
127 *this = src;
128 ASSERT_HOST(src.dawgs == nullptr);
129 }
131 delete dawgs;
132 memcpy(this, &src, sizeof(src));
133 src.dawgs = nullptr;
134 return *this;
135 }
136 ~RecodeNode() { delete dawgs; }
137 // Prints details of the node.
138 void Print(int null_char, const UNICHARSET& unicharset, int depth) const;
139
140 // The re-encoded code here = index to network output.
141 int code;
142 // The decoded unichar_id is only valid for the final code of a sequence.
144 // The type of permuter active at this point. Intervals between start_of_word
145 // and end_of_word make valid words of type given by permuter where
146 // end_of_word is true. These aren't necessarily delimited by spaces.
148 // True if this is the initial dawg state. May be attached to a space or,
149 // in a non-space-delimited lang, the end of the previous word.
151 // True if this is the first node in a dictionary word.
153 // True if this represents a valid candidate end of word position. Does not
154 // necessarily mark the end of a word, since a word can be extended beyond a
155 // candidate end by a continuation, eg 'the' continues to 'these'.
157 // True if this->code is a duplicate of prev->code. Some training modes
158 // allow the network to output duplicate characters and crush them with CTC,
159 // but that would mess up the dictionary search, so we just smash them
160 // together on the fly using the duplicate flag.
162 // Certainty (log prob) of (just) this position.
164 // Total certainty of the path to this position.
165 float score;
166 // The previous node in this chain. Borrowed pointer.
168 // The currently active dawgs at this position. Owned pointer.
170 // A hash of all codes in the prefix and this->code as well. Used for
171 // duplicate path removal.
172 uint64_t code_hash;
173};
174
177
178// Class that holds the entire beam search for recognition of a text line.
180 public:
181 // Borrows the pointer, which is expected to survive until *this is deleted.
182 RecodeBeamSearch(const UnicharCompress& recoder, int null_char,
183 bool simple_text, Dict* dict);
184
185 // Decodes the set of network outputs, storing the lattice internally.
186 // If charset is not null, it enables detailed debugging of the beam search.
187 void Decode(const NetworkIO& output, double dict_ratio, double cert_offset,
188 double worst_dict_cert, const UNICHARSET* charset,
189 int lstm_choice_mode = 0);
190 void Decode(const GENERIC_2D_ARRAY<float>& output, double dict_ratio,
191 double cert_offset, double worst_dict_cert,
192 const UNICHARSET* charset);
193
194 // Returns the best path as labels/scores/xcoords similar to simple CTC.
196 GenericVector<int>* xcoords) const;
197 // Returns the best path as unichar-ids/certs/ratings/xcoords skipping
198 // duplicates, nulls and intermediate parts.
199 void ExtractBestPathAsUnicharIds(bool debug, const UNICHARSET* unicharset,
200 GenericVector<int>* unichar_ids,
202 GenericVector<float>* ratings,
203 GenericVector<int>* xcoords) const;
204
205 // Returns the best path as a set of WERD_RES.
206 void ExtractBestPathAsWords(const TBOX& line_box, float scale_factor,
207 bool debug, const UNICHARSET* unicharset,
209 int lstm_choice_mode = 0);
210
211 // Generates debug output of the content of the beams after a Decode.
212 void DebugBeams(const UNICHARSET& unicharset) const;
213
214 // Stores the alternative characters of every timestep together with their
215 // probability.
216 std::vector< std::vector<std::pair<const char*, float>>> timesteps;
217
218 // Clipping value for certainty inside Tesseract. Reflects the minimum value
219 // of certainty that will be returned by ExtractBestPathAsUnicharIds.
220 // Supposedly on a uniform scale that can be compared across languages and
221 // engines.
222 static constexpr float kMinCertainty = -20.0f;
223 // Number of different code lengths for which we have a separate beam.
225 // Total number of beams: dawg/nodawg * number of NodeContinuation * number
226 // of different lengths.
227 static const int kNumBeams = 2 * NC_COUNT * kNumLengths;
228 // Returns the relevant factor in the beams_ index.
229 static int LengthFromBeamsIndex(int index) { return index % kNumLengths; }
231 return static_cast<NodeContinuation>((index / kNumLengths) % NC_COUNT);
232 }
233 static bool IsDawgFromBeamsIndex(int index) {
234 return index / (kNumLengths * NC_COUNT) > 0;
235 }
236 // Computes a beams_ index from the given factors.
237 static int BeamIndex(bool is_dawg, NodeContinuation cont, int length) {
238 return (is_dawg * NC_COUNT + cont) * kNumLengths + length;
239 }
240
241 private:
242 // Struct for the Re-encode beam search. This struct holds the data for
243 // a single time-step position of the output. Use a PointerVector<RecodeBeam>
244 // to hold all the timesteps and prevent reallocation of the individual heaps.
245 struct RecodeBeam {
246 // Resets to the initial state without deleting all the memory.
247 void Clear() {
248 for (auto & beam : beams_) {
249 beam.clear();
250 }
251 RecodeNode empty;
252 for (auto & best_initial_dawg : best_initial_dawgs_) {
253 best_initial_dawg = empty;
254 }
255 }
256
257 // A separate beam for each combination of code length,
258 // NodeContinuation, and dictionary flag. Separating out all these types
259 // allows the beam to be quite narrow, and yet still have a low chance of
260 // losing the best path.
261 // We have to keep all these beams separate, since the highest scoring paths
262 // come from the paths that are most likely to dead-end at any time, like
263 // dawg paths, NC_ONLY_DUP etc.
264 // Each heap is stored with the WORST result at the top, so we can quickly
265 // get the top-n values.
266 RecodeHeap beams_[kNumBeams];
267 // While the language model is only a single word dictionary, we can use
268 // word starts as a choke point in the beam, and keep only a single dict
269 // start node at each step (for each NodeContinuation type), so we find the
270 // best one here and push it on the heap, if it qualifies, after processing
271 // all of the step.
272 RecodeNode best_initial_dawgs_[NC_COUNT];
273 };
274 using TopPair = KDPairInc<float, int>;
275
276 // Generates debug output of the content of a single beam position.
277 void DebugBeamPos(const UNICHARSET& unicharset, const RecodeHeap& heap) const;
278
279 // Returns the given best_nodes as unichar-ids/certs/ratings/xcoords skipping
280 // duplicates, nulls and intermediate parts.
281 static void ExtractPathAsUnicharIds(
282 const GenericVector<const RecodeNode*>& best_nodes,
283 GenericVector<int>* unichar_ids, GenericVector<float>* certs,
284 GenericVector<float>* ratings, GenericVector<int>* xcoords,
285 std::deque<std::tuple<int, int>>* best_choices = nullptr);
286
287 // Sets up a word with the ratings matrix and fake blobs with boxes in the
288 // right places.
289 WERD_RES* InitializeWord(bool leading_space, const TBOX& line_box,
290 int word_start, int word_end, float space_certainty,
291 const UNICHARSET* unicharset,
292 const GenericVector<int>& xcoords,
293 float scale_factor);
294
295 // Fills top_n_flags_ with bools that are true iff the corresponding output
296 // is one of the top_n.
297 void ComputeTopN(const float* outputs, int num_outputs, int top_n);
298
299 // Adds the computation for the current time-step to the beam. Call at each
300 // time-step in sequence from left to right. outputs is the activation vector
301 // for the current timestep.
302 void DecodeStep(const float* outputs, int t, double dict_ratio,
303 double cert_offset, double worst_dict_cert,
304 const UNICHARSET* charset, bool debug = false);
305
306 //Saves the most certain choices for the current time-step
307 void SaveMostCertainChoices(const float* outputs, int num_outputs, const UNICHARSET* charset, int xCoord);
308
309 // Adds to the appropriate beams the legal (according to recoder)
310 // continuations of context prev, which is from the given index to beams_,
311 // using the given network outputs to provide scores to the choices. Uses only
312 // those choices for which top_n_flags[code] == top_n_flag.
313 void ContinueContext(const RecodeNode* prev, int index, const float* outputs,
314 TopNState top_n_flag, const UNICHARSET* unicharset,
315 double dict_ratio, double cert_offset,
316 double worst_dict_cert, RecodeBeam* step);
317 // Continues for a new unichar, using dawg or non-dawg as per flag.
318 void ContinueUnichar(int code, int unichar_id, float cert,
319 float worst_dict_cert, float dict_ratio, bool use_dawgs,
320 NodeContinuation cont, const RecodeNode* prev,
321 RecodeBeam* step);
322 // Adds a RecodeNode composed of the args to the correct heap in step if
323 // unichar_id is a valid dictionary continuation of whatever is in prev.
324 void ContinueDawg(int code, int unichar_id, float cert, NodeContinuation cont,
325 const RecodeNode* prev, RecodeBeam* step);
326 // Sets the correct best_initial_dawgs_ with a RecodeNode composed of the args
327 // if better than what is already there.
328 void PushInitialDawgIfBetter(int code, int unichar_id, PermuterType permuter,
329 bool start, bool end, float cert,
330 NodeContinuation cont, const RecodeNode* prev,
331 RecodeBeam* step);
332 // Adds a RecodeNode composed of the args to the correct heap in step for
333 // partial unichar or duplicate if there is room or if better than the
334 // current worst element if already full.
335 void PushDupOrNoDawgIfBetter(int length, bool dup, int code, int unichar_id,
336 float cert, float worst_dict_cert,
337 float dict_ratio, bool use_dawgs,
338 NodeContinuation cont, const RecodeNode* prev,
339 RecodeBeam* step);
340 // Adds a RecodeNode composed of the args to the correct heap in step if there
341 // is room or if better than the current worst element if already full.
342 void PushHeapIfBetter(int max_size, int code, int unichar_id,
343 PermuterType permuter, bool dawg_start, bool word_start,
344 bool end, bool dup, float cert, const RecodeNode* prev,
345 DawgPositionVector* d, RecodeHeap* heap);
346 // Adds a RecodeNode to heap if there is room
347 // or if better than the current worst element if already full.
348 void PushHeapIfBetter(int max_size, RecodeNode* node, RecodeHeap* heap);
349 // Searches the heap for an entry matching new_node, and updates the entry
350 // with reshuffle if needed. Returns true if there was a match.
351 bool UpdateHeapIfMatched(RecodeNode* new_node, RecodeHeap* heap);
352 // Computes and returns the code-hash for the given code and prev.
353 uint64_t ComputeCodeHash(int code, bool dup, const RecodeNode* prev) const;
354 // Backtracks to extract the best path through the lattice that was built
355 // during Decode. On return the best_nodes vector essentially contains the set
356 // of code, score pairs that make the optimal path with the constraint that
357 // the recoder can decode the code sequence back to a sequence of unichar-ids.
358 void ExtractBestPaths(GenericVector<const RecodeNode*>* best_nodes,
359 GenericVector<const RecodeNode*>* second_nodes) const;
360 // Helper backtracks through the lattice from the given node, storing the
361 // path and reversing it.
362 void ExtractPath(const RecodeNode* node,
364 // Helper prints debug information on the given lattice path.
365 void DebugPath(const UNICHARSET* unicharset,
366 const GenericVector<const RecodeNode*>& path) const;
367 // Helper prints debug information on the given unichar path.
368 void DebugUnicharPath(const UNICHARSET* unicharset,
370 const GenericVector<int>& unichar_ids,
371 const GenericVector<float>& certs,
372 const GenericVector<float>& ratings,
373 const GenericVector<int>& xcoords) const;
374
375 static const int kBeamWidths[RecodedCharID::kMaxCodeLen + 1];
376
377 // The encoder/decoder that we will be using.
378 const UnicharCompress& recoder_;
379 // The beam for each timestep in the output.
380 PointerVector<RecodeBeam> beam_;
381 // The number of timesteps valid in beam_;
382 int beam_size_;
383 // A flag to indicate which outputs are the top-n choices. Current timestep
384 // only.
385 GenericVector<TopNState> top_n_flags_;
386 // A record of the highest and second scoring codes.
387 int top_code_;
388 int second_code_;
389 // Heap used to compute the top_n_flags_.
390 GenericHeap<TopPair> top_heap_;
391 // Borrowed pointer to the dictionary to use in the search.
392 Dict* dict_;
393 // True if the language is space-delimited, which is true for most languages
394 // except chi*, jpn, tha.
395 bool space_delimited_;
396 // True if the input is simple text, ie adjacent equal chars are not to be
397 // eliminated.
398 bool is_simple_text_;
399 // The encoded (class label) of the null/reject character.
400 int null_char_;
401};
402
403} // namespace tesseract.
404
405#endif // THIRD_PARTY_TESSERACT_LSTM_RECODEBEAM_H_
PermuterType
Definition: ratngs.h:232
@ TOP_CHOICE_PERM
Definition: ratngs.h:235
#define ASSERT_HOST(x)
Definition: errcode.h:88
@ TN_ALSO_RAN
Definition: recodebeam.h:87
GenericHeap< RecodePair > RecodeHeap
Definition: recodebeam.h:176
NodeContinuation
Definition: recodebeam.h:72
@ NC_ANYTHING
Definition: recodebeam.h:73
@ NC_ONLY_DUP
Definition: recodebeam.h:74
Definition: rect.h:34
static const int kMaxCodeLen
const RecodeNode * prev
Definition: recodebeam.h:167
RecodeNode(int c, int uni_id, PermuterType perm, bool dawg_start, bool word_start, bool end, bool dup, float cert, float s, const RecodeNode *p, DawgPositionVector *d, uint64_t hash)
Definition: recodebeam.h:106
void Print(int null_char, const UNICHARSET &unicharset, int depth) const
Definition: recodebeam.cpp:42
RecodeNode(RecodeNode &src)
Definition: recodebeam.h:126
DawgPositionVector * dawgs
Definition: recodebeam.h:169
RecodeNode & operator=(RecodeNode &src)
Definition: recodebeam.h:130
PermuterType permuter
Definition: recodebeam.h:147
void ExtractBestPathAsUnicharIds(bool debug, const UNICHARSET *unicharset, GenericVector< int > *unichar_ids, GenericVector< float > *certs, GenericVector< float > *ratings, GenericVector< int > *xcoords) const
Definition: recodebeam.cpp:156
void Decode(const NetworkIO &output, double dict_ratio, double cert_offset, double worst_dict_cert, const UNICHARSET *charset, int lstm_choice_mode=0)
Definition: recodebeam.cpp:76
std::vector< std::vector< std::pair< const char *, float > > > timesteps
Definition: recodebeam.h:216
static bool IsDawgFromBeamsIndex(int index)
Definition: recodebeam.h:233
void ExtractBestPathAsLabels(GenericVector< int > *labels, GenericVector< int > *xcoords) const
Definition: recodebeam.cpp:133
static int LengthFromBeamsIndex(int index)
Definition: recodebeam.h:229
static const int kNumLengths
Definition: recodebeam.h:224
static NodeContinuation ContinuationFromBeamsIndex(int index)
Definition: recodebeam.h:230
void DebugBeams(const UNICHARSET &unicharset) const
Definition: recodebeam.cpp:303
static const int kNumBeams
Definition: recodebeam.h:227
static constexpr float kMinCertainty
Definition: recodebeam.h:222
static int BeamIndex(bool is_dawg, NodeContinuation cont, int length)
Definition: recodebeam.h:237
void ExtractBestPathAsWords(const TBOX &line_box, float scale_factor, bool debug, const UNICHARSET *unicharset, PointerVector< WERD_RES > *words, int lstm_choice_mode=0)
Definition: recodebeam.cpp:171