tesseract 4.1.1
Loading...
Searching...
No Matches
lm_pain_points.h
Go to the documentation of this file.
1
2// File: lm_pain_points.h
3// Description: Functions that utilize the knowledge about the properties
4// of the paths explored by the segmentation search in order
5// to generate "pain points" - the locations in the ratings
6// matrix which should be classified next.
7// Author: Rika Antonova
8// Created: Mon Jun 20 11:26:43 PST 2012
9//
10// (C) Copyright 2012, Google Inc.
11// Licensed under the Apache License, Version 2.0 (the "License");
12// you may not use this file except in compliance with the License.
13// You may obtain a copy of the License at
14// http://www.apache.org/licenses/LICENSE-2.0
15// Unless required by applicable law or agreed to in writing, software
16// distributed under the License is distributed on an "AS IS" BASIS,
17// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18// See the License for the specific language governing permissions and
19// limitations under the License.
20//
22
23#ifndef TESSERACT_WORDREC_PAIN_POINTS_H_
24#define TESSERACT_WORDREC_PAIN_POINTS_H_
25
26#include "genericheap.h" // for GenericHeap
27#include "matrix.h" // for MATRIX_COORD (ptr only), MatrixCoordPair
28#include "stopper.h" // for DANGERR
29
30class WERD_RES;
31
32namespace tesseract {
33
34class Dict;
35struct ViterbiStateEntry;
36
37// Heap of pain points used for determining where to chop/join.
39
40// Types of pain points (ordered in the decreasing level of importance).
46
48};
49
50static const char * const LMPainPointsTypeName[] = {
51 "LM_PPTYPE_BLAMER",
52 "LM_PPTYPE_AMBIGS",
53 "LM_PPTYPE_PATH",
54 "LM_PPTYPE_SHAPE",
55};
56
58 public:
59
61 // If there is a significant drop in character ngram probability or a
62 // dangerous ambiguity make the thresholds on what blob combinations
63 // can be classified looser.
64 static const float kLooseMaxCharWhRatio;
65 // Returns a description of the type of a pain point.
66 static const char* PainPointDescription(LMPainPointsType type) {
67 return LMPainPointsTypeName[type];
68 }
69
70 LMPainPoints(int max, float rat, bool fp, const Dict *d, int deb) :
71 max_heap_size_(max), max_char_wh_ratio_(rat), fixed_pitch_(fp),
72 dict_(d), debug_level_(deb) {}
74
75 // Returns true if the heap of pain points of pp_type is not empty().
76 inline bool HasPainPoints(LMPainPointsType pp_type) const {
77 return !pain_points_heaps_[pp_type].empty();
78 }
79
80 // Dequeues the next pain point from the pain points queue and copies
81 // its contents and priority to *pp and *priority.
82 // Returns LM_PPTYPE_NUM if pain points queue is empty, otherwise the type.
83 LMPainPointsType Deque(MATRIX_COORD *pp, float *priority);
84
85 // Clears pain points heap.
86 void Clear() {
87 for (auto & pain_points_heap : pain_points_heaps_) pain_points_heap.clear();
88 }
89
90 // For each cell, generate a "pain point" if the cell is not classified
91 // and has a left or right neighbor that was classified.
92 void GenerateInitial(WERD_RES *word_res);
93
94 // Generate pain points from the given path.
95 void GenerateFromPath(float rating_cert_scale, ViterbiStateEntry *vse,
96 WERD_RES *word_res);
97
98 // Generate pain points from dangerous ambiguities in best choice.
99 void GenerateFromAmbigs(const DANGERR &fixpt, ViterbiStateEntry *vse,
100 WERD_RES *word_res);
101
102 // Generate a pain point for the blamer.
103 bool GenerateForBlamer(double max_char_wh_ratio, WERD_RES *word_res,
104 int col, int row) {
105 return GeneratePainPoint(col, row, LM_PPTYPE_BLAMER, 0.0, false,
106 max_char_wh_ratio, word_res);
107 }
108
109 // Adds a pain point to classify chunks_record->ratings(col, row).
110 // Returns true if a new pain point was added to an appropriate heap.
111 // Pain point priority is set to special_priority for pain points of
112 // LM_PPTYPE_AMBIG or LM_PPTYPE_PATH, for other pain points
113 // AssociateStats::gap_sum is used.
114 bool GeneratePainPoint(int col, int row, LMPainPointsType pp_type,
115 float special_priority, bool ok_to_extend,
116 float max_char_wh_ratio,
117 WERD_RES *word_res);
118
119 // Adjusts the pain point coordinates to cope with expansion of the ratings
120 // matrix due to a split of the blob with the given index.
121 void RemapForSplit(int index);
122
123 private:
124 // Priority queues containing pain points generated by the language model
125 // The priority is set by the language model components, adjustments like
126 // seam cost and width priority are factored into the priority.
127 PainPointHeap pain_points_heaps_[LM_PPTYPE_NUM];
128 // Maximum number of points to keep in the heap.
129 int max_heap_size_;
130 // Maximum character width/height ratio.
131 float max_char_wh_ratio_;
132 // Set to true if fixed pitch should be assumed.
133 bool fixed_pitch_;
134 // Cached pointer to dictionary.
135 const Dict *dict_;
136 // Debug level for print statements.
137 int debug_level_;
138};
139
140} // namespace tesseract
141
142#endif // TESSERACT_WORDREC_PAIN_POINTS_H_
bool empty() const
Definition: genericheap.h:68
bool GeneratePainPoint(int col, int row, LMPainPointsType pp_type, float special_priority, bool ok_to_extend, float max_char_wh_ratio, WERD_RES *word_res)
void RemapForSplit(int index)
bool HasPainPoints(LMPainPointsType pp_type) const
LMPainPoints(int max, float rat, bool fp, const Dict *d, int deb)
void GenerateInitial(WERD_RES *word_res)
bool GenerateForBlamer(double max_char_wh_ratio, WERD_RES *word_res, int col, int row)
void GenerateFromPath(float rating_cert_scale, ViterbiStateEntry *vse, WERD_RES *word_res)
static const float kLooseMaxCharWhRatio
static const float kDefaultPainPointPriorityAdjustment
static const char * PainPointDescription(LMPainPointsType type)
void GenerateFromAmbigs(const DANGERR &fixpt, ViterbiStateEntry *vse, WERD_RES *word_res)
LMPainPointsType Deque(MATRIX_COORD *pp, float *priority)