tesseract 4.1.1
Loading...
Searching...
No Matches
dawg.cpp
Go to the documentation of this file.
1/* -*-C-*-
2 ********************************************************************************
3 *
4 * File: dawg.cpp (Formerly dawg.c)
5 * Description: Use a Directed Acyclic Word Graph
6 * Author: Mark Seaman, OCR Technology
7 *
8 * (c) Copyright 1987, Hewlett-Packard Company.
9 ** Licensed under the Apache License, Version 2.0 (the "License");
10 ** you may not use this file except in compliance with the License.
11 ** You may obtain a copy of the License at
12 ** http://www.apache.org/licenses/LICENSE-2.0
13 ** Unless required by applicable law or agreed to in writing, software
14 ** distributed under the License is distributed on an "AS IS" BASIS,
15 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 ** See the License for the specific language governing permissions and
17 ** limitations under the License.
18 *
19 *********************************************************************************/
20/*----------------------------------------------------------------------
21 I n c l u d e s
22----------------------------------------------------------------------*/
23
24#include "dawg.h"
25
26#include "dict.h"
27#include "helpers.h"
28#include "strngs.h"
29#include "tesscallback.h"
30#include "tprintf.h"
31
32#include <memory>
33
34/*----------------------------------------------------------------------
35 F u n c t i o n s f o r D a w g
36----------------------------------------------------------------------*/
37namespace tesseract {
38
39// Destructor.
40// It is defined here, so the compiler can create a single vtable
41// instead of weak vtables in every compilation unit.
42Dawg::~Dawg() = default;
43
45 bool requires_complete) const {
46 if (word.length() == 0) return !requires_complete;
47 NODE_REF node = 0;
48 int end_index = word.length() - 1;
49 for (int i = 0; i < end_index; i++) {
50 EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false);
51 if (edge == NO_EDGE) {
52 return false;
53 }
54 if ((node = next_node(edge)) == 0) {
55 // This only happens if all words following this edge terminate --
56 // there are no larger words. See Trie::add_word_to_dawg()
57 return false;
58 }
59 }
60 // Now check the last character.
61 return edge_char_of(node, word.unichar_id(end_index), requires_complete) !=
62 NO_EDGE;
63}
64
65bool Dawg::word_in_dawg(const WERD_CHOICE &word) const {
66 return prefix_in_dawg(word, true);
67}
68
69int Dawg::check_for_words(const char *filename,
70 const UNICHARSET &unicharset,
71 bool enable_wildcard) const {
72 if (filename == nullptr) return 0;
73
74 FILE *word_file;
75 char string [CHARS_PER_LINE];
76 int misses = 0;
77 UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard);
78
79 word_file = fopen(filename, "r");
80 if (word_file == nullptr) {
81 tprintf("Error: Could not open file %s\n", filename);
82 ASSERT_HOST(word_file);
83 }
84
85 while (fgets (string, CHARS_PER_LINE, word_file) != nullptr) {
86 chomp_string(string); // remove newline
87 WERD_CHOICE word(string, unicharset);
88 if (word.length() > 0 &&
89 !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
90 if (!match_words(&word, 0, 0,
91 enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) {
92 tprintf("Missing word: %s\n", string);
93 ++misses;
94 }
95 } else {
96 tprintf("Failed to create a valid word from %s\n", string);
97 }
98 }
99 fclose (word_file);
100 // Make sure the user sees this with fprintf instead of tprintf.
101 if (debug_level_) tprintf("Number of lost words=%d\n", misses);
102 return misses;
103}
104
105void Dawg::iterate_words(const UNICHARSET &unicharset,
107 WERD_CHOICE word(&unicharset);
108 iterate_words_rec(word, 0, cb);
109}
110
111static void CallWithUTF8(TessCallback1<const char *> *cb,
112 const WERD_CHOICE *wc) {
113 STRING s;
114 wc->string_and_lengths(&s, nullptr);
115 cb->Run(s.string());
116}
117
118void Dawg::iterate_words(const UNICHARSET &unicharset,
119 TessCallback1<const char *> *cb) const {
120 std::unique_ptr<TessCallback1<const WERD_CHOICE *>> shim(
121 NewPermanentTessCallback(CallWithUTF8, cb));
122 WERD_CHOICE word(&unicharset);
123 iterate_words_rec(word, 0, shim.get());
124}
125
126void Dawg::iterate_words_rec(const WERD_CHOICE &word_so_far,
127 NODE_REF to_explore,
129 NodeChildVector children;
130 this->unichar_ids_of(to_explore, &children, false);
131 for (int i = 0; i < children.size(); i++) {
132 WERD_CHOICE next_word(word_so_far);
133 next_word.append_unichar_id(children[i].unichar_id, 1, 0.0, 0.0);
134 if (this->end_of_word(children[i].edge_ref)) {
135 cb->Run(&next_word);
136 }
137 NODE_REF next = next_node(children[i].edge_ref);
138 if (next != 0) {
139 iterate_words_rec(next_word, next, cb);
140 }
141 }
142}
143
144bool Dawg::match_words(WERD_CHOICE *word, int32_t index,
145 NODE_REF node, UNICHAR_ID wildcard) const {
146 EDGE_REF edge;
147 int32_t word_end;
148
149 if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) {
150 bool any_matched = false;
151 NodeChildVector vec;
152 this->unichar_ids_of(node, &vec, false);
153 for (int i = 0; i < vec.size(); ++i) {
154 word->set_unichar_id(vec[i].unichar_id, index);
155 if (match_words(word, index, node, wildcard))
156 any_matched = true;
157 }
158 word->set_unichar_id(wildcard, index);
159 return any_matched;
160 } else {
161 word_end = index == word->length() - 1;
162 edge = edge_char_of(node, word->unichar_id(index), word_end);
163 if (edge != NO_EDGE) { // normal edge in DAWG
164 node = next_node(edge);
165 if (word_end) {
166 if (debug_level_ > 1) word->print("match_words() found: ");
167 return true;
168 } else if (node != 0) {
169 return match_words(word, index+1, node, wildcard);
170 }
171 }
172 }
173 return false;
174}
175
176void Dawg::init(int unicharset_size) {
177 ASSERT_HOST(unicharset_size > 0);
178 unicharset_size_ = unicharset_size;
179 // Set bit masks. We will use the value unicharset_size_ as a null char, so
180 // the actual number of unichars is unicharset_size_ + 1.
181 flag_start_bit_ = ceil(log(unicharset_size_ + 1.0) / log(2.0));
183 letter_mask_ = ~(~0ull << flag_start_bit_);
186}
187
188
189/*----------------------------------------------------------------------
190 F u n c t i o n s f o r S q u i s h e d D a w g
191----------------------------------------------------------------------*/
192
193SquishedDawg::~SquishedDawg() { delete[] edges_; }
194
196 UNICHAR_ID unichar_id,
197 bool word_end) const {
198 EDGE_REF edge = node;
199 if (node == 0) { // binary search
200 EDGE_REF start = 0;
201 EDGE_REF end = num_forward_edges_in_node0 - 1;
202 int compare;
203 while (start <= end) {
204 edge = (start + end) >> 1; // (start + end) / 2
205 compare = given_greater_than_edge_rec(NO_EDGE, word_end,
206 unichar_id, edges_[edge]);
207 if (compare == 0) { // given == vec[k]
208 return edge;
209 } else if (compare == 1) { // given > vec[k]
210 start = edge + 1;
211 } else { // given < vec[k]
212 end = edge - 1;
213 }
214 }
215 } else { // linear search
216 if (edge != NO_EDGE && edge_occupied(edge)) {
217 do {
218 if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) &&
219 (!word_end || end_of_word_from_edge_rec(edges_[edge])))
220 return (edge);
221 } while (!last_edge(edge++));
222 }
223 }
224 return (NO_EDGE); // not found
225}
226
227int32_t SquishedDawg::num_forward_edges(NODE_REF node) const {
228 EDGE_REF edge = node;
229 int32_t num = 0;
230
231 if (forward_edge (edge)) {
232 do {
233 num++;
234 } while (!last_edge(edge++));
235 }
236
237 return (num);
238}
239
240void SquishedDawg::print_node(NODE_REF node, int max_num_edges) const {
241 if (node == NO_EDGE) return; // nothing to print
242
243 EDGE_REF edge = node;
244 const char *forward_string = "FORWARD";
245 const char *backward_string = " ";
246
247 const char *last_string = "LAST";
248 const char *not_last_string = " ";
249
250 const char *eow_string = "EOW";
251 const char *not_eow_string = " ";
252
253 const char *direction;
254 const char *is_last;
255 const char *eow;
256
257 UNICHAR_ID unichar_id;
258
259 if (edge_occupied(edge)) {
260 do {
261 direction =
262 forward_edge(edge) ? forward_string : backward_string;
263 is_last = last_edge(edge) ? last_string : not_last_string;
264 eow = end_of_word(edge) ? eow_string : not_eow_string;
265
266 unichar_id = edge_letter(edge);
267 tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n",
268 edge, next_node(edge), unichar_id,
269 direction, is_last, eow);
270
271 if (edge - node > max_num_edges) return;
272 } while (!last_edge(edge++));
273
274 if (edge < num_edges_ &&
275 edge_occupied(edge) && backward_edge(edge)) {
276 do {
277 direction =
278 forward_edge(edge) ? forward_string : backward_string;
279 is_last = last_edge(edge) ? last_string : not_last_string;
280 eow = end_of_word(edge) ? eow_string : not_eow_string;
281
282 unichar_id = edge_letter(edge);
283 tprintf(REFFORMAT " : next = " REFFORMAT
284 ", unichar_id = %d, %s %s %s\n",
285 edge, next_node(edge), unichar_id,
286 direction, is_last, eow);
287
288 if (edge - node > MAX_NODE_EDGES_DISPLAY) return;
289 } while (!last_edge(edge++));
290 }
291 }
292 else {
293 tprintf(REFFORMAT " : no edges in this node\n", node);
294 }
295 tprintf("\n");
296}
297
298void SquishedDawg::print_edge(EDGE_REF edge) const {
299 if (edge == NO_EDGE) {
300 tprintf("NO_EDGE\n");
301 } else {
302 tprintf(REFFORMAT " : next = " REFFORMAT
303 ", unichar_id = '%d', %s %s %s\n", edge,
304 next_node(edge), edge_letter(edge),
305 (forward_edge(edge) ? "FORWARD" : " "),
306 (last_edge(edge) ? "LAST" : " "),
307 (end_of_word(edge) ? "EOW" : ""));
308 }
309}
310
311bool SquishedDawg::read_squished_dawg(TFile *file) {
312 if (debug_level_) tprintf("Reading squished dawg\n");
313
314 // Read the magic number and check that it matches kDawgMagicNumber, as
315 // auto-endian fixing should make sure it is always correct.
316 int16_t magic;
317 if (!file->DeSerialize(&magic)) return false;
318 if (magic != kDawgMagicNumber) {
319 tprintf("Bad magic number on dawg: %d vs %d\n", magic, kDawgMagicNumber);
320 return false;
321 }
322
323 int32_t unicharset_size;
324 if (!file->DeSerialize(&unicharset_size)) return false;
325 if (!file->DeSerialize(&num_edges_)) return false;
326 ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty
327 Dawg::init(unicharset_size);
328
329 edges_ = new EDGE_RECORD[num_edges_];
330 if (!file->DeSerialize(&edges_[0], num_edges_)) return false;
331 if (debug_level_ > 2) {
332 tprintf("type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n",
333 type_, lang_.string(), perm_, unicharset_size_, num_edges_);
334 for (EDGE_REF edge = 0; edge < num_edges_; ++edge) print_edge(edge);
335 }
336 return true;
337}
338
339std::unique_ptr<EDGE_REF[]> SquishedDawg::build_node_map(
340 int32_t *num_nodes) const {
341 EDGE_REF edge;
342 std::unique_ptr<EDGE_REF[]> node_map(new EDGE_REF[num_edges_]);
343 int32_t node_counter;
344 int32_t num_edges;
345
346 for (edge = 0; edge < num_edges_; edge++) // init all slots
347 node_map[edge] = -1;
348
349 node_counter = num_forward_edges(0);
350
351 *num_nodes = 0;
352 for (edge = 0; edge < num_edges_; edge++) { // search all slots
353
354 if (forward_edge(edge)) {
355 (*num_nodes)++; // count nodes links
356 node_map[edge] = (edge ? node_counter : 0);
357 num_edges = num_forward_edges(edge);
358 if (edge != 0) node_counter += num_edges;
359 edge += num_edges;
360 if (edge >= num_edges_) break;
361 if (backward_edge(edge)) while (!last_edge(edge++));
362 edge--;
363 }
364 }
365 return node_map;
366}
367
369 EDGE_REF edge;
370 int32_t num_edges;
371 int32_t node_count = 0;
372 EDGE_REF old_index;
373 EDGE_RECORD temp_record;
374
375 if (debug_level_) tprintf("write_squished_dawg\n");
376
377 std::unique_ptr<EDGE_REF[]> node_map(build_node_map(&node_count));
378
379 // Write the magic number to help detecting a change in endianness.
380 int16_t magic = kDawgMagicNumber;
381 if (!file->Serialize(&magic)) return false;
382 if (!file->Serialize(&unicharset_size_)) return false;
383
384 // Count the number of edges in this Dawg.
385 num_edges = 0;
386 for (edge=0; edge < num_edges_; edge++)
387 if (forward_edge(edge))
388 num_edges++;
389
390 // Write edge count to file.
391 if (!file->Serialize(&num_edges)) return false;
392
393 if (debug_level_) {
394 tprintf("%d nodes in DAWG\n", node_count);
395 tprintf("%d edges in DAWG\n", num_edges);
396 }
397
398 for (edge = 0; edge < num_edges_; edge++) {
399 if (forward_edge(edge)) { // write forward edges
400 do {
401 old_index = next_node_from_edge_rec(edges_[edge]);
402 set_next_node(edge, node_map[old_index]);
403 temp_record = edges_[edge];
404 if (!file->Serialize(&temp_record)) return false;
405 set_next_node(edge, old_index);
406 } while (!last_edge(edge++));
407
408 if (edge >= num_edges_) break;
409 if (backward_edge(edge)) // skip back links
410 while (!last_edge(edge++));
411
412 edge--;
413 }
414 }
415 return true;
416}
417
418} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:88
void chomp_string(char *str)
Definition: helpers.h:77
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
Definition: tesscallback.h:258
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int UNICHAR_ID
Definition: unichar.h:34
#define MAX_NODE_EDGES_DISPLAY
Definition: dawg.h:83
int64_t NODE_REF
Definition: dawg.h:52
#define REFFORMAT
Definition: dawg.h:89
uint64_t EDGE_RECORD
Definition: dawg.h:49
#define NUM_FLAG_BITS
Definition: dawg.h:88
int64_t EDGE_REF
Definition: dawg.h:51
#define CHARS_PER_LINE
Definition: dict.h:38
int size() const
Definition: genericvector.h:72
virtual void Run(A1)=0
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:453
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:349
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: ratngs.cpp:330
int length() const
Definition: ratngs.h:293
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:472
void print() const
Definition: ratngs.h:570
bool Serialize(const char *data, size_t count=1)
Definition: serialis.cpp:148
Definition: strngs.h:45
const char * string() const
Definition: strngs.cpp:194
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:209
int flag_start_bit_
Definition: dawg.h:309
bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const
Definition: dawg.cpp:44
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
Definition: dawg.h:247
uint64_t flags_mask_
Definition: dawg.h:306
virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const =0
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:65
virtual bool end_of_word(EDGE_REF edge_ref) const =0
uint64_t next_node_mask_
Definition: dawg.h:305
int next_node_start_bit_
Definition: dawg.h:310
DawgType type_
Definition: dawg.h:298
int unicharset_size_
Definition: dawg.h:308
void init(int unicharset_size)
Definition: dawg.cpp:176
bool match_words(WERD_CHOICE *word, int32_t index, NODE_REF node, UNICHAR_ID wildcard) const
Definition: dawg.cpp:144
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:222
void iterate_words_rec(const WERD_CHOICE &word_so_far, NODE_REF to_explore, TessCallback1< const WERD_CHOICE * > *cb) const
Definition: dawg.cpp:126
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:226
STRING lang_
Definition: dawg.h:297
int check_for_words(const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const
Definition: dawg.cpp:69
uint64_t letter_mask_
Definition: dawg.h:307
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
Definition: dawg.h:300
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
int debug_level_
Definition: dawg.h:312
virtual ~Dawg()
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
void iterate_words(const UNICHARSET &unicharset, TessCallback1< const WERD_CHOICE * > *cb) const
Definition: dawg.cpp:105
static const int16_t kDawgMagicNumber
Magic number to determine endianness when reading the Dawg from file.
Definition: dawg.h:118
bool end_of_word(EDGE_REF edge_ref) const override
Definition: dawg.h:467
UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
Definition: dawg.h:472
EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const override
Returns the edge that corresponds to the letter out of this node.
Definition: dawg.cpp:195
NODE_REF next_node(EDGE_REF edge) const override
Definition: dawg.h:461
bool write_squished_dawg(TFile *file)
Writes the squished/reduced Dawg to a file.
Definition: dawg.cpp:368
void print_node(NODE_REF node, int max_num_edges) const override
Definition: dawg.cpp:240
~SquishedDawg() override
Definition: dawg.cpp:193