tesseract 4.1.1
Loading...
Searching...
No Matches
trie.cpp
Go to the documentation of this file.
1/* -*-C-*-
2 ********************************************************************************
3 *
4 * File: trie.cpp (Formerly trie.c)
5 * Description: Functions to build a trie data structure.
6 * Author: Mark Seaman, OCR Technology
7 *
8 * (c) Copyright 1987, Hewlett-Packard Company.
9 ** Licensed under the Apache License, Version 2.0 (the "License");
10 ** you may not use this file except in compliance with the License.
11 ** You may obtain a copy of the License at
12 ** http://www.apache.org/licenses/LICENSE-2.0
13 ** Unless required by applicable law or agreed to in writing, software
14 ** distributed under the License is distributed on an "AS IS" BASIS,
15 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 ** See the License for the specific language governing permissions and
17 ** limitations under the License.
18 *
19 *********************************************************************************/
20/*----------------------------------------------------------------------
21 I n c l u d e s
22----------------------------------------------------------------------*/
23
24#include "trie.h"
25
26#include "callcpp.h"
27#include "dawg.h"
28#include "dict.h"
29#include "genericvector.h"
30#include "helpers.h"
31#include "kdpair.h"
32
33namespace tesseract {
34
35const char kDoNotReverse[] = "RRP_DO_NO_REVERSE";
36const char kReverseIfHasRTL[] = "RRP_REVERSE_IF_HAS_RTL";
37const char kForceReverse[] = "RRP_FORCE_REVERSE";
38
39const char * const RTLReversePolicyNames[] = {
43};
44
45const char Trie::kAlphaPatternUnicode[] = "\u2000";
46const char Trie::kDigitPatternUnicode[] = "\u2001";
47const char Trie::kAlphanumPatternUnicode[] = "\u2002";
48const char Trie::kPuncPatternUnicode[] = "\u2003";
49const char Trie::kLowerPatternUnicode[] = "\u2004";
50const char Trie::kUpperPatternUnicode[] = "\u2005";
51
53 return RTLReversePolicyNames[reverse_policy];
54}
55
56// Reset the Trie to empty.
59 nodes_.clear();
61 num_edges_ = 0;
62 new_dawg_node(); // Need to allocate node 0.
63}
64
65bool Trie::edge_char_of(NODE_REF node_ref, NODE_REF next_node,
66 int direction, bool word_end, UNICHAR_ID unichar_id,
67 EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const {
68 if (debug_level_ == 3) {
69 tprintf("edge_char_of() given node_ref " REFFORMAT " next_node " REFFORMAT
70 " direction %d word_end %d unichar_id %d, exploring node:\n",
71 node_ref, next_node, direction, word_end, unichar_id);
72 if (node_ref != NO_EDGE) {
73 print_node(node_ref, nodes_[node_ref]->forward_edges.size());
74 }
75 }
76 if (node_ref == NO_EDGE) return false;
77 assert(node_ref < nodes_.size());
78 EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ?
79 nodes_[node_ref]->forward_edges : nodes_[node_ref]->backward_edges;
80 int vec_size = vec.size();
81 if (node_ref == 0 && direction == FORWARD_EDGE) { // binary search
82 EDGE_INDEX start = 0;
83 EDGE_INDEX end = vec_size - 1;
84 EDGE_INDEX k;
85 int compare;
86 while (start <= end) {
87 k = (start + end) >> 1; // (start + end) / 2
88 compare = given_greater_than_edge_rec(next_node, word_end,
89 unichar_id, vec[k]);
90 if (compare == 0) { // given == vec[k]
91 *edge_ptr = &(vec[k]);
92 *edge_index = k;
93 return true;
94 } else if (compare == 1) { // given > vec[k]
95 start = k + 1;
96 } else { // given < vec[k]
97 end = k - 1;
98 }
99 }
100 } else { // linear search
101 for (int i = 0; i < vec_size; ++i) {
102 EDGE_RECORD &edge_rec = vec[i];
103 if (edge_rec_match(next_node, word_end, unichar_id,
104 next_node_from_edge_rec(edge_rec),
106 unichar_id_from_edge_rec(edge_rec))) {
107 *edge_ptr = &(edge_rec);
108 *edge_index = i;
109 return true;
110 }
111 }
112 }
113 return false; // not found
114}
115
116bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag,
117 int direction, bool word_end,
118 UNICHAR_ID unichar_id) {
119 EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ?
120 &(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges);
121 int search_index;
122 if (node1 == 0 && direction == FORWARD_EDGE) {
123 search_index = 0; // find the index to make the add sorted
124 while (search_index < vec->size() &&
125 given_greater_than_edge_rec(node2, word_end, unichar_id,
126 (*vec)[search_index]) == 1) {
127 search_index++;
128 }
129 } else {
130 search_index = vec->size(); // add is unsorted, so index does not matter
131 }
132 EDGE_RECORD edge_rec;
133 link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);
134 if (node1 == 0 && direction == BACKWARD_EDGE &&
137 (*vec)[edge_index] = edge_rec;
138 } else if (search_index < vec->size()) {
139 vec->insert(edge_rec, search_index);
140 } else {
141 vec->push_back(edge_rec);
142 }
143 if (debug_level_ > 1) {
144 tprintf("new edge in nodes_[" REFFORMAT "]: ", node1);
145 print_edge_rec(edge_rec);
146 tprintf("\n");
147 }
148 num_edges_++;
149 return true;
150}
151
153 NODE_REF the_next_node,
154 bool marker_flag,
155 UNICHAR_ID unichar_id) {
156 EDGE_RECORD *back_edge_ptr;
157 EDGE_INDEX back_edge_index;
158 ASSERT_HOST(edge_char_of(the_next_node, NO_EDGE, BACKWARD_EDGE, false,
159 unichar_id, &back_edge_ptr, &back_edge_index));
160 if (marker_flag) {
161 *back_edge_ptr |= (MARKER_FLAG << flag_start_bit_);
162 *edge_ptr |= (MARKER_FLAG << flag_start_bit_);
163 }
164 // Mark both directions as end of word.
165 *back_edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
166 *edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
167}
168
170 const GenericVector<bool> *repetitions) {
171 if (word.length() <= 0) return false; // can't add empty words
172 if (repetitions != nullptr) ASSERT_HOST(repetitions->size() == word.length());
173 // Make sure the word does not contain invalid unchar ids.
174 for (int i = 0; i < word.length(); ++i) {
175 if (word.unichar_id(i) < 0 ||
176 word.unichar_id(i) >= unicharset_size_) return false;
177 }
178
179 EDGE_RECORD *edge_ptr;
180 NODE_REF last_node = 0;
181 NODE_REF the_next_node;
182 bool marker_flag = false;
183 EDGE_INDEX edge_index;
184 int i;
185 int32_t still_finding_chars = true;
186 int32_t word_end = false;
187 bool add_failed = false;
188 bool found;
189
190 if (debug_level_ > 1) word.print("\nAdding word: ");
191
192 UNICHAR_ID unichar_id;
193 for (i = 0; i < word.length() - 1; ++i) {
194 unichar_id = word.unichar_id(i);
195 marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false;
196 if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
197 if (still_finding_chars) {
198 found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end,
199 unichar_id, &edge_ptr, &edge_index);
200 if (found && debug_level_ > 1) {
201 tprintf("exploring edge " REFFORMAT " in node " REFFORMAT "\n",
202 edge_index, last_node);
203 }
204 if (!found) {
205 still_finding_chars = false;
206 } else if (next_node_from_edge_rec(*edge_ptr) == 0) {
207 // We hit the end of an existing word, but the new word is longer.
208 // In this case we have to disconnect the existing word from the
209 // backwards root node, mark the current position as end-of-word
210 // and add new nodes for the increased length. Disconnecting the
211 // existing word from the backwards root node requires a linear
212 // search, so it is much faster to add the longest words first,
213 // to avoid having to come here.
214 word_end = true;
215 still_finding_chars = false;
216 remove_edge(last_node, 0, word_end, unichar_id);
217 } else {
218 // We have to add a new branch here for the new word.
219 if (marker_flag) set_marker_flag_in_edge_rec(edge_ptr);
220 last_node = next_node_from_edge_rec(*edge_ptr);
221 }
222 }
223 if (!still_finding_chars) {
224 the_next_node = new_dawg_node();
225 if (debug_level_ > 1)
226 tprintf("adding node " REFFORMAT "\n", the_next_node);
227 if (the_next_node == 0) {
228 add_failed = true;
229 break;
230 }
231 if (!add_new_edge(last_node, the_next_node,
232 marker_flag, word_end, unichar_id)) {
233 add_failed = true;
234 break;
235 }
236 word_end = false;
237 last_node = the_next_node;
238 }
239 }
240 the_next_node = 0;
241 unichar_id = word.unichar_id(i);
242 marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false;
243 if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
244 if (still_finding_chars &&
245 edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false,
246 unichar_id, &edge_ptr, &edge_index)) {
247 // An extension of this word already exists in the trie, so we
248 // only have to add the ending flags in both directions.
249 add_word_ending(edge_ptr, next_node_from_edge_rec(*edge_ptr),
250 marker_flag, unichar_id);
251 } else {
252 // Add a link to node 0. All leaves connect to node 0 so the back links can
253 // be used in reduction to a dawg. This root backward node has one edge
254 // entry for every word, (except prefixes of longer words) so it is huge.
255 if (!add_failed &&
256 !add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id))
257 add_failed = true;
258 }
259 if (add_failed) {
260 tprintf("Re-initializing document dictionary...\n");
261 clear();
262 return false;
263 } else {
264 return true;
265 }
266}
267
269 auto *node = new TRIE_NODE_RECORD();
270 nodes_.push_back(node);
271 return nodes_.length() - 1;
272}
273
274// Sort function to sort words by decreasing order of length.
275static int sort_strings_by_dec_length(const void* v1, const void* v2) {
276 const auto *s1 = static_cast<const STRING *>(v1);
277 const auto *s2 = static_cast<const STRING *>(v2);
278 return s2->length() - s1->length();
279}
280
281bool Trie::read_and_add_word_list(const char *filename,
282 const UNICHARSET &unicharset,
283 Trie::RTLReversePolicy reverse_policy) {
284 GenericVector<STRING> word_list;
285 if (!read_word_list(filename, &word_list)) return false;
286 word_list.sort(sort_strings_by_dec_length);
287 return add_word_list(word_list, unicharset, reverse_policy);
288}
289
290bool Trie::read_word_list(const char *filename,
291 GenericVector<STRING>* words) {
292 FILE *word_file;
293 char line_str[CHARS_PER_LINE];
294 int word_count = 0;
295
296 word_file = fopen(filename, "rb");
297 if (word_file == nullptr) return false;
298
299 while (fgets(line_str, sizeof(line_str), word_file) != nullptr) {
300 chomp_string(line_str); // remove newline
301 STRING word_str(line_str);
302 ++word_count;
303 if (debug_level_ && word_count % 10000 == 0)
304 tprintf("Read %d words so far\n", word_count);
305 words->push_back(word_str);
306 }
307 if (debug_level_)
308 tprintf("Read %d words total.\n", word_count);
309 fclose(word_file);
310 return true;
311}
312
314 const UNICHARSET &unicharset,
315 Trie::RTLReversePolicy reverse_policy) {
316 for (int i = 0; i < words.size(); ++i) {
317 WERD_CHOICE word(words[i].string(), unicharset);
318 if (word.length() == 0 || word.contains_unichar_id(INVALID_UNICHAR_ID))
319 continue;
320 if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL &&
321 word.has_rtl_unichar_id()) ||
322 reverse_policy == RRP_FORCE_REVERSE) {
324 }
325 if (!word_in_dawg(word)) {
326 add_word_to_dawg(word);
327 if (!word_in_dawg(word)) {
328 tprintf("Error: word '%s' not in DAWG after adding it\n",
329 words[i].string());
330 return false;
331 }
332 }
333 }
334 return true;
335}
336
351 unicharset_size_ = unicharset->size();
352}
353
355 const UNICHARSET &unicharset,
356 GenericVector<UNICHAR_ID> *vec) const {
357 bool is_alpha = unicharset.get_isalpha(unichar_id);
358 if (is_alpha) {
361 if (unicharset.get_islower(unichar_id)) {
363 } else if (unicharset.get_isupper(unichar_id)) {
365 }
366 }
367 if (unicharset.get_isdigit(unichar_id)) {
369 if (!is_alpha) vec->push_back(alphanum_pattern_);
370 }
371 if (unicharset.get_ispunctuation(unichar_id)) {
373 }
374}
375
377 if (ch == 'c') {
378 return alpha_pattern_;
379 } else if (ch == 'd') {
380 return digit_pattern_;
381 } else if (ch == 'n') {
382 return alphanum_pattern_;
383 } else if (ch == 'p') {
384 return punc_pattern_;
385 } else if (ch == 'a') {
386 return lower_pattern_;
387 } else if (ch == 'A') {
388 return upper_pattern_;
389 } else {
390 return INVALID_UNICHAR_ID;
391 }
392}
393
394bool Trie::read_pattern_list(const char *filename,
395 const UNICHARSET &unicharset) {
397 tprintf("please call initialize_patterns() before read_pattern_list()\n");
398 return false;
399 }
400
401 FILE *pattern_file = fopen(filename, "rb");
402 if (pattern_file == nullptr) {
403 tprintf("Error opening pattern file %s\n", filename);
404 return false;
405 }
406
407 int pattern_count = 0;
408 char string[CHARS_PER_LINE];
409 while (fgets(string, CHARS_PER_LINE, pattern_file) != nullptr) {
410 chomp_string(string); // remove newline
411 // Parse the pattern and construct a unichar id vector.
412 // Record the number of repetitions of each unichar in the parallel vector.
413 WERD_CHOICE word(&unicharset);
414 GenericVector<bool> repetitions_vec;
415 const char *str_ptr = string;
416 int step = unicharset.step(str_ptr);
417 bool failed = false;
418 while (step > 0) {
419 UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
420 if (step == 1 && *str_ptr == '\\') {
421 ++str_ptr;
422 if (*str_ptr == '\\') { // regular '\' unichar that was escaped
423 curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
424 } else {
425 if (word.length() < kSaneNumConcreteChars) {
426 tprintf("Please provide at least %d concrete characters at the"
427 " beginning of the pattern\n", kSaneNumConcreteChars);
428 failed = true;
429 break;
430 }
431 // Parse character class from expression.
432 curr_unichar_id = character_class_to_pattern(*str_ptr);
433 }
434 } else {
435 curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
436 }
437 if (curr_unichar_id == INVALID_UNICHAR_ID) {
438 failed = true;
439 break; // failed to parse this pattern
440 }
441 word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0);
442 repetitions_vec.push_back(false);
443 str_ptr += step;
444 step = unicharset.step(str_ptr);
445 // Check if there is a repetition pattern specified after this unichar.
446 if (step == 1 && *str_ptr == '\\' && *(str_ptr+1) == '*') {
447 repetitions_vec[repetitions_vec.size()-1] = true;
448 str_ptr += 2;
449 step = unicharset.step(str_ptr);
450 }
451 }
452 if (failed) {
453 tprintf("Invalid user pattern %s\n", string);
454 continue;
455 }
456 // Insert the pattern into the trie.
457 if (debug_level_ > 2) {
458 tprintf("Inserting expanded user pattern %s\n",
459 word.debug_string().string());
460 }
461 if (!this->word_in_dawg(word)) {
462 this->add_word_to_dawg(word, &repetitions_vec);
463 if (!this->word_in_dawg(word)) {
464 tprintf("Error: failed to insert pattern '%s'\n", string);
465 }
466 }
467 ++pattern_count;
468 }
469 if (debug_level_) {
470 tprintf("Read %d valid patterns from %s\n", pattern_count, filename);
471 }
472 fclose(pattern_file);
473 return true;
474}
475
476void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction,
477 bool word_end, UNICHAR_ID unichar_id) {
478 EDGE_RECORD *edge_ptr = nullptr;
479 EDGE_INDEX edge_index = 0;
480 ASSERT_HOST(edge_char_of(node1, node2, direction, word_end,
481 unichar_id, &edge_ptr, &edge_index));
482 if (debug_level_ > 1) {
483 tprintf("removed edge in nodes_[" REFFORMAT "]: ", node1);
484 print_edge_rec(*edge_ptr);
485 tprintf("\n");
486 }
487 if (direction == FORWARD_EDGE) {
488 nodes_[node1]->forward_edges.remove(edge_index);
489 } else if (node1 == 0) {
490 KillEdge(&nodes_[node1]->backward_edges[edge_index]);
491 root_back_freelist_.push_back(edge_index);
492 } else {
493 nodes_[node1]->backward_edges.remove(edge_index);
494 }
495 --num_edges_;
496}
497
498// Some optimizations employed in add_word_to_dawg and trie_to_dawg:
499// 1 Avoid insertion sorting or bubble sorting the tail root node
500// (back links on node 0, a list of all the leaves.). The node is
501// huge, and sorting it with n^2 time is terrible.
502// 2 Avoid using GenericVector::remove on the tail root node.
503// (a) During add of words to the trie, zero-out the unichars and
504// keep a freelist of spaces to re-use.
505// (b) During reduction, just zero-out the unichars of deleted back
506// links, skipping zero entries while searching.
507// 3 Avoid linear search of the tail root node. This has to be done when
508// a suffix is added to an existing word. Adding words by decreasing
509// length avoids this problem entirely. Words can still be added in
510// any order, but it is faster to add the longest first.
512 root_back_freelist_.clear(); // Will be invalided by trie_to_dawg.
513 if (debug_level_ > 2) {
514 print_all("Before reduction:", MAX_NODE_EDGES_DISPLAY);
515 }
516 auto reduced_nodes = new bool[nodes_.size()];
517 for (int i = 0; i < nodes_.size(); i++) reduced_nodes[i] = false;
518 this->reduce_node_input(0, reduced_nodes);
519 delete[] reduced_nodes;
520
521 if (debug_level_ > 2) {
522 print_all("After reduction:", MAX_NODE_EDGES_DISPLAY);
523 }
524 // Build a translation map from node indices in nodes_ vector to
525 // their target indices in EDGE_ARRAY.
526 auto *node_ref_map = new NODE_REF[nodes_.size() + 1];
527 int i, j;
528 node_ref_map[0] = 0;
529 for (i = 0; i < nodes_.size(); ++i) {
530 node_ref_map[i+1] = node_ref_map[i] + nodes_[i]->forward_edges.size();
531 }
532 int num_forward_edges = node_ref_map[i];
533
534 // Convert nodes_ vector into EDGE_ARRAY translating the next node references
535 // in edges using node_ref_map. Empty nodes and backward edges are dropped.
536 auto edge_array = new EDGE_RECORD[num_forward_edges];
537 EDGE_ARRAY edge_array_ptr = edge_array;
538 for (i = 0; i < nodes_.size(); ++i) {
539 TRIE_NODE_RECORD *node_ptr = nodes_[i];
540 int end = node_ptr->forward_edges.size();
541 for (j = 0; j < end; ++j) {
542 EDGE_RECORD &edge_rec = node_ptr->forward_edges[j];
543 NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
544 ASSERT_HOST(node_ref < nodes_.size());
545 UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec);
546 link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE,
547 end_of_word_from_edge_rec(edge_rec), unichar_id);
548 if (j == end - 1) set_marker_flag_in_edge_rec(edge_array_ptr);
549 ++edge_array_ptr;
550 }
551 }
552 delete[] node_ref_map;
553
554 return new SquishedDawg(edge_array, num_forward_edges, type_, lang_,
556}
557
559 const EDGE_RECORD &edge1,
560 const EDGE_RECORD &edge2) {
561 if (debug_level_ > 1) {
562 tprintf("\nCollapsing node %" PRIi64 ":\n", node);
564 tprintf("Candidate edges: ");
565 print_edge_rec(edge1);
566 tprintf(", ");
567 print_edge_rec(edge2);
568 tprintf("\n\n");
569 }
570 NODE_REF next_node1 = next_node_from_edge_rec(edge1);
571 NODE_REF next_node2 = next_node_from_edge_rec(edge2);
572 TRIE_NODE_RECORD *next_node2_ptr = nodes_[next_node2];
573 // Translate all edges going to/from next_node2 to go to/from next_node1.
574 EDGE_RECORD *edge_ptr = nullptr;
575 EDGE_INDEX edge_index;
576 int i;
577 // The backward link in node to next_node2 will be zeroed out by the caller.
578 // Copy all the backward links in next_node2 to node next_node1
579 for (i = 0; i < next_node2_ptr->backward_edges.size(); ++i) {
580 const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i];
581 NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge);
582 UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge);
583 int curr_word_end = end_of_word_from_edge_rec(bkw_edge);
584 bool marker_flag = marker_flag_from_edge_rec(bkw_edge);
585 add_edge_linkage(next_node1, curr_next_node, marker_flag, BACKWARD_EDGE,
586 curr_word_end, curr_unichar_id);
587 // Relocate the corresponding forward edge in curr_next_node
588 ASSERT_HOST(edge_char_of(curr_next_node, next_node2, FORWARD_EDGE,
589 curr_word_end, curr_unichar_id,
590 &edge_ptr, &edge_index));
591 set_next_node_in_edge_rec(edge_ptr, next_node1);
592 }
593 int next_node2_num_edges = (next_node2_ptr->forward_edges.size() +
594 next_node2_ptr->backward_edges.size());
595 if (debug_level_ > 1) {
596 tprintf("removed %d edges from node " REFFORMAT "\n",
597 next_node2_num_edges, next_node2);
598 }
599 next_node2_ptr->forward_edges.clear();
600 next_node2_ptr->backward_edges.clear();
601 num_edges_ -= next_node2_num_edges;
602 return true;
603}
604
606 UNICHAR_ID unichar_id,
607 NODE_REF node,
608 EDGE_VECTOR* backward_edges,
609 NODE_MARKER reduced_nodes) {
610 if (debug_level_ > 1)
611 tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index);
612 // Compare each of the edge pairs with the given unichar_id.
613 bool did_something = false;
614 for (int i = edge_index; i < backward_edges->size() - 1; ++i) {
615 // Find the first edge that can be eliminated.
616 UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
617 while (i < backward_edges->size()) {
618 if (!DeadEdge((*backward_edges)[i])) {
619 curr_unichar_id = unichar_id_from_edge_rec((*backward_edges)[i]);
620 if (curr_unichar_id != unichar_id) return did_something;
621 if (can_be_eliminated((*backward_edges)[i])) break;
622 }
623 ++i;
624 }
625 if (i == backward_edges->size()) break;
626 const EDGE_RECORD &edge_rec = (*backward_edges)[i];
627 // Compare it to the rest of the edges with the given unichar_id.
628 for (int j = i + 1; j < backward_edges->size(); ++j) {
629 const EDGE_RECORD &next_edge_rec = (*backward_edges)[j];
630 if (DeadEdge(next_edge_rec)) continue;
631 UNICHAR_ID next_id = unichar_id_from_edge_rec(next_edge_rec);
632 if (next_id != unichar_id) break;
633 if (end_of_word_from_edge_rec(next_edge_rec) ==
634 end_of_word_from_edge_rec(edge_rec) &&
635 can_be_eliminated(next_edge_rec) &&
636 eliminate_redundant_edges(node, edge_rec, next_edge_rec)) {
637 reduced_nodes[next_node_from_edge_rec(edge_rec)] = false;
638 did_something = true;
639 KillEdge(&(*backward_edges)[j]);
640 }
641 }
642 }
643 return did_something;
644}
645
647 int num_edges = edges->size();
648 if (num_edges <= 1) return;
650 sort_vec.reserve(num_edges);
651 for (int i = 0; i < num_edges; ++i) {
653 unichar_id_from_edge_rec((*edges)[i]), (*edges)[i]));
654 }
655 sort_vec.sort();
656 for (int i = 0; i < num_edges; ++i)
657 (*edges)[i] = sort_vec[i].data;
658}
659
661 NODE_MARKER reduced_nodes) {
662 EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges;
663 sort_edges(&backward_edges);
664 if (debug_level_ > 1) {
665 tprintf("reduce_node_input(node=" REFFORMAT ")\n", node);
667 }
668
669 EDGE_INDEX edge_index = 0;
670 while (edge_index < backward_edges.size()) {
671 if (DeadEdge(backward_edges[edge_index])) continue;
672 UNICHAR_ID unichar_id =
673 unichar_id_from_edge_rec(backward_edges[edge_index]);
674 while (reduce_lettered_edges(edge_index, unichar_id, node,
675 &backward_edges, reduced_nodes));
676 while (++edge_index < backward_edges.size()) {
677 UNICHAR_ID id = unichar_id_from_edge_rec(backward_edges[edge_index]);
678 if (!DeadEdge(backward_edges[edge_index]) && id != unichar_id) break;
679 }
680 }
681 reduced_nodes[node] = true; // mark as reduced
682
683 if (debug_level_ > 1) {
684 tprintf("Node " REFFORMAT " after reduction:\n", node);
686 }
687
688 for (int i = 0; i < backward_edges.size(); ++i) {
689 if (DeadEdge(backward_edges[i])) continue;
690 NODE_REF next_node = next_node_from_edge_rec(backward_edges[i]);
691 if (next_node != 0 && !reduced_nodes[next_node]) {
692 reduce_node_input(next_node, reduced_nodes);
693 }
694 }
695}
696
697void Trie::print_node(NODE_REF node, int max_num_edges) const {
698 if (node == NO_EDGE) return; // nothing to print
699 TRIE_NODE_RECORD *node_ptr = nodes_[node];
700 int num_fwd = node_ptr->forward_edges.size();
701 int num_bkw = node_ptr->backward_edges.size();
702 EDGE_VECTOR *vec;
703 for (int dir = 0; dir < 2; ++dir) {
704 if (dir == 0) {
705 vec = &(node_ptr->forward_edges);
706 tprintf(REFFORMAT " (%d %d): ", node, num_fwd, num_bkw);
707 } else {
708 vec = &(node_ptr->backward_edges);
709 tprintf("\t");
710 }
711 int i;
712 for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) &&
713 i < max_num_edges; ++i) {
714 if (DeadEdge((*vec)[i])) continue;
715 print_edge_rec((*vec)[i]);
716 tprintf(" ");
717 }
718 if (dir == 0 ? i < num_fwd : i < num_bkw) tprintf("...");
719 tprintf("\n");
720 }
721}
722
723} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:88
void chomp_string(char *str)
Definition: helpers.h:77
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int UNICHAR_ID
Definition: unichar.h:34
#define WERD_END_FLAG
Definition: dawg.h:86
#define BACKWARD_EDGE
Definition: dawg.h:82
#define FORWARD_EDGE
Definition: dawg.h:81
#define MAX_NODE_EDGES_DISPLAY
Definition: dawg.h:83
int64_t NODE_REF
Definition: dawg.h:52
EDGE_RECORD * EDGE_ARRAY
Definition: dawg.h:50
#define MARKER_FLAG
Definition: dawg.h:84
#define REFFORMAT
Definition: dawg.h:89
uint64_t EDGE_RECORD
Definition: dawg.h:49
#define CHARS_PER_LINE
Definition: dict.h:38
int64_t EDGE_INDEX
Definition: trie.h:38
bool * NODE_MARKER
Definition: trie.h:39
const char kDoNotReverse[]
Definition: trie.cpp:35
const char kReverseIfHasRTL[]
Definition: trie.cpp:36
const char kForceReverse[]
Definition: trie.cpp:37
const char *const RTLReversePolicyNames[]
Definition: trie.cpp:39
int push_back(T object)
bool empty() const
Definition: genericvector.h:91
int size() const
Definition: genericvector.h:72
void remove(int index)
void insert(const T &t, int index)
int length() const
Definition: genericvector.h:86
void delete_data_pointers()
void reserve(int size)
const STRING debug_string() const
Definition: ratngs.h:495
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: ratngs.cpp:330
bool has_rtl_unichar_id() const
Definition: ratngs.cpp:435
void reverse_and_mirror_unichar_ids()
Definition: ratngs.cpp:369
int length() const
Definition: ratngs.h:293
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:472
void print() const
Definition: ratngs.h:570
Definition: strngs.h:45
int32_t length() const
Definition: strngs.cpp:189
const char * string() const
Definition: strngs.cpp:194
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:519
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:626
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:505
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
int step(const char *str) const
Definition: unicharset.cpp:233
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
int size() const
Definition: unicharset.h:341
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:498
bool edge_rec_match(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
Definition: dawg.h:268
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:209
int flag_start_bit_
Definition: dawg.h:309
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
Definition: dawg.h:247
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:65
void set_next_node_in_edge_rec(EDGE_RECORD *edge_rec, EDGE_REF value)
Sets the next node link for this edge in the Dawg.
Definition: dawg.h:231
DawgType type_
Definition: dawg.h:298
int unicharset_size_
Definition: dawg.h:308
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:222
bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the marker flag of this edge.
Definition: dawg.h:213
void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec)
Sets this edge record to be the last one in a sequence of edges.
Definition: dawg.h:237
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:226
STRING lang_
Definition: dawg.h:297
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
Definition: dawg.h:300
int debug_level_
Definition: dawg.h:312
EDGE_VECTOR backward_edges
Definition: trie.h:44
EDGE_VECTOR forward_edges
Definition: trie.h:43
UNICHAR_ID alpha_pattern_
Definition: trie.h:421
void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:300
void KillEdge(EDGE_RECORD *edge_rec) const
Definition: trie.h:148
bool initialized_patterns_
Definition: trie.h:427
static const char kLowerPatternUnicode[]
Definition: trie.h:73
bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
Definition: trie.cpp:281
static const char kAlphanumPatternUnicode[]
Definition: trie.h:71
void remove_edge(NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:375
bool add_new_edge(NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:350
NODE_REF next_node(EDGE_REF edge_ref) const override
Definition: trie.h:127
void initialize_patterns(UNICHARSET *unicharset)
Definition: trie.cpp:337
TRIE_NODES nodes_
Definition: trie.h:413
bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2)
Definition: trie.cpp:558
void clear()
Definition: trie.cpp:57
void print_all(const char *msg, int max_num_edges)
Definition: trie.h:328
static const char kUpperPatternUnicode[]
Definition: trie.h:74
UNICHAR_ID digit_pattern_
Definition: trie.h:422
GenericVector< EDGE_INDEX > root_back_freelist_
Definition: trie.h:415
SquishedDawg * trie_to_dawg()
Definition: trie.cpp:511
bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.cpp:116
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const override
Definition: trie.h:98
UNICHAR_ID lower_pattern_
Definition: trie.h:425
static const char kDigitPatternUnicode[]
Definition: trie.h:70
bool read_pattern_list(const char *filename, const UNICHARSET &unicharset)
Definition: trie.cpp:394
void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes)
Definition: trie.cpp:660
UNICHAR_ID upper_pattern_
Definition: trie.h:426
UNICHAR_ID punc_pattern_
Definition: trie.h:424
bool add_word_list(const GenericVector< STRING > &words, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse_policy)
Definition: trie.cpp:313
void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.cpp:476
void sort_edges(EDGE_VECTOR *edges)
Definition: trie.cpp:646
UNICHAR_ID alphanum_pattern_
Definition: trie.h:423
static const char kPuncPatternUnicode[]
Definition: trie.h:72
RTLReversePolicy
Definition: trie.h:58
@ RRP_REVERSE_IF_HAS_RTL
Definition: trie.h:60
@ RRP_FORCE_REVERSE
Definition: trie.h:61
void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const override
Definition: trie.cpp:354
bool read_word_list(const char *filename, GenericVector< STRING > *words)
Definition: trie.cpp:290
UNICHAR_ID character_class_to_pattern(char ch)
Definition: trie.cpp:376
void print_node(NODE_REF node, int max_num_edges) const override
Definition: trie.cpp:697
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:169
static const char * get_reverse_policy_name(RTLReversePolicy reverse_policy)
Definition: trie.cpp:52
uint64_t num_edges_
Definition: trie.h:416
static const char kAlphaPatternUnicode[]
Definition: trie.h:69
bool reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, EDGE_VECTOR *backward_edges, NODE_MARKER reduced_nodes)
Definition: trie.cpp:605
bool DeadEdge(const EDGE_RECORD &edge_rec) const
Definition: trie.h:152
NODE_REF new_dawg_node()
Definition: trie.cpp:268
bool can_be_eliminated(const EDGE_RECORD &edge_rec)
Definition: trie.h:320
static const int kSaneNumConcreteChars
Definition: trie.h:65
void print_edge_rec(const EDGE_RECORD &edge_rec) const
Definition: trie.h:311
void add_word_ending(EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id)
Definition: trie.cpp:152