tesseract 4.1.1
Loading...
Searching...
No Matches
trie.h
Go to the documentation of this file.
1/* -*-C-*-
2 ********************************************************************************
3 *
4 * File: trie.h
5 * Description: Functions to build a trie data structure.
6 * Author: Mark Seaman, SW Productivity
7 *
8 * (c) Copyright 1987, Hewlett-Packard Company.
9 ** Licensed under the Apache License, Version 2.0 (the "License");
10 ** you may not use this file except in compliance with the License.
11 ** You may obtain a copy of the License at
12 ** http://www.apache.org/licenses/LICENSE-2.0
13 ** Unless required by applicable law or agreed to in writing, software
14 ** distributed under the License is distributed on an "AS IS" BASIS,
15 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 ** See the License for the specific language governing permissions and
17 ** limitations under the License.
18 *
19 *********************************************************************************/
20#ifndef TRIE_H
21#define TRIE_H
22
23#include "dawg.h"
24#include "genericvector.h"
25
26class UNICHARSET;
27
28// Note: if we consider either NODE_REF or EDGE_INDEX to ever exceed
29// max int32, we will need to change GenericVector to use int64 for size
30// and address indices. This does not seem to be needed immediately,
31// since currently the largest number of edges limit used by tesseract
32// (kMaxNumEdges in wordlist2dawg.cpp) is far less than max int32.
33// There are also int casts below to satisfy the WIN32 compiler that would
34// need to be changed.
35// It might be cleanest to change the types of most of the Trie/Dawg related
36// typedefs to int and restrict the casts to extracting these values from
37// the 64 bit EDGE_RECORD.
38using EDGE_INDEX = int64_t ; // index of an edge in a given node
39using NODE_MARKER = bool *;
41
45};
47
48namespace tesseract {
49
56class Trie : public Dawg {
57 public:
62 };
63
64 // Minimum number of concrete characters at the beginning of user patterns.
65 static const int kSaneNumConcreteChars = 0;
66 // Various unicode whitespace characters are used to denote unichar patterns,
67 // (character classifier would never produce these whitespace characters as a
68 // valid classification).
69 static const char kAlphaPatternUnicode[];
70 static const char kDigitPatternUnicode[];
71 static const char kAlphanumPatternUnicode[];
72 static const char kPuncPatternUnicode[];
73 static const char kLowerPatternUnicode[];
74 static const char kUpperPatternUnicode[];
75
76 static const char *get_reverse_policy_name(
77 RTLReversePolicy reverse_policy);
78
79 // max_num_edges argument allows limiting the amount of memory this
80 // Trie can consume (if a new word insert would cause the Trie to
81 // contain more edges than max_num_edges, all the edges are cleared
82 // so that new inserts can proceed).
84 int unicharset_size, int debug_level)
85 : Dawg(type, lang, perm, debug_level) {
86 init(unicharset_size);
87 num_edges_ = 0;
88 deref_node_index_mask_ = ~letter_mask_;
89 new_dawg_node(); // need to allocate node 0
91 }
93
94 // Reset the Trie to empty.
95 void clear();
96
99 bool word_end) const override {
100 EDGE_RECORD *edge_ptr;
101 EDGE_INDEX edge_index;
102 if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id,
103 &edge_ptr, &edge_index)) return NO_EDGE;
104 return make_edge_ref(node_ref, edge_index);
105 }
106
112 bool word_end) const override {
113 const EDGE_VECTOR &forward_edges =
114 nodes_[static_cast<int>(node)]->forward_edges;
115 for (int i = 0; i < forward_edges.size(); ++i) {
116 if (!word_end || end_of_word_from_edge_rec(forward_edges[i])) {
117 vec->push_back(NodeChild(unichar_id_from_edge_rec(forward_edges[i]),
118 make_edge_ref(node, i)));
119 }
120 }
121 }
122
127 NODE_REF next_node(EDGE_REF edge_ref) const override {
128 if (edge_ref == NO_EDGE || num_edges_ == 0) return NO_EDGE;
129 return next_node_from_edge_rec(*deref_edge_ref(edge_ref));
130 }
131
136 bool end_of_word(EDGE_REF edge_ref) const override {
137 if (edge_ref == NO_EDGE || num_edges_ == 0) return false;
138 return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref));
139 }
140
142 UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override {
143 if (edge_ref == NO_EDGE || num_edges_ == 0) return INVALID_UNICHAR_ID;
144 return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref));
145 }
146 // Sets the UNICHAR_ID in the given edge_rec to unicharset_size_, marking
147 // the edge dead.
148 void KillEdge(EDGE_RECORD* edge_rec) const {
149 *edge_rec &= ~letter_mask_;
150 *edge_rec |= (unicharset_size_ << LETTER_START_BIT);
151 }
152 bool DeadEdge(const EDGE_RECORD& edge_rec) const {
153 return unichar_id_from_edge_rec(edge_rec) == unicharset_size_;
154 }
155
156 // Prints the contents of the node indicated by the given NODE_REF.
157 // At most max_num_edges will be printed.
158 void print_node(NODE_REF node, int max_num_edges) const override;
159
160 // Writes edges from nodes_ to an EDGE_ARRAY and creates a SquishedDawg.
161 // Eliminates redundant edges and returns the pointer to the SquishedDawg.
162 // Note: the caller is responsible for deallocating memory associated
163 // with the returned SquishedDawg pointer.
165
166 // Reads a list of words from the given file and adds into the Trie.
167 // Calls WERD_CHOICE::reverse_unichar_ids_if_rtl() according to the reverse
168 // policy and information in the unicharset.
169 // Returns false on error.
170 bool read_and_add_word_list(const char *filename,
171 const UNICHARSET &unicharset,
173
174 // Reads a list of words from the given file.
175 // Returns false on error.
176 bool read_word_list(const char *filename,
177 GenericVector<STRING>* words);
178 // Adds a list of words previously read using read_word_list to the trie
179 // using the given unicharset and reverse_policy to convert to unichar-ids.
180 // Returns false on error.
181 bool add_word_list(const GenericVector<STRING> &words,
182 const UNICHARSET &unicharset,
183 Trie::RTLReversePolicy reverse_policy);
184
185 // Inserts the list of patterns from the given file into the Trie.
186 // The pattern list file should contain one pattern per line in UTF-8 format.
187 //
188 // Each pattern can contain any non-whitespace characters, however only the
189 // patterns that contain characters from the unicharset of the corresponding
190 // language will be useful.
191 // The only meta character is '\'. To be used in a pattern as an ordinary
192 // string it should be escaped with '\' (e.g. string "C:\Documents" should
193 // be written in the patterns file as "C:\\Documents").
194 // This function supports a very limited regular expression syntax. One can
195 // express a character, a certain character class and a number of times the
196 // entity should be repeated in the pattern.
197 //
198 // To denote a character class use one of:
199 // \c - unichar for which UNICHARSET::get_isalpha() is true (character)
200 // \d - unichar for which UNICHARSET::get_isdigit() is true
201 // \n - unichar for which UNICHARSET::get_isdigit() and
202 // UNICHARSET::isalpha() are true
203 // \p - unichar for which UNICHARSET::get_ispunct() is true
204 // \a - unichar for which UNICHARSET::get_islower() is true
205 // \A - unichar for which UNICHARSET::get_isupper() is true
206 //
207 // \* could be specified after each character or pattern to indicate that
208 // the character/pattern can be repeated any number of times before the next
209 // character/pattern occurs.
210 //
211 // Examples:
212 // 1-8\d\d-GOOG-411 will be expanded to strings:
213 // 1-800-GOOG-411, 1-801-GOOG-411, ... 1-899-GOOG-411.
214 //
215 // http://www.\n\*.com will be expanded to strings like:
216 // http://www.a.com http://www.a123.com ... http://www.ABCDefgHIJKLMNop.com
217 //
218 // Note: In choosing which patterns to include please be aware of the fact
219 // providing very generic patterns will make tesseract run slower.
220 // For example \n\* at the beginning of the pattern will make Tesseract
221 // consider all the combinations of proposed character choices for each
222 // of the segmentations, which will be unacceptably slow.
223 // Because of potential problems with speed that could be difficult to
224 // identify, each user pattern has to have at least kSaneNumConcreteChars
225 // concrete characters from the unicharset at the beginning.
226 bool read_pattern_list(const char *filename, const UNICHARSET &unicharset);
227
228 // Initializes the values of *_pattern_ unichar ids.
229 // This function should be called before calling read_pattern_list().
230 void initialize_patterns(UNICHARSET *unicharset);
231
232 // Fills in the given unichar id vector with the unichar ids that represent
233 // the patterns of the character classes of the given unichar_id.
234 void unichar_id_to_patterns(UNICHAR_ID unichar_id,
235 const UNICHARSET &unicharset,
236 GenericVector<UNICHAR_ID> *vec) const override;
237
238 // Returns the given EDGE_REF if the EDGE_RECORD that it points to has
239 // a self loop and the given unichar_id matches the unichar_id stored in the
240 // EDGE_RECORD, returns NO_EDGE otherwise.
242 UNICHAR_ID unichar_id,
243 bool word_end) const override {
244 if (edge_ref == NO_EDGE) return NO_EDGE;
245 EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref);
246 return (marker_flag_from_edge_rec(*edge_rec) &&
247 unichar_id == unichar_id_from_edge_rec(*edge_rec) &&
248 word_end == end_of_word_from_edge_rec(*edge_rec)) ?
249 edge_ref : NO_EDGE;
250 }
251
252 // Adds a word to the Trie (creates the necessary nodes and edges).
253 //
254 // If repetitions vector is not nullptr, each entry in the vector indicates
255 // whether the unichar id with the corresponding index in the word is allowed
256 // to repeat an unlimited number of times. For each entry that is true, MARKER
257 // flag of the corresponding edge created for this unichar id is set to true).
258 //
259 // Return true if add succeeded, false otherwise (e.g. when a word contained
260 // an invalid unichar id or the trie was getting too large and was cleared).
261 bool add_word_to_dawg(const WERD_CHOICE &word,
262 const GenericVector<bool> *repetitions);
263 bool add_word_to_dawg(const WERD_CHOICE &word) {
264 return add_word_to_dawg(word, nullptr);
265 }
266
267 protected:
268 // The structure of an EDGE_REF for Trie edges is as follows:
269 // [LETTER_START_BIT, flag_start_bit_):
270 // edge index in *_edges in a TRIE_NODE_RECORD
271 // [flag_start_bit, 30th bit]: node index in nodes (TRIE_NODES vector)
272 //
273 // With this arrangement there are enough bits to represent edge indices
274 // (each node can have at most unicharset_size_ forward edges and
275 // the position of flag_start_bit is set to be log2(unicharset_size_)).
276 // It is also possible to accommodate a maximum number of nodes that is at
277 // least as large as that of the SquishedDawg representation (in SquishedDawg
278 // each EDGE_RECORD has 32-(flag_start_bit+NUM_FLAG_BITS) bits to represent
279 // the next node index).
280 //
281
282 // Returns the pointer to EDGE_RECORD after decoding the location
283 // of the edge from the information in the given EDGE_REF.
284 // This function assumes that EDGE_REF holds valid node/edge indices.
285 inline EDGE_RECORD *deref_edge_ref(EDGE_REF edge_ref) const {
286 int edge_index = static_cast<int>(
287 (edge_ref & letter_mask_) >> LETTER_START_BIT);
288 int node_index = static_cast<int>(
290 TRIE_NODE_RECORD *node_rec = nodes_[node_index];
291 return &(node_rec->forward_edges[edge_index]);
292 }
294 inline EDGE_REF make_edge_ref(NODE_REF node_index,
295 EDGE_INDEX edge_index) const {
296 return ((node_index << flag_start_bit_) |
297 (edge_index << LETTER_START_BIT));
298 }
300 inline void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats,
301 int direction, bool word_end, UNICHAR_ID unichar_id) {
302 EDGE_RECORD flags = 0;
303 if (repeats) flags |= MARKER_FLAG;
304 if (word_end) flags |= WERD_END_FLAG;
305 if (direction == BACKWARD_EDGE) flags |= DIRECTION_FLAG;
306 *edge = ((nxt << next_node_start_bit_) |
307 (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) |
308 (static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT));
309 }
311 inline void print_edge_rec(const EDGE_RECORD &edge_rec) const {
312 tprintf("|" REFFORMAT "|%s%s%s|%d|", next_node_from_edge_rec(edge_rec),
313 marker_flag_from_edge_rec(edge_rec) ? "R," : "",
314 (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B",
315 end_of_word_from_edge_rec(edge_rec) ? ",E" : "",
316 unichar_id_from_edge_rec(edge_rec));
317 }
318 // Returns true if the next node in recorded the given EDGE_RECORD
319 // has exactly one forward edge.
320 inline bool can_be_eliminated(const EDGE_RECORD &edge_rec) {
321 NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
322 return (node_ref != NO_EDGE &&
323 nodes_[static_cast<int>(node_ref)]->forward_edges.size() == 1);
324 }
325
326 // Prints the contents of the Trie.
327 // At most max_num_edges will be printed for each node.
328 void print_all(const char* msg, int max_num_edges) {
329 tprintf("\n__________________________\n%s\n", msg);
330 for (int i = 0; i < nodes_.size(); ++i) print_node(i, max_num_edges);
331 tprintf("__________________________\n");
332 }
333
334 // Finds the edge with the given direction, word_end and unichar_id
335 // in the node indicated by node_ref. Fills in the pointer to the
336 // EDGE_RECORD and the index of the edge with the the values
337 // corresponding to the edge found. Returns true if an edge was found.
339 int direction, bool word_end, UNICHAR_ID unichar_id,
340 EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const;
341
342 // Adds an single edge linkage between node1 and node2 in the direction
343 // indicated by direction argument.
344 bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats,
345 int direction, bool word_end,
346 UNICHAR_ID unichar_id);
347
348 // Adds forward edge linkage from node1 to node2 and the corresponding
349 // backward edge linkage in the other direction.
350 bool add_new_edge(NODE_REF node1, NODE_REF node2,
351 bool repeats, bool word_end, UNICHAR_ID unichar_id) {
352 return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE,
353 word_end, unichar_id) &&
354 add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE,
355 word_end, unichar_id));
356 }
357
358 // Sets the word ending flags in an already existing edge pair.
359 // Returns true on success.
360 void add_word_ending(EDGE_RECORD *edge,
361 NODE_REF the_next_node,
362 bool repeats,
363 UNICHAR_ID unichar_id);
364
365 // Allocates space for a new node in the Trie.
367
368 // Removes a single edge linkage to between node1 and node2 in the
369 // direction indicated by direction argument.
370 void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction,
371 bool word_end, UNICHAR_ID unichar_id);
372
373 // Removes forward edge linkage from node1 to node2 and the corresponding
374 // backward edge linkage in the other direction.
375 void remove_edge(NODE_REF node1, NODE_REF node2,
376 bool word_end, UNICHAR_ID unichar_id) {
377 remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id);
378 remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id);
379 }
380
381 // Compares edge1 and edge2 in the given node to see if they point to two
382 // next nodes that could be collapsed. If they do, performs the reduction
383 // and returns true.
384 bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1,
385 const EDGE_RECORD &edge2);
386
387 // Assuming that edge_index indicates the first edge in a group of edges
388 // in this node with a particular letter value, looks through these edges
389 // to see if any of them can be collapsed. If so does it. Returns to the
390 // caller when all edges with this letter have been reduced.
391 // Returns true if further reduction is possible with this same letter.
392 bool reduce_lettered_edges(EDGE_INDEX edge_index,
393 UNICHAR_ID unichar_id,
394 NODE_REF node,
395 EDGE_VECTOR* backward_edges,
396 NODE_MARKER reduced_nodes);
397
404 void sort_edges(EDGE_VECTOR *edges);
405
407 void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes);
408
409 // Returns the pattern unichar id for the given character class code.
411
412 // Member variables
413 TRIE_NODES nodes_; // vector of nodes in the Trie
414 // Freelist of edges in the root backwards node that were previously zeroed.
416 uint64_t num_edges_; // sum of all edges (forward and backward)
417 uint64_t deref_direction_mask_; // mask for EDGE_REF to extract direction
418 uint64_t deref_node_index_mask_; // mask for EDGE_REF to extract node index
419 // Variables for translating character class codes denoted in user patterns
420 // file to the unichar ids used to represent them in a Trie.
428};
429} // namespace tesseract
430
431#endif
PermuterType
Definition: ratngs.h:232
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int UNICHAR_ID
Definition: unichar.h:34
LIST reverse(LIST list)
Definition: oldlist.cpp:244
#define WERD_END_FLAG
Definition: dawg.h:86
#define BACKWARD_EDGE
Definition: dawg.h:82
#define FORWARD_EDGE
Definition: dawg.h:81
int64_t NODE_REF
Definition: dawg.h:52
#define MARKER_FLAG
Definition: dawg.h:84
#define REFFORMAT
Definition: dawg.h:89
#define LETTER_START_BIT
Definition: dawg.h:87
uint64_t EDGE_RECORD
Definition: dawg.h:49
#define DIRECTION_FLAG
Definition: dawg.h:85
int64_t EDGE_REF
Definition: dawg.h:51
int64_t EDGE_INDEX
Definition: trie.h:38
bool * NODE_MARKER
Definition: trie.h:39
DawgType
Definition: dawg.h:68
int push_back(T object)
int size() const
Definition: genericvector.h:72
void delete_data_pointers()
Definition: strngs.h:45
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:209
int flag_start_bit_
Definition: dawg.h:309
const STRING & lang() const
Definition: dawg.h:125
int next_node_start_bit_
Definition: dawg.h:310
int unicharset_size_
Definition: dawg.h:308
void init(int unicharset_size)
Definition: dawg.cpp:176
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:222
bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the marker flag of this edge.
Definition: dawg.h:213
DawgType type() const
Definition: dawg.h:124
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:226
uint64_t letter_mask_
Definition: dawg.h:307
int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the direction flag of this edge.
Definition: dawg.h:217
EDGE_VECTOR backward_edges
Definition: trie.h:44
EDGE_VECTOR forward_edges
Definition: trie.h:43
UNICHAR_ID alpha_pattern_
Definition: trie.h:421
~Trie() override
Definition: trie.h:92
Trie(DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
Definition: trie.h:83
void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:300
void KillEdge(EDGE_RECORD *edge_rec) const
Definition: trie.h:148
bool initialized_patterns_
Definition: trie.h:427
static const char kLowerPatternUnicode[]
Definition: trie.h:73
bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
Definition: trie.cpp:281
void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const override
Definition: trie.h:111
static const char kAlphanumPatternUnicode[]
Definition: trie.h:71
void remove_edge(NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:375
bool add_new_edge(NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:350
NODE_REF next_node(EDGE_REF edge_ref) const override
Definition: trie.h:127
void initialize_patterns(UNICHARSET *unicharset)
Definition: trie.cpp:337
TRIE_NODES nodes_
Definition: trie.h:413
bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2)
Definition: trie.cpp:558
void clear()
Definition: trie.cpp:57
void print_all(const char *msg, int max_num_edges)
Definition: trie.h:328
static const char kUpperPatternUnicode[]
Definition: trie.h:74
UNICHAR_ID digit_pattern_
Definition: trie.h:422
GenericVector< EDGE_INDEX > root_back_freelist_
Definition: trie.h:415
SquishedDawg * trie_to_dawg()
Definition: trie.cpp:511
bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.cpp:116
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const override
Definition: trie.h:98
UNICHAR_ID lower_pattern_
Definition: trie.h:425
static const char kDigitPatternUnicode[]
Definition: trie.h:70
bool read_pattern_list(const char *filename, const UNICHARSET &unicharset)
Definition: trie.cpp:394
bool add_word_to_dawg(const WERD_CHOICE &word)
Definition: trie.h:263
uint64_t deref_direction_mask_
Definition: trie.h:417
void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes)
Definition: trie.cpp:660
UNICHAR_ID upper_pattern_
Definition: trie.h:426
uint64_t deref_node_index_mask_
Definition: trie.h:418
UNICHAR_ID punc_pattern_
Definition: trie.h:424
bool add_word_list(const GenericVector< STRING > &words, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse_policy)
Definition: trie.cpp:313
void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.cpp:476
UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override
Definition: trie.h:142
void sort_edges(EDGE_VECTOR *edges)
Definition: trie.cpp:646
EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const override
Definition: trie.h:241
UNICHAR_ID alphanum_pattern_
Definition: trie.h:423
static const char kPuncPatternUnicode[]
Definition: trie.h:72
RTLReversePolicy
Definition: trie.h:58
@ RRP_DO_NO_REVERSE
Definition: trie.h:59
@ RRP_REVERSE_IF_HAS_RTL
Definition: trie.h:60
@ RRP_FORCE_REVERSE
Definition: trie.h:61
bool end_of_word(EDGE_REF edge_ref) const override
Definition: trie.h:136
void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const override
Definition: trie.cpp:354
bool read_word_list(const char *filename, GenericVector< STRING > *words)
Definition: trie.cpp:290
EDGE_RECORD * deref_edge_ref(EDGE_REF edge_ref) const
Definition: trie.h:285
UNICHAR_ID character_class_to_pattern(char ch)
Definition: trie.cpp:376
void print_node(NODE_REF node, int max_num_edges) const override
Definition: trie.cpp:697
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:169
static const char * get_reverse_policy_name(RTLReversePolicy reverse_policy)
Definition: trie.cpp:52
uint64_t num_edges_
Definition: trie.h:416
static const char kAlphaPatternUnicode[]
Definition: trie.h:69
bool reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, EDGE_VECTOR *backward_edges, NODE_MARKER reduced_nodes)
Definition: trie.cpp:605
bool DeadEdge(const EDGE_RECORD &edge_rec) const
Definition: trie.h:152
EDGE_REF make_edge_ref(NODE_REF node_index, EDGE_INDEX edge_index) const
Definition: trie.h:294
NODE_REF new_dawg_node()
Definition: trie.cpp:268
bool can_be_eliminated(const EDGE_RECORD &edge_rec)
Definition: trie.h:320
static const int kSaneNumConcreteChars
Definition: trie.h:65
void print_edge_rec(const EDGE_RECORD &edge_rec) const
Definition: trie.h:311
void add_word_ending(EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id)
Definition: trie.cpp:152