tesseract 4.1.1
Loading...
Searching...
No Matches
unicharmap.cpp
Go to the documentation of this file.
1
2// File: unicharmap.cpp
3// Description: Unicode character/ligature to integer id class.
4// Author: Thomas Kielbus
5//
6// (C) Copyright 2006, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16//
18
19#include <cassert>
20#include "unichar.h"
21#include "unicharmap.h"
22
24nodes(nullptr) {
25}
26
28 delete[] nodes;
29}
30
31// Search the given unichar representation in the tree, using length characters
32// from it maximum. Each character in the string is interpreted as an index in
33// an array of nodes.
34UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
35 int length) const {
36 UNICHARMAP_NODE* current_nodes = nodes;
37
38 assert(*unichar_repr != '\0');
39 assert(length > 0 && length <= UNICHAR_LEN);
40
41 int index = 0;
42 if (length <= 0 || unichar_repr[index] == '\0') return INVALID_UNICHAR_ID;
43 do {
44 if (index + 1 >= length || unichar_repr[index + 1] == '\0')
45 return current_nodes[static_cast<unsigned char>(unichar_repr[index])].id;
46 current_nodes =
47 current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
48 ++index;
49 } while (true);
50}
51
52// Search the given unichar representation in the tree, creating the possibly
53// missing nodes. Once the right place has been found, insert the given id and
54// update the inserted flag to keep track of the insert. Each character in the
55// string is interpreted as an index in an array of nodes.
56void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
57 const char* current_char = unichar_repr;
58 if (*current_char == '\0') return;
59 UNICHARMAP_NODE** current_nodes_pointer = &nodes;
60 do {
61 if (*current_nodes_pointer == nullptr)
62 *current_nodes_pointer = new UNICHARMAP_NODE[256];
63 if (current_char[1] == '\0') {
64 (*current_nodes_pointer)
65 [static_cast<unsigned char>(*current_char)].id = id;
66 return;
67 }
68 current_nodes_pointer =
69 &((*current_nodes_pointer)
70 [static_cast<unsigned char>(*current_char)].children);
71 ++current_char;
72 } while (true);
73}
74
75// Search the given unichar representation in the tree, using length characters
76// from it maximum. Each character in the string is interpreted as an index in
77// an array of nodes. Stop once the tree does not have anymore nodes or once we
78// found the right unichar_repr.
79bool UNICHARMAP::contains(const char* const unichar_repr,
80 int length) const {
81 if (unichar_repr == nullptr || *unichar_repr == '\0') return false;
82 if (length <= 0 || length > UNICHAR_LEN) return false;
83 int index = 0;
84 if (unichar_repr[index] == '\0') return false;
85 UNICHARMAP_NODE* current_nodes = nodes;
86
87 while (current_nodes != nullptr && index + 1 < length &&
88 unichar_repr[index + 1] != '\0') {
89 current_nodes =
90 current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
91 ++index;
92 }
93 return current_nodes != nullptr &&
94 (index + 1 >= length || unichar_repr[index + 1] == '\0') &&
95 current_nodes[static_cast<unsigned char>(unichar_repr[index])].id >= 0;
96}
97
98// Return the minimum number of characters that must be used from this string
99// to obtain a match in the UNICHARMAP.
100int UNICHARMAP::minmatch(const char* const unichar_repr) const {
101 const char* current_char = unichar_repr;
102 if (*current_char == '\0') return 0;
103 UNICHARMAP_NODE* current_nodes = nodes;
104
105 while (current_nodes != nullptr && *current_char != '\0') {
106 if (current_nodes[static_cast<unsigned char>(*current_char)].id >= 0)
107 return current_char + 1 - unichar_repr;
108 current_nodes =
109 current_nodes[static_cast<unsigned char>(*current_char)].children;
110 ++current_char;
111 }
112 return 0;
113}
114
116 delete[] nodes;
117 nodes = nullptr;
118}
119
120UNICHARMAP::UNICHARMAP_NODE::UNICHARMAP_NODE() :
121children(nullptr),
122id(-1) {
123}
124
125// Recursively delete the children
126UNICHARMAP::UNICHARMAP_NODE::~UNICHARMAP_NODE() {
127 delete[] children;
128}
#define UNICHAR_LEN
Definition: unichar.h:30
int UNICHAR_ID
Definition: unichar.h:34
void clear()
Definition: unicharmap.cpp:115
bool contains(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:79
UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:34
int minmatch(const char *const unichar_repr) const
Definition: unicharmap.cpp:100
void insert(const char *const unichar_repr, UNICHAR_ID id)
Definition: unicharmap.cpp:56