tesseract 4.1.1
Loading...
Searching...
No Matches
ligature_table.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: ligature_table.cpp
3 * Description: Class for adding and removing optional latin ligatures,
4 * conditional on codepoint support by a specified font
5 * (if specified).
6 * Author: Ranjith Unnikrishnan
7 * Created: Mon Nov 18 2013
8 *
9 * (C) Copyright 2013, Google Inc.
10 * Licensed under the Apache License, Version 2.0 (the "License");
11 * you may not use this file except in compliance with the License.
12 * You may obtain a copy of the License at
13 * http://www.apache.org/licenses/LICENSE-2.0
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
19 *
20 **********************************************************************/
21
22#include "ligature_table.h"
23
24#include <utility>
25
26#include "pango_font_info.h"
27#include "tlog.h"
28#include "unichar.h"
29#include "unicharset.h"
30#include "unicode/errorcode.h" // from libicu
31#include "unicode/normlzr.h" // from libicu
32#include "unicode/unistr.h" // from libicu
33#include "unicode/utypes.h" // from libicu
34
35namespace tesseract {
36
37static std::string EncodeAsUTF8(const char32 ch32) {
38 UNICHAR uni_ch(ch32);
39 return std::string(uni_ch.utf8(), uni_ch.utf8_len());
40}
41
42// Range of optional latin ligature characters in Unicode to build ligatures
43// from. Note that this range does not contain the custom ligatures that we
44// encode in the private use area.
45const int kMinLigature = 0xfb00;
46const int kMaxLigature = 0xfb17; // Don't put the wide Hebrew letters in.
47
48/* static */
49std::unique_ptr<LigatureTable> LigatureTable::instance_;
50
51/* static */
53 if (instance_ == nullptr) {
54 instance_.reset(new LigatureTable());
55 instance_->Init();
56 }
57 return instance_.get();
58}
59
60LigatureTable::LigatureTable() : min_lig_length_(0), max_lig_length_(0),
61 min_norm_length_(0), max_norm_length_(0) {}
62
64 if (norm_to_lig_table_.empty()) {
65 for (char32 lig = kMinLigature; lig <= kMaxLigature; ++lig) {
66 // For each char in the range, convert to utf8, nfkc normalize, and if
67 // the strings are different put the both mappings in the hash_maps.
68 std::string lig8 = EncodeAsUTF8(lig);
69 icu::UnicodeString unicode_lig8(static_cast<UChar32>(lig));
70 icu::UnicodeString normed8_result;
71 icu::ErrorCode status;
72 icu::Normalizer::normalize(unicode_lig8, UNORM_NFKC, 0, normed8_result,
73 status);
74 std::string normed8;
75 normed8_result.toUTF8String(normed8);
76 // The icu::Normalizer maps the "LONG S T" ligature to "st". Correct that
77 // here manually so that AddLigatures() will work as desired.
78 if (lig8 == "\uFB05")
79 normed8 = "ſt";
80 int lig_length = lig8.length();
81 int norm_length = normed8.size();
82 if (normed8 != lig8 && lig_length > 1 && norm_length > 1) {
83 norm_to_lig_table_[normed8] = lig8;
84 lig_to_norm_table_[lig8] = normed8;
85 if (min_lig_length_ == 0 || lig_length < min_lig_length_)
86 min_lig_length_ = lig_length;
87 if (lig_length > max_lig_length_)
88 max_lig_length_ = lig_length;
89 if (min_norm_length_ == 0 || norm_length < min_norm_length_)
90 min_norm_length_ = norm_length;
91 if (norm_length > max_norm_length_)
92 max_norm_length_ = norm_length;
93 }
94 }
95 // Add custom extra ligatures.
96 for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr; ++i) {
99 int norm_length = strlen(UNICHARSET::kCustomLigatures[i][0]);
100 if (min_norm_length_ == 0 || norm_length < min_norm_length_)
101 min_norm_length_ = norm_length;
102 if (norm_length > max_norm_length_)
103 max_norm_length_ = norm_length;
104
107 }
108 }
109}
110
111std::string LigatureTable::RemoveLigatures(const std::string& str) const {
112 std::string result;
113 UNICHAR::const_iterator it_begin = UNICHAR::begin(str.c_str(), str.length());
114 UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
115 char tmp[5];
116 int len;
117 for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
118 len = it.get_utf8(tmp);
119 tmp[len] = '\0';
120 LigHash::const_iterator lig_it = lig_to_norm_table_.find(tmp);
121 if (lig_it != lig_to_norm_table_.end()) {
122 result += lig_it->second;
123 } else {
124 result += tmp;
125 }
126 }
127 return result;
128}
129
130std::string LigatureTable::RemoveCustomLigatures(const std::string& str) const {
131 std::string result;
132 UNICHAR::const_iterator it_begin = UNICHAR::begin(str.c_str(), str.length());
133 UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
134 char tmp[5];
135 int len;
136 int norm_ind;
137 for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
138 len = it.get_utf8(tmp);
139 tmp[len] = '\0';
140 norm_ind = -1;
141 for (int i = 0;
142 UNICHARSET::kCustomLigatures[i][0] != nullptr && norm_ind < 0; ++i) {
143 if (!strcmp(tmp, UNICHARSET::kCustomLigatures[i][1])) {
144 norm_ind = i;
145 }
146 }
147 if (norm_ind >= 0) {
148 result += UNICHARSET::kCustomLigatures[norm_ind][0];
149 } else {
150 result += tmp;
151 }
152 }
153 return result;
154}
155
156std::string LigatureTable::AddLigatures(const std::string& str,
157 const PangoFontInfo* font) const {
158 std::string result;
159 int len = str.size();
160 int step = 0;
161 int i = 0;
162 for (i = 0; i < len - min_norm_length_ + 1; i += step) {
163 step = 0;
164 for (int liglen = max_norm_length_; liglen >= min_norm_length_; --liglen) {
165 if (i + liglen <= len) {
166 std::string lig_cand = str.substr(i, liglen);
167 LigHash::const_iterator it = norm_to_lig_table_.find(lig_cand);
168 if (it != norm_to_lig_table_.end()) {
169 tlog(3, "Considering %s -> %s\n", lig_cand.c_str(),
170 it->second.c_str());
171 if (font) {
172 // Test for renderability.
173 if (!font->CanRenderString(it->second.data(), it->second.length()))
174 continue; // Not renderable
175 }
176 // Found a match so convert it.
177 step = liglen;
178 result += it->second;
179 tlog(2, "Substituted %s -> %s\n", lig_cand.c_str(),
180 it->second.c_str());
181 break;
182 }
183 }
184 }
185 if (step == 0) {
186 result += str[i];
187 step = 1;
188 }
189 }
190 result += str.substr(i, len - i);
191 return result;
192}
193
194} // namespace tesseract
signed int char32
#define tlog(level,...)
Definition: tlog.h:33
signed int char32
Definition: unichar.h:51
const int kMinLigature
const int kMaxLigature
static const_iterator begin(const char *utf8_str, int byte_length)
Definition: unichar.cpp:204
static const_iterator end(const char *utf8_str, int byte_length)
Definition: unichar.cpp:208
static TESS_API const char * kCustomLigatures[][2]
Definition: unicharset.h:150
std::string AddLigatures(const std::string &str, const PangoFontInfo *font) const
static std::unique_ptr< LigatureTable > instance_
std::string RemoveLigatures(const std::string &str) const
static LigatureTable * Get()
std::string RemoveCustomLigatures(const std::string &str) const
bool CanRenderString(const char *utf8_word, int len, std::vector< std::string > *graphemes) const