tesseract 4.1.1
Loading...
Searching...
No Matches
boxchar.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: boxchar.cpp
3 * Description: Simple class to associate a Tesseract classification unit with
4 * its bounding box so that the boxes can be rotated as the image
5 * is rotated for degradation. Also includes routines to output
6 * the character-tagged boxes to a boxfile.
7 * Author: Ray Smith
8 *
9 * (C) Copyright 2013, Google Inc.
10 * Licensed under the Apache License, Version 2.0 (the "License");
11 * you may not use this file except in compliance with the License.
12 * You may obtain a copy of the License at
13 * http://www.apache.org/licenses/LICENSE-2.0
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
19 *
20 **********************************************************************/
21
22#include "boxchar.h"
23
24#include <cstddef>
25#include <algorithm>
26#include <vector>
27
28#include "fileio.h"
29#include "genericvector.h"
30#include "normstrngs.h"
31#include "tprintf.h"
32#include "unicharset.h"
33#include "unicode/uchar.h" // from libicu
34
35// Absolute Ratio of dx:dy or dy:dx to be a newline.
36const int kMinNewlineRatio = 5;
37
38namespace tesseract {
39
40BoxChar::BoxChar(const char* utf8_str, int len)
41 : ch_(utf8_str, len), box_(nullptr), page_(0), rtl_index_(-1) {}
42
43BoxChar::~BoxChar() { boxDestroy(&box_); }
44
45void BoxChar::AddBox(int x, int y, int width, int height) {
46 box_ = boxCreate(x, y, width, height);
47}
48
49// Increments *num_rtl and *num_ltr according to the directionality of
50// characters in the box.
51void BoxChar::GetDirection(int* num_rtl, int* num_ltr) const {
52 // Convert the unichar to UTF32 representation
53 std::vector<char32> uni_vector = UNICHAR::UTF8ToUTF32(ch_.c_str());
54 if (uni_vector.empty()) {
55 tprintf("Illegal utf8 in boxchar string:%s = ", ch_.c_str());
56 for (size_t c = 0; c < ch_.size(); ++c) {
57 tprintf(" 0x%x", ch_[c]);
58 }
59 tprintf("\n");
60 return;
61 }
62 for (char32 ch : uni_vector) {
63 UCharDirection dir = u_charDirection(ch);
64 if (dir == U_RIGHT_TO_LEFT || dir == U_RIGHT_TO_LEFT_ARABIC ||
65 dir == U_RIGHT_TO_LEFT_ISOLATE) {
66 ++*num_rtl;
67 } else if ((dir == U_ARABIC_NUMBER) ||
68 (dir != U_DIR_NON_SPACING_MARK && dir != U_BOUNDARY_NEUTRAL)) {
69 ++*num_ltr;
70 }
71 }
72}
73
74// Reverses the order of unicodes within the box. If Pango generates a
75// ligature, these will get reversed on output, so reverse now.
77 std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(ch_.c_str());
78 std::reverse(unicodes.begin(), unicodes.end());
79 ch_ = UNICHAR::UTF32ToUTF8(unicodes);
80}
81
82/* static */
83void BoxChar::TranslateBoxes(int xshift, int yshift,
84 std::vector<BoxChar*>* boxes) {
85 for (size_t i = 0; i < boxes->size(); ++i) {
86 BOX* box = (*boxes)[i]->box_;
87 if (box != nullptr) {
88 box->x += xshift;
89 box->y += yshift;
90 }
91 }
92}
93
94// Prepares for writing the boxes to a file by inserting newlines, spaces,
95// and re-ordering so the boxes are strictly left-to-right.
96/* static */
97void BoxChar::PrepareToWrite(std::vector<BoxChar*>* boxes) {
98 bool rtl_rules = ContainsMostlyRTL(*boxes);
99 bool vertical_rules = MostlyVertical(*boxes);
100 InsertNewlines(rtl_rules, vertical_rules, boxes);
101 InsertSpaces(rtl_rules, vertical_rules, boxes);
102 for (size_t i = 0; i < boxes->size(); ++i) {
103 if ((*boxes)[i]->box_ == nullptr) tprintf("Null box at index %zu\n", i);
104 }
105 if (rtl_rules) {
106 ReorderRTLText(boxes);
107 }
108}
109
110// Inserts newline (tab) characters into the vector at newline positions.
111/* static */
112void BoxChar::InsertNewlines(bool rtl_rules, bool vertical_rules,
113 std::vector<BoxChar*>* boxes) {
114 size_t prev_i = SIZE_MAX;
115 int max_shift = 0;
116 for (size_t i = 0; i < boxes->size(); ++i) {
117 Box* box = (*boxes)[i]->box_;
118 if (box == nullptr) {
119 if (prev_i == SIZE_MAX || prev_i + 1 < i || i + 1 == boxes->size()) {
120 // Erase null boxes at the start of a line and after another null box.
121 do {
122 delete (*boxes)[i];
123 boxes->erase(boxes->begin() + i);
124 if (i == 0) break;
125 } while (i-- == boxes->size() && (*boxes)[i]->box_ == nullptr);
126 }
127 continue;
128 }
129 if (prev_i != SIZE_MAX) {
130 Box* prev_box = (*boxes)[prev_i]->box_;
131 int shift = box->x - prev_box->x;
132 if (vertical_rules) {
133 shift = box->y - prev_box->y;
134 } else if (rtl_rules) {
135 shift = -shift;
136 }
137 if (-shift > max_shift) {
138 // This is a newline. Since nothing cares about the size of the box,
139 // except the out-of-bounds checker, minimize the chance of creating
140 // a box outside the image by making the width and height 1.
141 int width = 1;
142 int height = 1;
143 int x = prev_box->x + prev_box->w;
144 int y = prev_box->y;
145 if (vertical_rules) {
146 x = prev_box->x;
147 y = prev_box->y + prev_box->h;
148 } else if (rtl_rules) {
149 x = prev_box->x - width;
150 if (x < 0) {
151 tprintf("prev x = %d, width=%d\n", prev_box->x, width);
152 x = 0;
153 }
154 }
155 if (prev_i + 1 == i) {
156 // New character needed.
157 BoxChar* new_box = new BoxChar("\t", 1);
158 new_box->AddBox(x, y, width, height);
159 new_box->page_ = (*boxes)[i]->page_;
160 boxes->insert(boxes->begin() + i, new_box);
161 ++i;
162 } else {
163 (*boxes)[i - 1]->AddBox(x, y, width, height);
164 (*boxes)[i - 1]->ch_ = "\t";
165 }
166 max_shift = 0;
167 } else if (shift > max_shift) {
168 max_shift = shift;
169 }
170 }
171 prev_i = i;
172 }
173}
174
175// Converts nullptr boxes to space characters, with appropriate bounding boxes.
176/* static */
177void BoxChar::InsertSpaces(bool rtl_rules, bool vertical_rules,
178 std::vector<BoxChar*>* boxes) {
179 // After InsertNewlines, any remaining null boxes are not newlines, and are
180 // singletons, so add a box to each remaining null box.
181 for (size_t i = 1; i + 1 < boxes->size(); ++i) {
182 Box* box = (*boxes)[i]->box_;
183 if (box == nullptr) {
184 Box* prev = (*boxes)[i - 1]->box_;
185 Box* next = (*boxes)[i + 1]->box_;
186 ASSERT_HOST(prev != nullptr && next != nullptr);
187 int top = std::min(prev->y, next->y);
188 int bottom = std::max(prev->y + prev->h, next->y + next->h);
189 int left = prev->x + prev->w;
190 int right = next->x;
191 if (vertical_rules) {
192 top = prev->y + prev->h;
193 bottom = next->y;
194 left = std::min(prev->x, next->x);
195 right = std::max(prev->x + prev->w, next->x + next->w);
196 } else if (rtl_rules) {
197 // With RTL we have to account for BiDi.
198 // Right becomes the min left of all prior boxes back to the first
199 // space or newline.
200 right = prev->x;
201 left = next->x + next->w;
202 for (int j = i - 2;
203 j >= 0 && (*boxes)[j]->ch_ != " " && (*boxes)[j]->ch_ != "\t";
204 --j) {
205 prev = (*boxes)[j]->box_;
206 ASSERT_HOST(prev != nullptr);
207 if (prev->x < right) {
208 right = prev->x;
209 }
210 }
211 // Left becomes the max right of all next boxes forward to the first
212 // space or newline.
213 for (size_t j = i + 2;
214 j < boxes->size() && (*boxes)[j]->box_ != nullptr &&
215 (*boxes)[j]->ch_ != "\t";
216 ++j) {
217 next = (*boxes)[j]->box_;
218 if (next->x + next->w > left) {
219 left = next->x + next->w;
220 }
221 }
222 }
223 // Italic and stylized characters can produce negative spaces, which
224 // Leptonica doesn't like, so clip to a positive size.
225 if (right <= left) right = left + 1;
226 if (bottom <= top) bottom = top + 1;
227 (*boxes)[i]->AddBox(left, top, right - left, bottom - top);
228 (*boxes)[i]->ch_ = " ";
229 }
230 }
231}
232
233// Reorders text in a right-to-left script in left-to-right order.
234/* static */
235void BoxChar::ReorderRTLText(std::vector<BoxChar*>* boxes) {
236 // Ideally we need the inverse of the algorithm used by ResultIterator.
237 // For now, let's try a sort that reverses original positions for RTL
238 // characters, otherwise by x-position. This should be much closer to
239 // correct than just sorting by x-position.
240 size_t num_boxes = boxes->size();
241 for (size_t i = 0; i < num_boxes; ++i) {
242 int num_rtl = 0, num_ltr = 0;
243 (*boxes)[i]->GetDirection(&num_rtl, &num_ltr);
244 if (num_rtl > num_ltr) {
245 (*boxes)[i]->set_rtl_index(i);
246 (*boxes)[i]->ReverseUnicodesInBox();
247 }
248 }
249 BoxCharPtrSort sorter;
250 size_t end = 0;
251 for (size_t start = 0; start < boxes->size(); start = end + 1) {
252 end = start + 1;
253 while (end < boxes->size() && (*boxes)[end]->ch_ != "\t") ++end;
254 std::sort(boxes->begin() + start, boxes->begin() + end, sorter);
255 }
256}
257
258// Returns true if the vector contains mostly RTL characters.
259/* static */
260bool BoxChar::ContainsMostlyRTL(const std::vector<BoxChar*>& boxes) {
261 int num_rtl = 0, num_ltr = 0;
262 for (size_t i = 0; i < boxes.size(); ++i) {
263 boxes[i]->GetDirection(&num_rtl, &num_ltr);
264 }
265 return num_rtl > num_ltr;
266}
267
268// Returns true if the text is mostly laid out vertically.
269/* static */
270bool BoxChar::MostlyVertical(const std::vector<BoxChar*>& boxes) {
271 int64_t total_dx = 0, total_dy = 0;
272 for (size_t i = 1; i < boxes.size(); ++i) {
273 if (boxes[i - 1]->box_ != nullptr && boxes[i]->box_ != nullptr &&
274 boxes[i - 1]->page_ == boxes[i]->page_) {
275 int dx = boxes[i]->box_->x - boxes[i - 1]->box_->x;
276 int dy = boxes[i]->box_->y - boxes[i - 1]->box_->y;
277 if (abs(dx) > abs(dy) * kMinNewlineRatio ||
278 abs(dy) > abs(dx) * kMinNewlineRatio) {
279 total_dx += dx * dx;
280 total_dy += dy * dy;
281 }
282 }
283 }
284 return total_dy > total_dx;
285}
286
287// Returns the total length of all the strings in the boxes.
288/* static */
289int BoxChar::TotalByteLength(const std::vector<BoxChar*>& boxes) {
290 int total_length = 0;
291 for (size_t i = 0; i < boxes.size(); ++i)
292 total_length += boxes[i]->ch_.size();
293 return total_length;
294}
295
296// Rotate the boxes in [start_box, end_box) by the given rotation.
297// The rotation is in radians clockwise about the given center.
298/* static */
299void BoxChar::RotateBoxes(float rotation, int xcenter, int ycenter,
300 int start_box, int end_box,
301 std::vector<BoxChar*>* boxes) {
302 Boxa* orig = boxaCreate(0);
303 for (int i = start_box; i < end_box; ++i) {
304 BOX* box = (*boxes)[i]->box_;
305 if (box) boxaAddBox(orig, box, L_CLONE);
306 }
307 Boxa* rotated = boxaRotate(orig, xcenter, ycenter, rotation);
308 boxaDestroy(&orig);
309 for (int i = start_box, box_ind = 0; i < end_box; ++i) {
310 if ((*boxes)[i]->box_) {
311 boxDestroy(&((*boxes)[i]->box_));
312 (*boxes)[i]->box_ = boxaGetBox(rotated, box_ind++, L_CLONE);
313 }
314 }
315 boxaDestroy(&rotated);
316}
317
318const int kMaxLineLength = 1024;
319/* static */
320void BoxChar::WriteTesseractBoxFile(const std::string& filename, int height,
321 const std::vector<BoxChar*>& boxes) {
322 std::string output = GetTesseractBoxStr(height, boxes);
323 File::WriteStringToFileOrDie(output, filename);
324}
325
326/* static */
327std::string BoxChar::GetTesseractBoxStr(int height,
328 const std::vector<BoxChar*>& boxes) {
329 std::string output;
330 char buffer[kMaxLineLength];
331 for (size_t i = 0; i < boxes.size(); ++i) {
332 const Box* box = boxes[i]->box_;
333 if (box == nullptr) {
334 tprintf("Error: Call PrepareToWrite before WriteTesseractBoxFile!!\n");
335 return "";
336 }
337 int nbytes =
338 snprintf(buffer, kMaxLineLength, "%s %d %d %d %d %d\n",
339 boxes[i]->ch_.c_str(), box->x, height - box->y - box->h,
340 box->x + box->w, height - box->y, boxes[i]->page_);
341 output.append(buffer, nbytes);
342 }
343 return output;
344}
345
346} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:88
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
const int kMinNewlineRatio
Definition: boxchar.cpp:36
signed int char32
Definition: unichar.h:51
const int kMaxLineLength
Definition: boxchar.cpp:318
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:215
static std::string UTF32ToUTF8(const std::vector< char32 > &str32)
Definition: unichar.cpp:232
const Box * box() const
Definition: boxchar.h:44
static bool MostlyVertical(const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:270
static void WriteTesseractBoxFile(const std::string &name, int height, const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:320
static void PrepareToWrite(std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:97
static void InsertSpaces(bool rtl_rules, bool vertical_rules, std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:177
BoxChar(const char *utf8_str, int len)
Definition: boxchar.cpp:40
void ReverseUnicodesInBox()
Definition: boxchar.cpp:76
void GetDirection(int *num_rtl, int *num_ltr) const
Definition: boxchar.cpp:51
static void ReorderRTLText(std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:235
static int TotalByteLength(const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:289
static std::string GetTesseractBoxStr(int height, const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:327
static void TranslateBoxes(int xshift, int yshift, std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:83
void AddBox(int x, int y, int width, int height)
Definition: boxchar.cpp:45
static void RotateBoxes(float rotation, int xcenter, int ycenter, int start_box, int end_box, std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:299
const std::string & ch() const
Definition: boxchar.h:43
static bool ContainsMostlyRTL(const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:260
static void InsertNewlines(bool rtl_rules, bool vertical_rules, std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:112
static void WriteStringToFileOrDie(const std::string &str, const std::string &filename)
Definition: fileio.cpp:56