tesseract 4.1.1
Loading...
Searching...
No Matches
boxchar.h
Go to the documentation of this file.
1/**********************************************************************
2 * File: boxchar.h
3 * Description: Simple class to associate a Tesseract classification unit with
4 * its bounding box so that the boxes can be rotated as the image
5 * is rotated for degradation. Also includes routines to output
6 * the character-tagged boxes to a boxfile.
7 * Author: Ray Smith
8 * Created: Mon Nov 18 2013
9 *
10 * (C) Copyright 2013, Google Inc.
11 * Licensed under the Apache License, Version 2.0 (the "License");
12 * you may not use this file except in compliance with the License.
13 * You may obtain a copy of the License at
14 * http://www.apache.org/licenses/LICENSE-2.0
15 * Unless required by applicable law or agreed to in writing, software
16 * distributed under the License is distributed on an "AS IS" BASIS,
17 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 * See the License for the specific language governing permissions and
19 * limitations under the License.
20 *
21 **********************************************************************/
22
23#ifndef TESSERACT_TRAINING_BOXCHAR_H_
24#define TESSERACT_TRAINING_BOXCHAR_H_
25
26#include <string>
27#include <vector>
28
29#include "allheaders.h" // from Leptonica
30#include "platform.h"
31
32struct Box;
33
34namespace tesseract {
35
36class BoxChar {
37 public:
38 BoxChar(const char* utf8_str, int len);
39
40 ~BoxChar();
41
42 // Accessors.
43 const std::string& ch() const { return ch_; }
44 const Box* box() const { return box_; }
45 const int& page() const { return page_; }
46 void set_rtl_index(int index) { rtl_index_ = index; }
47 const int& rtl_index() const { return rtl_index_; }
48
49 // Set the box_ member.
50 void AddBox(int x, int y, int width, int height);
51
52 void set_page(int page) { page_ = page; }
53
54 std::string* mutable_ch() { return &ch_; }
55 Box* mutable_box() { return box_; }
56
57 // Sort function for sorting by left edge of box. Note that this will not
58 // work properly until after InsertNewlines and InsertSpaces.
59 bool operator<(const BoxChar& other) const {
60 if (box_ == nullptr) return true;
61 if (other.box_ == nullptr) return false;
62 return box_->x < other.box_->x;
63 }
64 // Increments *num_rtl and *num_ltr according to the directionality of
65 // characters in the box.
66 void GetDirection(int* num_rtl, int* num_ltr) const;
67 // Reverses the order of unicodes within the box. If Pango generates a
68 // ligature, these will get reversed on output, so reverse now.
70
71 static void TranslateBoxes(int xshift, int yshift,
72 std::vector<BoxChar*>* boxes);
73
74 // Prepares for writing the boxes to a file by inserting newlines, spaces,
75 // and re-ordering so the boxes are strictly left-to-right.
76 static void PrepareToWrite(std::vector<BoxChar*>* boxes);
77 // Inserts newline (tab) characters into the vector at newline positions.
78 static void InsertNewlines(bool rtl_rules, bool vertical_rules,
79 std::vector<BoxChar*>* boxes);
80 // Converts nullptr boxes to space characters, with appropriate bounding
81 // boxes.
82 static void InsertSpaces(bool rtl_rules, bool vertical_rules,
83 std::vector<BoxChar*>* boxes);
84 // Reorders text in a right-to-left script in left-to-right order.
85 static void ReorderRTLText(std::vector<BoxChar*>* boxes);
86 // Returns true if the vector contains mostly RTL characters.
87 static bool ContainsMostlyRTL(const std::vector<BoxChar*>& boxes);
88 // Returns true if the text is mostly laid out vertically.
89 static bool MostlyVertical(const std::vector<BoxChar*>& boxes);
90
91 // Returns the total length of all the strings in the boxes.
92 static int TotalByteLength(const std::vector<BoxChar*>& boxes);
93
94 // Rotate the vector of boxes between start and end by the given rotation.
95 // The rotation is in radians clockwise about the given center.
96 static void RotateBoxes(float rotation,
97 int xcenter,
98 int ycenter,
99 int start_box,
100 int end_box,
101 std::vector<BoxChar*>* boxes);
102
103 // Create a tesseract box file from the vector of boxes. The image height
104 // is needed to convert to tesseract coordinates.
105 static void WriteTesseractBoxFile(const std::string& name, int height,
106 const std::vector<BoxChar*>& boxes);
107 // Gets the tesseract box file as a string from the vector of boxes.
108 // The image height is needed to convert to tesseract coordinates.
109 static std::string GetTesseractBoxStr(int height,
110 const std::vector<BoxChar*>& boxes);
111
112 private:
113 std::string ch_;
114 Box* box_;
115 int page_;
116 // If the box is an RTL character, contains the original position in the
117 // array of boxes (before reversal), otherwise -1.
118 int rtl_index_;
119};
120
121// Sort predicate to sort a vector of BoxChar*.
123 bool operator()(const BoxChar* box1, const BoxChar* box2) const {
124 if (box1->rtl_index() >= 0 && box2->rtl_index() >= 0)
125 return box2->rtl_index() < box1->rtl_index();
126 return *box1 < *box2;
127 }
128};
129
130} // namespace tesseract
131
132#endif // TESSERACT_TRAINING_BOXCHAR_H_
const Box * box() const
Definition: boxchar.h:44
bool operator<(const BoxChar &other) const
Definition: boxchar.h:59
static bool MostlyVertical(const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:270
static void WriteTesseractBoxFile(const std::string &name, int height, const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:320
Box * mutable_box()
Definition: boxchar.h:55
static void PrepareToWrite(std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:97
static void InsertSpaces(bool rtl_rules, bool vertical_rules, std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:177
void set_rtl_index(int index)
Definition: boxchar.h:46
void ReverseUnicodesInBox()
Definition: boxchar.cpp:76
void set_page(int page)
Definition: boxchar.h:52
void GetDirection(int *num_rtl, int *num_ltr) const
Definition: boxchar.cpp:51
static void ReorderRTLText(std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:235
static int TotalByteLength(const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:289
static std::string GetTesseractBoxStr(int height, const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:327
static void TranslateBoxes(int xshift, int yshift, std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:83
const int & page() const
Definition: boxchar.h:45
void AddBox(int x, int y, int width, int height)
Definition: boxchar.cpp:45
static void RotateBoxes(float rotation, int xcenter, int ycenter, int start_box, int end_box, std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:299
const std::string & ch() const
Definition: boxchar.h:43
static bool ContainsMostlyRTL(const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:260
const int & rtl_index() const
Definition: boxchar.h:47
static void InsertNewlines(bool rtl_rules, bool vertical_rules, std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:112
std::string * mutable_ch()
Definition: boxchar.h:54
bool operator()(const BoxChar *box1, const BoxChar *box2) const
Definition: boxchar.h:123