tesseract 4.1.1
Loading...
Searching...
No Matches
altorenderer.cpp
Go to the documentation of this file.
1// File: altorenderer.cpp
2// Description: ALTO rendering interface
3// Author: Jake Sebright
4
5// (C) Copyright 2018
6// Licensed under the Apache License, Version 2.0 (the "License");
7// you may not use this file except in compliance with the License.
8// You may obtain a copy of the License at
9// http://www.apache.org/licenses/LICENSE-2.0
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16#include <memory>
17#include <sstream> // for std::stringstream
18#include "baseapi.h"
19#ifdef _WIN32
20# include "host.h" // windows.h for MultiByteToWideChar, ...
21#endif
22#include "renderer.h"
23#include "strngs.h" // for STRING
24
25namespace tesseract {
26
30static void AddBoxToAlto(const ResultIterator* it, PageIteratorLevel level,
31 std::stringstream& alto_str) {
32 int left, top, right, bottom;
33 it->BoundingBox(level, &left, &top, &right, &bottom);
34
35 int hpos = left;
36 int vpos = top;
37 int height = bottom - top;
38 int width = right - left;
39
40 alto_str << " HPOS=\"" << hpos << "\"";
41 alto_str << " VPOS=\"" << vpos << "\"";
42 alto_str << " WIDTH=\"" << width << "\"";
43 alto_str << " HEIGHT=\"" << height << "\"";
44
45 if (level == RIL_WORD) {
46 int wc = it->Confidence(RIL_WORD);
47 alto_str << " WC=\"0." << wc << "\"";
48 } else {
49 alto_str << ">";
50 }
51}
52
58 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
59 "<alto xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" "
60 "xmlns:xlink=\"http://www.w3.org/1999/xlink\" "
61 "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
62 "xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v3# "
63 "http://www.loc.gov/alto/v3/alto-3-0.xsd\">\n"
64 "\t<Description>\n"
65 "\t\t<MeasurementUnit>pixel</MeasurementUnit>\n"
66 "\t\t<sourceImageInformation>\n"
67 "\t\t\t<fileName>");
68
70
72 "</fileName>\n"
73 "\t\t</sourceImageInformation>\n"
74 "\t\t<OCRProcessing ID=\"OCR_0\">\n"
75 "\t\t\t<ocrProcessingStep>\n"
76 "\t\t\t\t<processingSoftware>\n"
77 "\t\t\t\t\t<softwareName>tesseract ");
78 AppendString(TessBaseAPI::Version());
80 "</softwareName>\n"
81 "\t\t\t\t</processingSoftware>\n"
82 "\t\t\t</ocrProcessingStep>\n"
83 "\t\t</OCRProcessing>\n"
84 "\t</Description>\n"
85 "\t<Layout>\n");
86
87 return true;
88}
89
94 const std::unique_ptr<const char[]> text(api->GetAltoText(imagenum()));
95 if (text == nullptr) return false;
96
97 AppendString(text.get());
98
99 return true;
100}
101
106 AppendString("\t</Layout>\n</alto>\n");
107
108 return true;
109}
110
112 : TessResultRenderer(outputbase, "xml") {}
113
118char* TessBaseAPI::GetAltoText(int page_number) {
119 return GetAltoText(nullptr, page_number);
120}
121
126char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) {
127 if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0))
128 return nullptr;
129
130 int lcnt = 0, tcnt = 0, bcnt = 0, wcnt = 0;
131
132 if (input_file_ == nullptr) SetInputName(nullptr);
133
134#ifdef _WIN32
135 // convert input name from ANSI encoding to utf-8
136 int str16_len =
137 MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, nullptr, 0);
138 wchar_t* uni16_str = new WCHAR[str16_len];
139 str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
140 uni16_str, str16_len);
141 int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr,
142 0, nullptr, nullptr);
143 char* utf8_str = new char[utf8_len];
144 WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
145 nullptr, nullptr);
146 *input_file_ = utf8_str;
147 delete[] uni16_str;
148 delete[] utf8_str;
149#endif
150
151 std::stringstream alto_str;
152 // Use "C" locale (needed for int values larger than 999).
153 alto_str.imbue(std::locale::classic());
154 alto_str
155 << "\t\t<Page WIDTH=\"" << rect_width_ << "\" HEIGHT=\""
156 << rect_height_
157 << "\" PHYSICAL_IMG_NR=\"" << page_number << "\""
158 << " ID=\"page_" << page_number << "\">\n"
159 << "\t\t\t<PrintSpace HPOS=\"0\" VPOS=\"0\""
160 << " WIDTH=\"" << rect_width_ << "\""
161 << " HEIGHT=\"" << rect_height_ << "\">\n";
162
163 ResultIterator* res_it = GetIterator();
164 while (!res_it->Empty(RIL_BLOCK)) {
165 if (res_it->Empty(RIL_WORD)) {
166 res_it->Next(RIL_WORD);
167 continue;
168 }
169
170 if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
171 alto_str << "\t\t\t\t<ComposedBlock ID=\"cblock_" << bcnt << "\"";
172 AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
173 alto_str << "\n";
174 }
175
176 if (res_it->IsAtBeginningOf(RIL_PARA)) {
177 alto_str << "\t\t\t\t\t<TextBlock ID=\"block_" << tcnt << "\"";
178 AddBoxToAlto(res_it, RIL_PARA, alto_str);
179 alto_str << "\n";
180 }
181
182 if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
183 alto_str << "\t\t\t\t\t\t<TextLine ID=\"line_" << lcnt << "\"";
184 AddBoxToAlto(res_it, RIL_TEXTLINE, alto_str);
185 alto_str << "\n";
186 }
187
188 alto_str << "\t\t\t\t\t\t\t<String ID=\"string_" << wcnt << "\"";
189 AddBoxToAlto(res_it, RIL_WORD, alto_str);
190 alto_str << " CONTENT=\"";
191
192 bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
193 bool last_word_in_tblock = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
194 bool last_word_in_cblock = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
195
196
197 int left, top, right, bottom;
198 res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
199
200 do {
201 const std::unique_ptr<const char[]> grapheme(
202 res_it->GetUTF8Text(RIL_SYMBOL));
203 if (grapheme && grapheme[0] != 0) {
204 alto_str << HOcrEscape(grapheme.get()).c_str();
205 }
206 res_it->Next(RIL_SYMBOL);
207 } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
208
209 alto_str << "\"/>";
210
211 wcnt++;
212
213 if (last_word_in_line) {
214 alto_str << "\n\t\t\t\t\t\t</TextLine>\n";
215 lcnt++;
216 } else {
217 int hpos = right;
218 int vpos = top;
219 res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
220 int width = left - hpos;
221 alto_str << "<SP WIDTH=\"" << width << "\" VPOS=\"" << vpos
222 << "\" HPOS=\"" << hpos << "\"/>\n";
223 }
224
225 if (last_word_in_tblock) {
226 alto_str << "\t\t\t\t\t</TextBlock>\n";
227 tcnt++;
228 }
229
230 if (last_word_in_cblock) {
231 alto_str << "\t\t\t\t</ComposedBlock>\n";
232 bcnt++;
233 }
234 }
235
236 alto_str << "\t\t\t</PrintSpace>\n"
237 << "\t\t</Page>\n";
238 const std::string& text = alto_str.str();
239
240 char* result = new char[text.length() + 1];
241 strcpy(result, text.c_str());
242 delete res_it;
243 return result;
244}
245
246} // namespace tesseract
STRING HOcrEscape(const char *text)
Definition: baseapi.cpp:2310
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:830
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:895
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
STRING * input_file_
Name used by training code.
Definition: baseapi.h:896
ResultIterator * GetIterator()
Definition: baseapi.cpp:1324
char * GetAltoText(ETEXT_DESC *monitor, int page_number)
void SetInputName(const char *name)
Definition: baseapi.cpp:271
void AppendString(const char *s)
Definition: renderer.cpp:102
const char * title() const
Definition: renderer.h:88
bool BeginDocumentHandler() override
TessAltoRenderer(const char *outputbase)
bool EndDocumentHandler() override
bool AddImageHandler(TessBaseAPI *api) override
float Confidence(PageIteratorLevel level) const
bool Empty(PageIteratorLevel level) const
bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const override
virtual char * GetUTF8Text(PageIteratorLevel level) const
bool IsAtBeginningOf(PageIteratorLevel level) const override
bool Next(PageIteratorLevel level) override
const char * c_str() const
Definition: strngs.cpp:205
const char * string() const
Definition: strngs.cpp:194