tesseract 4.1.1
Loading...
Searching...
No Matches
strngs.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: strngs.cpp (Formerly strings.c)
3 * Description: STRING class functions.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 1991, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19#include "strngs.h"
20#include <cassert> // for assert
21#include <cstdlib> // for malloc, free
22#include <locale> // for std::locale::classic
23#include <sstream> // for std::stringstream
24#include "errcode.h" // for ASSERT_HOST
25#include "genericvector.h" // for GenericVector
26#include "helpers.h" // for ReverseN
27#include "serialis.h" // for TFile
28
30
31// Size of buffer needed to host the decimal representation of the maximum
32// possible length of an int (in 64 bits), being -<20 digits>.
33const int kMaxIntSize = 22;
34
35/**********************************************************************
36 * STRING_HEADER provides metadata about the allocated buffer,
37 * including total capacity and how much used (strlen with '\0').
38 *
39 * The implementation hides this header at the start of the data
40 * buffer and appends the string on the end to keep sizeof(STRING)
41 * unchanged from earlier versions so serialization is not affected.
42 *
43 * The collection of MACROS provide different implementations depending
44 * on whether the string keeps track of its strlen or not so that this
45 * feature can be added in later when consumers don't modify the string
46 **********************************************************************/
47
48// Smallest string to allocate by default
49const int kMinCapacity = 16;
50
51char* STRING::AllocData(int used, int capacity) {
52 data_ = static_cast<STRING_HEADER *>(malloc(capacity + sizeof(STRING_HEADER)));
53
54 // header is the metadata for this memory block
55 STRING_HEADER* header = GetHeader();
56 header->capacity_ = capacity;
57 header->used_ = used;
58 return GetCStr();
59}
60
61void STRING::DiscardData() {
62 free(data_);
63 data_ = nullptr;
64}
65
66// This is a private method; ensure FixHeader is called (or used_ is well defined)
67// beforehand
68char* STRING::ensure_cstr(int32_t min_capacity) {
69 STRING_HEADER* orig_header = GetHeader();
70 if (min_capacity <= orig_header->capacity_)
71 return (reinterpret_cast<char *>(this->data_)) + sizeof(STRING_HEADER);
72
73 // if we are going to grow bigger, than double our existing
74 // size, but if that still is not big enough then keep the
75 // requested capacity
76 if (min_capacity < 2 * orig_header->capacity_)
77 min_capacity = 2 * orig_header->capacity_;
78
79 int alloc = sizeof(STRING_HEADER) + min_capacity;
80 auto* new_header = static_cast<STRING_HEADER*>(malloc(alloc));
81
82 memcpy(&new_header[1], GetCStr(), orig_header->used_);
83 new_header->capacity_ = min_capacity;
84 new_header->used_ = orig_header->used_;
85
86 // free old memory, then rebind to new memory
87 DiscardData();
88 data_ = new_header;
89
90 assert(InvariantOk());
91 return (reinterpret_cast<char *>(data_)) + sizeof(STRING_HEADER);
92}
93
94// This is const, but is modifying a mutable field
95// this way it can be used on const or non-const instances.
96void STRING::FixHeader() const {
97 const STRING_HEADER* header = GetHeader();
98 if (header->used_ < 0)
99 header->used_ = strlen(GetCStr()) + 1;
100}
101
102
104 // Empty STRINGs contain just the "\0".
105 memcpy(AllocData(1, kMinCapacity), "", 1);
106}
107
109 str.FixHeader();
110 const STRING_HEADER* str_header = str.GetHeader();
111 const int str_used = str_header->used_;
112 char *this_cstr = AllocData(str_used, str_used);
113 memcpy(this_cstr, str.GetCStr(), str_used);
114 assert(InvariantOk());
115}
116
117STRING::STRING(const char* cstr) {
118 if (cstr == nullptr) {
119 // Empty STRINGs contain just the "\0".
120 memcpy(AllocData(1, kMinCapacity), "", 1);
121 } else {
122 const int len = strlen(cstr) + 1;
123 char* this_cstr = AllocData(len, len);
124 memcpy(this_cstr, cstr, len);
125 }
126 assert(InvariantOk());
127}
128
129STRING::STRING(const char *data, int length) {
130 if (data == nullptr) {
131 // Empty STRINGs contain just the "\0".
132 memcpy(AllocData(1, kMinCapacity), "", 1);
133 } else {
134 char* this_cstr = AllocData(length + 1, length + 1);
135 memcpy(this_cstr, data, length);
136 this_cstr[length] = '\0';
137 }
138}
139
141 DiscardData();
142}
143
144// TODO(rays) Change all callers to use TFile and remove the old functions.
145// Writes to the given file. Returns false in case of error.
146bool STRING::Serialize(FILE* fp) const {
147 uint32_t len = length();
148 return tesseract::Serialize(fp, &len) &&
149 tesseract::Serialize(fp, GetCStr(), len);
150}
151// Writes to the given file. Returns false in case of error.
152bool STRING::Serialize(TFile* fp) const {
153 uint32_t len = length();
154 return fp->Serialize(&len) &&
155 fp->Serialize(GetCStr(), len);
156}
157// Reads from the given file. Returns false in case of error.
158// If swap is true, assumes a big/little-endian swap is needed.
159bool STRING::DeSerialize(bool swap, FILE* fp) {
160 uint32_t len;
161 if (!tesseract::DeSerialize(fp, &len)) return false;
162 if (swap)
163 ReverseN(&len, sizeof(len));
164 // Arbitrarily limit the number of characters to protect against bad data.
165 if (len > UINT16_MAX) return false;
166 truncate_at(len);
167 return tesseract::DeSerialize(fp, GetCStr(), len);
168}
169// Reads from the given file. Returns false in case of error.
170// If swap is true, assumes a big/little-endian swap is needed.
172 uint32_t len;
173 if (!fp->DeSerialize(&len)) return false;
174 truncate_at(len);
175 return fp->DeSerialize(GetCStr(), len);
176}
177
178// As DeSerialize, but only seeks past the data - hence a static method.
180 uint32_t len;
181 if (!fp->DeSerialize(&len)) return false;
182 return fp->Skip(len);
183}
184
185bool STRING::contains(const char c) const {
186 return (c != '\0') && (strchr (GetCStr(), c) != nullptr);
187}
188
189int32_t STRING::length() const {
190 FixHeader();
191 return GetHeader()->used_ - 1;
192}
193
194const char* STRING::string() const {
195 const STRING_HEADER* header = GetHeader();
196 if (!header || header->used_ == 0)
197 return nullptr;
198
199 // mark header length unreliable because tesseract might
200 // cast away the const and mutate the string directly.
201 header->used_ = -1;
202 return GetCStr();
203}
204
205const char* STRING::c_str() const {
206 return string();
207}
208
209/******
210 * The STRING_IS_PROTECTED interface adds additional support to migrate
211 * code that needs to modify the STRING in ways not otherwise supported
212 * without violating encapsulation.
213 *
214 * Also makes the [] operator return a const so it is immutable
215 */
216#if STRING_IS_PROTECTED
217const char& STRING::operator[](int32_t index) const {
218 return GetCStr()[index];
219}
220
221void STRING::insert_range(int32_t index, const char* str, int len) {
222 // if index is outside current range, then also grow size of string
223 // to accmodate the requested range.
224 STRING_HEADER* this_header = GetHeader();
225 int used = this_header->used_;
226 if (index > used)
227 used = index;
228
229 char* this_cstr = ensure_cstr(used + len + 1);
230 if (index < used) {
231 // move existing string from index to '\0' inclusive.
232 memmove(this_cstr + index + len,
233 this_cstr + index,
234 this_header->used_ - index);
235 } else if (len > 0) {
236 // We are going to overwrite previous null terminator, so write the new one.
237 this_cstr[this_header->used_ + len - 1] = '\0';
238
239 // If the old header did not have the terminator,
240 // then we need to account for it now that we've added it.
241 // Otherwise it was already accounted for; we just moved it.
242 if (this_header->used_ == 0)
243 ++this_header->used_;
244 }
245
246 // Write new string to index.
247 // The string is already terminated from the conditions above.
248 memcpy(this_cstr + index, str, len);
249 this_header->used_ += len;
250
251 assert(InvariantOk());
252}
253
254void STRING::erase_range(int32_t index, int len) {
255 char* this_cstr = GetCStr();
256 STRING_HEADER* this_header = GetHeader();
257
258 memcpy(this_cstr+index, this_cstr+index+len,
259 this_header->used_ - index - len);
260 this_header->used_ -= len;
261 assert(InvariantOk());
262}
263
264#else
265void STRING::truncate_at(int32_t index) {
266 ASSERT_HOST(index >= 0);
267 FixHeader();
268 char* this_cstr = ensure_cstr(index + 1);
269 this_cstr[index] = '\0';
270 GetHeader()->used_ = index + 1;
271 assert(InvariantOk());
272}
273
274char& STRING::operator[](int32_t index) const {
275 // Code is casting away this const and mutating the string,
276 // so mark used_ as -1 to flag it unreliable.
277 GetHeader()->used_ = -1;
278 return (const_cast<char *>(GetCStr()))[index];
279}
280#endif
281
282void STRING::split(const char c, GenericVector<STRING> *splited) {
283 int start_index = 0;
284 const int len = length();
285 for (int i = 0; i < len; i++) {
286 if ((*this)[i] == c) {
287 if (i != start_index) {
288 (*this)[i] = '\0';
289 splited->push_back(STRING(GetCStr() + start_index, i - start_index));
290 (*this)[i] = c;
291 }
292 start_index = i + 1;
293 }
294 }
295
296 if (len != start_index) {
297 splited->push_back(STRING(GetCStr() + start_index, len - start_index));
298 }
299}
300
301bool STRING::operator==(const STRING& str) const {
302 FixHeader();
303 str.FixHeader();
304 const STRING_HEADER* str_header = str.GetHeader();
305 const STRING_HEADER* this_header = GetHeader();
306 const int this_used = this_header->used_;
307 const int str_used = str_header->used_;
308
309 return (this_used == str_used)
310 && (memcmp(GetCStr(), str.GetCStr(), this_used) == 0);
311}
312
313bool STRING::operator!=(const STRING& str) const {
314 FixHeader();
315 str.FixHeader();
316 const STRING_HEADER* str_header = str.GetHeader();
317 const STRING_HEADER* this_header = GetHeader();
318 const int this_used = this_header->used_;
319 const int str_used = str_header->used_;
320
321 return (this_used != str_used)
322 || (memcmp(GetCStr(), str.GetCStr(), this_used) != 0);
323}
324
325bool STRING::operator!=(const char* cstr) const {
326 FixHeader();
327 const STRING_HEADER* this_header = GetHeader();
328
329 if (cstr == nullptr)
330 return this_header->used_ > 1; // either '\0' or nullptr
331 else {
332 const int32_t length = strlen(cstr) + 1;
333 return (this_header->used_ != length)
334 || (memcmp(GetCStr(), cstr, length) != 0);
335 }
336}
337
339 str.FixHeader();
340 const STRING_HEADER* str_header = str.GetHeader();
341 const int str_used = str_header->used_;
342
343 GetHeader()->used_ = 0; // clear since ensure doesn't need to copy data
344 char* this_cstr = ensure_cstr(str_used);
345 STRING_HEADER* this_header = GetHeader();
346
347 memcpy(this_cstr, str.GetCStr(), str_used);
348 this_header->used_ = str_used;
349
350 assert(InvariantOk());
351 return *this;
352}
353
355 FixHeader();
356 str.FixHeader();
357 const STRING_HEADER* str_header = str.GetHeader();
358 const char* str_cstr = str.GetCStr();
359 const int str_used = str_header->used_;
360 const int this_used = GetHeader()->used_;
361 char* this_cstr = ensure_cstr(this_used + str_used);
362
363 STRING_HEADER* this_header = GetHeader(); // after ensure for realloc
364
365 if (this_used > 1) {
366 memcpy(this_cstr + this_used - 1, str_cstr, str_used);
367 this_header->used_ += str_used - 1; // overwrite '\0'
368 } else {
369 memcpy(this_cstr, str_cstr, str_used);
370 this_header->used_ = str_used;
371 }
372
373 assert(InvariantOk());
374 return *this;
375}
376
377void STRING::add_str_int(const char* str, int number) {
378 if (str != nullptr)
379 *this += str;
380 // Allow space for the maximum possible length of int64_t.
381 char num_buffer[kMaxIntSize];
382 snprintf(num_buffer, kMaxIntSize - 1, "%d", number);
383 num_buffer[kMaxIntSize - 1] = '\0';
384 *this += num_buffer;
385}
386// Appends the given string and double (as a %.8g) to this.
387void STRING::add_str_double(const char* str, double number) {
388 if (str != nullptr)
389 *this += str;
390 std::stringstream stream;
391 // Use "C" locale (needed for double value).
392 stream.imbue(std::locale::classic());
393 // Use 8 digits for double value.
394 stream.precision(8);
395 stream << number;
396 *this += stream.str().c_str();
397}
398
399STRING & STRING::operator=(const char* cstr) {
400 STRING_HEADER* this_header = GetHeader();
401 if (cstr) {
402 const int len = strlen(cstr) + 1;
403
404 this_header->used_ = 0; // don't bother copying data if need to realloc
405 char* this_cstr = ensure_cstr(len);
406 this_header = GetHeader(); // for realloc
407 memcpy(this_cstr, cstr, len);
408 this_header->used_ = len;
409 } else {
410 // Reallocate to same state as default constructor.
411 DiscardData();
412 // Empty STRINGs contain just the "\0".
413 memcpy(AllocData(1, kMinCapacity), "", 1);
414 }
415
416 assert(InvariantOk());
417 return *this;
418}
419
420void STRING::assign(const char *cstr, int len) {
421 STRING_HEADER* this_header = GetHeader();
422 this_header->used_ = 0; // don't bother copying data if need to realloc
423 char* this_cstr = ensure_cstr(len + 1); // +1 for '\0'
424
425 this_header = GetHeader(); // for realloc
426 memcpy(this_cstr, cstr, len);
427 this_cstr[len] = '\0';
428 this_header->used_ = len + 1;
429
430 assert(InvariantOk());
431}
432
433STRING STRING::operator+(const STRING& str) const {
434 STRING result(*this);
435 result += str;
436
437 assert(InvariantOk());
438 return result;
439}
440
441
442STRING STRING::operator+(const char ch) const {
443 STRING result;
444 FixHeader();
445 const STRING_HEADER* this_header = GetHeader();
446 const int this_used = this_header->used_;
447 char* result_cstr = result.ensure_cstr(this_used + 1);
448 STRING_HEADER* result_header = result.GetHeader();
449 const int result_used = result_header->used_;
450
451 // copies '\0' but we'll overwrite that
452 memcpy(result_cstr, GetCStr(), this_used);
453 result_cstr[result_used] = ch; // overwrite old '\0'
454 result_cstr[result_used + 1] = '\0'; // append on '\0'
455 ++result_header->used_;
456
457 assert(InvariantOk());
458 return result;
459}
460
461
462STRING& STRING::operator+=(const char *str) {
463 if (!str || !*str) // empty string has no effect
464 return *this;
465
466 FixHeader();
467 const int len = strlen(str) + 1;
468 const int this_used = GetHeader()->used_;
469 char* this_cstr = ensure_cstr(this_used + len);
470 STRING_HEADER* this_header = GetHeader(); // after ensure for realloc
471
472 // if we had non-empty string then append overwriting old '\0'
473 // otherwise replace
474 if (this_used > 0) {
475 memcpy(this_cstr + this_used - 1, str, len);
476 this_header->used_ += len - 1;
477 } else {
478 memcpy(this_cstr, str, len);
479 this_header->used_ = len;
480 }
481
482 assert(InvariantOk());
483 return *this;
484}
485
486
487STRING& STRING::operator+=(const char ch) {
488 if (ch == '\0')
489 return *this;
490
491 FixHeader();
492 int this_used = GetHeader()->used_;
493 char* this_cstr = ensure_cstr(this_used + 1);
494 STRING_HEADER* this_header = GetHeader();
495
496 if (this_used > 0)
497 --this_used; // undo old empty null if there was one
498
499 this_cstr[this_used++] = ch; // append ch to end
500 this_cstr[this_used++] = '\0'; // append '\0' after ch
501 this_header->used_ = this_used;
502
503 assert(InvariantOk());
504 return *this;
505}
#define ASSERT_HOST(x)
Definition: errcode.h:88
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:185
const int kMinCapacity
Definition: strngs.cpp:49
const int kMaxIntSize
Definition: strngs.cpp:33
bool DeSerialize(FILE *fp, char *data, size_t n)
Definition: serialis.cpp:28
bool Serialize(FILE *fp, const char *data, size_t n)
Definition: serialis.cpp:60
int push_back(T object)
bool Serialize(const char *data, size_t count=1)
Definition: serialis.cpp:148
bool DeSerialize(char *data, size_t count=1)
Definition: serialis.cpp:104
bool Skip(size_t count)
Definition: serialis.cpp:192
Definition: strngs.h:45
bool Serialize(FILE *fp) const
Definition: strngs.cpp:146
char & operator[](int32_t index) const
Definition: strngs.cpp:274
STRING()
Definition: strngs.cpp:103
void truncate_at(int32_t index)
Definition: strngs.cpp:265
~STRING()
Definition: strngs.cpp:140
bool operator==(const STRING &string) const
Definition: strngs.cpp:301
bool operator!=(const STRING &string) const
Definition: strngs.cpp:313
const char * c_str() const
Definition: strngs.cpp:205
STRING & operator=(const char *string)
Definition: strngs.cpp:399
void add_str_int(const char *str, int number)
Definition: strngs.cpp:377
int32_t length() const
Definition: strngs.cpp:189
void add_str_double(const char *str, double number)
Definition: strngs.cpp:387
void assign(const char *cstr, int len)
Definition: strngs.cpp:420
bool contains(char c) const
Definition: strngs.cpp:185
const char * string() const
Definition: strngs.cpp:194
void split(char c, GenericVector< STRING > *splited)
Definition: strngs.cpp:282
bool DeSerialize(bool swap, FILE *fp)
Definition: strngs.cpp:159
STRING operator+(const STRING &string) const
Definition: strngs.cpp:433
static bool SkipDeSerialize(tesseract::TFile *fp)
Definition: strngs.cpp:179
STRING & operator+=(const char *string)
Definition: strngs.cpp:462