tesseract 4.1.1
Loading...
Searching...
No Matches
scanutils.cpp
Go to the documentation of this file.
1// Copyright 2006 Google Inc.
2// All Rights Reserved.
3// Author: renn
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8// http://www.apache.org/licenses/LICENSE-2.0
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#ifdef HAVE_CONFIG_H
16#include "config_auto.h"
17#endif
18
19#include <cctype>
20#include <climits> // for CHAR_BIT
21#include <cmath>
22#include <cstdarg>
23#include <cstddef>
24#include <cstdint>
25#include <cstdio>
26#include <cstring>
27#include <limits> // for std::numeric_limits
28
29#include "scanutils.h"
30
31enum Flags {
32 FL_SPLAT = 0x01, // Drop the value, do not assign
33 FL_INV = 0x02, // Character-set with inverse
34 FL_WIDTH = 0x04, // Field width specified
35 FL_MINUS = 0x08, // Negative number
36};
37
38enum Ranks {
44 RANK_PTR = std::numeric_limits<int>::max() // Special value used for pointers
45};
46
49
53
54enum Bail {
55 BAIL_NONE = 0, // No error condition
56 BAIL_EOF, // Hit EOF
57 BAIL_ERR // Conversion mismatch
58};
59
60// Helper functions ------------------------------------------------------------
61inline size_t LongBit() {
62 return CHAR_BIT * sizeof(long);
63}
64
65static inline int
66SkipSpace(FILE *s) {
67 int p;
68 while (isascii(p = fgetc(s)) && isspace(p));
69 ungetc(p, s); // Make sure next char is available for reading
70 return p;
71}
72
73static inline void
74SetBit(unsigned long *bitmap, unsigned int bit) {
75 bitmap[bit/LongBit()] |= 1UL << (bit%LongBit());
76}
77
78static inline int
79TestBit(unsigned long *bitmap, unsigned int bit) {
80 return static_cast<int>(bitmap[bit/LongBit()] >> (bit%LongBit())) & 1;
81}
82
83static inline int DigitValue(int ch, int base) {
84 if (ch >= '0' && ch <= '9') {
85 if (base >= 10 || ch <= '7')
86 return ch-'0';
87 } else if (ch >= 'A' && ch <= 'Z' && base == 16) {
88 return ch-'A'+10;
89 } else if (ch >= 'a' && ch <= 'z' && base == 16) {
90 return ch-'a'+10;
91 }
92 return -1;
93}
94
95// IO (re-)implementations -----------------------------------------------------
96static uintmax_t streamtoumax(FILE* s, int base) {
97 int minus = 0;
98 uintmax_t v = 0;
99 int d, c = 0;
100
101 for (c = fgetc(s); isascii(c) && isspace(c); c = fgetc(s));
102
103 // Single optional + or -
104 if (c == '-' || c == '+') {
105 minus = (c == '-');
106 c = fgetc(s);
107 }
108
109 // Assign correct base
110 if (base == 0) {
111 if (c == '0') {
112 c = fgetc(s);
113 if (c == 'x' || c == 'X') {
114 base = 16;
115 c = fgetc(s);
116 } else {
117 base = 8;
118 }
119 }
120 } else if (base == 16) {
121 if (c == '0') {
122 c = fgetc(s);
123 if (c == 'x' || c == 'X') c = fgetc(s);
124 }
125 }
126
127 // Actual number parsing
128 for (; (c != EOF) && (d = DigitValue(c, base)) >= 0; c = fgetc(s))
129 v = v*base + d;
130
131 ungetc(c, s);
132 return minus ? -v : v;
133}
134
135static double streamtofloat(FILE* s) {
136 bool minus = false;
137 uint64_t v = 0;
138 int d, c;
139 uint64_t k = 1;
140 uint64_t w = 0;
141
142 for (c = fgetc(s); isascii(c) && isspace(c); c = fgetc(s));
143
144 // Single optional + or -
145 if (c == '-' || c == '+') {
146 minus = (c == '-');
147 c = fgetc(s);
148 }
149
150 // Actual number parsing
151 for (; c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s))
152 v = v*10 + d;
153 if (c == '.') {
154 for (c = fgetc(s); c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {
155 w = w*10 + d;
156 k *= 10;
157 }
158 }
159 double f = v + static_cast<double>(w) / k;
160 if (c == 'e' || c == 'E') {
161 c = fgetc(s);
162 int expsign = 1;
163 if (c == '-' || c == '+') {
164 expsign = (c == '-') ? -1 : 1;
165 c = fgetc(s);
166 }
167 int exponent = 0;
168 for (; (c != EOF) && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {
169 exponent = exponent * 10 + d;
170 }
171 exponent *= expsign;
172 f *= pow(10.0, static_cast<double>(exponent));
173 }
174 ungetc(c, s);
175
176 return minus ? -f : f;
177}
178
179static int tvfscanf(FILE* stream, const char *format, va_list ap);
180
181int tfscanf(FILE* stream, const char *format, ...) {
182 va_list ap;
183 int rv;
184
185 va_start(ap, format);
186 rv = tvfscanf(stream, format, ap);
187 va_end(ap);
188
189 return rv;
190}
191
192static int tvfscanf(FILE* stream, const char *format, va_list ap) {
193 const char *p = format;
194 char ch;
195 int q = 0;
196 uintmax_t val = 0;
197 int rank = RANK_INT; // Default rank
198 unsigned int width = UINT_MAX;
199 int base;
200 int flags = 0;
201 enum {
202 ST_NORMAL, // Ground state
203 ST_FLAGS, // Special flags
204 ST_WIDTH, // Field width
205 ST_MODIFIERS, // Length or conversion modifiers
206 ST_MATCH_INIT, // Initial state of %[ sequence
207 ST_MATCH, // Main state of %[ sequence
208 ST_MATCH_RANGE, // After - in a %[ sequence
209 } state = ST_NORMAL;
210 char *sarg = nullptr; // %s %c or %[ string argument
211 enum Bail bail = BAIL_NONE;
212 int converted = 0; // Successful conversions
213 unsigned long matchmap[((1 << CHAR_BIT)+(CHAR_BIT * sizeof(long) - 1)) /
214 (CHAR_BIT * sizeof(long))];
215 int matchinv = 0; // Is match map inverted?
216 unsigned char range_start = 0;
217 auto start_off = std::ftell(stream);
218
219 // Skip leading spaces
220 SkipSpace(stream);
221
222 while ((ch = *p++) && !bail) {
223 switch (state) {
224 case ST_NORMAL:
225 if (ch == '%') {
226 state = ST_FLAGS;
227 flags = 0; rank = RANK_INT; width = UINT_MAX;
228 } else if (isascii(ch) && isspace(ch)) {
229 SkipSpace(stream);
230 } else {
231 if (fgetc(stream) != ch)
232 bail = BAIL_ERR; // Match failure
233 }
234 break;
235
236 case ST_FLAGS:
237 if (ch == '*') {
238 flags |= FL_SPLAT;
239 } else if ('0' <= ch && ch <= '9') {
240 width = (ch-'0');
241 state = ST_WIDTH;
242 flags |= FL_WIDTH;
243 } else {
244 state = ST_MODIFIERS;
245 p--; // Process this character again
246 }
247 break;
248
249 case ST_WIDTH:
250 if (ch >= '0' && ch <= '9') {
251 width = width*10+(ch-'0');
252 } else {
253 state = ST_MODIFIERS;
254 p--; // Process this character again
255 }
256 break;
257
258 case ST_MODIFIERS:
259 switch (ch) {
260 // Length modifiers - nonterminal sequences
261 case 'h':
262 rank--; // Shorter rank
263 break;
264 case 'l':
265 rank++; // Longer rank
266 break;
267 case 'j':
268 rank = kIntMaxRank;
269 break;
270 case 'z':
271 rank = kSizeTRank;
272 break;
273 case 't':
274 rank = kPtrDiffRank;
275 break;
276 case 'L':
277 case 'q':
278 rank = RANK_LONGLONG; // long double/long long
279 break;
280
281 default:
282 // Output modifiers - terminal sequences
283 state = ST_NORMAL; // Next state will be normal
284 if (rank < kMinRank) // Canonicalize rank
285 rank = kMinRank;
286 else if (rank > kMaxRank)
287 rank = kMaxRank;
288
289 switch (ch) {
290 case 'P': // Upper case pointer
291 case 'p': // Pointer
292 rank = RANK_PTR;
293 base = 0;
294 goto scan_int;
295
296 case 'i': // Base-independent integer
297 base = 0;
298 goto scan_int;
299
300 case 'd': // Decimal integer
301 base = 10;
302 goto scan_int;
303
304 case 'o': // Octal integer
305 base = 8;
306 goto scan_int;
307
308 case 'u': // Unsigned decimal integer
309 base = 10;
310 goto scan_int;
311
312 case 'x': // Hexadecimal integer
313 case 'X':
314 base = 16;
315 goto scan_int;
316
317 case 'n': // Number of characters consumed
318 val = std::ftell(stream) - start_off;
319 goto set_integer;
320
321 scan_int:
322 q = SkipSpace(stream);
323 if (q <= 0) {
324 bail = BAIL_EOF;
325 break;
326 }
327 val = streamtoumax(stream, base);
328 // fall through
329
330 set_integer:
331 if (!(flags & FL_SPLAT)) {
332 converted++;
333 switch(rank) {
334 case RANK_CHAR:
335 *va_arg(ap, unsigned char *)
336 = static_cast<unsigned char>(val);
337 break;
338 case RANK_SHORT:
339 *va_arg(ap, unsigned short *)
340 = static_cast<unsigned short>(val);
341 break;
342 case RANK_INT:
343 *va_arg(ap, unsigned int *)
344 = static_cast<unsigned int>(val);
345 break;
346 case RANK_LONG:
347 *va_arg(ap, unsigned long *)
348 = static_cast<unsigned long>(val);
349 break;
350 case RANK_LONGLONG:
351 *va_arg(ap, unsigned long long *)
352 = static_cast<unsigned long long>(val);
353 break;
354 case RANK_PTR:
355 *va_arg(ap, void **)
356 = reinterpret_cast<void *>(static_cast<uintptr_t>(val));
357 break;
358 }
359 }
360 break;
361
362 case 'f': // Preliminary float value parsing
363 case 'g':
364 case 'G':
365 case 'e':
366 case 'E':
367 q = SkipSpace(stream);
368 if (q <= 0) {
369 bail = BAIL_EOF;
370 break;
371 }
372
373 {
374 double fval = streamtofloat(stream);
375 if (!(flags & FL_SPLAT)) {
376 if (rank == RANK_INT)
377 *va_arg(ap, float *) = static_cast<float>(fval);
378 else if (rank == RANK_LONG)
379 *va_arg(ap, double *) = static_cast<double>(fval);
380 converted++;
381 }
382 }
383 break;
384
385 case 'c': // Character
386 width = (flags & FL_WIDTH) ? width : 1; // Default width == 1
387 sarg = va_arg(ap, char *);
388 while (width--) {
389 if ((q = fgetc(stream)) <= 0) {
390 bail = BAIL_EOF;
391 break;
392 }
393 if (!(flags & FL_SPLAT)) {
394 *sarg++ = q;
395 converted++;
396 }
397 }
398 break;
399
400 case 's': // String
401 {
402 if (!(flags & FL_SPLAT)) {
403 sarg = va_arg(ap, char *);
404 }
405 unsigned length = 0;
406 while (width--) {
407 q = fgetc(stream);
408 if ((isascii(q) && isspace(q)) || (q <= 0)) {
409 ungetc(q, stream);
410 break;
411 }
412 if (!(flags & FL_SPLAT)) {
413 sarg[length] = q;
414 }
415 length++;
416 }
417 if (length == 0) {
418 bail = BAIL_EOF;
419 } else if (!(flags & FL_SPLAT)) {
420 sarg[length] = '\0'; // Terminate output
421 converted++;
422 }
423 }
424 break;
425
426 case '[': // Character range
427 sarg = va_arg(ap, char *);
428 state = ST_MATCH_INIT;
429 matchinv = 0;
430 memset(matchmap, 0, sizeof matchmap);
431 break;
432
433 case '%': // %% sequence
434 if (fgetc(stream) != '%')
435 bail = BAIL_ERR;
436 break;
437
438 default: // Anything else
439 bail = BAIL_ERR; // Unknown sequence
440 break;
441 }
442 }
443 break;
444
445 case ST_MATCH_INIT: // Initial state for %[ match
446 if (ch == '^' && !(flags & FL_INV)) {
447 matchinv = 1;
448 } else {
449 SetBit(matchmap, static_cast<unsigned char>(ch));
450 state = ST_MATCH;
451 }
452 break;
453
454 case ST_MATCH: // Main state for %[ match
455 if (ch == ']') {
456 goto match_run;
457 } else if (ch == '-') {
458 range_start = static_cast<unsigned char>(ch);
459 state = ST_MATCH_RANGE;
460 } else {
461 SetBit(matchmap, static_cast<unsigned char>(ch));
462 }
463 break;
464
465 case ST_MATCH_RANGE: // %[ match after -
466 if (ch == ']') {
467 SetBit(matchmap, static_cast<unsigned char>('-'));
468 goto match_run;
469 } else {
470 int i;
471 for (i = range_start ; i < (static_cast<unsigned char>(ch)) ; i++)
472 SetBit(matchmap, i);
473 state = ST_MATCH;
474 }
475 break;
476
477 match_run: // Match expression finished
478 char* oarg = sarg;
479 while (width) {
480 q = fgetc(stream);
481 auto qc = static_cast<unsigned char>(q);
482 if (q <= 0 || !(TestBit(matchmap, qc)^matchinv)) {
483 ungetc(q, stream);
484 break;
485 }
486 if (!(flags & FL_SPLAT)) *sarg = q;
487 sarg++;
488 }
489 if (oarg == sarg) {
490 bail = (q <= 0) ? BAIL_EOF : BAIL_ERR;
491 } else if (!(flags & FL_SPLAT)) {
492 *sarg = '\0';
493 converted++;
494 }
495 break;
496 }
497 }
498
499 if (bail == BAIL_EOF && !converted)
500 converted = -1; // Return EOF (-1)
501
502 return converted;
503}
Ranks
Definition: scanutils.cpp:38
@ RANK_LONGLONG
Definition: scanutils.cpp:43
@ RANK_SHORT
Definition: scanutils.cpp:40
@ RANK_CHAR
Definition: scanutils.cpp:39
@ RANK_LONG
Definition: scanutils.cpp:42
@ RANK_INT
Definition: scanutils.cpp:41
@ RANK_PTR
Definition: scanutils.cpp:44
enum Ranks kIntMaxRank
Definition: scanutils.cpp:50
enum Ranks kSizeTRank
Definition: scanutils.cpp:51
enum Ranks kMinRank
Definition: scanutils.cpp:47
enum Ranks kPtrDiffRank
Definition: scanutils.cpp:52
Bail
Definition: scanutils.cpp:54
@ BAIL_NONE
Definition: scanutils.cpp:55
@ BAIL_ERR
Definition: scanutils.cpp:57
@ BAIL_EOF
Definition: scanutils.cpp:56
int tfscanf(FILE *stream, const char *format,...)
Definition: scanutils.cpp:181
Flags
Definition: scanutils.cpp:31
@ FL_SPLAT
Definition: scanutils.cpp:32
@ FL_MINUS
Definition: scanutils.cpp:35
@ FL_INV
Definition: scanutils.cpp:33
@ FL_WIDTH
Definition: scanutils.cpp:34
enum Ranks kMaxRank
Definition: scanutils.cpp:48
size_t LongBit()
Definition: scanutils.cpp:61