tesseract 4.1.1
Loading...
Searching...
No Matches
tesstrain_utils Namespace Reference

Classes

class  TrainingArgs
 

Functions

def err_exit (msg)
 
def run_command (cmd, *args, env=None)
 
def check_file_readable (*filenames)
 
def parse_flags (argv=None)
 
def cleanup (ctx)
 
def initialize_fontconfig (ctx)
 
def make_fontname (font)
 
def make_outbase (ctx, fontname, exposure)
 
def generate_font_image (ctx, font, exposure, char_spacing)
 
def phase_I_generate_image (ctx, par_factor=None)
 
def phase_UP_generate_unicharset (ctx)
 
def phase_E_extract_features (ctx, box_config, ext)
 
def make_lstmdata (ctx)
 

Variables

 log = logging.getLogger(__name__)
 
 parser
 
 dest
 
 nargs
 
 type
 
 help
 
 metavar
 
 int
 
 action
 
 inputdata_group
 
 tessdata_group
 

Function Documentation

◆ check_file_readable()

def tesstrain_utils.check_file_readable ( filenames)

Definition at line 113 of file tesstrain_utils.py.

113def check_file_readable(*filenames):
114 if isinstance(filenames, (str, pathlib.Path)):
115 filenames = [filenames]
116 for filename in filenames:
117 try:
118 with pathlib.Path(filename).open():
119 pass
120 except FileNotFoundError:
121 err_exit(f"Required/expected file '{filename}' does not exist")
122 except PermissionError:
123 err_exit(f"{filename} is not readable")
124 except IOError as e:
125 err_exit(f"{filename} IO Error: {str(e)}")
126 return True
127
128

◆ cleanup()

def tesstrain_utils.cleanup (   ctx)

Definition at line 288 of file tesstrain_utils.py.

288def cleanup(ctx):
289 shutil.copy(ctx.log_file, ctx.output_dir)
290 shutil.rmtree(ctx.training_dir)
291 return
292
293
294# Function initializes font config with a unique font cache dir.

◆ err_exit()

def tesstrain_utils.err_exit (   msg)

Definition at line 70 of file tesstrain_utils.py.

70def err_exit(msg):
71 log.critical(msg)
72 sys.exit(1)
73
74
75# Helper function to run a command and append its output to a log. Aborts early
76# if the program file is not found.
77# Usage: run_command CMD ARG1 ARG2...

◆ generate_font_image()

def tesstrain_utils.generate_font_image (   ctx,
  font,
  exposure,
  char_spacing 
)

Definition at line 319 of file tesstrain_utils.py.

319def generate_font_image(ctx, font, exposure, char_spacing):
320 log.info(f"Rendering using {font}")
321 fontname = make_fontname(font)
322 outbase = make_outbase(ctx, fontname, exposure)
323
324 common_args = [
325 f"--fontconfig_tmpdir={ctx.font_config_cache}",
326 f"--fonts_dir={ctx.fonts_dir}",
327 f"--strip_unrenderable_words",
328 f"--leading={ctx.leading}",
329 f"--char_spacing={char_spacing}",
330 f"--exposure={exposure}",
331 f"--outputbase={outbase}",
332 f"--max_pages={ctx.max_pages}",
333 ]
334
335 if ctx.distort_image:
336 common_args.append("--distort_image")
337
338 # add --writing_mode=vertical-upright to common_args if the font is
339 # specified to be rendered vertically.
340 if font in VERTICAL_FONTS:
341 common_args.append("--writing_mode=vertical-upright")
342
343 run_command(
344 "text2image",
345 *common_args,
346 f"--font={font}",
347 f"--text={ctx.training_text}",
348 *ctx.text2image_extra_args,
349 )
350
351 check_file_readable(str(outbase) + ".box", str(outbase) + ".tif")
352
353 if ctx.extract_font_properties and pathlib.Path(ctx.train_ngrams_file).exists():
354 log.info(f"Extracting font properties of {font}")
355 run_command(
356 "text2image",
357 *common_args,
358 f"--font={font}",
359 f"--ligatures=false",
360 f"--text={ctx.train_ngrams_file}",
361 f"--only_extract_font_properties",
362 f"--ptsize=32",
363 )
364 check_file_readable(str(outbase) + ".fontinfo")
365 return f"{font}-{exposure}"
366
367
368# Phase I : Generate (I)mages from training text for each font.

◆ initialize_fontconfig()

def tesstrain_utils.initialize_fontconfig (   ctx)

Definition at line 295 of file tesstrain_utils.py.

295def initialize_fontconfig(ctx):
296 sample_path = pathlib.Path(ctx.font_config_cache) / "sample_text.txt"
297 pathlib.Path(sample_path).write_text("Text\n")
298 log.info(f"Testing font: {ctx.fonts[0]}")
299 run_command(
300 "text2image",
301 f"--fonts_dir={ctx.fonts_dir}",
302 f"--font={ctx.fonts[0]}",
303 f"--outputbase={sample_path}",
304 f"--text={sample_path}",
305 f"--fontconfig_tmpdir={ctx.font_config_cache}",
306 )
307
308

◆ make_fontname()

def tesstrain_utils.make_fontname (   font)

Definition at line 309 of file tesstrain_utils.py.

309def make_fontname(font):
310 return font.replace(" ", "_").replace(",", "")
311
312

◆ make_lstmdata()

def tesstrain_utils.make_lstmdata (   ctx)

Definition at line 653 of file tesstrain_utils.py.

653def make_lstmdata(ctx):
654 log.info("=== Constructing LSTM training data ===")
655 lang_prefix = f"{ctx.langdata_dir}/{ctx.lang_code}/{ctx.lang_code}"
656 path_output = pathlib.Path(ctx.output_dir)
657 if not path_output.is_dir():
658 log.info(f"Creating new directory {ctx.output_dir}")
659 path_output.mkdir(exist_ok=True, parents=True)
660
661 args = []
662 if ctx.lang_is_rtl:
663 args.append("--lang_is_rtl")
664 if ctx.norm_mode >= 2:
665 args.append("--pass_through_recoder")
666
667 # Build the starter traineddata from the inputs.
668 run_command(
669 "combine_lang_model",
670 "--input_unicharset",
671 f"{ctx.training_dir}/{ctx.lang_code}.unicharset",
672 "--script_dir",
673 f"{ctx.langdata_dir}",
674 "--words",
675 f"{lang_prefix}.wordlist",
676 "--numbers",
677 f"{lang_prefix}.numbers",
678 "--puncs",
679 f"{lang_prefix}.punc",
680 "--output_dir",
681 f"{ctx.output_dir}",
682 "--lang",
683 f"{ctx.lang_code}",
684 *args,
685 )
686
687 def get_file_list():
688 training_path = pathlib.Path(ctx.training_dir)
689 if ctx.save_box_tiff:
690 log.info("=== Saving box/tiff pairs for training data ===")
691 yield from training_path.glob(f"{ctx.lang_code}*.box")
692 yield from training_path.glob(f"{ctx.lang_code}*.tif")
693 log.info("=== Moving lstmf files for training data ===")
694 yield from training_path.glob(f"{ctx.lang_code}.*.lstmf")
695
696 for f in get_file_list():
697 log.debug(f"Moving {f} to {path_output / f.name}")
698 shutil.move(str(f), path_output / f.name)
699
700 lstm_list = f"{ctx.output_dir}/{ctx.lang_code}.training_files.txt"
701 dir_listing = (str(p) for p in path_output.glob(f"{ctx.lang_code}.*.lstmf"))
702 pathlib.Path(lstm_list).write_text("\n".join(dir_listing))
703
704# make__traineddata() {
705# tlog "\n=== Making final traineddata file ==="
706# local lang_prefix={ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}
707
708# # Combine available files for this language from the langdata dir.
709# if [[ -r ${lang_prefix}.config ]]; then
710# tlog "Copying ${lang_prefix}.config to ${TRAINING_DIR}"
711# cp ${lang_prefix}.config ${TRAINING_DIR}
712# chmod u+w ${TRAINING_DIR}/${LANG_CODE}.config
713# fi
714# if [[ -r ${lang_prefix}.params-model ]]; then
715# tlog "Copying ${lang_prefix}.params-model to ${TRAINING_DIR}"
716# cp ${lang_prefix}.params-model ${TRAINING_DIR}
717# chmod u+w ${TRAINING_DIR}/${LANG_CODE}.params-model
718# fi
719
720# # Compose the traineddata file.
721# run_command combine_tessdata ${TRAINING_DIR}/${LANG_CODE}.
722
723# # Copy it to the output dir, overwriting only if allowed by the cmdline flag.
724# if [[ ! -d ${OUTPUT_DIR} ]]; then
725# tlog "Creating new directory ${OUTPUT_DIR}"
726# mkdir -p ${OUTPUT_DIR}
727# fi
728# local destfile=${OUTPUT_DIR}/${LANG_CODE}.traineddata;
729# if [[ -f ${destfile} ]] && ((! OVERWRITE)); then
730# err_exit "File ${destfile} exists and no --overwrite specified";
731# fi
732# tlog "Moving ${TRAINING_DIR}/${LANG_CODE}.traineddata to ${OUTPUT_DIR}"
733# cp -f ${TRAINING_DIR}/${LANG_CODE}.traineddata ${destfile}
734# }

◆ make_outbase()

def tesstrain_utils.make_outbase (   ctx,
  fontname,
  exposure 
)

Definition at line 313 of file tesstrain_utils.py.

313def make_outbase(ctx, fontname, exposure):
314 return pathlib.Path(ctx.training_dir) / f"{ctx.lang_code}.{fontname}.exp{exposure}"
315
316
317# Helper function for phaseI_generate_image. Generates the image for a single
318# language/font combination in a way that can be run in parallel.

◆ parse_flags()

def tesstrain_utils.parse_flags (   argv = None)

Definition at line 216 of file tesstrain_utils.py.

216def parse_flags(argv=None):
217 ctx = TrainingArgs()
218 log.debug(ctx)
219 parser.parse_args(args=argv, namespace=ctx)
220 log.debug(ctx)
221
222 if not ctx.lang_code:
223 err_exit("Need to specify a language --lang")
224 if not ctx.langdata_dir:
225 err_exit("Need to specify path to language files --langdata_dir")
226 if not ctx.tessdata_dir:
227 tessdata_prefix = os.environ.get("TESSDATA_PREFIX", "")
228 if not tessdata_prefix:
229 err_exit(
230 "Need to specify a --tessdata_dir or have a "
231 "TESSDATA_PREFIX variable defined in your environment"
232 )
233 else:
234 ctx.tessdata_dir = tessdata_prefix
235 if not ctx.output_dir:
236 ctx.output_dir = mkdtemp(prefix=f"trained-{ctx.lang_code}-{ctx.timestamp}")
237 log.info(f"Output directory set to: {ctx.output_dir}")
238
239 # Location where intermediate files will be created.
240 if not ctx.tmp_dir:
241 ctx.training_dir = mkdtemp(prefix=f"{ctx.lang_code}-{ctx.timestamp}")
242 else:
243 ctx.training_dir = mkdtemp(prefix=f"{ctx.lang_code}-{ctx.timestamp}", dir=ctx.tmp_dir)
244 # Location of log file for the whole run.
245 ctx.log_file = pathlib.Path(ctx.training_dir) / "tesstrain.log"
246 log.info(f"Log file location: {ctx.log_file}")
247
248 def show_tmpdir_location(training_dir):
249 # On successful exit we will delete this first; on failure we want to let the user
250 # know where the log is
251 if pathlib.Path(training_dir).exists():
252 print(f"Temporary files retained at: {training_dir}")
253
254 atexit.register(show_tmpdir_location, ctx.training_dir)
255
256 # Take training text and wordlist from the langdata directory if not
257 # specified in the command-line.
258 if not ctx.training_text:
259 ctx.training_text = (
260 pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.training_text"
261 )
262 if not ctx.wordlist_file:
263 ctx.wordlist_file = (
264 pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.wordlist"
265 )
266
267 ctx.word_bigrams_file = (
268 pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.word.bigrams"
269 )
270 ctx.numbers_file = (
271 pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.numbers"
272 )
273 ctx.punc_file = pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.punc"
274 ctx.bigram_freqs_file = pathlib.Path(ctx.training_text).with_suffix(
275 ".training_text.bigram_freqs"
276 )
277 ctx.unigram_freqs_file = pathlib.Path(ctx.training_text).with_suffix(
278 ".training_text.unigram_freqs"
279 )
280 ctx.train_ngrams_file = pathlib.Path(ctx.training_text).with_suffix(
281 ".training_text.train_ngrams"
282 )
283 ctx.generate_dawgs = 1
284 log.debug(ctx)
285 return ctx
286
287

◆ phase_E_extract_features()

def tesstrain_utils.phase_E_extract_features (   ctx,
  box_config,
  ext 
)

Definition at line 525 of file tesstrain_utils.py.

525def phase_E_extract_features(ctx, box_config, ext):
526 log.info(f"=== Phase E: Generating {ext} files ===")
527
528 img_files = list(pathlib.Path(ctx.training_dir).glob("*.exp*.tif"))
529 log.debug(img_files)
530
531 # Use any available language-specific configs.
532 config = ""
533 testconfig = pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.config"
534 if testconfig.exists():
535 config = testconfig
536 log.info(f"Using {ctx.lang_code}.config")
537
538 tessdata_environ = os.environ.copy()
539 tessdata_environ["TESSDATA_PREFIX"] = str(ctx.tessdata_dir)
540
541 log.info(f"Using TESSDATA_PREFIX={tessdata_environ['TESSDATA_PREFIX']}")
542
543 with tqdm(total=len(img_files)) as pbar, concurrent.futures.ThreadPoolExecutor(
544 max_workers=2
545 ) as executor:
546 futures = []
547 for img_file in img_files:
548 future = executor.submit(
549 run_command,
550 "tesseract",
551 img_file,
552 pathlib.Path(img_file).with_suffix(""),
553 *box_config,
554 config,
555 env=tessdata_environ,
556 )
557 futures.append(future)
558
559 for future in concurrent.futures.as_completed(futures):
560 try:
561 future.result()
562 except Exception as exc:
563 err_exit("Failed while extracting features: " + str(exc))
564 else:
565 pbar.update(1)
566 # Check that all the output files were produced.
567 for img_file in img_files:
568 check_file_readable(pathlib.Path(img_file.with_suffix("." + ext)))
569
570 return
571
572
573# # Phase C : (C)luster feature prototypes in .tr into normproto file (cnTraining)
574# # phaseC_cluster_prototypes ${TRAINING_DIR}/${LANG_CODE}.normproto
575# phase_C_cluster_prototypes() {
576# tlog "\n=== Phase C: Clustering feature prototypes (cnTraining) ==="
577# local out_normproto=$1
578
579# run_command cntraining -D "${TRAINING_DIR}/" \
580# $(ls ${TRAINING_DIR}/*.tr)
581
582# check_file_readable ${TRAINING_DIR}/normproto
583# mv ${TRAINING_DIR}/normproto ${out_normproto}
584# }
585
586# # Phase S : (S)hape clustering
587# phase_S_cluster_shapes() {
588# if ((! RUN_SHAPE_CLUSTERING)); then
589# tlog "\n=== Shape Clustering disabled ==="
590# return
591# fi
592# check_file_readable {ctx.langdata_dir}/font_properties
593# local font_props="-F {ctx.langdata_dir}/font_properties"
594# if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] &&\
595# [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
596# font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
597# fi
598
599# run_command shapeclustering \
600# -D "${TRAINING_DIR}/" \
601# -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
602# -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
603# ${font_props} \
604# $(ls ${TRAINING_DIR}/*.tr)
605# check_file_readable ${TRAINING_DIR}/shapetable \
606# ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
607# }
608
609# # Phase M : Clustering microfeatures (mfTraining)
610# phase_M_cluster_microfeatures() {
611# tlog "\n=== Phase M : Clustering microfeatures (mfTraining) ==="
612
613# check_file_readable {ctx.langdata_dir}/font_properties
614# font_props="-F {ctx.langdata_dir}/font_properties"
615# if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] && \
616# [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
617# font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
618# fi
619
620# run_command mftraining \
621# -D "${TRAINING_DIR}/" \
622# -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
623# -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
624# ${font_props} \
625# $(ls ${TRAINING_DIR}/*.tr)
626# check_file_readable ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/shapetable \
627# ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
628# mv ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/${LANG_CODE}.inttemp
629# mv ${TRAINING_DIR}/shapetable ${TRAINING_DIR}/${LANG_CODE}.shapetable
630# mv ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.pffmtable
631# mv ${TRAINING_DIR}/${LANG_CODE}.mfunicharset ${TRAINING_DIR}/${LANG_CODE}.unicharset
632# }
633
634# phase_B_generate_ambiguities() {
635# tlog "\n=== Phase B : ambiguities training ==="
636
637# # Check for manually created ambiguities data.
638# if [[ -r {ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}.unicharambigs ]]; then
639# tlog "Found file {ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}.unicharambigs"
640# cp {ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}.unicharambigs \
641# ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
642# # Make it writable, as it may be read-only in the client.
643# chmod u+w ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
644# return
645# else
646# tlog "No unicharambigs file found!"
647# fi
648
649# # TODO: Add support for generating ambiguities automatically.
650# }
651
652

◆ phase_I_generate_image()

def tesstrain_utils.phase_I_generate_image (   ctx,
  par_factor = None 
)

Definition at line 369 of file tesstrain_utils.py.

369def phase_I_generate_image(ctx, par_factor=None):
370 if not par_factor or par_factor <= 0:
371 par_factor = 1
372
373 log.info("=== Phase I: Generating training images ===")
374 check_file_readable(ctx.training_text)
375 char_spacing = 0.0
376
377 for exposure in ctx.exposures:
378 if ctx.extract_font_properties and pathlib.Path(ctx.bigram_freqs_file).exists():
379 # Parse .bigram_freqs file and compose a .train_ngrams file with text
380 # for tesseract to recognize during training. Take only the ngrams whose
381 # combined weight accounts for 95% of all the bigrams in the language.
382 lines = pathlib.Path(ctx.bigram_freqs_file).read_text(encoding="utf-8").split("\n")
383 records = (line.split() for line in lines)
384 p = 0.99
385 ngram_frac = p * sum(int(rec[1]) for rec in records if len(rec) >= 2)
386
387 with pathlib.Path(ctx.train_ngrams_file).open("w", encoding="utf-8") as f:
388 cumsum = 0
389 for bigram, count in sorted(records, key=itemgetter(1), reverse=True):
390 if cumsum > ngram_frac:
391 break
392 f.write(bigram + " ")
393 cumsum += count
394
395 check_file_readable(ctx.train_ngrams_file)
396
397 with tqdm(
398 total=len(ctx.fonts)
399 ) as pbar, concurrent.futures.ThreadPoolExecutor(max_workers=par_factor) as executor:
400 futures = [
401 executor.submit(generate_font_image, ctx, font, exposure, char_spacing)
402 for font in ctx.fonts
403 ]
404 for future in concurrent.futures.as_completed(futures):
405 try:
406 future.result()
407 except Exception as exc:
408 err_exit("Failed while generating images " + str(exc))
409 else:
410 pbar.update(1)
411
412 # Check that each process was successful.
413 for font in ctx.fonts:
414 fontname = make_fontname(font)
415 outbase = make_outbase(ctx, fontname, exposure)
416 check_file_readable(str(outbase) + ".box", str(outbase) + ".tif")
417 return
418
419
420# Phase UP : Generate (U)nicharset and (P)roperties file.

◆ phase_UP_generate_unicharset()

def tesstrain_utils.phase_UP_generate_unicharset (   ctx)

Definition at line 421 of file tesstrain_utils.py.

421def phase_UP_generate_unicharset(ctx):
422 log.info("=== Phase UP: Generating unicharset and unichar properties files ===")
423
424 box_files = pathlib.Path(ctx.training_dir).glob("*.box")
425
426 ctx.unicharset_file = pathlib.Path(ctx.training_dir) / f"{ctx.lang_code}.unicharset"
427
428 run_command(
429 "unicharset_extractor",
430 "--output_unicharset",
431 f"{ctx.unicharset_file}",
432 "--norm_mode",
433 f"{ctx.norm_mode}",
434 *box_files,
435 )
436 check_file_readable(ctx.unicharset_file)
437
438 ctx.xheights_file = pathlib.Path(ctx.training_dir) / f"{ctx.lang_code}.xheights"
439 run_command(
440 "set_unicharset_properties",
441 "-U",
442 f"{ctx.unicharset_file}",
443 "-O",
444 f"{ctx.unicharset_file}",
445 "-X",
446 f"{ctx.xheights_file}",
447 f"--script_dir={ctx.langdata_dir}",
448 )
449 check_file_readable(ctx.xheights_file)
450
451
452# # Phase D : Generate (D)awg files from unicharset file and wordlist files
453# phase_D_generate_dawg() {
454# tlog "\n=== Phase D: Generating Dawg files ==="
455
456# # Skip if requested
457# if [[ ${GENERATE_DAWGS} -eq 0 ]]; then
458# tlog "Skipping ${phase_name}"
459# return
460# fi
461
462# # Output files
463# WORD_DAWG=${TRAINING_DIR}/${LANG_CODE}.word-dawg
464# FREQ_DAWG=${TRAINING_DIR}/${LANG_CODE}.freq-dawg
465# PUNC_DAWG=${TRAINING_DIR}/${LANG_CODE}.punc-dawg
466# NUMBER_DAWG=${TRAINING_DIR}/${LANG_CODE}.number-dawg
467# BIGRAM_DAWG=${TRAINING_DIR}/${LANG_CODE}.bigram-dawg
468
469# # Word DAWG
470# local freq_wordlist_file=${TRAINING_DIR}/${LANG_CODE}.wordlist.clean.freq
471# if [[ -s ${WORDLIST_FILE} ]]; then
472# tlog "Generating word Dawg"
473# check_file_readable ${unicharset_file}
474# run_command wordlist2dawg -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \
475# ${UNICHARSET_FILE}
476# check_file_readable ${WORD_DAWG}
477
478# FREQ_DAWG_SIZE=100
479# head -n ${FREQ_DAWG_SIZE} ${WORDLIST_FILE} > ${freq_wordlist_file}
480# fi
481
482# # Freq-word DAWG
483# if [[ -s ${freq_wordlist_file} ]]; then
484# check_file_readable ${UNICHARSET_FILE}
485# tlog "Generating frequent-word Dawg"
486# run_command wordlist2dawg -r 1 ${freq_wordlist_file} \
487# ${FREQ_DAWG} ${UNICHARSET_FILE}
488# check_file_readable ${FREQ_DAWG}
489# fi
490
491# # Punctuation DAWG
492# # -r arguments to wordlist2dawg denote RTL reverse policy
493# # (see Trie::RTLReversePolicy enum in tesseract/src/dict/trie.h).
494# # We specify 0/RRP_DO_NO_REVERSE when generating number DAWG,
495# # 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS,
496# # 2/RRP_FORCE_REVERSE for the punctuation DAWG.
497# local punc_reverse_policy=0;
498# if [[ "${LANG_IS_RTL}" == "1" ]]; then
499# punc_reverse_policy=2
500# fi
501# if [[ ! -s ${PUNC_FILE} ]]; then
502# PUNC_FILE="{ctx.langdata_dir}/common.punc"
503# fi
504# check_file_readable ${PUNC_FILE}
505# run_command wordlist2dawg -r ${punc_reverse_policy} \
506# ${PUNC_FILE} ${PUNC_DAWG} ${UNICHARSET_FILE}
507# check_file_readable ${PUNC_DAWG}
508
509# # Numbers DAWG
510# if [[ -s ${NUMBERS_FILE} ]]; then
511# run_command wordlist2dawg -r 0 \
512# ${NUMBERS_FILE} ${NUMBER_DAWG} ${UNICHARSET_FILE}
513# check_file_readable ${NUMBER_DAWG}
514# fi
515
516# # Bigram dawg
517# if [[ -s ${WORD_BIGRAMS_FILE} ]]; then
518# run_command wordlist2dawg -r 1 \
519# ${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE}
520# check_file_readable ${BIGRAM_DAWG}
521# fi
522# }
523
524# Phase E : (E)xtract .tr feature files from .tif/.box files

◆ run_command()

def tesstrain_utils.run_command (   cmd,
args,
  env = None 
)

Definition at line 78 of file tesstrain_utils.py.

78def run_command(cmd, *args, env=None):
79 for d in ("", "api/", "training/"):
80 testcmd = shutil.which(f"{d}{cmd}")
81 if shutil.which(testcmd):
82 cmd = testcmd
83 break
84 if not shutil.which(cmd):
85 err_exit(f"{cmd} not found")
86
87 log.debug(f"Running {cmd}")
88 args = list(args)
89 for idx, arg in enumerate(args):
90 log.debug(arg)
91 # Workaround for https://bugs.python.org/issue33617
92 # TypeError: argument of type 'WindowsPath' is not iterable
93 if isinstance(arg, pathlib.WindowsPath):
94 args[idx] = str(arg)
95
96 proc = subprocess.run(
97 [cmd, *args], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env
98 )
99 proclog = logging.getLogger(cmd)
100 if proc.returncode == 0:
101 proclog.debug(proc.stdout.decode("utf-8", errors="replace"))
102 else:
103 try:
104 proclog.error(proc.stdout.decode("utf-8", errors="replace"))
105 except Exception as e:
106 proclog.error(e)
107 err_exit(f"Program {cmd} failed with return code {proc.returncode}. Abort.")
108
109
110# Check if all the given files exist, or exit otherwise.
111# Used to check required input files and produced output files in each phase.
112# Usage: check_file_readable FILE1 FILE2...

Variable Documentation

◆ action

tesstrain_utils.action

Definition at line 159 of file tesstrain_utils.py.

◆ dest

tesstrain_utils.dest

Definition at line 139 of file tesstrain_utils.py.

◆ help

tesstrain_utils.help

Definition at line 142 of file tesstrain_utils.py.

◆ inputdata_group

tesstrain_utils.inputdata_group
Initial value:
1= parser.add_argument_group(
2 "inputdata",
3 "OPTIONAL flags for input data. If unspecified we will look for them in the langdata_dir directory.",
4)

Definition at line 173 of file tesstrain_utils.py.

◆ int

tesstrain_utils.int

Definition at line 154 of file tesstrain_utils.py.

◆ log

tesstrain_utils.log = logging.getLogger(__name__)

Definition at line 35 of file tesstrain_utils.py.

◆ metavar

tesstrain_utils.metavar

Definition at line 147 of file tesstrain_utils.py.

◆ nargs

tesstrain_utils.nargs

Definition at line 140 of file tesstrain_utils.py.

◆ parser

tesstrain_utils.parser
Initial value:
1= argparse.ArgumentParser(
2 epilog=
3)

Definition at line 129 of file tesstrain_utils.py.

◆ tessdata_group

tesstrain_utils.tessdata_group
Initial value:
1= parser.add_argument_group(
2 "tessdata",
3 "OPTIONAL flag to specify location of existing traineddata files, required during feature extraction. If unspecified will use TESSDATA_PREFIX defined in the current environment.",
4)

Definition at line 196 of file tesstrain_utils.py.

◆ type

tesstrain_utils.type

Definition at line 141 of file tesstrain_utils.py.