tesseract 4.1.1
Loading...
Searching...
No Matches
tesstrain_utils.py
Go to the documentation of this file.
1# (C) Copyright 2014, Google Inc.
2# (C) Copyright 2018, James R Barlow
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6# http://www.apache.org/licenses/LICENSE-2.0
7# Unless required by applicable law or agreed to in writing, software
8# distributed under the License is distributed on an "AS IS" BASIS,
9# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10# See the License for the specific language governing permissions and
11# limitations under the License.
12#
13# For a detailed description of the phases, see
14# https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract
15#
16
17import argparse
18import atexit
19import concurrent.futures
20import logging
21import os
22import pathlib
23import platform
24import shutil
25import subprocess
26import sys
27from datetime import date
28from operator import itemgetter
29from tempfile import TemporaryDirectory, mkdtemp
30
31from tqdm import tqdm
32
33from language_specific import VERTICAL_FONTS
34
35log = logging.getLogger(__name__)
36
37
38class TrainingArgs(argparse.Namespace):
39 def __init__(self):
40 super(TrainingArgs, self).__init__()
41 self.uname = platform.uname().system.lower()
42 self.lang_code = "eng"
43 self.timestamp = str(date.today())
44
45 self._font_config_cache = TemporaryDirectory(prefix="font_tmp")
47 self.fonts_dir = (
48 "/Library/Fonts/" if "darwin" in self.uname else "/usr/share/fonts/"
49 )
50
51 self.max_pages = 0
52 self.save_box_tiff = False
53 self.overwrite = False
54 self.linedata = False
57 self.distort_image = False
58
59 def __eq__(self, other):
60 return (argparse.Namespace.__eq__(self, other) and
61 self.uname == other.uname and self.lang_code == other.lang_code and
62 self.timestamp == other.timestamp and self.font_config_cache == other.font_config_cache and
63 self.fonts_dir == other.fonts_dir and self.max_pages == other.max_pages and
64 self.save_box_tiff == other.save_box_tiff and self.overwrite == other.overwrite and
65 self.linedata == other.linedata and self.run_shape_clustering == other.run_shape_clustering and
66 self.extract_font_properties == other.extract_font_properties and
67 self.distort_image == other.distort_image)
68
69
70def err_exit(msg):
71 log.critical(msg)
72 sys.exit(1)
73
74
75# Helper function to run a command and append its output to a log. Aborts early
76# if the program file is not found.
77# Usage: run_command CMD ARG1 ARG2...
78def run_command(cmd, *args, env=None):
79 for d in ("", "api/", "training/"):
80 testcmd = shutil.which(f"{d}{cmd}")
81 if shutil.which(testcmd):
82 cmd = testcmd
83 break
84 if not shutil.which(cmd):
85 err_exit(f"{cmd} not found")
86
87 log.debug(f"Running {cmd}")
88 args = list(args)
89 for idx, arg in enumerate(args):
90 log.debug(arg)
91 # Workaround for https://bugs.python.org/issue33617
92 # TypeError: argument of type 'WindowsPath' is not iterable
93 if isinstance(arg, pathlib.WindowsPath):
94 args[idx] = str(arg)
95
96 proc = subprocess.run(
97 [cmd, *args], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env
98 )
99 proclog = logging.getLogger(cmd)
100 if proc.returncode == 0:
101 proclog.debug(proc.stdout.decode("utf-8", errors="replace"))
102 else:
103 try:
104 proclog.error(proc.stdout.decode("utf-8", errors="replace"))
105 except Exception as e:
106 proclog.error(e)
107 err_exit(f"Program {cmd} failed with return code {proc.returncode}. Abort.")
108
109
110# Check if all the given files exist, or exit otherwise.
111# Used to check required input files and produced output files in each phase.
112# Usage: check_file_readable FILE1 FILE2...
113def check_file_readable(*filenames):
114 if isinstance(filenames, (str, pathlib.Path)):
115 filenames = [filenames]
116 for filename in filenames:
117 try:
118 with pathlib.Path(filename).open():
119 pass
120 except FileNotFoundError:
121 err_exit(f"Required/expected file '{filename}' does not exist")
122 except PermissionError:
123 err_exit(f"{filename} is not readable")
124 except IOError as e:
125 err_exit(f"{filename} IO Error: {str(e)}")
126 return True
127
128
129parser = argparse.ArgumentParser(
130 epilog="""
131 The font names specified in --fontlist need to be recognizable by Pango using
132 fontconfig. An easy way to list the canonical names of all fonts available on
133 your system is to run text2image with --list_available_fonts and the
134 appropriate --fonts_dir path.
135 """
136)
137parser.add_argument(
138 "--fontlist",
139 dest="fonts",
140 nargs="+",
141 type=str,
142 help="A list of fontnames to train on.",
143)
144parser.add_argument("--fonts_dir", help="Path to font files.")
145parser.add_argument("--tmp_dir", help="Path to temporary training directory.")
146parser.add_argument(
147 "--lang", metavar="LANG_CODE", dest="lang_code", help="ISO 639 code."
148)
149parser.add_argument(
150 "--langdata_dir",
151 metavar="DATADIR",
152 help="Path to tesseract/training/langdata directory.",
153)
154parser.add_argument("--maxpages", type=int, dest="max_pages")
155parser.add_argument(
156 "--output_dir", metavar="OUTPUTDIR", help="Location of output traineddata file."
157)
158parser.add_argument(
159 "--overwrite", action="store_true", help="Safe to overwrite files in output_dir."
160)
161parser.add_argument(
162 "--save_box_tiff",
163 action="store_true",
164 help="Save box/tiff pairs along with lstmf files.",
165)
166parser.add_argument(
167 "--linedata_only",
168 dest="linedata",
169 action="store_true",
170 help="Only generate training data for lstmtraining.",
171)
172
173inputdata_group = parser.add_argument_group(
174 "inputdata",
175 "OPTIONAL flags for input data. If unspecified we will look for them in the langdata_dir directory.",
176)
177inputdata_group.add_argument(
178 "--training_text", metavar="TEXTFILE", help="Text to render and use for training."
179)
180inputdata_group.add_argument(
181 "--wordlist",
182 dest="wordlist_file",
183 metavar="WORDFILE",
184 help="Word list for the language ordered by decreasing frequency.",
185)
186
187parser.add_argument("--extract_font_properties", action="store_true")
188parser.add_argument(
189 "--noextract_font_properties", dest="extract_font_properties", action="store_false"
190)
191
192parser.add_argument(
193 "--distort_image", dest="distort_image", action="store_true"
194)
195
196tessdata_group = parser.add_argument_group(
197 "tessdata",
198 "OPTIONAL flag to specify location of existing traineddata files, required during feature extraction. If unspecified will use TESSDATA_PREFIX defined in the current environment.",
199)
200tessdata_group.add_argument(
201 "--tessdata_dir",
202 metavar="TESSDATADIR",
203 help="Path to tesseract/tessdata directory.",
204)
205
206parser.add_argument(
207 "--exposures",
208 metavar="EXPOSURES",
209 action="append",
210 nargs="+",
211 help="A list of exposure levels to use (e.g. -1,0,1).",
212)
213
214
215# Does simple command-line parsing and initialization.
216def parse_flags(argv=None):
217 ctx = TrainingArgs()
218 log.debug(ctx)
219 parser.parse_args(args=argv, namespace=ctx)
220 log.debug(ctx)
221
222 if not ctx.lang_code:
223 err_exit("Need to specify a language --lang")
224 if not ctx.langdata_dir:
225 err_exit("Need to specify path to language files --langdata_dir")
226 if not ctx.tessdata_dir:
227 tessdata_prefix = os.environ.get("TESSDATA_PREFIX", "")
228 if not tessdata_prefix:
229 err_exit(
230 "Need to specify a --tessdata_dir or have a "
231 "TESSDATA_PREFIX variable defined in your environment"
232 )
233 else:
234 ctx.tessdata_dir = tessdata_prefix
235 if not ctx.output_dir:
236 ctx.output_dir = mkdtemp(prefix=f"trained-{ctx.lang_code}-{ctx.timestamp}")
237 log.info(f"Output directory set to: {ctx.output_dir}")
238
239 # Location where intermediate files will be created.
240 if not ctx.tmp_dir:
241 ctx.training_dir = mkdtemp(prefix=f"{ctx.lang_code}-{ctx.timestamp}")
242 else:
243 ctx.training_dir = mkdtemp(prefix=f"{ctx.lang_code}-{ctx.timestamp}", dir=ctx.tmp_dir)
244 # Location of log file for the whole run.
245 ctx.log_file = pathlib.Path(ctx.training_dir) / "tesstrain.log"
246 log.info(f"Log file location: {ctx.log_file}")
247
248 def show_tmpdir_location(training_dir):
249 # On successful exit we will delete this first; on failure we want to let the user
250 # know where the log is
251 if pathlib.Path(training_dir).exists():
252 print(f"Temporary files retained at: {training_dir}")
253
254 atexit.register(show_tmpdir_location, ctx.training_dir)
255
256 # Take training text and wordlist from the langdata directory if not
257 # specified in the command-line.
258 if not ctx.training_text:
259 ctx.training_text = (
260 pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.training_text"
261 )
262 if not ctx.wordlist_file:
263 ctx.wordlist_file = (
264 pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.wordlist"
265 )
266
267 ctx.word_bigrams_file = (
268 pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.word.bigrams"
269 )
270 ctx.numbers_file = (
271 pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.numbers"
272 )
273 ctx.punc_file = pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.punc"
274 ctx.bigram_freqs_file = pathlib.Path(ctx.training_text).with_suffix(
275 ".training_text.bigram_freqs"
276 )
277 ctx.unigram_freqs_file = pathlib.Path(ctx.training_text).with_suffix(
278 ".training_text.unigram_freqs"
279 )
280 ctx.train_ngrams_file = pathlib.Path(ctx.training_text).with_suffix(
281 ".training_text.train_ngrams"
282 )
283 ctx.generate_dawgs = 1
284 log.debug(ctx)
285 return ctx
286
287
288def cleanup(ctx):
289 shutil.copy(ctx.log_file, ctx.output_dir)
290 shutil.rmtree(ctx.training_dir)
291 return
292
293
294# Function initializes font config with a unique font cache dir.
295def initialize_fontconfig(ctx):
296 sample_path = pathlib.Path(ctx.font_config_cache) / "sample_text.txt"
297 pathlib.Path(sample_path).write_text("Text\n")
298 log.info(f"Testing font: {ctx.fonts[0]}")
300 "text2image",
301 f"--fonts_dir={ctx.fonts_dir}",
302 f"--font={ctx.fonts[0]}",
303 f"--outputbase={sample_path}",
304 f"--text={sample_path}",
305 f"--fontconfig_tmpdir={ctx.font_config_cache}",
306 )
307
308
310 return font.replace(" ", "_").replace(",", "")
311
312
313def make_outbase(ctx, fontname, exposure):
314 return pathlib.Path(ctx.training_dir) / f"{ctx.lang_code}.{fontname}.exp{exposure}"
315
316
317# Helper function for phaseI_generate_image. Generates the image for a single
318# language/font combination in a way that can be run in parallel.
319def generate_font_image(ctx, font, exposure, char_spacing):
320 log.info(f"Rendering using {font}")
321 fontname = make_fontname(font)
322 outbase = make_outbase(ctx, fontname, exposure)
323
324 common_args = [
325 f"--fontconfig_tmpdir={ctx.font_config_cache}",
326 f"--fonts_dir={ctx.fonts_dir}",
327 f"--strip_unrenderable_words",
328 f"--leading={ctx.leading}",
329 f"--char_spacing={char_spacing}",
330 f"--exposure={exposure}",
331 f"--outputbase={outbase}",
332 f"--max_pages={ctx.max_pages}",
333 ]
334
335 if ctx.distort_image:
336 common_args.append("--distort_image")
337
338 # add --writing_mode=vertical-upright to common_args if the font is
339 # specified to be rendered vertically.
340 if font in VERTICAL_FONTS:
341 common_args.append("--writing_mode=vertical-upright")
342
344 "text2image",
345 *common_args,
346 f"--font={font}",
347 f"--text={ctx.training_text}",
348 *ctx.text2image_extra_args,
349 )
350
351 check_file_readable(str(outbase) + ".box", str(outbase) + ".tif")
352
353 if ctx.extract_font_properties and pathlib.Path(ctx.train_ngrams_file).exists():
354 log.info(f"Extracting font properties of {font}")
356 "text2image",
357 *common_args,
358 f"--font={font}",
359 f"--ligatures=false",
360 f"--text={ctx.train_ngrams_file}",
361 f"--only_extract_font_properties",
362 f"--ptsize=32",
363 )
364 check_file_readable(str(outbase) + ".fontinfo")
365 return f"{font}-{exposure}"
366
367
368# Phase I : Generate (I)mages from training text for each font.
369def phase_I_generate_image(ctx, par_factor=None):
370 if not par_factor or par_factor <= 0:
371 par_factor = 1
372
373 log.info("=== Phase I: Generating training images ===")
374 check_file_readable(ctx.training_text)
375 char_spacing = 0.0
376
377 for exposure in ctx.exposures:
378 if ctx.extract_font_properties and pathlib.Path(ctx.bigram_freqs_file).exists():
379 # Parse .bigram_freqs file and compose a .train_ngrams file with text
380 # for tesseract to recognize during training. Take only the ngrams whose
381 # combined weight accounts for 95% of all the bigrams in the language.
382 lines = pathlib.Path(ctx.bigram_freqs_file).read_text(encoding="utf-8").split("\n")
383 records = (line.split() for line in lines)
384 p = 0.99
385 ngram_frac = p * sum(int(rec[1]) for rec in records if len(rec) >= 2)
386
387 with pathlib.Path(ctx.train_ngrams_file).open("w", encoding="utf-8") as f:
388 cumsum = 0
389 for bigram, count in sorted(records, key=itemgetter(1), reverse=True):
390 if cumsum > ngram_frac:
391 break
392 f.write(bigram + " ")
393 cumsum += count
394
395 check_file_readable(ctx.train_ngrams_file)
396
397 with tqdm(
398 total=len(ctx.fonts)
399 ) as pbar, concurrent.futures.ThreadPoolExecutor(max_workers=par_factor) as executor:
400 futures = [
401 executor.submit(generate_font_image, ctx, font, exposure, char_spacing)
402 for font in ctx.fonts
403 ]
404 for future in concurrent.futures.as_completed(futures):
405 try:
406 future.result()
407 except Exception as exc:
408 err_exit("Failed while generating images " + str(exc))
409 else:
410 pbar.update(1)
411
412 # Check that each process was successful.
413 for font in ctx.fonts:
414 fontname = make_fontname(font)
415 outbase = make_outbase(ctx, fontname, exposure)
416 check_file_readable(str(outbase) + ".box", str(outbase) + ".tif")
417 return
418
419
420# Phase UP : Generate (U)nicharset and (P)roperties file.
421def phase_UP_generate_unicharset(ctx):
422 log.info("=== Phase UP: Generating unicharset and unichar properties files ===")
423
424 box_files = pathlib.Path(ctx.training_dir).glob("*.box")
425
426 ctx.unicharset_file = pathlib.Path(ctx.training_dir) / f"{ctx.lang_code}.unicharset"
427
429 "unicharset_extractor",
430 "--output_unicharset",
431 f"{ctx.unicharset_file}",
432 "--norm_mode",
433 f"{ctx.norm_mode}",
434 *box_files,
435 )
436 check_file_readable(ctx.unicharset_file)
437
438 ctx.xheights_file = pathlib.Path(ctx.training_dir) / f"{ctx.lang_code}.xheights"
440 "set_unicharset_properties",
441 "-U",
442 f"{ctx.unicharset_file}",
443 "-O",
444 f"{ctx.unicharset_file}",
445 "-X",
446 f"{ctx.xheights_file}",
447 f"--script_dir={ctx.langdata_dir}",
448 )
449 check_file_readable(ctx.xheights_file)
450
451
452# # Phase D : Generate (D)awg files from unicharset file and wordlist files
453# phase_D_generate_dawg() {
454# tlog "\n=== Phase D: Generating Dawg files ==="
455
456# # Skip if requested
457# if [[ ${GENERATE_DAWGS} -eq 0 ]]; then
458# tlog "Skipping ${phase_name}"
459# return
460# fi
461
462# # Output files
463# WORD_DAWG=${TRAINING_DIR}/${LANG_CODE}.word-dawg
464# FREQ_DAWG=${TRAINING_DIR}/${LANG_CODE}.freq-dawg
465# PUNC_DAWG=${TRAINING_DIR}/${LANG_CODE}.punc-dawg
466# NUMBER_DAWG=${TRAINING_DIR}/${LANG_CODE}.number-dawg
467# BIGRAM_DAWG=${TRAINING_DIR}/${LANG_CODE}.bigram-dawg
468
469# # Word DAWG
470# local freq_wordlist_file=${TRAINING_DIR}/${LANG_CODE}.wordlist.clean.freq
471# if [[ -s ${WORDLIST_FILE} ]]; then
472# tlog "Generating word Dawg"
473# check_file_readable ${unicharset_file}
474# run_command wordlist2dawg -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \
475# ${UNICHARSET_FILE}
476# check_file_readable ${WORD_DAWG}
477
478# FREQ_DAWG_SIZE=100
479# head -n ${FREQ_DAWG_SIZE} ${WORDLIST_FILE} > ${freq_wordlist_file}
480# fi
481
482# # Freq-word DAWG
483# if [[ -s ${freq_wordlist_file} ]]; then
484# check_file_readable ${UNICHARSET_FILE}
485# tlog "Generating frequent-word Dawg"
486# run_command wordlist2dawg -r 1 ${freq_wordlist_file} \
487# ${FREQ_DAWG} ${UNICHARSET_FILE}
488# check_file_readable ${FREQ_DAWG}
489# fi
490
491# # Punctuation DAWG
492# # -r arguments to wordlist2dawg denote RTL reverse policy
493# # (see Trie::RTLReversePolicy enum in tesseract/src/dict/trie.h).
494# # We specify 0/RRP_DO_NO_REVERSE when generating number DAWG,
495# # 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS,
496# # 2/RRP_FORCE_REVERSE for the punctuation DAWG.
497# local punc_reverse_policy=0;
498# if [[ "${LANG_IS_RTL}" == "1" ]]; then
499# punc_reverse_policy=2
500# fi
501# if [[ ! -s ${PUNC_FILE} ]]; then
502# PUNC_FILE="{ctx.langdata_dir}/common.punc"
503# fi
504# check_file_readable ${PUNC_FILE}
505# run_command wordlist2dawg -r ${punc_reverse_policy} \
506# ${PUNC_FILE} ${PUNC_DAWG} ${UNICHARSET_FILE}
507# check_file_readable ${PUNC_DAWG}
508
509# # Numbers DAWG
510# if [[ -s ${NUMBERS_FILE} ]]; then
511# run_command wordlist2dawg -r 0 \
512# ${NUMBERS_FILE} ${NUMBER_DAWG} ${UNICHARSET_FILE}
513# check_file_readable ${NUMBER_DAWG}
514# fi
515
516# # Bigram dawg
517# if [[ -s ${WORD_BIGRAMS_FILE} ]]; then
518# run_command wordlist2dawg -r 1 \
519# ${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE}
520# check_file_readable ${BIGRAM_DAWG}
521# fi
522# }
523
524# Phase E : (E)xtract .tr feature files from .tif/.box files
525def phase_E_extract_features(ctx, box_config, ext):
526 log.info(f"=== Phase E: Generating {ext} files ===")
527
528 img_files = list(pathlib.Path(ctx.training_dir).glob("*.exp*.tif"))
529 log.debug(img_files)
530
531 # Use any available language-specific configs.
532 config = ""
533 testconfig = pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.config"
534 if testconfig.exists():
535 config = testconfig
536 log.info(f"Using {ctx.lang_code}.config")
537
538 tessdata_environ = os.environ.copy()
539 tessdata_environ["TESSDATA_PREFIX"] = str(ctx.tessdata_dir)
540
541 log.info(f"Using TESSDATA_PREFIX={tessdata_environ['TESSDATA_PREFIX']}")
542
543 with tqdm(total=len(img_files)) as pbar, concurrent.futures.ThreadPoolExecutor(
544 max_workers=2
545 ) as executor:
546 futures = []
547 for img_file in img_files:
548 future = executor.submit(
549 run_command,
550 "tesseract",
551 img_file,
552 pathlib.Path(img_file).with_suffix(""),
553 *box_config,
554 config,
555 env=tessdata_environ,
556 )
557 futures.append(future)
558
559 for future in concurrent.futures.as_completed(futures):
560 try:
561 future.result()
562 except Exception as exc:
563 err_exit("Failed while extracting features: " + str(exc))
564 else:
565 pbar.update(1)
566 # Check that all the output files were produced.
567 for img_file in img_files:
568 check_file_readable(pathlib.Path(img_file.with_suffix("." + ext)))
569
570 return
571
572
573# # Phase C : (C)luster feature prototypes in .tr into normproto file (cnTraining)
574# # phaseC_cluster_prototypes ${TRAINING_DIR}/${LANG_CODE}.normproto
575# phase_C_cluster_prototypes() {
576# tlog "\n=== Phase C: Clustering feature prototypes (cnTraining) ==="
577# local out_normproto=$1
578
579# run_command cntraining -D "${TRAINING_DIR}/" \
580# $(ls ${TRAINING_DIR}/*.tr)
581
582# check_file_readable ${TRAINING_DIR}/normproto
583# mv ${TRAINING_DIR}/normproto ${out_normproto}
584# }
585
586# # Phase S : (S)hape clustering
587# phase_S_cluster_shapes() {
588# if ((! RUN_SHAPE_CLUSTERING)); then
589# tlog "\n=== Shape Clustering disabled ==="
590# return
591# fi
592# check_file_readable {ctx.langdata_dir}/font_properties
593# local font_props="-F {ctx.langdata_dir}/font_properties"
594# if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] &&\
595# [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
596# font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
597# fi
598
599# run_command shapeclustering \
600# -D "${TRAINING_DIR}/" \
601# -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
602# -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
603# ${font_props} \
604# $(ls ${TRAINING_DIR}/*.tr)
605# check_file_readable ${TRAINING_DIR}/shapetable \
606# ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
607# }
608
609# # Phase M : Clustering microfeatures (mfTraining)
610# phase_M_cluster_microfeatures() {
611# tlog "\n=== Phase M : Clustering microfeatures (mfTraining) ==="
612
613# check_file_readable {ctx.langdata_dir}/font_properties
614# font_props="-F {ctx.langdata_dir}/font_properties"
615# if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] && \
616# [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
617# font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
618# fi
619
620# run_command mftraining \
621# -D "${TRAINING_DIR}/" \
622# -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
623# -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
624# ${font_props} \
625# $(ls ${TRAINING_DIR}/*.tr)
626# check_file_readable ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/shapetable \
627# ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
628# mv ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/${LANG_CODE}.inttemp
629# mv ${TRAINING_DIR}/shapetable ${TRAINING_DIR}/${LANG_CODE}.shapetable
630# mv ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.pffmtable
631# mv ${TRAINING_DIR}/${LANG_CODE}.mfunicharset ${TRAINING_DIR}/${LANG_CODE}.unicharset
632# }
633
634# phase_B_generate_ambiguities() {
635# tlog "\n=== Phase B : ambiguities training ==="
636
637# # Check for manually created ambiguities data.
638# if [[ -r {ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}.unicharambigs ]]; then
639# tlog "Found file {ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}.unicharambigs"
640# cp {ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}.unicharambigs \
641# ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
642# # Make it writable, as it may be read-only in the client.
643# chmod u+w ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
644# return
645# else
646# tlog "No unicharambigs file found!"
647# fi
648
649# # TODO: Add support for generating ambiguities automatically.
650# }
651
652
653def make_lstmdata(ctx):
654 log.info("=== Constructing LSTM training data ===")
655 lang_prefix = f"{ctx.langdata_dir}/{ctx.lang_code}/{ctx.lang_code}"
656 path_output = pathlib.Path(ctx.output_dir)
657 if not path_output.is_dir():
658 log.info(f"Creating new directory {ctx.output_dir}")
659 path_output.mkdir(exist_ok=True, parents=True)
660
661 args = []
662 if ctx.lang_is_rtl:
663 args.append("--lang_is_rtl")
664 if ctx.norm_mode >= 2:
665 args.append("--pass_through_recoder")
666
667 # Build the starter traineddata from the inputs.
669 "combine_lang_model",
670 "--input_unicharset",
671 f"{ctx.training_dir}/{ctx.lang_code}.unicharset",
672 "--script_dir",
673 f"{ctx.langdata_dir}",
674 "--words",
675 f"{lang_prefix}.wordlist",
676 "--numbers",
677 f"{lang_prefix}.numbers",
678 "--puncs",
679 f"{lang_prefix}.punc",
680 "--output_dir",
681 f"{ctx.output_dir}",
682 "--lang",
683 f"{ctx.lang_code}",
684 *args,
685 )
686
687 def get_file_list():
688 training_path = pathlib.Path(ctx.training_dir)
689 if ctx.save_box_tiff:
690 log.info("=== Saving box/tiff pairs for training data ===")
691 yield from training_path.glob(f"{ctx.lang_code}*.box")
692 yield from training_path.glob(f"{ctx.lang_code}*.tif")
693 log.info("=== Moving lstmf files for training data ===")
694 yield from training_path.glob(f"{ctx.lang_code}.*.lstmf")
695
696 for f in get_file_list():
697 log.debug(f"Moving {f} to {path_output / f.name}")
698 shutil.move(str(f), path_output / f.name)
699
700 lstm_list = f"{ctx.output_dir}/{ctx.lang_code}.training_files.txt"
701 dir_listing = (str(p) for p in path_output.glob(f"{ctx.lang_code}.*.lstmf"))
702 pathlib.Path(lstm_list).write_text("\n".join(dir_listing))
703
704# make__traineddata() {
705# tlog "\n=== Making final traineddata file ==="
706# local lang_prefix={ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}
707
708# # Combine available files for this language from the langdata dir.
709# if [[ -r ${lang_prefix}.config ]]; then
710# tlog "Copying ${lang_prefix}.config to ${TRAINING_DIR}"
711# cp ${lang_prefix}.config ${TRAINING_DIR}
712# chmod u+w ${TRAINING_DIR}/${LANG_CODE}.config
713# fi
714# if [[ -r ${lang_prefix}.params-model ]]; then
715# tlog "Copying ${lang_prefix}.params-model to ${TRAINING_DIR}"
716# cp ${lang_prefix}.params-model ${TRAINING_DIR}
717# chmod u+w ${TRAINING_DIR}/${LANG_CODE}.params-model
718# fi
719
720# # Compose the traineddata file.
721# run_command combine_tessdata ${TRAINING_DIR}/${LANG_CODE}.
722
723# # Copy it to the output dir, overwriting only if allowed by the cmdline flag.
724# if [[ ! -d ${OUTPUT_DIR} ]]; then
725# tlog "Creating new directory ${OUTPUT_DIR}"
726# mkdir -p ${OUTPUT_DIR}
727# fi
728# local destfile=${OUTPUT_DIR}/${LANG_CODE}.traineddata;
729# if [[ -f ${destfile} ]] && ((! OVERWRITE)); then
730# err_exit "File ${destfile} exists and no --overwrite specified";
731# fi
732# tlog "Moving ${TRAINING_DIR}/${LANG_CODE}.traineddata to ${OUTPUT_DIR}"
733# cp -f ${TRAINING_DIR}/${LANG_CODE}.traineddata ${destfile}
734# }
def generate_font_image(ctx, font, exposure, char_spacing)
def make_outbase(ctx, fontname, exposure)
def check_file_readable(*filenames)
def make_fontname(font)
def run_command(cmd, *args, env=None)