216def parse_flags(argv=None):
217 ctx = TrainingArgs()
218 log.debug(ctx)
219 parser.parse_args(args=argv, namespace=ctx)
220 log.debug(ctx)
221
222 if not ctx.lang_code:
223 err_exit("Need to specify a language --lang")
224 if not ctx.langdata_dir:
225 err_exit("Need to specify path to language files --langdata_dir")
226 if not ctx.tessdata_dir:
227 tessdata_prefix = os.environ.get("TESSDATA_PREFIX", "")
228 if not tessdata_prefix:
229 err_exit(
230 "Need to specify a --tessdata_dir or have a "
231 "TESSDATA_PREFIX variable defined in your environment"
232 )
233 else:
234 ctx.tessdata_dir = tessdata_prefix
235 if not ctx.output_dir:
236 ctx.output_dir = mkdtemp(prefix=f"trained-{ctx.lang_code}-{ctx.timestamp}")
237 log.info(f"Output directory set to: {ctx.output_dir}")
238
239
240 if not ctx.tmp_dir:
241 ctx.training_dir = mkdtemp(prefix=f"{ctx.lang_code}-{ctx.timestamp}")
242 else:
243 ctx.training_dir = mkdtemp(prefix=f"{ctx.lang_code}-{ctx.timestamp}", dir=ctx.tmp_dir)
244
245 ctx.log_file = pathlib.Path(ctx.training_dir) / "tesstrain.log"
246 log.info(f"Log file location: {ctx.log_file}")
247
248 def show_tmpdir_location(training_dir):
249
250
251 if pathlib.Path(training_dir).exists():
252 print(f"Temporary files retained at: {training_dir}")
253
254 atexit.register(show_tmpdir_location, ctx.training_dir)
255
256
257
258 if not ctx.training_text:
259 ctx.training_text = (
260 pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.training_text"
261 )
262 if not ctx.wordlist_file:
263 ctx.wordlist_file = (
264 pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.wordlist"
265 )
266
267 ctx.word_bigrams_file = (
268 pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.word.bigrams"
269 )
270 ctx.numbers_file = (
271 pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.numbers"
272 )
273 ctx.punc_file = pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.punc"
274 ctx.bigram_freqs_file = pathlib.Path(ctx.training_text).with_suffix(
275 ".training_text.bigram_freqs"
276 )
277 ctx.unigram_freqs_file = pathlib.Path(ctx.training_text).with_suffix(
278 ".training_text.unigram_freqs"
279 )
280 ctx.train_ngrams_file = pathlib.Path(ctx.training_text).with_suffix(
281 ".training_text.train_ngrams"
282 )
283 ctx.generate_dawgs = 1
284 log.debug(ctx)
285 return ctx
286
287